From e89f639fa99a05607eaaed1832adc97b5095553b Mon Sep 17 00:00:00 2001
From: Robert Perrotta <104582251+robert-perrotta@users.noreply.github.com>
Date: Tue, 4 Oct 2022 04:08:37 -0400
Subject: [PATCH 001/624] Fix error message typo (#6682)

---
 torchvision/models/detection/roi_heads.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
index 18a6782a0..38dd7d4cf 100644
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -787,7 +787,7 @@ class RoIHeads(nn.Module):
             mask_proposals = [p["boxes"] for p in result]
             if self.training:
                 if matched_idxs is None:
-                    raise ValueError("if in trainning, matched_idxs should not be None")
+                    raise ValueError("if in training, matched_idxs should not be None")
 
                 # during training, only focus on positive boxes
                 num_images = len(proposals)
-- 
GitLab


From 344ccc05b2ebf643af319b6f648128e18807c892 Mon Sep 17 00:00:00 2001
From: Robert Perrotta <104582251+robert-perrotta@users.noreply.github.com>
Date: Tue, 4 Oct 2022 04:12:07 -0400
Subject: [PATCH 002/624] Fix missing f-string prefix in error message (#6684)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 torchvision/models/detection/roi_heads.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
index 38dd7d4cf..f6347a0d9 100644
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -746,7 +746,7 @@ class RoIHeads(nn.Module):
                 if not t["boxes"].dtype in floating_point_types:
                     raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
                 if not t["labels"].dtype == torch.int64:
-                    raise TypeError("target labels must of int64 type, instead got {t['labels'].dtype}")
+                    raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
                 if self.has_keypoint():
                     if not t["keypoints"].dtype == torch.float32:
                         raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
-- 
GitLab


From 45f87fa3a6bb2aec9885c3a0963513ea2bba7532 Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowbao@microsoft.com>
Date: Tue, 4 Oct 2022 03:16:57 -0700
Subject: [PATCH 003/624] [ONNX] Support exporting RoiAlign align=True to ONNX
 with opset 16 (#6685)

* Support exporting RoiAlign align=True to ONNX with opset 16

* lint: ufmt

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/test_onnx.py                     | 27 +++++++-----
 torchvision/ops/_register_onnx_ops.py | 63 +++++++++++++++++++--------
 2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/test/test_onnx.py b/test/test_onnx.py
index d5dae64b4..b6f5481ed 100644
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@@ -1,6 +1,6 @@
 import io
 from collections import OrderedDict
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 import torch
@@ -11,7 +11,7 @@ from torchvision.models.detection.image_list import ImageList
 from torchvision.models.detection.roi_heads import RoIHeads
 from torchvision.models.detection.rpn import AnchorGenerator, RegionProposalNetwork, RPNHead
 from torchvision.models.detection.transform import GeneralizedRCNNTransform
-from torchvision.ops._register_onnx_ops import _onnx_opset_version
+from torchvision.ops import _register_onnx_ops
 
 # In environments without onnxruntime we prefer to
 # invoke all tests in the repo and have this one skipped rather than fail.
@@ -32,7 +32,11 @@ class TestONNXExporter:
         dynamic_axes=None,
         output_names=None,
         input_names=None,
+        opset_version: Optional[int] = None,
     ):
+        if opset_version is None:
+            opset_version = _register_onnx_ops.base_onnx_opset_version
+
         model.eval()
 
         onnx_io = io.BytesIO()
@@ -46,10 +50,11 @@ class TestONNXExporter:
             torch_onnx_input,
             onnx_io,
             do_constant_folding=do_constant_folding,
-            opset_version=_onnx_opset_version,
+            opset_version=opset_version,
             dynamic_axes=dynamic_axes,
             input_names=input_names,
             output_names=output_names,
+            verbose=True,
         )
         # validate the exported model with onnx runtime
         for test_inputs in inputs_list:
@@ -140,39 +145,39 @@ class TestONNXExporter:
         model = ops.RoIAlign((5, 5), 1, -1)
         self.run_model(model, [(x, single_roi)])
 
-    @pytest.mark.skip(reason="ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16.")
     def test_roi_align_aligned(self):
+        supported_onnx_version = _register_onnx_ops._onnx_opset_version_16
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 2, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 0.5, 3, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1.8, 2, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((2, 2), 2.5, 0, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((2, 2), 2.5, -1, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
-    @pytest.mark.skip(reason="Issue in exporting ROIAlign with aligned = True for malformed boxes")
     def test_roi_align_malformed_boxes(self):
+        supported_onnx_version = _register_onnx_ops._onnx_opset_version_16
         x = torch.randn(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 1, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
     def test_roi_pool(self):
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py
index 629c19c16..eaea0b900 100644
--- a/torchvision/ops/_register_onnx_ops.py
+++ b/torchvision/ops/_register_onnx_ops.py
@@ -3,7 +3,9 @@ import warnings
 
 import torch
 
-_onnx_opset_version = 11
+_onnx_opset_version_11 = 11
+_onnx_opset_version_16 = 16
+base_onnx_opset_version = _onnx_opset_version_11
 
 
 def _register_custom_op():
@@ -20,32 +22,56 @@ def _register_custom_op():
         nms_out = g.op("NonMaxSuppression", boxes, scores, max_output_per_class, iou_threshold)
         return squeeze(g, select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1)
 
-    @parse_args("v", "v", "f", "i", "i", "i", "i")
-    def roi_align(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
-        batch_indices = _cast_Long(
+    def _process_batch_indices_for_roi_align(g, rois):
+        return _cast_Long(
             g, squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1), False
         )
-        rois = select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
-        # TODO: Remove this warning after ONNX opset 16 is supported.
-        if aligned:
-            warnings.warn(
-                "ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16. "
-                "The workaround is that the user need apply the patch "
-                "https://github.com/microsoft/onnxruntime/pull/8564 "
-                "and build ONNXRuntime from source."
-            )
 
-        # ONNX doesn't support negative sampling_ratio
+    def _process_rois_for_roi_align(g, rois):
+        return select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+
+    def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int):
         if sampling_ratio < 0:
             warnings.warn(
-                "ONNX doesn't support negative sampling ratio, therefore is set to 0 in order to be exported."
+                "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. "
+                "The model will be exported with a sampling_ratio of 0."
             )
             sampling_ratio = 0
+        return sampling_ratio
+
+    @parse_args("v", "v", "f", "i", "i", "i", "i")
+    def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+        batch_indices = _process_batch_indices_for_roi_align(g, rois)
+        rois = _process_rois_for_roi_align(g, rois)
+        if aligned:
+            warnings.warn(
+                "ROIAlign with aligned=True is not supported in ONNX, but is supported in opset 16. "
+                "Please export with opset 16 or higher to use aligned=False."
+            )
+        sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+        return g.op(
+            "RoiAlign",
+            input,
+            rois,
+            batch_indices,
+            spatial_scale_f=spatial_scale,
+            output_height_i=pooled_height,
+            output_width_i=pooled_width,
+            sampling_ratio_i=sampling_ratio,
+        )
+
+    @parse_args("v", "v", "f", "i", "i", "i", "i")
+    def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+        batch_indices = _process_batch_indices_for_roi_align(g, rois)
+        rois = _process_rois_for_roi_align(g, rois)
+        coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel"
+        sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
         return g.op(
             "RoiAlign",
             input,
             rois,
             batch_indices,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
             spatial_scale_f=spatial_scale,
             output_height_i=pooled_height,
             output_width_i=pooled_width,
@@ -61,6 +87,7 @@ def _register_custom_op():
 
     from torch.onnx import register_custom_op_symbolic
 
-    register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version)
-    register_custom_op_symbolic("torchvision::roi_align", roi_align, _onnx_opset_version)
-    register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version)
+    register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version_11)
+    register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _onnx_opset_version_11)
+    register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _onnx_opset_version_16)
+    register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version_11)
-- 
GitLab


From 969a7b532ec8946efc78513e12a81dcc78289444 Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Tue, 4 Oct 2022 12:22:27 +0100
Subject: [PATCH 004/624] Bump main version to 0.15 (#6691)

* Bump main version to 0.15

* Update table on README.rst

* Revert the readme update
---
 android/gradle.properties | 2 +-
 version.txt               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/android/gradle.properties b/android/gradle.properties
index 1b6b275f6..8204b73b0 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=0.14.0-SNAPSHOT
+VERSION_NAME=0.15.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch
diff --git a/version.txt b/version.txt
index 56f78043a..b4f7ccce2 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.0a0
+0.15.0a0
-- 
GitLab


From b482d896f448cc44fdadb030391ac12723a81546 Mon Sep 17 00:00:00 2001
From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Date: Tue, 4 Oct 2022 04:23:05 -0700
Subject: [PATCH 005/624] Use real image in test_detection_model (#6658)

* update test_models.py

* update tests

* fix linting

* fix linting

* add comment

* Trigger CI

Co-authored-by: YosuaMichael <yosuamichael@fb.com>
---
 ...er.test_fasterrcnn_resnet50_fpn_expect.pkl | Bin 3939 -> 4395 bytes
 ...test_fasterrcnn_resnet50_fpn_v2_expect.pkl | Bin 3939 -> 4410 bytes
 ...elTester.test_fcos_resnet50_fpn_expect.pkl | Bin 9571 -> 3405 bytes
 ....test_keypointrcnn_resnet50_fpn_expect.pkl | Bin 2199 -> 3367 bytes
 ...ster.test_maskrcnn_resnet50_fpn_expect.pkl | Bin 4507 -> 4965 bytes
 ...r.test_maskrcnn_resnet50_fpn_v2_expect.pkl | Bin 4507 -> 4986 bytes
 ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 9571 -> 9382 bytes
 ....test_retinanet_resnet50_fpn_v2_expect.pkl | Bin 9571 -> 9461 bytes
 test/test_models.py                           |  55 ++++++++++++++++--
 9 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl
index e95ba5f53985e3773c6a625bed929b406350aa90..862af2185c75bd90734b068981e298cf94d11cc8 100644
GIT binary patch
literal 4395
zcmc&&2~d<*68^biM8I&2oZg6FxJCv+L(g;zQ6?-$f?^CZ90SZUF!Rp=$YJR!UWtZP
zYb)ZBRYW8niOFsv5F;_W>Y9i_Ap(kuM?zL(K|B(VnEn4BNhw=o6DqZNRd0Iwf8F2L
zulv3K&6@;;g(bt-+A{yurZfIboHk3Pc}Zn3s`P<IPNZe0^WXKE1qEq(m7zdo3=2-n
z))l0wigl_?V^CJQF+EV1r-?yd=H*}1mYAoqOgU@jmtw_5kfAMB8Cdg7qgJ1p8<=J?
zsx<}?$CP1GYqHXesse*npH>(uVlDjqQfxQ}1813@qhc-lT|_Y&ZMyNr{)ZygD#eVm
z31+QREI65|GqE;tm1aiPHc{cEa8X#UVC^_zwt^L{%FNViG~5)_+5&?pUZ17XtFmUP
zGmRp4#FQk~-pGzjVjWVfIT%fPhDu{#9do%DatC-{&=%zUMC&BbhOnb@Z4_BX_L)S5
zrNX6xbyiqaur8}}c|cbq>z2fd1wcclmdhnOx*ssNKj=`_z2Dp8v9~Ah?X^nq_BOIU
zN$eO?dSHUW#<qZ|ivw<=d`}p%oKnVnSmg`~jzRaKi>a^w8o8t}ne&6GvSI<1$2bzN
zH~uh|lS(?~ds7&{bUBrlv3GDorvowTB(gUhM^VyVF;x}3A&h%%Kf*~RB}7>X{=JeB
zq|`o7X+^dqi|v4l2zyfA4<_y9Kw-RQO)$k&T*B6x5#;p#4w=7~BV`>Ap;@<f3wV0Z
z4gt@}uHXLVogivmQ;+NB)ynz!)MrDmO%X^IT~nz^avL0D1L?uq6FB?&`+~pkxMPBU
zr)M+vde+I3w=AG(AD5td*XQVC*(~q6*nvGR9oQc7Pb|D!Eie4w6h2ye1$SR=5d2-*
zuL=J8uNp8z(twI_qY18$p@+`f@RB}EcH6cF-;?gZ<>!~+j5$l?mk%t*D)C};bNc}`
zb5L&AVn#j(ZRpU$!?<_eI(f^qPtkC;8}q!s5b(=Id|s8bG|E%RN7vO5A>Z3(KM>|;
zk_QOm-rG+K_8ro0LI3>B2ZAoGwWkT5cafRYk5}b*3E$o8NO5gG^jX<R!FKPb2T@$*
zN!@LuF)qiO)=amfo1b)}vMyNGEa^ezQcKz)-6i0e=0;J>nP;f(t41`sHptC9j$`tQ
z9yFWeO0P9+mbaygY1*+cTJQcN%JR3!U;NOVTKy)_tw-aj=M}BIx;2Wj#1V9q<?F`H
ztmz5{?KY>SRpo+S@-7!EH{bd<w&P3g60Etj8zE;Ghd+s&7h2GrwV{-EUqoBu^@9Co
zWi?Jb^BvCEyc3oC-jqL^xF28AccIv!5ci*2FMm)`hAo!4cr<IYkUt*#T1Yba$=&H)
zkCSLwKaReMm(rF+kz^GuBF(88bnCv1RwXpa`1sJH8sGArMjeiwI6qvBZ2{eQv$G4`
zKeDCHiOrbk6hKjOXX=%m5#qtgotU)lc^YGO0#`L&l-Z4G#q-YRu-a)n`L_1r2lc*m
z-7|vr9X^Bm8q=|6dnevKbQ8-0|BiF3yX5xqP1x%71zz83A;fE@;OnlWITh}-Y1dci
zS}UU<nG>D9!RLu#Bz>;5dMo1M=n;Z`y6FW>aCUh-SB5$AU7h@eDx}bB7lrYpKYWRi
zWf3&z+eoTj=}T91arCug07aj;C5(S-{*1tPFz14xzorkNOP5dK?TU{vIa`bVeaBJP
zegk7;XV8R5H*)xLCe3q`)7hJ6@aSzRUKJ-$rO#xVx%{t!|N3wFyerAFr&rLWz4tN7
z$DPb1VRRz;S^A_foC4R+qr1ybp{yhXn?ugx#)d~YexsjYUy-wvlS)djS}Ev?85?ok
zU)*T#jXSui!c%Tv_Xsb{nMCdEQGBP?L$35|#yjUGQ{#d#N=+71`c)~VIn1Pp9n-0F
z@o1`TiJ(z)<#fUSZ5f}xo%_4FeWawT5`G^ujK_RF?@HR&z6<}X|6YE0$J==N_yP3W
z`W)F>dXoNO6cu+wQ{uu0f_-ebwP0^l*Q4&jHu>CpALC}{lPKF0#64#im+(2l_{b~%
zv~Ye99Xm3W-l|EV$$epTaAPRBUks<_Hy2Q3@g@Op#g;}~-RMg3=L6{@Q4ncY22+Y*
z0R;p*(-!Xz%!~A*h6Dw*1UyGxjU(yUt|<EOoRns67YTgX(-qWHxD7w^bECFJhtOeM
zB84vTqqWiJG4b9BJUhMzucvlm$EL&R^zw_i^3{J}*hkhR**%_;$~`Fc>O|Ua6-!(C
zLTFD{B^Q&D#=YK-0SBg0+~3B~0aq!p`dPGRb12<AZckHQpG}|M2%`LUG3l30Mb%1o
zN_cN9?S0@)z7l6T7(bTAx;{tz0oJ|*C(F2_mG3`yfO!q?01Ju1(L)|!=6o@@_FKz=
zA5_0J?ofFDZ@!`8_`RMF%zv=_z2X_nI}q34=f76}t#E!RFTX!G?3^%VtOdjH{pa@Q
z%;D|N&={OP<o^6$mxx}E#o(Az4oA|;;b>?%L@Y0Z;>l(3qgffS9i<S<^+UT$!MU;&
zmQE}M&4f}2J6-~PdrIJfwgiG|7XkCyB3N^y7^Yk+hCi(@hQ8IsP_U>N6j{YE7mMLY
zUlAN^EP~mair{o<5qxGSg4FmT@E=<UKm6MSsT)mTv%v%>*P5WN+yvj3n4mu21d^#H
zXsR*7#wsK1DK)~%Y$L?qGQhp-21r&L;QLeq?35TFF~k5%JPd&M_0Zj>2Um?AymIy6
z7ORIUEeqLwI*9&42PMmNU^h<(w#hm$SLooOzYb=)>R`5=4r=dcp;4)Y^RH;3B1sF@
zvkJh&zX0fDK3vGihsgMRkVfW1(a3zTTdV<_Vhx<L(!e{}c_53+1AmV^DD3zn<hQ8d
z8&(Y!Dm4U8P(yThF4XqqfLNac3za$Wz&jgWZBl`HS|(g^$^`M{3@{io;PCV5@Z;A?
z;MdE!sSfihxyuOOhh8t?%!psTUJ6xu19wA<L7RaEBN6e8E!%RE`4!Fg1cjNfxCidE
zPhO9v^RN1++Hh~UgIS(D!=z6!oZ_-D9G0Nq!5mNS!>2hax!1z5I6?+<JXyP^IfCtn
z$q_o3<H>4;Kb0$`!>{Lfye1M9*49%WgZWrI`x7kxw*ub=<ab;-_;#A3;v473|B=A_
Y%ZdMF7=AcGVa4C{IUg&Ax9_+84+TwqTL1t6

literal 3939
zcmb_f2~?9;7ET}trK}PJ6@}v<0wP4LJ9^2zhzp`DN)ZbNLfAy%hXh4J;<#6xaUF3V
zt91Ys>WU~<t5&TmSjBCupq^?y=(tbSiq8K({1q~XI&)^;Iq%$j?|$FCZ_WE(M3}iu
zBC)cP{9AF5w3Q_0sMFH(Gy%zp+C=~CiJ75jCy8rOrkllcl0r`E9c#fyNXnY3$>q${
z+ALn^KS8HW&&;)+kfTY`rDrBj&}t^<X5~!C3$*6Uy}e^Cc|k7EOia~qvL=ajXl7QT
zHoB?NnzM+N@*D-%Cf1y{>TDfn8NOJm<*XvZ9Kz&bvN_x*yphh~tmmoKS(%v{wKhF#
za<290oMcUoCOITst+nPp^@-waw47}eXBXRs50jagq{+<X?9=!@q=|V)W=&3gpVon;
z?a4W&S%xKRxwesEvM~8<&MC}dHYfiwO$g|$<=RDYE-YZKIxB}CwQEzrw5Co6a_yVc
z9p0%s3Tn4`Ox;P#b&lfPb&38FVU|{tC8goO4^*fTtuBY1#G=J=1SmpLDv7%iw*&b>
zNj&CW;jM~prhkuqn;$FH#^b1&atQr~Ig{X5CB{?s25YMA-+|$xH+`uO7fATz9@9T=
zP*9Y65Z$lyV175vAE@1e1L(BVom^@gluLrVDA#2cmbq7Bn*N^hr#a^_@6J&iGX5|o
z?5xEF#YZv!TR)~1@;SfS7QDpIkBax5#>>kbsrP}OabS`Yg=~1M6#PjmhoX<vlJZI}
zV3co-a)$8|&TptjrRNR2wMmNM1vil!bpXvbd9j$qYG3jnXF)M;%UOS)y)I|t{cXur
z9Jy^f{)X2v>aiD{SaTm^Ja^%cjSn$e<wG`QH*i?@Z*W85O;oh&L6u#9VtVglVcaS@
zdHWH=-Mg95>Ng4s3?P>8)anS{^7Bb`%n!z=j9=;3G=J@)-Kbo?6-O<yrHDU254Mpz
z(1=S?I-%~x^!5v5nEs!W9I4C8nOGxtBFEApwA|w?x{a;GrQ52}X1piuKki5c^+U1V
zQcjy{eX01~NT#>c#4-I>H#{kLU52uKdoTJ^-`-ST`w!NinCE89ui9V5@VwHixVvKk
zRu6J!c*wmG=vhA-wbkw{p7-iejKA=l6~mHs!oE<^{%a4|di=UYf#tqyad)E)^JjWH
zH1T(Lp!&X#l@(7q()xUNGL|SPJfs0#ruU|C(GRho%POWn)3JuPD!TBEoavp83-hF+
zRS89`Kk1E|c&j3n?ik~*9owiZIvY<`r_7k&CNv0V4j6+&9bK6|TJA-I%Lz*xVyRK_
z7fd}7&*E*Javk3;^e0W1o0!@wkOn;_%DUKx;PqI#dM<)~`SKNuC$BYU{9TKa7<N6r
zlD8__*|Uu0ubwNz`s_N)F7s#kPEA)Z{%hy<4BvMV&Vh>7@3@6E<VUOPtSIiTg03nD
zFn<{hVt!Y5DZ_e|Fs~{KQtZPAoz3xFk1D)0`WnVQY(T5XH%dR7hxpP=Mx~vGu>N#+
z9mD$bqOXzlC)sJ6a_$~qnrgih56nJ@vIo0x)jfN1b{WX{e;t*Dz1luwzlHPj=<MCW
zLO$WQP&O0>F-$jmvUrA=x5|=kUR3B{MwY`);c&e<W!NdnHDC*>_AFN((Qd=4j301i
zmaxwxk|W3CDegfm9q|}VI%6M_ofu062m8^w9@|;Y*_tX0F+{%4S5aY)vfGETH1$67
z_giK|p4Dry!)iO_n2Py`&qrX2<4Js);7$wTtWk4+7)ck}($K-{u-(94abM*jjGr7#
zmJ<WWe}1{L{o#1ROO=?>CxpJM@}c!<(bReJ7u2IBiXQo`!Sikjlyc3DwoI#I<5LuU
ziyogTs87QzmcM-XV&1CAYW-D)9}TylwRh_9RD~s7TfPK$jzC(WJAt(+otXc2NdSh+
zKc~*l!a0{n651xv!9`Cn)GC4Dq=*T;RneLq5{4UhInbSuRO~l@9K)~1eZl-&w%Sm`
ztXViM!j572&oUPOmvcU>Kk9QjY_zSwy%7a?ZdrHQy>V%<kT2l#nauB1|D5rKTvOKP
z;U6tVGd#?72If6_i?b5a8UAcsJ-(WH2dDqdg5~!g*nuuDt-*#a4tTxs7CKwB$MTC^
z=$VHP^Y70KqS7cc>^*fK)B9G&&=232DrbJz^#86|F`gFFkHNwPR59Q7O)!5o3ibCc
zAX&=`sAni2*-RJ=U;bAset?K^OtIMh-`f9IoYwM+@kMLOYpq@6wdNPwO|7W=m=$#&
z<F)1&^`aGdB5uu!Jkg3gQ|#2VZ6v0Ck@ufkQ7>9i*BUl$Z;daui`G>4F)ZqY^R>8a
zMyQ85|Bc4i-#cHPEzehAD0-NDZMXRJW+1b{2nSXhVbW|PnDe!4iV@CCG=gUa-#*?5
zUw1J=qq`A8<VKkB!T{SZ7~sWe11#ELfW#sL44rR)c{2>KQEz}VX$GhnZGa)e4KR18
z0S@#wKz3IHSiLQTuU{3y-lK)Ezh@!LeXWPn7xfUgSr1E!^x%`Ohx18#=pU~K%dUEO
z+({2NY<b*J04qugAfT`SE~XTKPp<;l^>!NUSu_pQ3DaQ2<$S2f%!jIp`Oq~fA5Qw^
zLtFcNxKl9|ybn)-r<PNoU0NR8f2o5Fhjj3Kq7G7Gb+Fx22j`u1Fkr0~7R=Sc!MnL|
zH8L0WB;~-d;T$Y`mIW2glR!2v6XNz}fQ=yow&!O+w*l$UtuhT>MW(`jSqgM}qJi=}
z4NMJ6hOgX{p`=a?*`G{++r{Hx^X^#CM#VzXu^3317zNJNkuc9868?TL9Ih-4hl|rj
zLjP3=GYS#(SCmkcI2aCB1%a^Mr_^^JEtX0oLT!D$?Ipe5uXmm%CzpS@gkr!)7u;kE
z^s*K*o?o9jR@iYujR=#n%_qL;nm;cb-krJ+#`J|}s&0Na*nN=kzZUVMYpxDZm};Bv
z`<7Y<{`j=e_B7QtpTm~g>`zU!fu`E#vti$&&nnvw+IMpv5!R-S-#b!g^B(W%!gE$A
mq7C4OE0iDk5JKT~!UI+!eV0LSM}%1jFHBxzArbgZ+<yT_+?)Xb

diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl
index c2875679efd98e7d3454084ddc67054a8dec047e..1d317eb791515686c7294d8c0663f798df6fb71c 100644
GIT binary patch
literal 4410
zcmeHLX;c(f7A}@<pe3wLBOuBmvMA7?pn%~~_bM~mAtEj?sDmu6Ft$L^-Hu9E7ZNqb
z-MGeu(IjR>QAtij$K8mcXdL$-Q9*GR_l3kU&Q!fla@54+9OiG{IrqH2b?^PYTld{}
zUcFA$*w`|Ry*=}bk-!8qDMfi&-B7LGpe>0oh*Q>>Z0UDNZed|oiB?~zHN>d1#uOK3
zO^D9YPAt~u8Y1(u4cQUJ<8;Xo$YlQhD6<~LF+67#k|7f_$SInr)pORlhN6<({D>^0
zp+Kish)g-g0$pB~L0hOVDiOI9oJ~kbhMgFo7d^AbYB^g=h$2~6lx-Mdd8puI8CIf?
znv-YPh;weSk+VygZ)M=@2Wwn3N{#Jw&Ow~4rgMrJxw%C;otS??QK4Qjs3cEYqRs19
zkZVwI9YWJMM+5g(8t0TD7h!bSIa-~bbIuoY$p4A=oub0AKhwHU+9=L7-%gWf;M@jl
zY&FVhoV!LgjZ=P*F9CWOI8_?wNdfh_MPe<vjuyatOVZJtmnGW!$7mlZy3-6A?Q7sV
zr*U13*%7H4JNrVWGKGjQDoN@amotv>ZtrqNs*^$WhQ-v|-h~?}UO1=p88FOf+`jY&
z=(i&S1AEz_q@RoI1D!_f1?9SBP~z4beK$Gam2X?1#Z`q)X|dQe%O2FFb>MF>Lw<P@
zwpKV|JgEhxzYpg6MdHSkHuz*)IEH>Cqj-+b6&OA_1|Op<)xDpLhk!@dK!2bHisUTr
zx0B<Q2Y<%5w{@g;-zGauaPEsOqaCRp!e54!JsnZE&z9;BQ<?y-1*22+O`89ub9*4J
zV<L9>tcB{&pB@FKY7Qx{wcx4L4DMz7!>xIpX#D4@T3FOp0fP$aAhtFVC;1$NRU4<k
z`+0{U`mir9oLUb<pV;I0ghr^E`B!vU^(ni;hQ-|-=TiI$o;t8HzJUI{R#W}PCqF=S
zw^($Hu%Y<l6!%#}(Q;TE)5azr>Wp!x_Cc>%MyRW*g`T6-=svUoKKh3Y?`&v>v3}{;
zI$4epZIz%r;DH6U{#bQ32<wBxacE@(#T)9;6~7^2SQQpS^)Uz5LsO?rT>k6`<(J1?
zg@8N$_)Kx1>L(mdLps=D<kRy|xvM`0J?Ma&mL%f0$2@8LLuYrab96(e`#q@cquB@z
z?lQdEtCq&UxGuxqK`OLvaK*!!1JIb@g&$7sg|nurseg|<GI05L5u8qQfbZr8VA9sR
zNm-5=I9q9{2fwde#9^5CUM#10snrfJzv3=bI|stH`-pb4WjMhl7QXN4f|qw#qu&KB
zOc?Bm#Zj}Ve@$!!e6`0N2M$;RiF=2jUg?il{kq_s?va!~C$SnnT;YHf>>Vh)(iN3E
z-631P6<YTXfa87BasI|SwwdjWeL{ia9X$6LE9Dd4?Swdw#yIz8*csm+>#D9(``x+H
zdozq1XQ|)7@@tSCl8H6D9#eZBEMN~MSHj9&M`*m{D~;{v=OjsZ&)>dI@vSeZsK1X!
ziQSsR@amVAej6ESWFOt14Qcrs+39~u$E2fc*>?w3vU?wGXM42Xf&V7%h4sNzV8!l-
z?z(t9>FfdTJgtI=YENjK>Wyjo9Z4y=kD*84akg<lH}sc<;lmpa*c4A_JwAWS8CC9$
zFz8_@8lU>(__->KY)Qt2pLeD9%JNdkyy%P@cF&{vh0bb&vMX-vxb2qpe6KM9HeYf>
zKB$D+hZT5W@W{cqVTq+b$_kxv`$#7&Ti}JspY+G-1qCp3zbA(D-b4L89dE#~peA@6
z*9pDGC*T+5tHCwK4d4CYQ@B_E4ptai*xJB#kaMJ!?R;12X@;qKz6QD$d((P0tg^+e
z>KF|9*pKF$e#;fWGY%`yXehqZxJ0(YmNJ-Be~|j$YS;u$7Xwgry@uK~yB|VG4;O44
zZI2D7GH`{bJI=hFfZBE56psz;4XMjVvU@rWqyE!7$S`?i9ZdexvcHZWIRq6Rhd^yO
z3Q@*jJYw9;#ywgM-Uk@Slc!+x*_EKW{Q!>LTn$D=A9U^-4fl_ofd?ZpVMa!8oH*hF
z@Xy0>M&3h;U$kHiT&q#wf=zqD`*0|RsTlMzbjPzd-Do~jw)*1aeo-jc2UC4LC~<^C
zD-7pjamatvc(LsOG-S##;Lz6)q6KV;T?o2+9nkmiDvCe+>}+;m7jO8RViAl`KZA9b
zw}5@)Imj5b1D>CZMb~T7;f32xxOeUYm{Nmyrs5RT&pQg;{TsliPZX|Sd<M2O1>x|p
zCW<$!^L1Eq;wWq{_QM!Y9}HaD4E^og@IwAG*wN=*OpI6qCDrHQtHEob?w|1}9bm_Y
zj5j8UN2?^ib%1%b`v8kd29IAqz@(1|Y5Xz&QHMYJ;FtSA+Mj{-r(67N7)Fxc+MjXl
z-k;IQ5cBK%^Y?BL&z=Y*=!!tnng!DHTY>P60=dy3$|-?loD|5gT>=RdC2EI2y8TNa
z555qeRSD$ra)BILCeBL*QockWs)YhsFh?Na9|$C!7l^|ofg~9O(pMB;QJf0};x$en
znjC>l?I)162?BXPMj*FC1hU*)AS*ouvOy`3wx?#Y^0t}ut1}bpoo4dIW;02zG?V;g
zW+F^86V((msS(WNa;cg4a%Pf{V<s(sF%y$0x>Pf%8)PPx8Z+q&qCLt?8p1?8rJ3|p
zm`Td_Jh}9gC;SbbT)4uMVPEs)ave|h@8-$6YMzvB=E<l{JgHyH6Zs0BY*@w<&oZ9O
z`8!WqIiB<z#*@2IJPC;4$?-s*boA%Ru8usp=fV?jJDx0SF_FQ|CbIavi3EIOB0CS7
z$my?4#Ac_7<SsRlF|j5xI^0C2b~BLy{wCt~eJM$KTuL@RDkWQPm6Fiwr9|qN1IF(r
z&$nV2N#5Kq@r-?Y`elN)L@!>_lEIcXOizWxc%owLFl%x9N|LIvlBS>Uv@c(e5+uVb
z<NvtfzE1S=F&6g<$-l)yw38@O{W{IdyYW?;`Qk0Uy);p;)4aThuhOU;+fNhyI?c-~
z7ynApMmx2O=Ewd>)yU;Ne#Gl+6aF)@^e=-newyX9R~Y7`$R-Ujj(!R>xL9;?<-}()
O>EbWO$Qa4r;`=|ZC7yZ!

literal 3939
zcmeH~c~BHr9>;r_102EOMno>pAtI+n&8ltxo|!cc=!hs}SmhY*AlSpe8l!e3giOR*
zQDaPu@kn{WMqN$Ra#xLI)HR7>FiJ`z2#Oagm!RN<`|H=lW!Osg&&?lsRiAqGo!|AY
zp6+MR%M}dc;K2M>b7owbWV0zVt1vAzHQACJvha-@9SmTMHtlV@c|5a}mucgbm?0%^
zQCdE4XR_pBU`S$tB`YUiooG%=Dagu6O|+!t=I5Cc3nSFLT&s<@$AEnFOwLH-6;g;=
zmy?%lnJxXO=9Td>^a<lt@p9}<3k!Jr=rWmwcbKJj*1PK!C44{ZWF@@XYBJ^J<fNG_
zS$Vnn>Y3)$G;><&)GU)l&G!$A;T<i!Qw%>KUWI9Ll2g)h@_9`rjv@2sx>0$#886B@
z)3V{bOQyX()xx{Z(kt}t@A7VX<-5H5`<Y@w4-4-Z!+X($`KCNG&U&DfFjE?I1n(_H
z59*2b5u*oNX|%6}_lx2E3z9<&dV7alraYR(iHaD0aXSVuN}KHn4by>)G0q&>0E<)g
z<npNwV3;J5;ZOtX<Kjt`|23)~-fe?3=kG#D|L0T(-K>WxB}PI{-=Mn2PetBrNg$hB
zR78y5?ly?9w`P*j6~0vKoSew?gjlklccMD|_7k|0F^fF2ccA)AkerOTG@a~ma-_Oy
zn>Vr4mx0F1pX#kiF2uiU2=U2wrP}J@K`wU25yKuYs&(~_q%gA<vM+j49pDp4=4<DY
zl}`hyPWf01Yl3!yd3X>V@1l)uu&OwYoNa2SI;rM4EHius&J8l^AN<^pXm7qmrp=M!
zeY<NQ;58$;JzJXZk+Va|ke9}jl$znx|Me&S#QkrvWM9EBs-rwdLG1L8pyAU|G=H)}
zMM4^9ldl8qsr}G~7N`xLM-snlqxN!7JDA%t7oL>bQ9bU*eh{Kdf>Q}<s`u76ag&B`
zh5EJ@u3IsOj9uak8~&LC2h=Xq{_yn<?wj-LAp5-s9NoXq2SLO93P_yjL+xC&3|!j>
zgS<*c^_-eW*tPpF(9VycdeqWRE`H}uSeN&Rvo_5m|JXDD(%&zJsz433uPr+a6RY+E
zn{tlo%9Gu2)mKk``hcPHG0%AiirZ_TE3|~#$L#bWhVjwlYMr!xEwRqz4=)=?;D_$i
ze}dVWxaUqKtLJ)<=2zwuZG<D4k?}F4-_uaLaZ(FczT*;{yw*zR^TAGrDEf^f$2Lmm
z<8GuEbSy7{*%e;2p6j7bST@U%9PU!k_@d)$!Ti_jaN^E7YX2t12fR~00@*cbJ=TZ(
ziz{7G4xOb<T-T{6qCDnL)`l6$Awe2%cDpMHTrioa9!Te_8Tb2!$7JGpT1)dEUl0W9
zX)dHVc`D6+LpC0A!-`<S`tejR+b)N?;4CP!S5RF(Tm!A=zTi4{OY6Dxz8i6^t_F_^
zX+1to(ZZr<@$j`=I-fO4PqMGkNFv`HNb3b(aDXz+TmW4^n(xs84qERO0nY)|`KIsT
zYV#A&2#r*44hRO<eQ&_f!@*RiG&qw%nhQ`HCDqez^(6(}aiqG?kLF)8+l_pdbql87
zaHD$0@<=lH$FpE<c$w;$Z9||gY&CcW52g8LPI<;zHblXxKX%jo$vyTtjHs;#hjV+V
z{gZPuVEQl<jLw=#_0c1~F!|{_@KvH8)rX@;kT$>P5ZV|-^DnLQgbycaA$f(gpId&g
zgVd&h(C}wD^*<S-BJ&>ElOcuD^RuV=F}Jkbi+ox5l-BP*@(YM~s{_XE|2y?R9pnoR
zS3=;#cfM3tW`~pVVN=OOWhAZlrBeWG&#}UhH38H<<NOG4T~h|Nu_LLDD@i~<u_aOe
z(tHicuHFW5Mi=59?n{np%}|%H7b>EcQ~%Ocja=*9$KZ4H9v#nut19?DeJpn%OL~60
zwOX(yYzE^-E%hI^U@tURoPej{2dTENaEGu<L%}=BgX+5Kfw1eq6o?hPslHa~2wO(~
zz>W2DqS{*B#m(Dv8cJ#((R#(f_1H7a^?;k)l0+kUZ~ioJ^zb7!vkcUK^|=U`zjg<N
zzZOaLrs$DywsSEwd?wA;S~?k4n1_%vKbfdK{oBi2+$}d^>~5m@<+_Jl%cfBz^4l(|
zh4MjUvDd5QXsY!8EWUsWl&az>_-YjSwHJ`W_5~EK!;0cdTx`Ghzpump^BmwuEN#EF
zb0|Jv`}S8_1LO=N@@vo6Fx%%VLI(kTzP4>Xy{WkMTwtq_${x;Oe*{TK;t>T>*)6aq
zks`!`{0SL@Xc0fe6>&uFKNHwi<N{KO97HxEE0Ihj3Q-|lPX+cmvI|++gBkmU$P8o(
z5{85z407j*z#d0dB1On6$d1PXy9Kc#MaU8)6KQ@Vu!TqxqC&d61hx@5i0nY3kYL0e
zxz&mDLOLGexRKMy#s>mB5phKBbqH)7QjUCr+-Miry~tSP(tUy5jkLE3>|3n@`=CW&
zosbF50y`S<MwH0qCV@>vA`us){hq)cMMfYi?qZEQSOXb@T)8c<;(p)Mue?|$V;GTN
z-fs;P`C`8d)6DsJbJ0QQ|J?9CM<7xyR;=2TF~<(izlcFE6T268y583dXVGBO`rh#R
zM)!U<46un@g|q7&9U9g*w)eTWjdjNRrcG>k-`L)3XdAn*f1lWhzOlVmLt``MB&Ywx
z_N=)<uTll~IQhv(zi=0?vts+X55?(<?E<C{8^+Q98d*;T(QMEw#TzEZC>haT^8Gge
C%kz-{

diff --git a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl
index 0657261d96cefe0d09b93efcd81c21b4cb56b3da..3d4e3e63f280c79044706fa5ac4e9c1c448fdefe 100644
GIT binary patch
literal 3405
zcmeHKYiv|S6rOE&+q%3<DGv*lL<uO)-Bu9&0a+?pja$-v+?hMG5jM;A(rs9Fdv|XO
zF+gr2AS8eS3DFoJ5EK2O5&vP-&_o2SC>DYOktji9<P~Tkq(H&%?$)NQY03U)PR>qx
zXU=!d`OeIACg3l25>i%1{?}@Whb&Du>Pc14ntFzs=%Op*>1;HkXH&X4-xFQgl8WkU
zTlBc8G{(#rZCRD{)y*Z50-9B{%pmIx$86z31S`^O^{i18H`AGTf<{}-=493le=Ay>
zlZ{bRPi51YXj`q@D4soAl)@m3nXx9_aOO?iuf>up!})J+!zCP;;xS4@G5UB*t5Lf2
zU59Cu1^wgw6@KSt;|X+*&4znRJf2P_kw<enm322{8ug6cxTHC5x{XnDLdIy*7!xwa
ziW1vnJdON~aS4Qwa3=CTYK-xTQh%dqObGg&{)$aTx!<+Pn7AcjTUVIIq>xdWvrfiV
z=*g@xId9z;bbUJ2)F1SeT+p?~)I8v{KENk!z^bh|!0DzjBV;_)8lwS!X_-}f*S83X
z+8P+PnjZ3Wfz?zzzPjp1*0NJO$%r7F@Y}2yzh1d3?3EMLj^JNxVTkbiTCXHxa>Oex
zs9rIPc}1`EimC_^JGD|lL`U!@Cn2I){IqKE5_Xp#*2*;?22zp)ENN-Cp_5==58FM=
z5|}5RMSn+HqB7EszFka<TvOW^VVfAy9zqE}5PVaN30cD8F=la#5!oY&tcuK&dqZAH
z5aVLiD?bI+4YnWg-)9|K87E@B_)N197g0%ssN*Bw1c^j^vMY=)kgF^uCU8q!=TYIX
zXYvQ!#fj#XCxADX-Dj&9fxksUBH#ULk2s_eSuBYB1pbdo!a6l#GbYZeL{zhOaYAhu
zHHc4gBDZl%?Uj}~DY7wmQG;LLuadw6huv(9%`+mm^<wahS{@>7jp*Y01krXh{GyQn
z5$z(t+XZlh4;<|R{u1cN%qtt92U*MOaY^dmMh!lK9+MV-2(5_xt!4y)MVovt1aI&l
zQOoV{d{VWLR}b&z)f$*(UEmoJ@nxvjPIZIyWBfz#_@}xJ2x;w<YH8)Ho2|xbu=s7Q
z33+~i9DUG3DhaiMPk4}C<b05=<2O(r2YlU<UO7E-4LMFh#@J~ZA<t>+<=(I*$HMLb
z?3@vzp^n;=2$9EBA}MmJ2lpGXVgy*-f!hD568LXptC<D40&b?l;<9>6xZrV5!zB)$
zw0r7hjy#aja?F1p@dv=!R*bE~+OxzO(I|F{oycf4YG%n^u^~+4MAY~cd`?HjbJk$2
z0k->962_PCd@)aa9`=fHYDR5hmiiX6cj)mq_V!le2(opg9a>lD4z2OkEqde*9r-Xm
zpqNTVNhwK1S7OR?C8BVJL0(l}Q5GsMLte_s0%d_R2Qo`}MwzKpDU%_U%4lVjQVJ<i
zT#6G?l#^b1kM__zw3~K8Zqn=YYM)%D|IkbH56JJ33-q^~oTumMImoZ{3_VRx<)o8#
z(x2%``UCBt-}lLJ`Yk=$FGuJR`ZfKEeo4QehjS<4i>~c6zr(_@btImr70Pq1udZ?!
zp5f|od(6ON28Ln4KL0Kqd?)ef^RI?1E?{4_>6t7pYreXL!)(lvkG=lU7u<G)o%@<t
z5i+nf;CI+v|DAZ?{i@a;7}N^CArHkL_}{@Z2>U18=nBJEJVVh3o{59eTX5kn5WQw7
z`oPK#Mn8+Ya)IczL(vCTrDhN_E$%|l`_@XpUs5u+4`xR3^ZkJK!^3X>_2=wY16uj0
dd=ESOsX?xc&i`!>2mCJkF#sDEvFGRK{tM;G{h9y(

literal 9571
zcmeG?XH-<z))z!6B1~)$qcB*<1_Ox((5*9P8^EA~8eh~{5F|DhI)F76l<22<7BohU
zC5{OuCYngp=rfiKLldM7Ac)k#77+{f63}n&%fv+AdT)I{zMtN;w)^a}`#$@edj@Z4
za5Ec8va^%^?NLarr71BH(`U?!^qM*)Zi?reSy2JPNgDss6K_3HCoR!gsm3-Z3*oaD
zM8@i@BjRS0qUYqfaWkS~9VW*_hR>Z5HFa`aWOVH8n91{e9CXc8s<F1D5KEF%rbX&(
znp7MDqGnHt8{IVNpld$Xie%Kf7Gs+cj+ir7XFELECQfHJGT1k`ZLrNMoqh1YV5?O+
zht&}gv!kLSBjRSvj*fK*iJ2N16FGIrjEFb~UCW-MbdGVlR-<%IV_UGxh}kg|r1Nx&
zVY<!qCLi89#dO=@eXQbiZ;T9f3U0en*E-mFrLN8D>8##1PS<XfuDz@tH6=VUDpu#x
zq;3g&*zD+OFQRspqxR9gX|dSBV(~4u*l~?)u~VGRZIsS^?iA0^U|T!YfXCytlqy5$
ze_RkJ313<eFLi(z{K|5?v^M`IB6$!>l1WICM#gcOgYcK==N4MRv0DdWc*SiCT~Ann
zrX7PxR)O~tk+4B5X}nMK1_Fvua>ej}G2a!tFJrzE-?M!}JSB~pA4X#{L74EYdn889
zXa7vIf5rwe+P@Qx4K!BbZhTwD#FufpbV}TJYiZ#t2&>SvpZ%#Yr!)IA3+1prm>kNs
z)VOj-AU>Jhp5}uHTnX$5*q0q{YBhwg5`VtH`bvy`CJ2X-p6Uf|qviNG7S=Hc<#iu%
zbW)=*w?>+iaPomb?4I68W415h=vjdn8{$M`rWKf&;f0@f{ek#uqLT?5MEBa>=rb)v
zo^x2%2Ztm!h?w8JW&LTVdgFEf3Q@5C9ill0vrzaylrh^c+Z~7#TG$eg?c1TE*b77d
zkuBSDo>L&YP8fvW?qz;cj!r)X;%7H*5})Pk#apL+ggU{Q`83D%lSI)qPZ@KJR>!^Y
zy2A<CUc*q%+aL~g`&Gs~9%<`EvT7W-h~wp)G~&r7)@OT6>&1;@Pid?tkZ|i2XM8q{
z<ClG*dr_508RJNDX3Jc?Q0(IR*9-j!UcVrmv1}LdI9|P2?&^ihJ6|ALjVG_x$n)+s
z2Q2(#v1XDx?mgTJy<I#=o_#1$6Hz68xu~SES@c%A<M0G`dH&3oO5C-*H;oO#`k9Z6
z*>1b{c#YL)dMa>;xexBx;e-QC%x8OM(Gbh=nZ@jQ#EjiviROQF!k7+wD265<LF+rp
ze5PA|+XJ^AJ4-wj&6&i&bT3>9wM46N@w2;PZNnJt`IUW$7q2z6_rav}-gt4RJ8tU#
zHqOZ99Joh#?UIFc5%;5nbA$TIm~E`NK>PFL!51=(zi{kaYu3?=(&RwQT3SUk*Uv11
z?>Xa7+gjqFZk!8)gd=WqJ`z5#;r^E}`6;iDB#&7yxw0FLSzm<}v)MlH8@A8*FJErW
z5VaF~;6*R)7ZsKaG_W19>8nZ^vwx1c#K`+ujkAhrKM%SuDwlCTu|I<db?AY;n)CkP
zdU5W0v2Pm3&wk9pIc~6MTs06k&k7`3g9mqWJe&u|qrl(O`(pdJL6{ZB{iwl<Gkf5=
z@5)G)?U}{sTY=c&n_QwfACu^JE)a)>pR&*f@l&%NcsBA&S$_9#df^)4g^>@KR^#e}
zyv7D`z@=QqP5C5>WgEGEiQ=&x_p2{1wg(yCJ)uR1lt<e4vmTSY#6a_sc>g%}AJa<w
zIwTMkiEc!5JW2u;UY&TIcq;7a!RIUI)dW8+61cSaF41f#?^mt|=dHvD8}1+0rGD^p
z#z(hGh07i0nMuYV^z%4AgYa2_XsUKb?UmyeT?1jISo|sTmBR5p`%z%O&i!OO<|kk5
zS<)y@t#!nhQ<6N#em2JPS;YQ)aZk82KKzmUy(t#<Ip`kSZ}QDK|8ST2tj{{}#M8*Q
z*FG(J^nYJ_`0N1uCTX>{GL7d=V(|AIk4Y@QhUi!3j8AuQzckq&lyFBozeg*gB{_$P
zMq8}UX1*Okll`PIjpB>w6~vclE?!JZ>W#7K>{l(ztiJu5xHXMs4K(*MY<T2`m;H9o
zJoEM9c#)rotF9K-Lo_sJCQ#t$5uW(1Twk`Q#+nsfWvsf#Jr*x;B(HD0#fC;~lg|r{
zs55EMxObQ~ekRu=UVAr@_qP&9{!flY)+FP_jc>{G{T(<471j*m^`O3J?T2Ts`eL7(
ze+YHQ0Gg9<->-{A2Uk81*dLu)C%tf{mitYO*ROH@uK4|Qj*s`M3df!C!l4`Zyj5e2
z&l`k&v9gl)gD)<vXa8XY8k!FieR8yzu)_<To!ZIr6IQz7&Yis9I4+h^Vz0Yict~d>
z^Q-;r2}`)U*<bsa?Jg+f{pCwM17Q_bF6Z^4SiWUDdZAcxlIHaU)T<vX6E~_m;b)D!
ze$2PS?arLP9a?wv!&&wX+95{=;-0Pc#BYk>(zPzcXIZYlFHWkxK|Eg^?s!+mw?_Ub
z<3G=n*!_Iwvp-(Hc(Ey?O0>0Se`c|HTMrD2;(DrZ>peB<?)&1n$c|`}{~paL@k+zH
z7}cK7Ghdv4nBPCHc(fhwKd!GDW8ZYg8&CJsJjcW5mn%kZ=k;bj&&7+BhTmvDp1>fY
zD}2#eQ6;qYyq;$9Ac1Y^MNKHjrWZ5U`-`~3(;}s<MwT~w`pNUf8~9zI7e&imFlQ<I
zWBbWso-q)m$caSrc@idmu6-9z*fkJM@otgOt{1Og7|~q2FyU9!9jjG*zN_%}?p!}5
zT50(F;aKa1@t!*_p7T!&ok%!d7?1oFfA|z<4Dn}ug=}N{Qm$Pnu8iaNWhffA^uP%T
zYe<jmKT8xIy-tgGdjDR&ZjfzGif-zQ=T2N-gP60z8Rz}LHDG&ceB9Cx{li<~(`{TI
z1^UIhqg6Qf3&+PcRM_I!6B%>w+0Ph++rD2$H0KvD3Slt5xw;39nO33pfeWsy<Ne0<
z;(k|P&K15tu)kz6q`NJyUBdqD2zU*YSoWCjIP6c2ZCiU{pU{7hJoCcDnlHO!XYa8@
zbM3gUcDSJ%zpq%|OgIQfcjKA_;l^dGV-}w$U7&g9nZ+|(7YzCNOB!=LdcyHy#{NYz
zZt4+Nj7#G4))mvc`r<n24{cN463G|{H0ZgxKk+qqXa)C%9Zn6*r+N0z`6;ot{J!G&
z*=G<|*sPN=zq2^*6?)#^D$LmFiX*CQF+s`qW7c=YPaZpC=KD-5@O}qZv>E7$`6+Wr
zk8_C^X)A59XCkjZ%c}^~*n1bptH$-A98bOIF^u<by*RD%!zCdg2CVq&e(0J0E*^T`
zocO-D>JZzYHN46C60YgZ`uuR*lGM4yH8J_<I&q_p*N0<QV~9s<?U-P`|Ho_F9aG?u
zd|RA&Zzu8CmY%Q*H`=$NF`b`#6ol2-@fPc=(Q&^w7GD{FZB(B4#c}RGHTH8jE!Gd?
z_2Sw#<v4kl6TY``InfHtJJ21SHgo=*Ydm3tsJ8{IU6D>>4UTahjA<t}$YYK%@wqK(
zr?P*JM}Z>_dg9p>AU@mHh>{2wyn2A|OT7N<(;%jP#qsl4PoTz@3wZy~y>tubtU$M4
zD`ebM7hg;Y-bJ)8eyr1r`mXhw@`t=0lGe7UCvN%P8H4)q`;2qfi=hU0JT{Z}3-^OW
zpu}bpuRs0XvEmpM=$Ogx1C}?5(h^6lXX+!O<F!ebSzf}Dlv1L3Y!+*jH%0Y$u2;Rd
zG**pT{cX{675l3v;P_oJ<`CO=#c`cnaYGfKKN_0XSg;b8eGRyBr;=#`I~=S2iTGTv
zWO4Vo3z{-MlkIRlqyOcPA8Z9|V-Oa3B~}zU(wOx*X0v#-cK{a5ylU}bhgzT07JFP%
zJG_<7?+3QeF`9+_HLj<*3A_)805|T>WTMq*Jot&2c#7|5@kFzaAaq;0kj6psT)Rtw
zc<pQl3#}ACEO{<+lQ@5g_<F*=sCM9f_C<%j+|NNM77nn;Ys6<({<56~daaB(j`V->
zeld$akA1Pr!bUOt2KTFk8{gudvBM;f7Bc4X(Gis*rg1#k<M_iw=Dl1|@Exy*iD<f;
zjPFQTMYK}JN9g^}Ysoyu1s}TL*F~()Yab@6*E|>6(R?0q{K?|EZl^#Gt_SDBIx6fx
zj`yny9pACTpU?4mq#;@&K4S%TZ<k7Q3fyu_66?DTz};;Ii1A-=ej3^CVIQt1+o#{!
z_n&Dl9^&6elJ>^_bA;{CW5;?KH|=o+e$;r9XkJ@`Sbp=KW>_bVN7CN?*c*qt4aOs$
z@_B+CX+H-{s@J@={)B8VBrTP&5)<e0{!o&RUeu;P7hk4xf3ttCkrI6daz4!Go={-$
zBJL0NS1&Y`$1So33$7POJt}G3)H@WDCI3IMgw}-var|Cu($}Cv>!m_U<^32>I=l|9
z=-Z0>jq7bD9E2yk@}3AnA2ZiTiEkHmqj`=;t`~*MdXluZKle|f_&t;RDUoF8{=M54
z7x+z}F~=T8n0{BiQEORWNz}F+FHU-My_u&(`;WL+*|t)~R&n{FY!<)YxfY3H=e~N)
z-csIgk~ZA)bKzciTMYSx`+@UK79S1if&FveBYlo3h(Is0?{j_i7Es{VhXBKtw<Vs2
zK#5i9^NGhk<Hfy)Zus^C?l-2xMAf2d;W>cy&9c6C{%!GLUw%KazeF*qI@yv7*O&W`
z-VX`PmuSp2kg(wZ_XpGT`;q>9Hlf7%ilH@81pn>NXErbY`OG_jtU&!8=(^z!to-y2
zjC!LU-g#69eXrI*v&D6g_g)=zRn>vxAJ?I`={h)mcpb*OT!)sw)q+hxEvTRt8a!&j
zI`4Ogthoxu)?S5Q8?Hdei7Vi{?Fy9JUV)nrE<@F|%iw$NGBmz*8H!6U!SofEAYF9{
zPCmW}wrekfe_0LK=heWh=o+}>Q3DCiH8B4dGaT7xhAu755OS*;x))S~w4@pydRN1b
zPStQgs)poyRdBer3OZh?f_~?#z$T>%j3=uAS69Iik1ELMR0$@RO0aUMgqD{o;O&wM
z*ppuYNA(q8zpDZ+e^CL^t14jA!V0LHSph{k<!~pZ93C$%hk^6Up+`(PG|Vc8VH3*X
zoyTPmbGr<dFDQdYVP(*NTp9EnTLxP~%3vatfy<K%&?4gk{IKr=l!RS?F?}z<tmCCH
zV0$Sf`Ikaf-%|LvRViegI1laioQH&2=OJ~>dDt9q9(JBAfxHh&VD0-Q;OJdKwu<5N
z%3>IsT@0N!7sHZG#jt8sF{mSo;nR`DFze|#xcTrL^sYSz6<g1NvE4cNsJsZ87Z$-_
zLlOA>S_J(F#vLjG*Q6r&?!zLO{B9ArsEXiJbs@wZDuhu4B|jE|_p(ChuPKD0E_8Fe
zSO8lK3Lxj30w{<nfPEnaaI9?sEWBod;4>z8cdrR-SDC;#(F9SxCiunP1fMqM!ym=@
zus$gtT;}D2`#<s_baXzHdgjB7D|zrsWgaXfa4gP)R@?Kyv^EcJC*(oFf;<SAnFrg#
z^5ETGdC;Ou9{kuQ56<1lg`g|BaN=Sv1Ru|Z@aegbIW!mc`{%+cms}XuHW$jRb75jx
z4uq8C!0?nDh}o9|NYF1n2c9g=fsRo*@Xquc=r%M5#&^ttk*ChW(35AOT7MR_W6wfe
z@L8BK<Sbm!oCW{JY>2v_4O?omVOe%Ibp9b5e%p`@%hzVZ(nZ;DY(X}71ZG2Fc@~6Z
zWkJK?EI9a87O1|=g6z#%@OxwyxP6ob#xYqibVL^H^Us2qmRT^sDhqt8&cLapGtfNY
z3<P?df!X$FAUHP@?i|d7+h1jZ!?sL_ipzx3IhkPkAQS8%6FT{3LT*C_w7i!Ag`Z@=
zqxl(-H6sJ`m;vsp3~+GCfDUakV6h?t>V8fK>+jN`CLta6%uk2VN$Jp9nGW|mro*Gh
zY0&>n8hoCb20M<Vf%Ty@xcO-sJXx3q35qmW(vS)d{z!!t>r!Fel2lmuaViw{PlbP}
zQz5)-D(rMig|+QdVO+aZ7}z=$TpNuLaMuX!>x>XmYlQSFBgCW`A@{Tq^1nC2@~@3x
z-f0A<HAYBTX@rV}Mp!eAAlwL32|jLTgtiVwxcoE)9zITiZy%+=j_WDl`fCc<{+I$?
zx2M4KZFEumECr4)Pk~FZDKI8F1<s90fqjY;_@>eTX-5sf=iSI-bCxC3|JpG8xAU&;
z%jca>fKdF~e;3d>`ucfYf!7syU4hpXcwK?l6?k2N*A@7GtN`EFdL0hl6=qMrehL2b
zea%_w^5VWWFES>UzGMjy?*DdSd*!2!w;k9+!B^A9SX0ytp}|&+UVPK@_m4{y%y{W}
z^$VX@wg3Lx+UX_DjaJf&c=`JvFZHXsf4`@`tZQofimvyox___P%esa)U(xk>Rrl}J
zaDFM~IbHtJwbVE?xJ8RT79qE0eP4+4=S=+kiM=RYehSG1KcvSWFiBS3_+P>Vp~21h
O3nS8LF0p)*?EeDnO=}PU

diff --git a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl
index 2f1ff941abae5994144c73dcfd361e963ab28cb9..54dfb7cd206f1e420915bb5703f13971a4055cbe 100644
GIT binary patch
literal 3367
zcmcJS3s4hB7{`xy2#P3(4+^411Vspe_&{=dw?}d6SsEVN_<$3VgD}Q~O%6+Ss3?W1
z^)+p^Y7rf^Me*&3qarv()am$uI{26lYFnm`P{$`K)Yf({K_-a<(`LqRX0yrVzVG+n
zz5RB-Q?YLD1R;?SZ&(5mL407;^7;(kEbyji0e9R&{*wY@zCkc)3<l1`n+?36ka4*M
z297T(;59;wHe1M!E?B5fhGE3q|6nDqnTRMxE)i-EtRTl&#G8?;MlhN*x@fLY$k&^t
zn3GeOuh(({Z!jB8++w*DxkW^%y|9BB-<h4qBX_HdG+A%V7SgR3rN~3=g71+bPqiEF
zH3fypiwzZ!B$Xvt3F}^tyjgeFr5s6DYBWZ@9_!CH8qCrUO<LZ>Ym@Rdf)x2grlA1>
z8kmNB)t)$tK0Alko6#U0mY{o$ooO`W^-}#T)L1lF=fz3{G{izHL;kEw846gX69Wzv
zP+%GwW(jE481Y1+;nsjUt7bVGVf7x_;~gY=zq8Wf9W0=bG&D->jpe+4AZ8R=j7CSO
z@xvq*@!1A6#!g_8u`ox^_ey-%A~6nyVTg->#$rfhq&0ZBfX1bv2+NF_1ucr~(Peds
zN8_z76Kq_f1T--XMHj05Yz1*W1B<a6m;%XgfRzG@wG50MT#n@IVDTZuS^4n-Qlz0t
zh1t<6)=N@+`?^n%D?x}&)#p;3LI5Y1YK$xyPWw*_wU4!oM{KbxvC|<zEMV4YWsLjF
z7N}awGU9K)xg7IL!E>_t8ON;J#?wPHIA*~O1-bnN$DHunMQt9;F^6BR2A|#Kn4JwY
zd@?qGDJtGew`5IYLN3dg*qK?(%@y&$yM|*D24>S$WgN3Z9z?Zwam=HnE?S(Vzt+?~
zFTpCr*54$Jbas+rlVM;VlO!gXXwLSc{zX@R&!cb*vAq@j^1k&)g~tQd^$6MKUrnCV
z?jbYVw~#eE*OQ4wrKC1%9x3Wu@}g~ZDAp#n{`4g{JHK(sa8e)h+sCr&y?o`eLYwz~
z1|{lT{#C&PaU&O7e>#Uat1~_s`a0;0FO7MTsQbvmKM+!V@U8L{Z}Jb6%!3pc2p^Bn
zf$|zTto!_1`rd|Mcs+0wOut`C?+a?6nz~bA5ekJF-QkdKd`Yu)>*>2G9kjpXA*~r*
zO+U(+0aMNHFmo3Vr2%2^i}}-Ob>)3J_*OXFC{)q`n@p6}TLCZWBH**iG4SYxQ25kq
z8x34*>55s8>4Nj$)4R*plNC+5;KzzKFsiExWF?GKZn~Wgz6mY{Ndta8O?7-)SMw?r
zbl>_8R4kZS-+Cg741738+1)+|T>hsDe7%20edp;_<nX4sptiXMkU{%F!sM;w)pc7z
z<FXX6abJb<x25rCN;x?QJAM#c-SXb4-7_C5mD>l^ckGG=+QuE=?C7fcqQ(ngR*Ek*
z`_FP}#Ia0j?V)-wZe;*<2M(v;m^0*MaGo5Q8b(cg>PLB<OaVTPIY2tf71T9s2fl}+
zD6YPhbcve*JFbL4twaY8B~5@^bNA4DmvqyuKTL<G4>Zt6ubrZbFRI|qz(DxKBL+UG
z4uL**w$h~+Tj|a&7r5O22wlBmDHR-6O<mhxO!*$~q?%XHqE=LVOpPvl0gfrxDeuW9
z*Y6z>Odn5NMR^>ZO(`3Ts3yrPP~|t1a(~iHg^i`ava=h>X}JmD=OC;zJss>>{)bZK
z8wrN#{lVtyL~>A8t8%@un=C0*gZ3M3O8v<)Qom^pIeGEl<fU*D%rEy-ih04cbTXm%
z01;c?c`=ok>O@{F=1pc?G$ccXLt^x1X#Zd0q@u!~D)VMxafH}Ztc%$7)?~IvV1oFE
zojG5jIdW`IktueJeOQlkoEVuS#dfc^r!?al%}GkEBgJ;dwx?{s1(K5#xg*7PzqO~x
za53v7CC-szyQ|t$ZaPniccj?nh&@FtQk@)&!jWRT&ZpX~u}Ape`2Uoi>{GFxo>4vF
yLfj_y1{Mz^(O&m);%fl5FU86k8}<`lB8YPX#7}DxqhdY8G9UX$2=RXFz5fC`w%lz1

literal 2199
zcmbuAe`s4(6vuDUA4#*>rt`PkYS(QpnOmBlU2$D$Pi9cxwDA=o4qciik0kb)*OPZs
z>>q^+TdY=OtV3b44J;^v{viso{W0gp6#YksVoOm$H~;ew{}B8`D&G5ElD+$69d;Kk
zH+k=T&iCAV&v_>)H8m6Bc9Z`MFWE#g<!qrirS#@9G!vMdP!rHj4hv(}`8<*tbof#&
z%)_|$lA@zCOEm@t(iK`%bx*pij8}?kE=`q^u9eeMVGlO>d?^<LbY_{!E7)uzo`kAp
z=&1S7gDoisvk78rs)?=H$qKq8H%0f;5|Nxza~0d9X30@S&s;XEsj8BtMXjWJM#?#*
ztmF<Bv($qd{4#E&c!!MbsaEDi&5SFmjvWOSLZL2uNGs(Vs$N4igq;PK<fORCP&<Zq
zN{(ZA*Q*8Y@NSCt$at^eP|s>*w(0v!hXpg{Fy3#{57g+*oZd2L&|4{X$@pNZjfFFz
zOiyaX64kML2DkZA>@vuUN~VN8LVyRg%DAdDLfmeI7{MJ(#6fW<6Y={^=MPcbC8N(c
zF`eenUyI8`qPW{c9=0HRDDIUpP)T{M0i|mj>lHQ@!yt3eLosA*OzNy+SnA}L5Ha<m
z6k{^>RWgC3<Z{1K4(OYmgz%ZXX{xmozmRIZ!36BIC7O_RH~xg*-W|XA*Vg##4})<+
z#^MM3Eo>#kf3z?Dbv_yAL0mocnYWug<8!Sbc8EcQ63~6iAb5iEQD3><;g#t(_)+q?
zmR?lU3nw6WOT8OAH-B9?{o0^=bYPI{ZusTB<GhghTuXOA)Qu#d|9|V&<5*r=`ZVy#
zvqN0}`y(^IuR}k2JNm1q=D!>IeALiSul}}W9s04`(O+CV1Mfd2!AIX-0eJERR4*?M
zzjO9c(EHznmA72bH}b^Le(%Td)sOSAZ~I9&-d2DUuM?0b`eCvE5G;h3hTG0vfFrX<
z;h*cj!cWr+@b{M+VPNcK_~5`Lh^~epmVA8p!|FHi=amJBuDlD^9+`*hU+jXei_gLE
z3nP&Kc^mNaf2`;H>>Rs$K5u^i9b|vw{->0(&OVt5*tgyUh<Ld2)ie2LScMZllahlk
zjh<rt1n`OwhrPqtX>0Aa3+Mz>vC_RkJ8P{&k^RHYhU~1hmPB@p^&&wxY-g=?AhOGL
zHezS3IV`fgH3+&<J8QjvkzKa4F*|F$!4Bb^msw-4>(=rmDYdrlsTp-O?QNLz;^U*f
gUaNf-W!`w%U>tAyBvv!vuai=X@z%{KZg1ND3vqM*?f?J)

diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl
index 36b680816726017ffafc262ca38861df2737087a..f52b77a8dd8eb18ec2d4b0c85a52968bf6d7d92b 100644
GIT binary patch
literal 4965
zcmeHLX;f547JkTT2U!es1QifLTcglGqk_nNuc|>p3j&G|QR#+m1#P=|y&wuC>VR9A
z43fBI9Ai9269pxkyNTlx)F`74DiV#v4Wgss?qJlsszy#jW}FE>=FE@OId!V<yZ5`_
zty{OM`n@EDtsTQSIWfO#!<ZpVf?loBj@KAX8bgpt7&2$7j57?`I$fqgW7KI(;nK{h
zdAdwZL7pbt6s%U6R6%(&w6RFSO#Cm~+-{S3rkJ;pq}vM-Wa$euM!s9NNpHx`3Cc8^
za<xX0z?Egr)v7a18l6#Z$jlEF@wO64x}yLv3YMyA8s5$V5yfitD$_X2LlJMEZX?)8
zd53gcVa(1m^NtDSHYVOFS>dMeP}nWty9<NO0$#KzJ6o^S3X{my>x`nY2DQeZQOk3)
zO(Oo4K`FemiSLoZyQDh^ZnUZ_jn>G!<_Iz5yx<+9*G+4ub)&Q)ynBwLLT%!ECM)a|
z9;N)N3j0#tV{r~~=xO4`Dg0~Hp)p%8WRve@ahPKXI+XXapuOABy$QO{B8u*7;(bzh
zUz#$qB1Yb?nC~Y^7gmy)sZr^8{|>>)_2w+Craip=G`uiAKp+s35tyuC1`V<}9$?}J
zrtlJ)E~81!4{D3a0+I29Es)o(AVW<2&=fw%Y*qy&DIA@)|GhQvn2>eSlm2aO$(Wv9
zsI6dWEOO~oS!=g#g~Q86;5@e{=$SYk3mIT`d^&<-7$x?r4MRjON%z8Ww(C&siIG$v
z7Zi^B8$yuFVFi`-jpfwdDZB!?^}EIOPah#XSK<uMcs!=55M4ZU8Z9X}fc7{<Q2XY~
z;oOR=pQB}}Th#xL6%WudX)ITAfTj8}r)0ceT817=-lTHOS3}YLRqiN#uRGRWTF<$D
z_zfC9cQabQJ`g+49DoiwZWjC~vFNxLwST_uAxb-uiKo8qN##D1FQP_=+bCeyKq~7*
zW~3c=1YO(Vj2VaVLhKA<idUo9k}C8?u0M@wZt-VmTkv)Io9K5O+{%3r(}IT9wV+9d
zA}HUyGf~tZ(U&AhIoZVlR1WiPLXGB7?5cL9^7<1!gi(om?eoD6AKzsCB4={fy*<(6
zLwnKA+V?5{*Tw`ie?b(wd3z{U+$o{{fB0TT@f&=_6d&I=fXe&x=dmsC?Be`SI@0)>
zS_zL5U%2ub)o=N69;yk6!bJ(q=zJlI{1ydMebJH;_+9%7mc)C1Yj<?DHW4v)HQcd?
zAyohDLMX)(J@H4nhcp0PdynKzi7Nuz=KJU2NdDzFTyxkblwavxL*p$gV%qT&?vCTO
z9?d|`>-tg~f-g-q)448L8%X_aY6-(<-_}z7u@g_w%6`Mx703Hwchp4v<sFq{vF{fw
ziN@jZa_&(~Ikg@6^_!g8@DtnbTSpWaaftd1N!yC<94SMOwy$dEvsp8NZOnVl!K`q~
zmr)i;<J%Wmz}f6x&W^9Ir?yk1m0Zhn8MnIk#x^X&JiFJQZGP<o6y=hQo|s5pl-Sl|
zIK}(hR|$4X9M^iY-M)UsF-U9t8C5xlpvcr7`0QmbywW?B+LQP*_wPW1XI<go^adop
zXGi^w|1J`bFHHraZ*2H0+pE5cT{-sEHlIrD`uV%iTz4Rv=%bruY$~^jWm6xr1>g6k
z_=5qjW4GMv6kln6gX@0rJUgdO$`X6`-V3=sA1-JAv3mC?BAecm!-}=tX|8G{>3E-4
z6!zUd30LpG&Sl?_)A^hiA4iQDV&u5rGGE`*fGZy#MbosF{k*I}PUmxaV+rRLu@N=}
zEJCNA*<sOSaxMt_FC&iX*Xn*^14c3+)>R?Nw!7#RcaG|N+=;_%X%*+_L)Jlw!8V%O
zllZp0?E>|O2cb*md2~LXw777AQ#mft<#>BOB>CmY?+?vrzL><J#J8&Ban{~Imf#nh
zuiy-miXrlwWGr#*N%iUnGQ98bHrBI}tb-DV6i%dk9)$`{``HP0xpW>%@-C(Mb-5^I
zC32qYWwVjy$UjPquiRbAE_p1R5wr<>N511K?rugct4Q2RtlKHaXG*J3&;&%~hgV~$
z?apIf=u+r9c7;9>1>C;EJ(=o6^_wN0_;8x#OeOeU)3njn@}3J%qZiHEMAxgX+k7a9
z_!FnC^<)YCwP){h;^5(2eS-@5B}%Ek`B^f&rPoR1sUA&bRb@QhQ+osDCX;hqiOo9`
za8+t=9Iz&lTe@rtmWa2Y`j2{``<~>SRN|6EUp)5Xr6?_XI&!`fgUeOtQFD1RTD#Y>
zKev?)#>@%87dAYj^`Utn#rsyR<JhKTia+D~CMI{*sHc6-vclC)`pfPtpRU|lA+gB2
z)177g*T|Nx4a59CS`zX9ng^TT*NJA}f4x3rf6g^D?DG?dNq^b?jPA<*42?yRFWH~g
zd-Y%a?$J;T+`?k09$gFvaWMo9Ee15W7<>YY!Es>`lp2d*oT>;!vLZ-+PzX2b3t>ie
zAzV%^1er@Aq}I%Vn<;ak@txVQc=Bv$lFtUU|7=KWDuCgO^Wju@K3q9xhQEo-(9~*z
zFaKhK6uA*j?J_`}vjJ8$@Gza@VXqGljAI_y)a#*7oE~z|>p;uu;E`x1v_8^8VWAe}
zmuJAJGc&;c+;k|BOo!e5ro-)>xnK^<fwITbz$0KP%&t&F{>m(<$<Km`URf~PTLl|W
zWx|8XOgMCN3gpb10tG{+fLr4vsE)`0mth%jYr#Z_cxxied1E4sJCqJTUrmFDb!l)+
zp9XAj8XR3W0pwRxA;)<<1h=L@W?%|Ds7VG?lnm#7NP?1@vGA$?SSa`?5e^JWgxLOx
zP<t)`?rpR5uSSjm`|S$2_0DK`8b2C_t%-p=pBPxJlta&fa;Tfi!XGyRlpCXg)Jyk~
z{H1bX!;=28dU5GWy@bUgkyX81-uZ3uKlg8;yyC?N?~@m4(qG0a?h3Cg7WK5^eNl_8
zva3<HZyl%n%bPs!{Ad1f#`8Dw=W7f`;k`H(x&JzVFcHc4$u-kby9u)<JxO6hhW2j?
z)^CKvh@wO5+82h-9M(6utb_kaLM*?XBUsv*!TJp9$l%t!OAH~M8LX?eBSW=wml#4j
zGg#MWM}|h1E-{34X0WcnjtuFpU1E@RX0Wcb@D3|o|C<@w>N-i`;4q}km5=SvcBka~
o5@|2S&uCdQOp2T3xqzc8+gB|z`HPUGuqWS$1dJm?>@Bvx03H|20{{R3

literal 4507
zcmd^DX;f547Jls5BA_fTAhZGsD#)fVir)9Cq|wuWO}h*tO9S0P3ka{f*+gh0E=1HI
zc+e=I(Q)5mG|CWlaL3F{28~-ZPNKn4jN>s;oY73kyjRWZL-!o?%$)f#=S-bb+;{7K
z-@Ui$R=w(nOloCKQTF!KzcgoR2$inM%vKkvCMwf)=|KfKxf0|^C7HBJ%M_L>XD$2_
zZ8#4Z`6VhXYniFb=ZHaRg*tVvR+OeuWfZD&m1#QF5^cUFtteE)TKW4Y+HnLehfL2>
zvDOBPNRpeMuA5^R6tOmm791v+wN13*WM)AjYZq5*p=0e6q|Q=Tsr4#$04FR~v7+kC
z%>3M3Ri;jzzeFpVtx>8pD&-7yrcT5T43M)9I(CqpbxgG7+~lTbsB*QeQ#O}Fwh%Wq
ze@T`xtTPE4!Vb>1lPY!WkOZl<)O986BDGn`y1t*yJ9N{r?s9f0aj4DA*Ko5QW^kBo
zNIH}qZeWk-Wqa^!&uYRRsbjt5tT$P*ymW1jmKB$?qx=)ON>F*K^d;<QljLUP7iQ$D
zjOqE1^ukzQE&#U}ehE@4Ai&^wjE)^EXZ^`?X?03Apf@K2C7c~+pp5TBnV@3><!n%4
zVS12EYG;4)%Qt?1;8vaQvVQl5BNb_~Hzo#4P`KI7Fxe*kDphyu5I1YCDX@KXQ}44u
zFFF5`Atv%ifh#*N(ak<D(51&m(9aNx{cnpgZ+ek@iQ_5oh;^Ps-;p84EkQe&9w$un
z?5)w9RN#Y2@kHl2pD(t>x1Vky@z6XgV&8u_gxD>PoJMss0d+=u;#m5V-W(JJ!*lu8
zaAPjBPwt4zBWevk8f<Xons$P9-sn%TNk_woKOTRtItC?H++#Y{%9&O=iR62%unoCC
zpGGi6qWM_gF`Ybz{NzpeFPeEj6ji%m0Unm)hJQSnjO|x`g;FQ;^H$))4O5Bz^G!ZP
zZ>|m`I?p+GBnMRvbf7CQ(=@Mpi7ZiP!67C$FSZwBn2+nZFVXMo_%Q`O|0o_u^$aKY
zlgFnrNtumw{DZBiv5O)0*2gpO{R5VGap$k`&{IixM{%+-UgN_^NcC~Rb@FE@Zdnk)
zuY6787S}McCWqrgQ2Vh)V?AH*J4+92*@&`t#*+2BaKN2#YJH{X=fWuRym~5)tGpvg
zKD4izIdI#GJU`ocf~;Slrv=8V#}J=8T5m94?091I|5<?_<NMTt_~xchrOqEq$F|-j
zdIXa~_~(BrLRp&Wgwy`fLFQTWS>~YH0Tumj4vC+1bv`Ezt#N`2H1()aVo5#lBUP$R
zOm)&=)aJIjH--XVs~SO%eX@b(?JvJqGR-{?jrsgg@f<|^B<RqZ6$J0ovL7k0?Pq)+
zIS{>JfEJy&L-qQisFq&}(CafI-09L~^dE5L7WCEk>~b~HPo+`QLdR_+ctz<w<hXqW
ziErcXOy*;EXb5hZbc~G29tPlzqjTt>WJiJ@F<~-(`omHr)-53O+x+)-I^JO+Q}Ugm
z#xDcXh!1w{B7zr>w!vOuOHdJdkLam+UvrY*QxvtiiJu2|-bVB5Oi{^`t`dFsSKUP4
z+uVREo#XH)Pj8^qW&UJr&BX+QDR8(>@(Z@ALydXu%w$gwVpnXDAdPz%+88&0*tb-x
zne!VW$n*2>EwE)!D%P})!nKu0$=FcE7*sO7i^=+NBwjb8nz?*jj4#`E5gVV=lwx0K
zJbjqx;w{l6_L>#z(d2Ey88V!YS*mFyety*)f+<ZLNpz0|VMOPBy>a>u-E5WMtTGwy
z_BPb_uiiCAd&QYssNOmSH|7k&PS<Y{KEFPy(&^~cZ{Ls{n#x?z{d2=`L-`P5_w<^H
zx>^ny^DX+u8{KuXVZJRD8~Lx6rZC%=jw6^zb`X9wV=S?$MDEx(cPbuU8iPAN&LU$$
z%MpHm=y+Uxp`75CX6$DU&Z{K0d+o`1{D#@MP|DZX&?0Z}^DxwtA7Q)eP(tiH#<ReX
zpXJdgG=De3r9R&ffA#KzQ1~V(Dpfk+H_`liRe>MBI**FtE|M6$en#0qZ(3$ebhzk7
zFs}YVME5&TgH~*nGqTtwhK{;T^Z09b7c+6)DL5+oAJcig^Zt1HbdX`c{?U)$7YfYZ
zS$Fc488mmb^ZjjimbK}f6(Zq0SQ)1Ke=9BjN9ND}806c3s~&>C{^n<42uBGxA+EVD
z;QF%*<AN0Uf)x1u>B6|cH<yCl+>bCW;LYbL*ag|2UEm8+zzdQ;Uvmb^wPFsy_qUxd
zU(@pyDna6R{w8#NqldrzO%I)1U%_?#Q$6^0=^^>gdidg+9s>TPhn7=%c+jMWD+)b)
zM(d&Hc?DS3Rlv#Q3OEs84&9dJ5c{GG+8&g_ET1y?G;}F!D=UTHyex*=C&dsQT?}Yo
zF*qJBggaF_*j}rJS?_6~CS40L30k<rXu+>T17&I!hJ9NAZEbn5_*D+vKCOnTg=#qd
zVK&rH$b#o*7s0CeD%jnlfGwbaz>ri}K0FnUk4l7_Df5hd=m&G*!sxkxZ^~g?xeN*x
z$)L<h29+ykgZD?V@M)wJdREPZUs9q#=Q;&m{1O3${t?hKAOfPcPl8>YVQ@Vz4E!Gj
zLtJDq)a;xHF&|EZQRRVfD=z?&m11zc>J80<y`W?HNceh>CzL++fXZ4AaH#Qs#Azd7
zfPOf*-*bZnF>bK;rVE^|b%7JthJg43C%7}-3C^u`gqk=<_;~#wIHnOn#VtE%sJ8{a
z-bD*tm9^Z4<NMp{9d26hFbN8NXT8lo1WLNv{MOIhOX7AqUX9>CMEU-<xKXBY!zC#4
z-;OKfv5xNEyesqyiC-Tx<<z_5J5f`O@rzU%E%(JKK@tCH0%0P4#jiGJ&Erof-(^w@
za#)3L^1feW&Yaz}nSY_1v-{qSjwV!3E=pha#9(u7-*a!u?ckQlgd1Yc?YoCexxo%*
z+)#6F-+f@p6?3Xd+%R))-x`{7H=A?A&AEN6;AFB^!#Gul+go#))Yf)FFUZR(&<Gb^
loPylb`{u6e^~mvv=~sgd6y7Y8+VKCG;4ULOipLwU{{%Qq$(jHF

diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl
index c6d1fd14081505a25ffea7ddc8e279078a917b3b..23e841bf8749504030baca953e351dd9b7f146b0 100644
GIT binary patch
literal 4986
zcmeI0dsvOx8o+n=+8R<(3d1BSU6nBOt?zwTa@j7GVpr(4H=){nwUJ^Ni4un9MAH~G
zq1?u3grho{(!`PbgeKQi7{yVVMkYDm-pNy)lXFhbbN-w4JnMP)x7K>s?_KYD-@VrN
zxr@})1p;kt!M~IV0!zV6nM4}uC5@6xBW>lp@DB@&S{4}`9_}A0jS82_$J+ZZiU{{#
zF~(mS9U%>t+erfDfwmFLLfz0@;QQZ{b*c+E!5U7*Laf1C5G0G1Mscdaa#>_>h^_x}
z`I68mA<qh0z9dxQFPDZ#$s&20kW;g;5Nq)YQT)ij#ZpeaT}9{?Dhrg)X>SyA8e$cG
zj6J6*R^w%G#Bxq+=5`f1r|luq6B&xsH*z|>P}#@{6M}<fp`rW=mdL`RgtH?h(nzVq
zbxE*X$n~-G<aFiSN1j|?u_ixFXkd^uG>YpN!rKti!JQ!sU;G}cr^GsN{X?`w5;>>u
zAyO9^#&ZTDjd;#5F+?%p0690%lQU9I7!@qzy~!E3PZ-i}^cZeXySB+YZBvEz-~^@i
z5IHy0llxfdGR2NWal_Vd!!5-8P71=Lf#KXIU5s;;Ee{HnzBkWIX`UlDf(P)<F!vA%
zEG^q79x3NWd2$v?cSXr1oaH;4+Etu5t9F&qom8yloQ)@EyL@?|t-DA|`_#{`%`fs^
zS5)^NV{4qCU+-ee&fX3CbdD^@zK#uA3u%tJDfRR7rze)3LD{wWbo415Di8$FjmK2j
z{ES@8_j$qoG`0>$9KTHBHWgvpXS-Q%jXJ#CR6?$&cwoke*X-T8G>maEVegt0qD7D=
zh^989Jc*Joyf$EQ?H}xd%6u&I5%TjYN&tOOd4wEjC?ykjw_^Fh>+DG3P`baw33idk
zIKQ+S4%{@Q_u?j#{WI61zWPXZ*~7!Qa{X0ie`p+<#2+EG6^Ajm^a;DAt{Ed7oUrd6
zCt5LG$Vv=1qrSu+3bvg>&uuK56!QxAoe{yYy~cFp;ay}+(nPGxKg@cy#p95iQDD8u
zhN?!?!eGvgW`s3C+3khOwX0qAr}aO3K}_ply7HkL>N`%M$#!GmiL400E)}*kR-x~L
zcs63PI!&$G0G`ICwDI&=cA9D_wjE7`y6=C-RI{&G1K0iNY#s|fUtPqKa}4AcU&76~
z88G}xBbsH-0>6jjXr6He9NsdG&doG{lV^W`8x^(S;<*%co*RSL1`{lBu7(SJ_o7Lg
zCm1!wA>8~c4Dijy%;0fs`7=YhC&m-RbF65eoNS0mcBZ9&H$>g<M0E6s6g=)Xi_UZT
z4Rbhe+Voi+8>{jSn#z)3_`L#T^0HXPZj?0Ug7q6+yq0PQif6Z5L(m}fJ{F{k04y?a
zk;Op>{<08PEuBd>S<P4ahw}bm(^KHc)BQM5`zMezl;VODX>4|VAF8(^9hNl?qqfs$
zQ{3vK9Dgr&A$5w(fPrU{G57jy2roK~_Cdx<URwPl_@c%Pe@wE2S_>mOY0y#l8*4#7
zIUEW7b$!sI=rqjB-iGR}M_JtmRXFQdG!)wksUFxsjj0%;52VBR)NS}s(-*XVcg5=I
zxv)^Y0k{4o3$kK!Q8e`iEIR)PKeaccCfD5Qcq62fGUn1lg_|Mab^)&Mp9d8?Z=poE
z2V(6?@%)chVdknASZHzxCY`pTf$CFmYMDC?|9Ud)id>2*7xqI@{}fCviigaV30QyS
zG~{|^qH1*s)GTZPqf7Jfsc=3vZ8;7%C+tCiY7MN{FT<A43t?$OKK|o}XJG&G6}Fx>
zhQoF#_?t)<$B)pVkM4%S)9NOCD=&n31NzV`o2^j(`Wl8@D1t2-t!N(~2opxOqLs^8
z*mX{yW*Q_w$@E%G#&hspn-+cT9|UO=^{M~)vruerPWvY=q~=F{K}Eiq7QV$w8w&_3
zzlQe$cEaX4>h$7^1Hk-x14jf@!d^#B>b>(ZTN5`P#WNCsjfq6z%OX}059@Tb@OQo8
z@SBSTzn6*<K(BA#03kC+Q}yN=R`FcGso|oE1_+Q{!gEtuRL!o$tljmn{Zc)eCOKen
z<!gLpoe9q4jBvGEB}!_ip>sn$+|Q`RuuFDmv#$~FHI1UbOrJxWCeNTlX8Y1R5At~1
z1p-|y6Pj^yAdL0e25zIG@z?K)p<mrjyuY>__71*`B?;CTpL7?Ens&hO$)Dn;J3qnW
zVXN`m7bjsw;006}^#uCf*P+dqPQZ)8Ml95P0_)ceq^3t&;A))=;z>L<c^$-NwLCa}
z-4`pJs$il=3g&&C1>al7;mu)Jp=CfiwpJ&B%WQ30Fuo2PJ{?Az#0R0~6BeI_-GMfz
zP5As<hxh(@o?gx-YnAc*GPKudfxL@C+P}F7q+V$lHsudUyjq54XFTYFH^-Fw<6`^_
z?L=R~xtp&rC;B#glxRrT<vY;=vsgYy0;pcxUNkc}1-A=J@bbxzF=tXaW=D9x=bib-
zU2HhC56YkGQjZs{@R_|eJ$msaw7tEJ2QM<{ob?z-Px>0}46?%5Uv7i@><H}R9jNs0
zQ<Gv`yDWqKJ|-W2JRqQr*_xPebTAEV5!19V!ZUdOd3So$w+gnTS<&>JWAWb0u~czp
z1s#9b!171CqJHSiGVSG=<=}=!ozJY!_o9xA*Pl|6CHzgaUHmD#`9X2>RFs}%Pk!qd
z-&6nptGy>~PwgFcbo6&fMSiZ#$=4mGCJ-p<hw^h`FY|MZ8;<KKKRf5@zx%zzd@Wg&
zy@oXBt|m7;SCcb`VhJ}PmIM^X5Uti#BvW%0xs@1A+Ki*g$Lm*;Lt9o5iC-j9)91*u
zv=9=wbTM&mkdTuJ3&{`V^U1XYKa#WFkLbJm5y@j8GBd@896vFQ{2`u7-qyI1(`v3{
z#&Js8yeaYCgJk)2AYMWshRG9&mv{mhaBMWG-eE}!5=W8!8s=p7<e|jzvI)@_4<I`~
zHza;*42kPJL-G;~NCRs?WM%!yN8joZvnm~86Rb_V;x);`WtwD#t|r;A^$p`P_ywa<
zU&q)Td&)d}S<Nhnu41ZG?lM=#-C?-<x0t2Qw-{sLAx5UMkNIMEI+N-24U-kHn^}1!
zl^GO}%2aqIGu9V&FtKyDGj9fMXHLxD%(R@{#5{+MOygUQIr$`rd2GIbNiUekR5Z+C
zd@f97?j%iNoOO|TDkO~ML}zB!a7U*3p*<s;Zp|d-Sus&NteD}|BN*af%IM5BVICL{
zWUlxdGXAR#7)881iu<Z%JOBQns2_@#KD~?=M>o{!h!?eeqZhr*izW*0Ue^n@ia1u(
z4{;58$#rtWejVJ7SUkD!eEcqXj*{>Dc5KXl=8qTX|L^=2(#R<O)z}UDb~C_Hs95>6
zJ&V0n`Bf{byGTVL-hWf*{6;uI(b1*!`U^vMOy@&<To=TA-ke@x?CiTkI{y=Oh3M(@
z7UIwy(m8y)LXPS77BZ$gq;srxg*@onTZm(KNauj;3K93~EySriq;u?z@3Qd~y@9-o
zba#=aruDnohN{`TpI7nGNl`jvsWR3D0#Cj6c3w}}Wo*#?%>M=P8KI#(&a`+f#rXEI
F{{+26jN<?R

literal 4507
zcmeHLc~F$c5?}69BtR4rB_JN4ghLQear>JdvML*q!ziM%EQfOJf*-pe_o9L#F@Q!v
zMZgPBLX43>6hWg=;i<s`1$}Bv5Ft_H^F%b7Z$T}VS26Woy?<U+x~gkxzL{VDdb)dd
zX4Y4zp~>Uv>hk_gjCm$JaeP>0RAPi<xL7WBSQsM_pb;-fMKaarvAlFv%{EYrvk)4W
z6d_~P!{l)s&_R?SkCMprMez}#2~m=8kvt++78fr{9IMZ2*xCl_Z~z%MQ@kL8)$E1n
z3nX!3c|h++eO4<_jho}dY6og?ZP>yDR%cp)nw-`36B-N6gqq7(J+4t(#_H#Ug~drE
z5n=MExLBF~^!V_I_=s??s4%%cYhdTk8p_!L{;W};HfKg64vmn=*nyFp50S5VKaPuC
z@P^j7ht`>WCsIckE@w^rgqlLLY}Qn$mCc&1h*TPUSI!RdX9xEfl!e8`bE`J*H5l3J
z^jOxS7yaHV^!rNmkenX$P&sSq&wkJoGO1V=BV(=7S!>%sE)zUyggBNRrsAAeTtcWM
z;*EPYJ?^=%!#M&j7$f|IJUhEy!z1PFD1X+rCtNalIBWOHr(TFFJGvJ#rVqqk&VK06
zIwT~B9ejm4y4B~qN1WoKu2kPYtqmidhe~ZYItkFVUuCG2$)9!#cD_#)b;0zm?igCM
zUqq*zT?J7OFX0!>mL$gAimEO4pmP@o(?dHL8Y&ji<L?@hhPf}%M%F@>em8_V8s;(q
z_9yXP{RUEGtw&D=eg%z-Cm^ePiYZ^4jpwYbn2U~4czx_K=14*fGO6uk=$FB?Vu}#U
z_lT%>ngXgS2<DzhXCl`uN1{0nl14_NdCC>=cH4*YZ!f{3L=S5CdlXqEoI+zg^`K*K
zISzJS3mfTa+<tU8-_3ayW|!9SLwOCjS+Nq<O%Fz9st?1Llwr4a2^{p?gV)o?GtJfA
z7@WS0Z|UNT|1hI)^lmX$xac$YSDMhuW-k(wZ%YU52FC67ODvtOL6$lUqZd8Ym`bx2
zblkCr+2W~71wME9?-mr|)ud>+vHuwEw)+v>KCi&Fi!I1$FI~E2uz;Fc1=9<UXH$bs
zA{uH^&7T$IhsUqvK;xrAe7MV&sqC<!n}>OkoKr6Jd9)1_-Te{sre0zO?|z4RNlwER
zx6AllPzvKA(xI90o}@HrAiel(Jt<V!&_KIZK5yAHOvyhAMQ8G`uw^WLZ_`|=Jrybc
z*I;U2sYNEom{NnKny4dmqNyiRiN~-<nEyZ@S08(yZt<~VQcde|z{B4dqgo9*W0wsK
zxA(#FPjr}VvsYn3<{*gEyoMDY*FefiU8;U9hN-L7qi!G0BVH0~y3^nYOmlO>D47E}
zO>)rq`%z@Tfhv4Fhb3i;S}^x=GMSV#nAXKk#BA5`R6XB{q%{qsn+L5VNdrHi9#<n_
zVpS*3=orK-%rT^S<7Sf*X*uqQUO{w<PM{8-N7`2J#@5W)B&GW-N~F6Oi;}Hasm3QK
zlFnh{l0AH~_%R0i?1ODJrZiHT1tx{9m@6H}9JMf`_K)v*Cf;ho5%HCi!s_&B!8+Nb
z7_Gaw?Vy5Rf6avMoi~c)>TN-D9~m*3e-zgl)DhR;@8gt|tE91E82u?^6|<sCijNN2
zl8(-uxY2Yef5w<j{HZt@vQC&%UX?Fj>x?;VyWhf}HDolc`z4ETzZ$%}XU|FdO9Q-o
z^d8ghcnE(u>qlO6w4t_qGLcN&iYqIViO;da=r~ji*S6or_BB6&-&YpYGGRQa%v_Hb
z8dJ%+pc>SuA4yA!gXtxYv2^+f5v{ednY1sX1DPY`lcKyWXwEP0J$!B9mN=m+6ZdLr
zV#O16nm0Ncvuejt_o?P2y14;|nu>|{m|wBwPCj$)=SIAdG=`w!1wLD0MLbTQ#m^3g
zllj{pV@s|XIs9ln$}5A&nvV|PrX$sisQXI{?X)CS4L@V)i(%9!H;4|;^Povb=Ftrc
zcMyv<Cz=Nj2q~OQ4R7pV_L}G6H}3Z2?#gd)&BZZv#mqVM!Yp5!ye62|MoUQSgc4kR
zeI?P(t3kIjvj{Jw7^7PiWWQ+*9you4QO57(wFu%A+sXJU1oY=?!IZ}fp<g$N=*9)h
ziSy*c*fzhEeA?WBn|IBj9?dg*#^Z%Sv>`W;>a|Vp>FZ6LL8YlniS?rGIQ^?)qO<iV
zR%G2|E*KVJZ8?zG>Kc4dmPJCeDsgi62C{Mb57-rANuxW1C{N9WcHIo78v9Df&7Ce(
zeD)mqH2x!cFwdNJC(oo~+}vs5)*!0+<N-4@!-$SfOCaAjS<)f$d!Y2c>yDH$UOE*E
ztTRba{imq^eLk7#vjevc+(kM^G~&tNYO?m}@7NH!q-VaNNWy%ZJD!yC_A^Snc~=<;
zF(1#|Xg4DFH?^bM`1$1ABm)|6T}8US-G|2mPLLTHcknm&62go;kF7qtiMIN4+|aO>
zEc(%kwsuWHsV$!-IhPT`*?hXr>MHSe64I{w$4Q;w5Ee8ylhW+RXgYB{30SfRO-Fx0
zj@X{UgOlrtb4W8v-OdnOT|K&ddmhn#+Jw()%gBb@hqygp4H@lSj}sDi5kb<gs6H&8
z+}AC~g-^;!lU5xn&n!LRoK}WA+LijYGfPwT%yJeWck@v$o|C`ZzvJ+~tOMmuQz@_8
z|JV9b?k}&x<{zvycs!-PZGR0{-Ctt`X!VzO!c$KbAib-A5B{lu+>;8pbzA|STNGef
ztN@446cDgj0i{U_SjH(vE8w7?0_wdK-~<X_M<}3efC3)rDIiNj0f%2^z!#S?;7(2k
z<Sod6bAA~h_RWCQ2^k=C$$-zSG5`;!gW3Lc2;H3y=Zn(e$mi+Myeb{4I6du_4t6f-
zkXf7t-s{rfN>&=wCZ<8XV;Yo=N`qU^QsMUXRJeX96*P-eVQp?IH0Y(mrkWIR7@7k9
z>M3xqI~kr|N`_ha$)Mq$3|i?)U?EC^HbD}c?_2~sTNeTKTLdP?iSWWH0nXl(!<nmc
zSiW5j4wqT@xMCrc)y2YX?^xJ$RSJiyrEsb%1_nQf0qMdR(DaFcMDG}ITo4T=Pb1-9
zU(bi=A@hM4hr`JyVu<`s1Y6C6p*m$Q*ln5#OMaOF*SZ71(<K1-yL@1XwhugLoCdPM
zX>i7F8dT2_!m8{k;5mN^1QrQEnePKnjY<o+`@d4(mTy<pe7gwH=`Zu${~_=o&i_`%
z%<vxGQ(hJA%6+2LxAA(a^12Gp<3Hwo?Z;5#`fk6j@*aNJJbLqA^AF@re8Zm@5ijFj
zoC0+JZyj*aS4RA7>4Hzxxm7FGSE#0J-n_~CzQ~M~11i%0h29_C_iQv$K@Q<q`l213
z`eXa<dsS=`7bX>~bAN2#8dAkN8TP}D?T_tS2dY>ruB*c9(jVJ5hpO0h{jsk7v3;{J
zP$gF8T=zBatDO4^wYBYEO|sPZ@Xd7PzZyz;-R|Hvky2K_vdH0jhQ`0s`&~KgE7a;a
NCv-TJa(?gJKLGbJ$A$m^

diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
index 7fb8d66b080dfdcebb4bed386cd752b99398b779..f188ee7b911cc7a024563f7572eb71062a0f97e7 100644
GIT binary patch
literal 9382
zcmbVS30zI-`@ijJmr7|tI=7@nO)7i$`y>%bC1Hk0h^D%gM#dVGBxW?(LNZf?WKWiq
z>_w#Pl`UI@7-5kAxhJnN_xJIi-*3+Mb3f00miPNS?{eOA?zzX$M@d;EQdbxK$KfXG
zCK?(wF)Si5EH*AI#vzUu<6^?%!Xw8=hRMFg@}fugag(AW$A!&^4hxNQoH#yiyhHS~
z2rpQRM*nxmJjL&%qS;afn_v~bf(cPG!eXV0p>a_$p;H{jO^*wYh}GazO_&}YF>zd6
zSY&Ke%(!@G4XKiiO|Tl@AeN6DKRHaQ+}cFLD<W!qTtKT)L#h(2z{hl#ss=0ZGBkR+
zRBh-^<v6MOa36Oc10UriX&awDJ_<=vjpWeKsE7!@58+Xfu^Pi-CWgg?P4o;8jnj~7
zcJ-HP#YwgOr8>c?vXr4weBY(IQ}`ODu)N;mBPM-Lt0$y&lD3_qHq=fbPO3lLN5{wD
zCuut$-Jhfe$x~#EhH+9Oe`$MxF=G6Lu!vZx@!uGSL`6>i%xEGoI!ilzVl@54XeMJc
zUo0?M#7R5)OFK;;@8IX7rXHRZG%Jwrz3k)n)j94VYV&XA*s;48R9a`aV8n3ngTfyn
zwAEIbwk`L=>SRc4-5Sv7TaRAr4`80#bC1yFhLrsNEROE|1xdZqC0RMDG}h@Us&^fg
zSiMk^ED6ZRVTCKuJElPwUMfgre8;lBp<W&D;EywUlF;X3D&16uT(?fJerE)``r~jN
zuSVU@*a-Qjj=6|=#rZgWGYBP@&cf_MBRXC3r~ONJV#BMhuo|ULt4q~sOUN}Go7^3F
z!#L6vYtj?5518HD1}V}WRC@d!qGK%i+#=ELrbYO^^Hwa}oGzJoD;XVw@=&qEjE403
zN#g&<bQBaSk*r1=<*P^<eGyNJx>8ZvV46D15@vU|p<Ggpm<m_&d{!n|`Lqg-o`<k6
zIe_|1>LS#8XUG}UU#`ODJ&8g-qnE4U@y8%6>z4+1he90aq(=3*ZuDm55Q*sJNl9nz
zWMpmBBzgaiTeAheT-|p_HoSr9qb6eXVO{e2^$KqfA<=gxlGF=(B_V-}QLFh5Cy%#C
zPS+<&Dk}D3a;O0vu53#tLAvnFA3!rxM6~SKT;x5`Mo)`oT(aWGu5SU9OEzPwqyx2d
zbfL4gdoc2d7IkRS5c)6gkN71W`ROJoTct$R*Olnoq#U7t;dR+Kz9(C<CiPqFoO?ua
zy+%wLdV%z?yaRN~I}5e;8QPQ9kIfhQv-?5^8vi~K^6P2(21~*3wX+<Sc84S-TcZU%
ztKlgktR}*{&ss!ODM|{J%8{0{76vzLsl?09LpBy=AJ!q_&(%VF8I&nrx&U|2-IlZz
zx1mLLS-d@jyq(en|F@f!DaK$8QYv-|`k~bwDY{}l4xeffbW1Z=y7(a-J4g2y^xU)>
zWDX96*M!j+Ff$*uIuB7->O!zL68z)B%aL_D8v&0>aPLqpg37y)&Yk{r+BO55{<I?Z
zORxF35ZY3rNh>=(z{rmSsYmok;$pWUYnPMchE^TUwpj<)$P}11YNKrU5yZ49M@gju
zS;o{zJn~1=jL>Q!m(9WfQ0w4@=1e^@UYZT>O<N>!Yub|AG(FPMd4c__j^M4k3t73i
z)2bi0!sCh~!jIP?#GnFac9%nAlquZKwcy8JPa=0r0j>`&lGypTr@{6w@of)l{MK>?
z{yFEdqr{Kaw|j}y^VZ}X-%VH##t!dLx6U1ZOfjP6(vt{KXhL(;T8u1qAr)_1YMNYu
zn+xrsyZQ}&DLM^<wZ~zj>n(}Ds79ulIYRxi^Y6*GgHTx0{y+AI#gjyo{n8IVTr(mE
zuPdnj`6?E`7e?-?bY$!m?C+8waOkgULoWSSeCEq34MbwEQ=j#X&*$Th%&l1S_MTvK
zS>QwmpR}$$14CsTsq(;~UJBIL@dFA>%xJ<xLmGU4IGN2+rwhquRDaZhy1(|LHYa~T
z($#Rxd8$Ko;d+#?rVc}N?cf=pLZjcDK-<0<X#4FqcvgB2Zz`|D*=;@Wy$veVPhs4k
z5)>`dqaSokiFS!;vDA;|xad=-K~)}!nnaP)4hyv<Z+_&Vf7S-cIXB=v&lykC7f5P8
zF2V7P->|MZ2zRa>lk7ge4#`^%pmR(JZqHwa=O@>oW51V}HMa<RG#^VAiki@_wg8cV
zc4#Oxr!$7TdF~Lp-1aHbmkfj9er<C7^AZl!9z(!2e!m=UN!uQEp_9+o2{rWk`4l`O
z&2hTEP|)3Pe1t_o6b_`$!i3Uako|T$GH1U)V((noq$XmHXC1D-e20au$^5y;Uxa(c
zgKYQz$Ns%mWsSi1O>G-WxT%eE_w8{ss|PYtQ;_9Y1^s9(vRgO-PtSAI@Zky)R$G$U
zjxCZ!4+5!Y`VHh|?m=l)D7JRjBRwB|GTHwer425Kt9K#46P+pOKpixk?qWn2b^7sv
zIfVr;6!NXL{)js}SE9OClE>7o+Ym5o0m^NMQe~nFDc2AjXIoH9<!MYvIe^m>en#f>
zr&#B!N|SBUaW?yz<m_W(Iu}$2TT&4Crez*P!<;2Zi><}1VTy2@q)sp{fU9o<+G%mP
zXRJfb38mOOs}U>in9wE#9hzqDPjzpLg!XN_7?Jw8c-)d`P(w;r>NKaXa6T=Xnu*c5
ztB^3_EH<rrjh90!;Lg{ez56{*k8H-9*x}HRnFnRnb$ESJL`NmY)Zv90^>1;a``UrD
zXmByI+QuQFM<cqwz6;OlW~`e#6Ll*L$ho8*8C|zyNLf54t-OKj8MQbijzLSkE7sfX
z!hG!-#J{-)ubxtzIA%%t!wo23xd?-E?BM_7LyY<UB;MahL8R+8DDGCHGxY=Mp+h|K
zjt`QgPRYa6fol=It@V00#5Nr-E~H~hWvoZy^n;jwG!gv=cu{J{*8Mdu>l%9IcE+0z
z%^1Hf6UqukbYZy~Wj(D%A8!@fnX5*}_xn?m+D>#UY?A1@U&pJ;T+~<*ysZO}*0==@
z@usx&+7+lj%}1{)6?(63Or-w?>Kdw4*K9~{UiGAy=lz81(F@5kT%5511HV_Md1vBr
zKz|rzX=_l^ipMyDWl-CvP3Zd?HE+u>)i70}zrcYuWn6||N_UU5g{jCNvj7=g>`1(C
zfy8uB6$;X}A!hABns{!PWVAyTHf!xbRjv~yr>c?qvMZSJ(wE*B_NEEnX;N9k3S{1~
zqP6_>HnOG~aRGhlj8r7dciQjwVeorDa(k*S=q*jz(Dzay#fn-yO&CtkZ|oE5S8lTl
z_b+O}dxxE%x8ItEbp_wTbyE&hztN<75e7ng1q(-EeTkYj;NJHKv8|6GUDJuh2qgzf
zFT8}q1wUhbQye@86~lSp9qgZwjD<bsV$k|x^b327eZLIAtR0caY5yzm@G2xzw`1?^
z6}VN_nZ)O<XzyVy^8KzFLudevS}vhtlh!>;a{m!j8{JWnydIrg&%k;A1w22lPi>!Q
z(iFA6^z7G8LVdEm!DDSQ5?&5Q$cv5??6?<s?(HbouMU9&hmvAO4n}$PrWUS}??(s)
zEp3M9i#{|WK~HF3maaq(r}ZJ%ZN`EwE^OWZ8@trwVSGDu477pMg=9P*u^10ViYV&A
z5qQ+y$8zImxTR-7UHP$+)o?{?IF0pNh9~(tl&6#rgGwUXInJcDak(&lGo$*`*>ip9
zjl(<~2_Fdc;XBY-^E{%&r(rWO5zDWAz||Wc;PAtFbgQdIc(#Zh>lu+-<bEu*;LnZW
zO^DqXNc1{Y;4AXpkHH@;aBJvw{9>u@q1RKNu9!!|h_BJF;GU$x#+V{5=7QRqQt7cl
zD2jBUj+tffy_6`lA2nT>I?Z+?)73h3B~u;u9*8MjFAw`p6rufrJoxl_jyO`GAxT{j
z7r6)54=={aheo6^=OX-%bf9-0%9KBU1l8n^68O@ltb)O!E+~I<QdmzJ3&L^7E*T;9
zW|Up8imV$wDF6HIINE15yx*sz|AKR<nXg8L_kV}6uQ7RFyNriUBWXeReXyL?mu8>8
z&GUv3S=kBvb=wcazP+~aJ+nxPl*KnBPMu33e}DRD%6+`d-hk94+LUCkLBB<+(<IR?
zjBVSA&J~+ef=z#{F#Q$B%TqC}{x#Omn2*PQUO?!;y+|o-PouWhVIzM}RIQ;-;~bnQ
zVV*0D%8lrF@lR-Q8A{6oSK`Q!-W0X%s8HX9SdO0f84KT2M&3eV`YSm1EJpFZo;3M)
zdrG;bM*4T%5r5N^eppt7L)Kcf)N}?0thS<FFRmiDUowVx=SqSl7tuI054Tn<gIC@b
zbZRI<?G|Oanfx>UFjqonx64@Iu1t9@3Uo2$BLX!|Xw^&y%FOr`&oAiFgs>v$Ir`DM
z!OHLrn~L3|^N}a+OXRo#T@PkUiu^7izG(#pKj<Nh;h3{Ycxq>hYiqY*n%Om2Ppn0m
z)QBEfEW+%{7cjY909>)5kt3GCZ^?UXnR*Dp2Yb@TGgmR%N|{9ORA`f_JIy&{PFr1G
z2>sv2dD7OPCGf}#_Xv7)2${Qo!o-6<`1so%RQ$FU^CH{R_3|Y`{&pQSY3fH4R3>*p
zdfhIpes>-PdVciF#<PMy+Tu0R(;V^Gv<W&6{53d6kKX2*kxtrdG#)6!tal>P=;BM)
zN^B`QgkJ~zS~;z2Po4XBq===}xH&J71}6*@&dp9!$zy`UP6X=*khyCyx~|$GxjU#C
zt{pNUo}nV-Youf>{?Q&DsUou2td5dcLy|`9#>K=?gxuJLRV|tD(^n^>+;9{vy^UAf
za?m-UBTcJs!5y#jC|lZ|etvI2nlm%8B3OxD{d6AF9yrp=$IozARYYnUdSss9Kt}38
zm>X_DuX?Y-d?Q~P*5(BKiVSI&>vQZ>^d(=<KZNt=N#t!5#Q%gAHA|{_u>guQf+%g0
zrO=;wi+y2vK^<A^WqX!Ll>536DT+?8Qzn|Oy8!JwG)a~`&&SNS8{r?#-#2V<K>p8G
zq!N7)RsKN`nH=$0X_SFzizFlk9})5u1%#twNHwORO7e$xvn1^NPN=^94*fP7k~pI?
zN;B=Lad!dCqBzoO&_=yqANq6fW+a_3CfC~^G4oGH3L9omvn)R%<mO?I()=Ezp<jZ_
zN1wnVKoN#FwMc5T2mNOAzqhJZ^cz2)KcBazLoq1_x}{stl2C&;W80Bp^%)FyJA;;K
ze`1VPM>-T{O=Z7#q5-4(ldD-E<=XloI9!$Trfx*;O8$Pja<Wi=jmlz_Xk`d<DI1g5
z$IKD4RYX;0>m(7gO;LVz19EPu()f|8q?lug4G+7~T>D~N)Kj8SbC$v3nHg=K&;JHi
zwnDEO6*^qI2q*9Mrhr;a@?NM!+kfMz@Lf;3a*M;Ff~80wHV+AB8t`KJI;=}nk?3eV
zMBa|yaPAbpKcDH1Zwf0hZ*48?H?^f<U2dVuq$>qScA%t}7tl866=rIE#1D<u6ll|c
z<3~^7@-{89xM?BGr>y>xlr+5U*_SG84e0Em1#mXBCzrxTD5oAlw7D|9kA24P`)*|8
zszn=Sy3y4^UHG;kWWT8coo#lZDidcx-?wT2`A682&y-P=Up0!neq4k%;VN{{vl!}^
zN08UMlaiGq=ioa<AM^;kjNR9_p!nid_};sLjhC&-#zNgA&tN}v=O$w7zTV_CI93?r
z2N_#1?vxw4S!+Yj`HtjLa4crUp2U`_F0_naV<RJa(E+1#IPy`K&NnF#L^kBK%wFi9
zvU|3qWek7aFwvm!?CSzw`S(hsU+Ida-p??r!&Q$1rsfnpXCCtT-_Ln=?<D@a-r`bR
z33ObRqd{^7FMN(c^QH+IZsqS6joVRk!)63p52dwL^|-m-m7I-a*X0l@<ol_Z{!x<s
z!H_<_3dZPfhM-^M?~;H`xrnyfj>3N1;dy=+ycOy&wosiU-3sAw^#qCy9^p}dIX!yX
zh?cgGu_RfS6x{5{{iuOZ@9CCs><zJl>{(g!=>t<r^!YjCKmXseGV_1+tjx&^CSQLV
zde>&E%`h7u8)qA18v~oS*4wRTTaU61vhHi$-dfLE)mqW|o%ofwNqmdP0dcN4N4!D2
zR-7hI6ps=Ii3f`Ziao_nVh0{JVoR}!xShDISVyeFqnUg835{GM*T6mG?g+Ta$xy>x
z<1P!R=Bl~MPpIHpQO?P5g2yqggvUW{zkq!l!)|Ujx8n=saa%qko7>1E^Apx^3@f>n
z+^;;A^H?Te3Acn>EFg)KVcr**!~KOHIT>brLOd7G#qx+25XrS-8pkk&W0=IXB9vno
z&$VJKCyz1Q7;e<x5XAi*0o<1u&b7jq`wK%jhC!S>ytr0)a*|Kz&&i_?*NR@;mvH0$
zLJv*`=g)BD{(?QnV9UwY_5H;^3=LKkiDaMuZ(XbWt99+{1qIP)ZP{kWAF_w>vg2#=
zzm5A^d)bqB*}?p6TGh`wvop!!nUuG)KRo39@^SJyYsc(NGCSjA^|5|4yL=q8GaeSt
z;+UOD=4X1lPvgMinUs%X@$ynWj@83><l~v0NqIlhnV<17$?QzBcvc6KtY54hv$J?6
znO&YwUT5))kLgT)EuPus+cBN_8ISB=EZM>AEROl*rF<OI<@+z+j`f?x$@`hk_~dnY
zyS)FObk<(JfATuxVN#=Ycs`HMKh@9r$@0nT{MIHM@_v?2Ub6fwFO#gjyw2>5hw1Wl
z$orYj;+Xz5$#|Kc>0fKl{POXvf6V^1`C$EIaq@i3&g9p4n4d|;(^hU?<ozs8Ua~kQ
zS^t@ok7so;JL?zI8IOD%v$HrRnVm_Nm+4HhJdBsw84p{R^6lAmfyJ@*O#g2s%g3aA
z9>&Y;^1Q4a>xX<C^UJqmI^$<?@{+AX7SHmqarvkAtR0JElG&MLc^N;`nPhQHGP`_z
z%+Bg#y1ZmOY@cU-Rv(irj`^9+#`$Y;ES^c(ecTq4`o1Cj!jXNxejn#5>hr(e$Hj-m
z#PTmZc)|VaKT!VRHI8pIWEECsP97l}IoZe0M?n}w_D;!{Zz8yT+7<p1|MCqJ`6OTd
zgzNDaJO8a-LG-^4i|jQP$L{hezPy|MdkQ_nf0x2ZKE;<4{r40P+yA>1&hja~oGI78
z4b;xg|8t5@>&MSWRn_hjmxYr3XI6HOF!_R?3fbXr*(&*V!ZP=?YSm?KKOYt0)KC$r
Kh-C4tvHu5K@y`wb

literal 9571
zcmb_i2{={T+dqe6I_5DnAVWl^kk||ReI-IklO!RP?p2(SCS6IQdAP}KE*fu1X)aB-
zQY!b7R5z7dw-gP?5E;I6w$}aM&VBmd@A;l@J<odH^S<l%d*62rd+l}h-a-Dd0v=CM
zk@t_H!PDYJ&Y2oBbM7>|=*YxK+xYKeeL#&D`c+AlnJnU^i1?Nva$*lrar34nh-9WF
z#)*lxQ<4&A#wG}-%$XLIG&44OO5(KH32}3#%ykrsWGyX26vTuCF*0(-G?Ac(BJ_!k
zi%cBTV-$+yLil2gy+}SpRxGE+Cy5jS3j~QG#nJw*{yP4G43UyQ<Im3!2{Wfojf;()
zHZ^f(-0TG5s5#No=1hzBo;fv9C{nfx7O5nPRD(roA@Y)tQ{(1{lT?op=MW=^>51VW
z(uh$AwBaX;G)McZ`RgnbY5A)!6KQA0Na#9=BHdt-9!HOjjG7jkAnMga|B`k<-0T^D
zWv$O;?I<$%;?eMn$KDc;eX=-@eG^6ff<;D2k+wnp3W}EQpF&x2S0y#*n-ikO!>=a9
z&fW(+rKaPnx%uBH<*ij99u#qjz?^nvOwaN{stat5#J?j@F+-1)$O7IVET|9ut9^-O
z6S%BbN4KKWtW1hOrq}!mQ5W06>)bX^e{W9=yKxy>S`Tpk_eL3^o9AIz`27=zTDYB!
zeLWa=op=qUw}T-loMmT)cLP)N5Nz7+K-JHOpg?~J?r`_TQn!uZ+<X!W0`&0xXhb*L
ztx)jM3Q8s!;><!h+*qf9d%EvJ<mxdvY2j%6Zsa>CtUy3Mi@VL$P~@$K9_0t2uqzPX
z$C;sXjR%)ci0W?uetSV{_7V2=#IumyUIx$SkHD}3MOf{`gW9yGP+!>sxx=c&edqB8
zTjj&0f!jgBeH?hDrh?lP9(J<ksGcweea>o%eMVr^i3un_w1~q8@vlSZhSTt>Y!fT|
za|z7Tx&tXi{c-x0`&@jyW`U~~r9x0ZwwJ_zpAnJ3i#`aeYIm>;l(QkRN|qb@K~D#u
z#_CRXV|O?@l|JUU>z3{JYTn=sEk?$8(YXzxWrVmt(3!)tKg{GXrJ27&o{1IOy-Vlv
zv3QjRvhEG=Tit!Q=gGrA`<i0%xe0hCWjTlUZIZz#o3pUd@;#?donVEfljShr_9%!9
zlg0R56>zY55<)Lii0E+inx=aesu%P}(rJd(`dQGOG8uKIXyLY<vpD{)_n+ZNVLH5O
zP{*m}hIsKu<nmk9b%ATQJ=F(Y2X?Zt)08pW{Q}tD9fiY~!I-RWz^$*flY)@Fx)<gg
zY=Ap%YhYPFMA5SWn52D~>rYJkelQ$k2~Hgw;83s(1~_zqP1ywWaL<B6YMG!Bq>B^%
z2H+H5KU92eC5{<^vi<q!mE9BHFI5lI5>0Uy$%m$shgri0Bb>bHC4`$D14pZs5Pkh6
z2qtS{TEPP@-)Co>v1WS?xNAIPFMXE7?9I6_=7-5xnjVZ+gBNiAn=16tt9=m64flcS
z_x3&g)sjWM?A2`8UKSJ0Y9K%UCFd`9wH;PobAtYz8#&#~NDa3%WkBf>RWuwj4?^nw
z&_YfP$JnHD?Jq(;Li`pxSk-lz(?dU-;QZz5{&D@B>D%g6;jY3qYGlKXj%JWv&sCLv
z*Z?_V9}(EG&k`e@b2)!jh3XhGfr0GERM>aZ2-tbrC?Cs1=Ftm?GB?MZ<et5tPoNV9
z_0i+_={Xh9l93I;=?h`@<r~nFsUhB>sv$Ia2`tW1L$gvLD(*MKj0z?6h;>CK(;wCR
z#^Ft)Lmc1ug)QbB8-nRRrs6q`K*#z$TrP_iYs0DaXE_Y-xeD^i%h`yxo+!xdfYU!U
zz%|i0h+eLPzJuGrro0Fit?P%ny;QmJe(=T~tJAVL?&fPMxZrLOcrQN$eU^yl;VNQU
zV<#75s0+al7H47qT^=s5Pltt({cudK67s@&0q@RYNOP})G_L^|@F5Q}+owQ2RKlza
z0`%0afGORIkfI`o_Et|oIKu?}FY92&XUV=8fwfzQbL&^~``3F*xLA_w+w|>C_Vn1}
zu=?(`FSZC=J|GjcSVa!M=70}|!Fp(E^uYOsd@lc(ojq+5e)K3u5GF>#$Y<Fc-=IYk
zb>{ehcFq+DjZcIpBiq1spbXlwWw3jF0n``CpqYUgK6`Kp?C+d_afd&`zN^L<V$5Q{
z#kzQC`9zE#6@b|WgK$c!4yqjT!<z3#p|6?@{>gT-uKE3N{o67qRy+y8W^dR>k6*(`
z_uFvFR2@fLNrUMIw%A8U8D0ExLC118yVpeqg9;fuVAUHPG*$89HhVn3%8%>Y>$*O`
zpEwmtW-6h}5>t%(q>2y65oFr~utU#?eKb7)VtntyIZIXC661oeM;oB;&@rg@Y$Rqx
z$l?LzK`<}Z2{rcU;E_BzyitD-j`cOePIG&_Hc5QnDYxLpKI-5^^zT0y_tc$&(J7<R
zVML#Q#Mg2${cHS-TW$d3{D+rhe4D~of<@&Jh_U_1?Jpy;kAcOlP26uu`#v@XcKF_e
zoh=PuyL=ODXxB!S>fW6H<65Z@lD`3dupWdaV-+|XD+B1^XM_ih&2g(=Iq>6#qNS1z
z>OAR(lON{5^maFNeWi&zYxhIRjAO8O_CvV!uICz)#FMNakX`d7R~}D3Eg8oD)WJ?o
zvc(7&0a{N7T=38XRr^1N6&gaE=O~X($^j^MTA$;{Gga_hv<9Bt8ISKrjS$-+aP~?b
z&KX<+{$nRVv4SmrDAVC`l|)N?CW6nVWpIXXiyDbHIecuF9p2Z~#Os=>pgz?ezZ+fw
z<Lj(&i~T7m_eV5%V1mA_#uy$S1R0MQJb1ksmJbTy;vX=13<qM7^?4-3YX6sTQ_~DL
zJat05GHvW=eF>e6H6AZ^#wqU)gT;U&;F4s4e&CPZb;hW^%@RWnxQcU&K%wDK*e+{;
zN`ezyPC@YzUR$3Q!>Ri@u<y)d4A@=>Z;qG3C(9a`-Y@_cKDiCA?yBObCGGH|{RoWp
z*T%GkY8dM}5Sy|Bal>Xs?238`$FG~=y`A=0<M#n%n%}_jxth2{)eg_^ki#Jhg!r<!
zFW$c9hf5DwVVc%*@VpGzZ=^DY=gHs#<2tBqF~nci4Mtyshuj+R7<v-MM{B{J<G(<v
z;(ZUfWLaD{@+RDs?TeeNZ^6432h3gBA1}B&AX}CK8V81>N>ei!PdCDA59^@uk{gCU
zwMUuGKG+VHa53K=Ki)2Zwwo$g``|nbf8v3x2_g~Cb-Ymqd%w~g`^`H60~Ne*LE|vw
zsgLEx%Xp^}PTn&H^zK`rj7JNo6n4YRlo|+cH^iHVZLs8WDIE9s9+G6uam6VYl)fLO
zUL1_GUL=D2`q3ze{tQ_nUA(YqFJQX^mOR$O;M5l|BKJN!u2LBzCf|j<^WTGe!%!@p
zDBh>V^EH1<9SFxN;pPH0j4Bv`)>S*fvTU>0lHM<1?2^Zjn@|M@%ZB2JK>`l%o2`OF
z3RN+5mpKNW2*5RUzd+?-dB{-t1WO_xKzo}K`X1N{ccRSk+&F!_I3y6Q$5wJ{VbwYo
zKY7?;$%Gp4E)GD`gn?M~qXzm6bm7{)O$bK0zJXOwWifJ|G8$h*+)~j2`43%D?x{8x
z|GD}cZvN~I*1*GBH8!-MFZS7y1k7DW^s@a3fp#vqtull2C+T0M>pKn~99$;0@pz|x
zt>ttX_B2%QAHljWAAxVS`r@S#lR5uZE3|RU8cTR1KJU$zy$3C}zknd@56I7MhrES@
zaNqAvu=@Q*n00CsY>QDwTfK2&o}@%z|JZnF*q#O*Csx6gLq)J^j{=(1Y=-ES#V}O0
z1j1acadDUsRfnix%Lo(kIV%9?#W};<W4A#sD;v(FSzvOiJT{*kirXhh_QD99Wo?bl
z&h}{iBXD}(tLM3X?2FLE2&0W`@v}Tw{PrEYbX+CuQn?THU9R{c%K>wYMsfJi98YvB
z?S}Y|6H&du1v8%6bNn}h?m<NMS=Q--cn?dx&(5)ON0-79IJ(#oa{^S+eYW^KGg7=x
z#pSq92E30R4!^Fl#zk_Iq2uH-@R@r6(zO(E=Zt5tp?f`4cw1oOb`u<V=?&*U`6oSm
zQg;l7=Xh~?HF*JB-o!%O+(|w9;CcwN=EJtJw&?KS2+Y1e48MP^g{8uD&`>D?{h+fj
zsMZ)$#;T)>?`>$fWrKA;+G2|)i$QjheO|oZuKWnSyUsz0wh&DQyn>MneDSBlra1o3
z1CX5WiuSAZ(f;RS+<d=Ydk*SvZSdmy^SAw9A!!iKymkX>WW3PF(*z5zXyehLOMzHg
z;i*XioEK?_?q*(S@8ivlXTn()jsLjeDTyC#R15)QXF#~;M6gwBfam2wXtUuqlq<XA
z@#<!AKO=DcY7ey8brn|Eg`#}B2~Jwl2&Fq};SQ^bzwUL$@K{gO_>=?r!^1GARt0}_
z^}za(Z(+;xV7xil0hf9hq5RdMm@?myi*FIo2E0E6cyEyoZfOCWuwwwar6<E)7k5-W
z_8Fwl#~QEt;e|PFxGK$w<11S`<4v=-tkR-D{I$G4_R-VFx?@9e`b2B4T{1q0FV2N`
zW5<C)wKrB}y@WlEcc8vT1z&_(;2KX&%)3|)7ISQIN0~XAlstgy)_PCRGFhCPp9{y5
zt?{m-KE8Y4iNp7g!{-T-eTK(-ZWWB)mzP6WxjgblzhnpM48qqjrI0@LBex#<53#}O
z_3>a(D&D^b+-GG?0R!BBhv(gnsP(EAu6FvN)dm}Uc13b_;PJ@$u~_hS<v;!px@t?#
zsyv>`1~m-U7ymc>a1y*;hIkfltpUZR9iX~N6J3(D(Js>g_w)`%|9cf2zNtwA^U`jD
z;JO>e4(K_{+8iOc!!8^R3sH=Tz|bfjCiIfUpWi2dl9@g_eUQU5dy2fee1Cy7TnmNe
zqcJsj6$G_6K!)mh$UE!Cjp>ZUIF5hHMje&%+Q91%Z}#xjUKsQI0@OJq!`lPhpr&00
z<=X>sdC)U1pB1xRuza}%*4j6~##zCzwp#-iEcyg{ij**jIRoj*{n2r!3GUw_hg}Dk
zLD@=kB!{%|ypJpXa&ZfIP8x|7<qBwLdL9e~?)c=Q<bO;A?*92IlpRgue)p|=1jFXL
zf^oGT2y3z-=k+tlH*&>@d!A@<*PqLM{{uxhF?1|c>@Nh(7Jppod;uQt<#FrSI_T@`
zi5qWeV7+s=SFao!4DH+n>PE|8db1jKhn<GEqkNEmY8{+OzRpfCz5xri^z6Zs`T6ll
zJ<DgkQ9;iZ<y*!>S+FM>DE<j-WIME&JYu`2itoeXdy)iO`}hRAdi7B@&PW{xxLafB
zz|EkL?SUH}DdE$UWEi_t$Zpzn21)}Tz`NCxup)dr*S~qPMyOJ72I3#&ae5!EcF-DV
zjuF8_A^xo^`Uvhr<4@YSs<$kh+u@8CbRLOu;-_nSPP2&>^Vk#C<q+Z}87m&I)UgFz
z?H91&S=+%;M;<Q+T!+|Ve>@%IfE^QMz&&>){x<wKC^gN5A4BAD_APa8{C2+}ShY|a
z3$udJ>AXM6Ui%#sN9MBGR;IYevJ#p!lfdS3E95Eeh4&kK&QMF&{><f@zt#c|Z9WJo
zw`5WORweWr?}<h0A41yxW@wotgU=EOY8>7SY9{h%DRUFt?s}kYZZE7@lEt>%bH<A<
zC!n@-2bcd&k-GSKM;IiYv%nSQ9@tpGaQTIW_MERX1Gd8^-5R(Y@jG<9Im&j;QpX`H
zAHsO12r?2Y*)6*}pz6DKARqY%CNoJeAzB${%+bYB^*#S@hPmAUkBFTxFxU`ply8K$
zv+VKyYb7)vPy%@yCD&L4h8bw1<nh__bx}>1wRk*9{cn%Y1YbWsbMO&+Il6BfIX!$6
z*%Y5iR<2AZq$8EwSdc;{3{D^kDk3s$$ZR72W-{4!X)-A?nM96XnMjVl3L*Rd3?y19
zfkZLDpA=siPP`usBQD+E<n1rsWVFbeX!Jp%@H0zZukj>5YkQJh3qt0{4Izeaok(7|
z6H#k&Ag5LANu|J^{A6fN_}8sSoQ}9owIo-CmLzNc0OD3-P7HUO5l>=9thLR^5s@iL
zAJL!8h%zBeMn5v_Mqe@|xG&LD?nCTb4T)lrA&EO?NPfvSBs-TGl4C0jh)Jpe`L$S|
z{J`pyrm|jSP;xI)Tc$@c?DR;Ok{-Fyp-UWI=n$_Q9g?7-P14?K5#0$|<nRkkA{V4d
zmj9_i@R0^d->yNfG^&$6picH#sT01bI?-%VBX~)Tq^(mUazCq*Pr0hZe5)#XxL%cL
zE>k6|7panxBvq1PtV;A7R7gmT3OTe<g{)kmLKcYYm@E~Nl&(V7MyrrXs6w{AQ6@8r
zmC2PG%A|9XG8s8TnOKG>6BB)9l3pex!I?s$J5NZ=qJ?C!uaL}Uh2*-akYwg6k+nHW
z<S-}^9|t88XRAbB>L?NZMnw`5s7Us?E0W<Zilnkzfpm8$kfCh~WO%&-*<Gzb^0O4k
z-o*-Jai9W;_f;TQoD|5m<MO1qK%NxtlP7!D%9E^-^29+;o)q1cBmGawkw5a~h|Ekm
zvTL*)NtTf#RrLbGJQdd}0Xg|VKqAHU-Vp)0v`0X0Z4!{eH3E`4Q$U7k2}p}TKx&uD
zlH?y`$*Lq-VlqaS<eAEn!Md`fx4JADe?^9zDv}|8Y?C4Lx5$u7i83U7vJ4qDOokkF
zkRdx7_$2HhpNuZ!6Vo$%^7$m6?B2;I8R2~Lb`qbIjpmc^K)(2N#3wldKC%3rM@lMr
zr1Kz;tX<6`D>8XxUJ{S&AHyS6qj*Hso=145JaS2sM~=zxh+|zhWAv(<IkK&ri8t<M
zhJEZ}w&Zm&pO$wqC5c^3vr!k5ZrH`>Xm>HgYdV?rmE!tSC*!%HlR4?s$?yhtGK)1j
znF@tY=1p4%qjtH2F)i$1))aIw=jU`Vk0*67@e?|j<krtjar0*;>&|B;KkPH(IQBE+
zW%rrU)BVgW`lFp$y}O+ePH$(f3}|N*jN6%CTickTn{CYKD{V~Vt~Lgf+8B$`ZA`p>
z8}mZ9jZts@#Q077#KaE&#Pl8ZiOH>QWm2xUGRM!gGW^4>%-zsd#y+H#`C!?~=#;cD
zGaXwPQB^aO^01jPC~9Vw9BgLt_BJ!=6PuasgPWP9y3NeCr%lY@OHE95XcN<0Topo^
zm}?GA;!CfIS$p*(Q)Tv%Ij-}O*~<UOJh<1$1fOYSLbo(BLUA3qs*(9HyOA;XYGmFp
zjf|6RBXhdBfmv~~fjM}*fqA*9fhk|vz&JG5Gxb;MnT1EiHMgD#Tq&-}^-N4aJyYaf
z&&+nKXBr38Ggm%*U^<?EU?lt9V+B2*0`XfLN&Rp8ozB<$oudzE{Nr_js^n%QF5i-p
zmo$>{|1BkNVkAY{ue6=!^(`F5EA_bh%OAB<9@VM*-`XioT2eps#Tjj<I&GJ>)BezQ
zil;bjseVvA?VmKB>QqYmrFQC{>eR0^p5kab)v1)`Qyk5Y>J(2UwNpNgOWUcP>e7<(
zDUa%uM<s2S=Ko)H>W})Taj8FQr#fw?JZhJg6i4Tq;;CI4Pur#AQ5@xc3rBerM|B#X
z+NnQjDeaf?rKL1aTBmqvyR=SmG{0}fr#y=Pw|-E(v_ES9H%WQ_hLg^N>NHQvr~as&
z;%L9Aq<D&>I+c`1bt=Dwr*<0uTljCqmySnu>W}iMr15ARilaJ}6erE2b}FUu|K?8`
zPyI+sX+Od*`!U5yw^P4VO5-S=j*m2s+9{9ro8qXXc8Zhc$w=i%+bN&sA?=6yqwQ3u
zyl=^G;idV~@u)7%r@D0B)E{l9lKPR(kK(DM{=TJCJC)QQ<x!o+qw%FB#Yy{-oYxk#
zwq-_$Z;+Du?em(t_!v&Vx0yR_PJ;ML79TMB$AL}iqmIEsNrbW;Gsa0)fTRZb^SQ-9
zzv=n=$0ZtHR>^+^jYz-nk!Jt>zqQ(bXyRY%`MkgKm9XvXrMZ7Ur+&@-O-G9BAkF>z
zn0?K?Y#_yTl;-|@G}OP!Io|X?+%IDs<S#F8^99sTcF<pN$>&UxLZzL!yOL5ab|EQZ
Z9yjNF^B!H&9ON&@tqVCZOM>sg{tpEr-C_U$

diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl
index 9c74f2e9b9940de50adb68253c2b3d2bf9b41ba2..beaf6c8e84b1dee9a3748c0cc08dcaab2cf15c07 100644
GIT binary patch
literal 9461
zcmb_i30O^Q+uqGdlS*ljjC++NMJhvk-47We$&{giip@#ciQ{w<kt87@9YaDWCy^=h
z&}Zn7p^zamlw?W=8H<1IRm<<}@9OjYu7BOvx}N>K@BIw#^S*2Ccdxy?JXBOgB27)v
zza6e3Ml>`mC^%$f@XUzdaJvXj_=g8aObwkF8Z7@blM_Ac{U^@|^`GtFA3SG9a9~8&
zpotL^?PmNE;*JiYasS0JU)fhCnk!RceAKuMCWXxjo+(ofj0g)4l-l{vikKQQQ;Q2V
zY1Y(`ApeNq(3xT3Tre$}3d8tla0W9u=ENz%GSvnXE%%VHi4h|j8nt9<K1v+NUZ(D&
z!pXoHvt$}W_ozn5G>3aQdl-4B#>tv_^!8ASlWD~V28M-%a08ec7CKXFSa?uycyLg^
zseut%vgX#_GVKVNj<-zLM_nE=FpL|!Oi#+?AQj>coES3sTUdQQY&Tg8sm4%Sr3jh9
za1UJ%qeU`954}Y)qj;&@W6KDcvA4{G_ZTv9QgFyjS*w3|91<2f<(o%S-lK!etkI)+
zqelz5$JR@DkCqX#Hr_I;SrhHNJTx??rjL#q$&FrqdNn!6&LZ`noa3(c?oj4tIIDaO
zeCE8D*j`qm$joiv3MA6}`xPr0or5TiQ8>Ec6oR}4Vr!8eT^{q5@Bg}T0Et8a{ORtT
z!?t+5ovqG|!!h>^1n%g7+PvrRd{GJY3>SLfSRr|$lY!X7Q=sjlPlwj0NOt(BV#y0B
zvajXCsozs%1P9{r#&%?<^BEm&<8i9PNw?B#Z*j-n7vufM@iD!<9w7YrpEzZ7jMvi-
z7|`UiTTruUFKRaIkZ9^_)1<^vh-;sTJ<p!AeL^*9WdABmb-w}oB`R#;-PW{!qXZ97
z4(ZDdY?5DV8nLAiQPG!>k+q-Am*~^lQ5W!eZ#pb(ok(q&e5?WFT$d+#)_nwi69r&t
z$7&cA-$t}}A>Nf35?k1ua_1;PbhlNXOnQaCTw258O$x%tsI%d5%G4oYDB}DK;I(WB
zdpzkFMu#5cbC=inYH1PgKXgeV5@d;N^6aNb3%rM?p7nTE;lL)QcBXUc6EJdxGRA$b
zLiUVkjDS96DxcteWPj;G>lRqRg~E8<VYd-|>FCCPc7GEO-CTPd(XQilvoq$jf3gIX
zN4>c@6Nx+`7=-xgQK?!isyAFm)b&qT(#?$W*ErD^pIY|!;0uVkHXVz`eM05b-l$Am
z0v(S6EHl-i2W>;atlut)?`cn-#(B_JGsVg*W14es5yDcs(C%fLD1BB3YnS;*PH06^
zb5%ZOYK15Dy=RRQB@;Rm>&M!BA(HCFLDYIxUzhMH2rsdrin#6AJS!GK<!j-e@DiTO
z+fbWZDeSD0HuT}I#aKGkhw7TC;;P|osJ@FrtI_JT$L)xlN{lKg+f6`yUpGFM(}Xqb
z>)ug#+)a$d>8FuiT8U%3b;;gaOuNRKqtvPg**Io$wF#hVZ7q~P&4YnsBw8CkMs<xp
zr1@&>k^#k-yrMT>KY7inubUvvXBXSNS0+5SEI=eQ>9%KkN~&omF`QJ4_xh9Y&eMkQ
z$PvR_+tPjA706ZVMPobOVC_xfu+wQNq$9N{r&k|rH8mz{q`|jSSL&s$1F26}oP9GF
zg{v0u@r<VH()J#E@uIj3IfqO`C#MorEGfpz3{#qCq)GbYh9l+|JNl6P1rHsTqt(FM
z><V=eeVV=qRms<QABj$F$+mJB2Dbc-bqIWpF@08G%5q0qJ1Y}g1FhJp2@hbhLk$&Q
zlHrqd0v1a3xSrRI((BjY>Iow@ApatUB&otN^)>IWavwK8;oGrd=^ggfL?d!O5`-3I
zT`6tsVpjJ+OLV)|2Coxjn5?ah$_XQJiJSjr!&6XsFps^Lp$T_~R3z_uiW?bPNF8B<
zl<9Gh+Dduf&g1&y$c5IZFt`M>Z67h}y$WW&>`0?ki-0rbtk&uSsPW{k%^Q|vzEBy#
zF0lxYj=_5OTx{?E6uZqgq6>HbU|yS$`lw4-&i16awJrJk)4-{N5cj!)opA9yEMw%?
zoJi!kHJ9y`J{Y!RQgCRi4{{DWP>0Y$t}lSdz!_yGH`%g-*P%Hsnde8nJBYN;s}cKG
zHLw3+{vILHws73rjhwX%F=l96D(Df7tk-?%WiA7INe#5luSAu93o>xo&yK&f0l$_5
z+r*bp_jUv8u+f<oEQ;jgsrpAia&j=DuRcV8)mPl@A!C`DB<c4t7MBZx@NPvaEG|?-
z&Dok>uX7}~kKV`_e+&UCmtp7Kj^-XQz*Qe3Ow4UYsjJrGQ42S$p0*WwdDn69S##>1
zeS$qCn~X~9@#vtjna^Kd1Nm6jMMx-am<nDOry$Zk8oJX@qa@1&`GxUp%g)=dX{;M^
zdex%)(tE7+BWrSOy$GK?c49|vYf@RK&DXbE%r;gmiHG!SZ`?k42uYvcq42#GIWMr|
z`^)XWBKy8AEf;sEk?H<q{Gkh+4vMIjQgES}DV0>`K)Ky5tdXkI*U*OR_uSGj3{Wj*
zb?>S0*K_~MWY!^hF~ZchAklR#+i{nejy&pt(qsuGJui^(!{Nr`d#yb`zFRHR5VcYM
zy%9iP7H)z|mJTV`-N5-2Yu=xne@bg5JHOWxR@cF*u`Pi1mbK>XZx%j5$;1U%W2#Sa
z)@yJv;0z*--(&r&7PR`S3WloQf_cCQeta9AI>AP-UgF@f0p@-xcpUQ=g6wmkyUCC)
zFJ?&X<P!WVrVoAQ)`UEt$j559SlOB4%;utpX@9D?kj7fva6^WBGdirLMGvy(urJRm
zA-=Xd3_py;;J5_VM(+qp>pG)k#!FP?iqWH}7}|?h;9~U`I3Jmdv`<!4G3hz)KRqK0
ziMwuNPZu?EwKkz7hrKBN^Z|$5m1r$&DY8{92B)v*xus$1<h0^F)N}44c#aL#XZ9q&
z2PfH&i_f7-BLS+%+R@)<SEBiCFP?vViHKsd%GsAMI*{b~7#OZ{qxOSi*q3P|QN7Lv
z3HIxek$9So<Ms?WF2RUw|AMm<iN5@<#@9+de!0G<;3?biIe^T+zGtrwXUQc!jgPVO
zV?A%vS*0h%4z|Q(op{zvZ6}7BI-=6`7M{fzp{3eh#C_GE*?rp3ID0)Zz4aX1uk@j}
zBfKc(nL5tYY{r2LTahL<r^pZEvDc;=|5})i9Xs`?%Jn3YJBpE*uSY$1?ZVrS7VuL)
z2=%~h6h$?sO}ifA;36IBmc`KEPks4(zh951&*h<@UIT6}*I>aOz;a&;%#84cOHnS)
zEqa4FlUA|HHx1~FCyOtPEpM-VbQ^w2-hsEOcW`0`%a3pKj$~x0rLjlm_d>5*Hni0(
z6=$D-l6K{?_SY=ooHYbjLn2ToIs)H;rlk7yHMaihNW~_xXm8eu?5Ymp-eU<BTuH`;
zN$TW0;~au~+mh8tO=Jfw`c~h0&f`(;`UR)TGqB>336(#x#PJJCwEXf>(5&G!@uUTv
z^tHmK`5t6dw42Y(_<=5RM&z-zdL@|KaUr5D-?IzNF5+~JFGQwa-PU)#h7GhE4vafL
zKKYyq^Z71H8K*qs+!iTiBjHjc{6l?FTW$quP#aQT{tCNoT0#3=u_QBLG3MsKVk_Eo
zMbL#|(6wKU%dRPi$|z^;59?5Vm)~(!ouO`B;}JSzEu@*6^p}P<#fL`1w){Dc#phz$
zavgI1G6}DP?x8|*4~unJ@_LvL_aXY&<M#mf4(Gt3SjJ8m>xhUy&Ef611Y4SCpw9yl
z`Mte`f894BeUo!+Liri=j;unj<pz{cl!V1!KVr7sS@b^Dk_zi{*i+$ZWEL|75*;hr
z7PA@KJE@Z0eH|KiyDQCqbHmML>uZ=s^rfIs3zGUIqQ<Wdsp3b7_j2Ux)8%RfGTsm8
zJ_pdsDSL6Zco6HX?Lvc=`anxmhPyrU@p)b+GKe*#=nH=AvY00b&|%=}rbFklw<F$k
zon+%wb-HqQ0=oHo<6cb$x-%Bk{>n#u9W)*TqTMlHryboD<-sQS4JHrNrC%-?(7Io3
zsl<9Sd~{lo&FE5OIyh0)f@e5yt44`^b;-rBJ=ri1pt?Ybo<DCzb@gNEQO|hfjkF-k
zfyXfD7e|tYK4jP3$%bS27@GX}DgKC#MBv;=?3dhyr&1B3t|h_X=~KjJpTVz7wTU^o
zA1|iq(yjS5Fj`|x`iAGwUGy8~l(eAAd6l@7Hl1H%>+H87e8&#<^~*<yC>R7??@u^a
zdKOkor%HxHOgjwAI9>pS)n}l;uLf2s{|kPyJg7ytrh#Fe_<XuQ*##-l$I&S$RNDuQ
ztRi%GeU3S+9^y}jXXsa=Nq<)AQ@?qx=w<x}x~31Kvnj=B=AMif?TqRD$BQu9Xiw)C
zo@1BZb0A&+c+7g_Pt8ZBL33Sus_lFQA393t!`K8QWS)TL<b2E?^Ony&%9FwD@%@oM
zCkE<+ZopN^44<4{A#mP)Xu5|ZwbYpEChX(Zygl7EYe8c|7!n8fr!2#<G=2Us^vud;
zAGc{mLC<IL{?hjS1>~fnTPGDt=@E-_rlpvckcZZe>XdP|1XtTQkX>sFe*MTs{f@8!
zmfXJXI0oL0fr&H+V@ej_>CLmaeqD)P{^<y@Q8Utdwg$I5x1+X;Zm{mRkK>TvF8CU1
zQ&roZd`#`I0>q^C=I%WVPK{4OxnVLS8OQPA$wh2QHzw5=O5}Infj)28hc*xEpjLhf
zvK4*!^}Kw+LpFQ9118*WxW9|;59EEy=gVYU8JlX5&qj2SVgGD5oJg9F0c<luSRM{X
zx1$U5cCpnRyOFtQFbu}Ye@AlP58f?tutzK#<FusFk4Plu?spxei*fUUA>~TVVO{G+
znHP&t(d#Pv%DfkJA5DO!K?3fd*^JBq)oA;B5o|4Vd4JjcHP}Rp=2#PI`ORPDx`!x^
zIE%=XWHvu~Cb})tpiPn_+)8Upx?A3WnK+t0=CHhdTy!40Kl%xdOB|86^Bj&j6|fmQ
z+tZeeKzt~u#<7WMh%L3HcEi+Y@R<qNIi(n-_qJnntFL&osRgZkaTGe~Zgi*_Lr==;
z;hdyJiC2xOAglvTdb}2edhanz$AHWZ^`#jLJ=tkn?x2G|;bN#MrB17bwMIDh57wlR
zy5}fAcK}{9I#TQ3s<3AmNaiz^l{!6UE3ynxm@yb;bFFAkP!=i|45iFvCe%0UBAQ!j
z)5PYgRJ)mrzhE-Es^}BkH`$@5i3w%=(FOM1P04OwA+CCKrXg*-pd1bwS5wT@Du8^>
zlq1`mq3g%BX=Fx!a%@wHAvbd{xyF^oUDx63GhKZc9g#NN&&u+yz-g%#S_Em+g|{<s
z!XXv~+jj%Yj>7qNIWlMNg0|=j>y!8%pXQB&+ciz<SM7+?*(Ow`S%B!@wb^!4?!fA_
zDZ*Z;H`ITKBLyUQ^7$z4M}LWH`28=>RqNdoT-{iXRk~5Ie^iHqAD+O`Mw?vHbm-EJ
z_H=%;4IR_BX<P%`Z+Xoq%nyIWeK*RF0O~wPO!m|Qs|s~VT9%FJ``Xjz=XnTd-;b2F
zjJaF_sN>)Wxa@0Azv{Tq`NS@Kev{bWuy^qZc9{P@?9tpWc{J-9j<m5My<jc+AU%%!
zZLbj9*@)UqZ%2ciRjF-3I`YnUr5bY+Ixp3r2T!<ruSpNG@)=84qIa-|!uPYO5^#G!
z7ksJ9hed@NDXV-!#rk%%`kgQ9Gq@wWR-{VPx&*;>ZfiaVd2W_n^KqhV1KWFYv1GHC
zH)Pvu*-mD)7%;FEy?kvzhktdV%}1Ot`ho-P*|H2fYduK4pC5`2+M@pW5~KzlLG+gq
zs9HA&r$sxE_<A)Qe5+tLy)R8ElJ8vtNPBe&du!h&N&UD#u&7xy6!(wYGd@c^ntx?W
zjNh^2K8?lQ^+)mXejGXk6+_itodS&ybF~oB#;Gq5HPV#NJtWB<7SGP`dz30}PjJ?C
zFcO{BC~eYJxCg~UdHM+)S^p9925HmvxPw@;#2>aUg>dTi27fnGp`CyFK<`}(>OAc+
z<j=|qy3D<N%7EL`bLW40R%Z6I&&s;F!?=m3q1rY@)<-#HTBlk|t^KUWT94%5X5GWu
z&f3hn6$e%8dge2Scg)`$UNL3N9p)x;gDGUrGlw}G;E~PjXv9`#E0flUbxbO=jEQIB
zILu{catP%R!c68ciGweP@jON`qnP20M<WI?a=0`78UaSa^k$qnI5GAd<VPo_1BZ4T
ztQa$<6%PZZ0s4#_+TTHwkwcw>Di38wnc*HVGxg$+9BRdXH{zxE8IMZwqwi2AE)(D3
zaFfF|9#_Oy#1|WJOf1JC4hI^ME6x?~YDAVeOT4`iTf}LNSTA17A&EmGk9e^hF+Apq
zBO4JemLrVAFW(>)OT|Gv0>r*N#){<_{SDsYVU6$*H(;PxKtHhy2PYm5;{QcgaaXbJ
zcVNUFd02|&Fymmt!%(cxLt89Ivu{urH$Yk30QtJkEXm&Dqbw51&;PWp@w4mN!5w{@
zSl9o>qko>#tl?%VYSAbKUHC&n(DfAhh4%leRQf;q5%_{^DxT2alvK16#;eFz<SF(m
z>Vlu9?1XlKC)f+^f-XpbC-e(a=vUN*cEL{26?uX#$RF{9enr`oz0lsgaef6mq5Vfv
z;0xmr_(Hp)6#9Q8743ecH>NKV{0e+Q=l-nY55;!j`VjIL`nj(({t)bic150|uF_~H
z@CEy(;tM=QDcC8t3%bxRNTIzcsjDy^As<D1L06Pb`4!rQ_0yDJ#X1RgKaxV;0#C?G
z(1o~yF7ylYD(Hd~ct4T?Ul@-fU+}NkF7Oq{E8Ld_zQ7Z7`M<dGL(qRDh5Q7bVmzU}
zDJjGk{3z;*c8dN5ep5WfentI9{uKFwF665iSLoMnDC57~2Nd~2d_gwlSFyh-UGOjD
zA?S*J71ygGzbQMRU6B7@c0zu_`V+1<MgLljbrtS!f-dlcx+qGaUx*_}VH}E5;48KZ
zx?;be3w0IpYKkw!7wm-b2)ZEU`@qDS9pwR>f&6TGALt@-`H6kt?BMX3+&dHQ=<#zK
zK}RilhDB*pMsb@|k^J=XP?C#p@1%VHCWEWI;~&S*zoDWS=lfrQ&i{x}!A-)?#p!CV
z80Gst`9GuRxBTfS-4vsIKjZ(5Qf2bfQ5+Pbd_QY0|EO3^+n<TjxR$&;)YWYp-C3%1
t`Q})-v<2C)OhlsOb`2EHPX0=eXjO*>U4H3!d8o<X*%7I6kn<b3{{!Rv!Q}t|

literal 9571
zcmcIq2UJu^vmR!KG^8PC1QbLG3I@XVRc;ka5EVsKRFotOC<_j-%DM(jD3}ul1kp7h
zqL@)sKrypxWEBGnhy=+97(jV<M(@7e*~9<uIq$sov>(-7Uv+hLbx+@!JHeaB=WrAh
zIR7X@js_<<HZ&q~e%O%N!STU%^X5jmgDPkG-%28PriimxBxB_-%UB4BNeGJ*aYN%{
z7@}R^Pw|maae~0uu#lf3qh<%jhegN5#0JiH5QunIR{rt~A&$`u4i6LYzfuJ5Q8B^s
zlfE_zM6&)ejLuMzoIj7j(0M<J<j0-h$BPsudOLY*dGnJ*ir&OqCP^e%6B-&56%`g5
z9~l!JC-8}#9Tpol+buFQULaB$<SSB+7peG)RQ=_|Lx#r0GES;RFg`@^BfgGdFA_$`
zkF$}97pYJ5R`u3eCDQO#TP4z56CtK+#f!9kMLH6CRB%XGRGdinEB%|>V`HMj|MXf<
z;<bZF|C>dFZx#*37LAf67LDUYCcdJ6KLy)O@RnDw`tHMYnwhG&PUv$%R5|#!1sO8b
z9Y#wn$KTfG-w0l6VQ}ilLikPo5xnzPKqf#OpU3I&@SK48I|$!qaWH?wb68|ygIj-h
zM45Fw9JneOdXpT`H{Be`>k`-gao6G8(`eZJ-V6=0OmX0l3h0<N5g)4(tc-jCHp9M$
zMMWl9l_-ln8*|{o!~(EPlErm{6fxrZZg_4H2TkvIINZSymBtD2oWoKmKSmf?9ec-7
zFt|McwyE5O{KblR;^G@HIFtb?Q5M+YFbIvZ7el81aysX86J*$(2cLKS&_ZVucrBfT
zV<!o4)Y&9Rk9`IQc07U&@&=d^uooiYM$p|2%DBt-5!6k424B{!q<6NMpsMK~aJ54f
z^W*X%>zWHL@Ug~qqT?{{wLLr^u7N)kD`HqxG>mh*3Hx^Nu;zmm+RQD3>4!?7M`Vn=
z52Nt}<hcsvJ0K-Z1X`Vr*zG95imZwBcPgVX%PS8)d#GS!dJD8XcSND3J5omZkWf_t
z-z9R<K}R2x4X=QU!&+!WJ*>?(!qy~n*Da5NLAxXX-VAAi`Gbl<yhc|zWAMg3Bgx-G
z4bRc3o&n6?L0Gk<P}26t4L#hk<yWX_{FQF5IRdkXZ34R$+BhXW0ES%K1+-cjEJ-uM
z#5wzEu{|-@#mfa<>o-dHOU{{N+A$rtwL=z9dz(Z2IeR>A&PV<hby)Kw7uT64fzb^k
zY&Ts8XR`S)>SZ^Kbm)?BEzHdjhI+$cjmdb@X%r+rzUb=Y`vCUJ9hCGJt{5oEE%BI-
z+mgXksS|=#z3{{fJ4yceM^!;ip&dNr4MwNC7hMJEDws(UVadh`_=w_QXM-`${b{(w
zMvhe--T1@_3m<G@YzN_0(gvkL*7V0k)~L{Y26j!>!kHXd^hlkE#l1t&;3^*@et!!)
z-#MV`0W;iIAcXt@%`houC3I&CF>!M??d))a{zbGAEWYf6Y5fck!Z;Y{;(>A<J{ajP
zgmpnlv};KY^w<@FLGd$qdgc{;>@`654sBetX+4Ny9)o3SC$#6$cp*|Bg*)Y8Rn!>t
zy!8Siujyj5+(@)d^OpE;@^K!d55EV4Lk8o1)&8iQtdCKCN5HRU49>e@fzkU!;1s_b
z`rUX5o=lEM%-4V)64JqY>p7_2Wq_N;ZF0Sn)k5>?e}FG8C!pfduaX$I-s0gk+dvpH
z$QjbdJ%u|9ilOnDvn0M%@9xuwtW{7qe;TZD+XwI0E`pan9@uz=hj#aJz%XGk<erkj
zmm43zlw0QLwr??Q{@DvR&sYLEG9j=`iwkn&@4@-vGFW712WIm3phNu$xb_T2uUTr4
z>7kC3<jwHKLo2*6DH{@*b0c0>MKa$(U(6t6lO8TQyaCFlIY|78U*iRCk=?Xq^k>lD
z*b3LGyTECaEo$7hMbX0?IKMj!nwKi$*r*}sYxx<h=p8gQa5x@!ngT`73+R&OkFe7x
z999|X<ETIG(@nd(VddJLP`PCUa?EE@K5zoGO5DM3uo}wmQN%e}0@U_)MZYR1>>SfV
zTkkQ2pSR`038PMEF1SOlc&>x4!#6<fu&EONqA%M+<WNhTG-@5hh(1DME@u>H>rmv*
z_l8-U*TS#DVVGLm0W*pM@j%dA5T5dMJ?C5wk-;<I=z<y;V(Wk!1_M#WGZ6Q9yTJMn
zt?+5tZm`;?jwJ`)!p9OW&d)Hx^f$h^CCweo+|AK`{S%OHorS%ZYry@fEING9$Ar}`
z*x^Rg9z(65#(phy9e4qFMOWhEgJ@4YzBnD4@8keq#he~6Lm7<&?ZIkU3d9zwqU#Dj
zyq0T$OMM)nar8*6PPr`EFVptvVO*&>mSle~S)VHXGRb+6aq$JLEY*bW(Hz_`a~Mvm
z7Vpg<q|8>so2SY2+DGk>(-RJDj)u5+xHdit9gh=7j>alAOK7b*2yQ7OvFvIU94{XL
zo1fm5j3wsJYO;nD<xJSPu?X^7WiaDcb*R`=4i7)MW5`5R%;FWppr=P9`t=QuAW=7s
zu6PveD&~r-@Wo;1nWT+E<!T7HFbpp><if0abKsuV!0zu4!`7KRaCN>8!v3ovKBiN$
z7qpoD`sBqFtXen~g=6=F)1pLL>#jfEYrhB+(r?k*R+K}~$51$YL;>XknLV&u4%;p{
zA$Jv%AKfkxcuWyT8t#P74Fhmb;X>GAnh1B3%HgWzJ%~!V2=CO}z-F8#o{#(<4M;I8
zII|t>%ylrb;FV;2kH$lE=&K_zs>uy_avI<ibQbcX)-wDWcvrwh-L_^p8E%YmacXF>
zQ-hhOxCG%+4^{AV>VXHpwZq641Xns+V^wE4T|}3<Zt9;2dItj`k>>)tRW$KkRvP%1
z*kF)<Hf(sWk86xPFlD_xdOLeS?&X0v_Wcepo<SqeHwc7&G>#ed_}iLrI7eTFg01H&
zx`xNYhGHAgyIcu30=L0KX0Ob3Hp4SdtZ@);I!5oGF7d75$^aN5>JK{&^B|y~7Eb@+
z0c3N1vE;NSlGZX9T^CNT*=T|pPaPppT^_wt^1y2&ppK~uKHriKdMTDT=)-SN=Sibp
zz;RgQWP$4(Uc&e@uDCwf9A><+#N`8%na_{W_<QmKaDHxy^#gK1#G%m-R?&}NZwB*E
z!_X%24m6G9qT}ON&>3Ka`zOhxT0tu9@O~zI${P*z^nKuagok`{4m$Xl;Oq!J3{YJ`
z7yn!aQPv}1XNo*dJlhEgEvqE4n4qJ8WkY#*IcW{J);gi#?Q%$6sDMc^g-~?D6mwh}
z!2e_kU6!*1dJ~4Ba_uSj;hYyV`<Y^7Zw4$+al&M+?QpbQ7xfoDfXOz)@wTNRDySHv
zaer5|`eh8d$sMLQsOCX|yECRHbiu6C4X``zG4$?p#Na#nXm4W<E5}R%rJWmL`Q0>0
z{s@lk0vo|IW?u&3?EaBpT*B;|8Rc;Mh#y+Xtabf)-*>1@W=Q5ReP<kK#J0mY`}2?<
zq={b&W9jxD8N9p0397>l@mw#!%@Z8#`Xd0-_84L5#XKlx?p@oSH-KjSNHn<l5^R6f
z#T!NfRE)Dnk9m67_KOvC%UR*btkbYy(<@NWcf(uzzkY_NT(riB3LVMb+*0U+(Yzaw
z8)}L=+w~-Eyfg)zwfG*$zB5HmZ8s#C$AZRc88oUm4+h(eaL#o(ygot|l~Ya8`H>9{
zyrG4+5A}k4u_<O89f<{P=V;Y(A*w9&gR4Ha=xmh-@|(HXYQ;xq3n%2y;bB<dU9h8!
zamdGRSm=^SyUtX>US$ddMV44&unhD=Y$UOX7AT^da3$=yppPp(F2mT;k&=8)30_Zg
z7mbHjDjTwYjDyTgig=~;1S}8hk0*kh=mp)$P%gU%7VPDtT}UxqkaHBgf}<dz;ss<@
zr@`L0O;F}j25D;Q5_{_V)zCLF9Tp$8ML%m@lq<JH5;hL`V<zD}$0MNNy;b7ReNHi`
zY8Qg6p#=`u(hQf!x=Q%5?VXUs{}I+dWzN-F0Y0d(hRd&A(euDDxJ{*lZS*_1^C<u;
zYt+z4={ty1_#OOGjzT`_qDTIHczf(RG@DuD6a}Hg-fN>>;1FgD!z*QReEw-L-l2wi
zi)C@I<t)6!Z-+{uHo7ZmqH)~U`<GL{S5Uya2#PAISauI2IVjG(LAw@0^wt(IhdRI_
zE&RSq86WokL7yB}3=!WQgd#f|$#_pPTVbTaAk6R5#L|iq7;;t%g=09F7d{lvkJ$}Z
zRSIBM<ukZ@x(%*Qa)%Wf!(rprbWl-T3sIw#=^#fg<U4i)r$QM!qh+ye#&|dqZiaI1
z>7baI4yD#_=(!?3E;zj$RutR=d#8tBdWnaA30%DR{%1N<;R5Wq>I8771v*rB!to8g
zz?oMHNBj+u4(<nbcN3ug&;~G`nhV@#1u!?f6UGnhgzdT7IObR(_zrsoQ5jv}zgP*$
zloWXFs{?h9JK@lrblAJJ5$Z=>20b?+9=q81kMqHIfdU@MI|f0_{RgL(!`zg6@V(V1
zxbDJ38y8*tO&AU2EfbIaCt;qK9qzm@hiSaaP}(pWi*Bo+$}KfCwQ+{;N=<OylPhpY
z{{tvi?}3{`M__JvFANJ(z~67BL+%b!jCuDvq#9O3&9Z*DOPhHXxjhzB&3!@Btp*P6
zJOB%yE8)$@9rO)<<{tS)1O}a>F+F$+7>&w?%!AsP=_Z3Qoh?v0pbYexeV3WW+!-PH
z&-FHm;9z-S7c{-)z(#EWYE+$urWH+aA<qa#d&}X<F9l#VI|G(hsbJa2XHc?yFeZ&>
zo~cG#!sO}ZxJf1hrmPYoy;l+aZ;ZzUA{r|;>%#%d{jlfOdT1Z<hh!{Yg*%Y6*Bug-
zy1{k9ia(!6*2FvGO2a0YCFlW%3z}GqIyiPlKMCie-WWV@UQJ*3QNZLUNib2x4Bwx=
z00|kP^faArSY8cGE(Rgq&;hFu9$K!v43<t_Xsf^ntr&I8*u4tIdfVcc+0!6G4q^Dv
zK-4?C1?2JyV9W695S!EiHt)M(?&l5A?kS6kV{Sw2Rptz1%KU>|^fX=#mzaHfH$)pt
zHW}cSt3z?>A?Cixl!3yVP~2Y@e<|pJMFLG+!zqWcTKzC~wfM~BaBdFXM4zAL2%bno
z&bFg)r=2+$1K+?=IRmuUUJI|CR4}^86?Hd{#Yl@ZSQU34mij+~0c*Gt{$KCQ2PfCj
z>yAf5{^odSO*#V8w<N%?yDDMtt9;n6A;37jEc$F2^SqNV70&MedLAe+cLZTwhHE1I
z^?6_HlQ>pN9>*cyx)heZ84Iu8%Hz+l1B!NRhf@i|G1N_#89NAzuPNi*LPu0zK3XFG
z=(3kKNl%C6E4<<H>WAP`_yX={t77mF0nXZCg-V_^fG1Q?0|w(S@dW#ye+6$ho1w!1
zd30;0@LujH;J#l2Q!I8vz^P(r%U8lGM?T&jXpUTtiDbTiy`LD|djpoTZ)tTVH|Gq_
zh1{a!z*~F{F6^m+yxWa%Uda^gSBEe;5`==G4Un4E3r9;0;pjj$-2ePC3^<Yvqw-qm
zD)pNXlAHswe=zx@TLqdiX4w5n9_y}@fZV*Z@NR$%2IRekNvStsd58+Gkl|p4(?EPa
z?gQM&;G?;QK6b>K;F+n8IQ_;m$$0d7TjVZ#0#SN=G~p;<&ukM+wJrvi7)z9w)yK?J
zzIb7UAv(S~K&!P5!v3cZg9i5u)N|Ev>`yJADX_)~FLwYmd<2gv%D7BO;~7I0jN<Cy
z?)#QF+kONt`h0}`J;xFg8MPrlkH+@-4mfz~2wa#Ni2LgTCGn~hyauBf9q3y8^?7gO
zH9fG|E`w%mam>BrDt&xm3536y3n8a^Bzvi)MwUKv=MbFVtOT`!G{{~17Dfzi2CWWT
z%-XJq5OY>?_gFGlJWmeiQ9vz3t-Am{Z#?i4bH{LtxkXP2F~iWGP6H{eh4b3KKqglP
z_pJW_0h3OHyLUPLN$COXS$_sfT_4l)3N2h!7pddQGY!xhpbx>Pj3Bmh8#pKZ0#>(e
zuzKNONp4lFoQ3YG6QRrQ5p1?O1Op4X`0(Hd`1#gENSm#P{ER&y{`%~={1vq#4Gu?K
z|I^oJ{J(#FX7A3-=HqBv%Jhsi)$^-0b?oL~%4?Yw)m=W2+Hl2;`f$*UTCmrQI=j@A
zYE&|%PK5NMbVl~0zUwuiPSluCmjxzNV6!phz0R0g;%H3m`)ovw8)ZZVR2oo+iVdip
zJ_gjfdOd1tjUM$fUzhq}uP)W_y$<!8jt&(s)S>D=YEypmv?<L9ZEEWmEvn|Z7UeWn
zi;5O#QIk6~DbwwmRN5*{s>DN+Iyq94;$G6AZhL7^ht8=}7fz~E*AJ^xoL%aae1njB
zlq#h5%@I=bLxfZa3aN1sYK&bq>Y$z)7574wYIabeX0|I+Zpq5j_QlGS#zQ44H&2Ob
za8;s=a|M)jrhrOI5m1ri1l0UmMauh`BBk!2NYxKgq$)ENn7JrWp<W79$p8f^SzUpO
zJ1<X7&y%MD&dO6xu}qyKPn{esPYsuqr=DDwqlSs(s1k2E$}UWn8ml8q9sbCtl56->
z*l9kM5y+>;IP$4Wd_HB_#G_W#@~AmSc~oZxk8)YVqgp5MsC+dZWm?Cj0`GDu-Thq3
zD1l4$hH<IzU@qm#)WK7@lv{%grBf(FO*tw<ZA+4&X3UYHYG=w&>nR!PwxtYp{{x3w
zc8o)f-^ZbR7IUZ?Hx9K<pF=fja;N~NW~p;1enT&b-q}k`t$Rsc_!r_C`h^%B_)IP?
z{!F|)J`;y8J>>q09)cr#NYsEH5_q<oEZ^KsGUL06rE51CUh|2Rtouav-}p%S9sNia
z#(yNU1Ru$nPanwJXCKJH-`|sMh40DBu=k|d?>+hOqKhP6?IP=sb&<`%T_j~%7g4h9
zB1<j0$o-R@#A9J6Dcasa^p|&#=EM$?ZPGzfKD{G1&%PszC%z+1?(c|ddpnu(yq)Y#
zZ70vRwiEZc?ZjtJJ6Y1(MnoUm$mFIrQgOSDSX^i$A^X}${iZfDW=$J$_@#{up5I2y
z=d=;|kT$Y<avOP}*+%T&wUXscOuf@eJd<0=Gkz<1|FnfHz12dd6}6D)-7N%CTZq4M
z3z;X^LY~z&lfv80#Cdfy8Clyza90zd{hP=`>n5_ju90+<G?FF7jb!1eMj|-cNc49%
zl4Ua+Nzk?i;-1n#@|HD_Yu*in=hi@~+ujnjn75>0`df0?_APl<R!@#SswY_y^`zgZ
zdXivSPd@e3k*uORQdU?;=t!o9)e(ycb%fVdOS)gzlBhpw$?*rZWOsHgd330j7+BO2
z2jyDQ<Wxh{6>ErJ)*Hg{c|-Dtz9FZ!Rg+!Qs!9I1YI4xCnv^+JldCDOiS?S-<i|y?
ziTBU1iBaTh;^+LDbm_h(=}Zmpsv^O!s>r3=RpiU9Dsm{liX=x>5rv6WWP*1UaWktT
z_a0XgyIYmSc3&m=;#El=Gj-5_O5&qiNscO1k~=ReNJMr8>By`goqH>YYkUQn_hSW_
z8D2r;EGkHmW(C=#P(dE{ydqOIUXgnKD>DAXOVXS9lBDi^NrK{Ek`=>VGULfB1e%vk
zIl=sDBd-7Hywm#odFS8`!hif+AR|7|nWFd={x$x8^VI&OKYjVpSG)8)Sv?lnercI)
zXOaEEjFtZ_GD{{YtbG>y;`i0h+L7k<r7x|gE)`#v$0Dn*^i8iXeRiFsd92JLYmddg
z{Ffew)tAP;?Ebg*zU;F+)-P$9<*9y)2iwoKvnU-8R*yxN&(4Q!?+c~%*m3&OXZ`<g
zJXVigU)CPWW0BQk`~NNeH$PcB0;zbi?S1iCf7pIjW_hgN(#ZC+$nsg4Mb;jR(ssnZ
za*GSA&mybO%KsLn{g7S{)()%3j>F2*{e8)7JIiP1#kRBdq}PMhWBXZsX=L@J+yAd}
zU-4$`vh!h)ZD+BscDA2Ic3-mkEVBK5AsZLo*R=REFQnth_Djc!<xAUV+u8ZEGV2#B
zvpi|LedQ6W&+?`9So_lUSRUKY%F@W5f6{uaJ+_@ic7D=!Seez6mZjU-c}Ux1$7Scs
zB5O}tX4j47vExW1+t1oz`&pSq@qNwgbpP%kh9Ite-`CVQ{r|kL%@2!>W8P$OhkpOK
zut~ktF-#yH;Xy|DWG<6|;yS@wMht(x)AQGtON8Q%zZK~>e5BcbeYR%z$X5fKnJNC2
zJ!Ggf_pkTVzjN<vNpbC^xqpq>-?>HlQd|dV?q8$9=HpkN=k@<z?zb3E@RpOa`4$or
wp6#E(VH1hPA>UT?A!9;Z7@qhACXS4S`2VjT2ot<z#cvpKWSM_r{ja+J16B@S8UO$Q

diff --git a/test/test_models.py b/test/test_models.py
index 5d2b5565a..d284ec6fe 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -16,7 +16,8 @@ import torch.fx
 import torch.nn as nn
 from _utils_internal import get_relative_path
 from common_utils import cpu_and_gpu, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
-from torchvision import models
+from PIL import Image
+from torchvision import models, transforms
 from torchvision.models import get_model_builder, list_models
 
 
@@ -28,6 +29,41 @@ def list_model_fns(module):
     return [get_model_builder(name) for name in list_models(module)]
 
 
+def _get_image(input_shape, real_image, device):
+    """This routine loads a real or random image based on `real_image` argument.
+    Currently, the real image is utilized for the following list of models:
+    - `retinanet_resnet50_fpn`,
+    - `retinanet_resnet50_fpn_v2`,
+    - `keypointrcnn_resnet50_fpn`,
+    - `fasterrcnn_resnet50_fpn`,
+    - `fasterrcnn_resnet50_fpn_v2`,
+    - `fcos_resnet50_fpn`,
+    - `maskrcnn_resnet50_fpn`,
+    - `maskrcnn_resnet50_fpn_v2`,
+    in `test_classification_model` and `test_detection_mode`.
+    To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params`
+    """
+    if real_image:
+        GRACE_HOPPER = get_relative_path(
+            os.path.dirname(os.path.realpath(__file__)), "test", "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
+        )
+        img = Image.open(GRACE_HOPPER)
+
+        original_width, original_height = img.size
+
+        # make the image square
+        img = img.crop((0, 0, original_width, original_width))
+        img = img.resize(input_shape[1:3])
+
+        convert_tensor = transforms.ToTensor()
+        image = convert_tensor(img)
+        assert tuple(image.size()) == input_shape
+        return image.to(device=device)
+
+    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
+    return torch.rand(input_shape).to(device=device)
+
+
 @pytest.fixture
 def disable_weight_loading(mocker):
     """When testing models, the two slowest operations are the downloading of the weights to a file and loading them
@@ -231,7 +267,6 @@ autocast_flaky_numerics = (
     "maskrcnn_resnet50_fpn",
     "maskrcnn_resnet50_fpn_v2",
     "keypointrcnn_resnet50_fpn",
-    "fasterrcnn_resnet50_fpn",  # See: https://github.com/pytorch/vision/issues/6655
 )
 
 # The tests for the following quantized models are flaky possibly due to inconsistent
@@ -250,6 +285,7 @@ _model_params = {
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "retinanet_resnet50_fpn_v2": {
         "num_classes": 20,
@@ -257,6 +293,7 @@ _model_params = {
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "keypointrcnn_resnet50_fpn": {
         "num_classes": 2,
@@ -264,18 +301,21 @@ _model_params = {
         "max_size": 224,
         "box_score_thresh": 0.17,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_resnet50_fpn": {
         "num_classes": 20,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_resnet50_fpn_v2": {
         "num_classes": 20,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fcos_resnet50_fpn": {
         "num_classes": 2,
@@ -283,18 +323,21 @@ _model_params = {
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "maskrcnn_resnet50_fpn": {
         "num_classes": 10,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "maskrcnn_resnet50_fpn_v2": {
         "num_classes": 10,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_mobilenet_v3_large_fpn": {
         "box_score_thresh": 0.02076,
@@ -633,11 +676,11 @@ def test_classification_model(model_fn, dev):
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
     input_shape = kwargs.pop("input_shape")
+    real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
     model.eval().to(device=dev)
-    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    x = torch.rand(input_shape).to(device=dev)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     out = model(x)
     _assert_expected(out.cpu(), model_name, prec=1e-3)
     assert out.shape[-1] == num_classes
@@ -731,11 +774,11 @@ def test_detection_model(model_fn, dev):
     model_name = model_fn.__name__
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     input_shape = kwargs.pop("input_shape")
+    real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
     model.eval().to(device=dev)
-    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    x = torch.rand(input_shape).to(device=dev)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     model_input = [x]
     out = model(model_input)
     assert model_input[0] is x
-- 
GitLab


From d427f36510798863a3953ba4ebf6ab364717bbbb Mon Sep 17 00:00:00 2001
From: Joao Gomes <jdsgomes@fb.com>
Date: Tue, 4 Oct 2022 17:48:11 +0100
Subject: [PATCH 006/624] fix bug in output format for pyav (#6672)

* fix bug in output format for pyav

* add read from memory with constructor overload

* Revert "add read from memory with constructor overload"

This reverts commit 14cbbab239165be05096fd6cbb88cb0448502436.

* run ufmt
---
 torchvision/io/video.py | 132 ++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 65 deletions(-)

diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index ceb20fe52..002fde998 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -273,72 +273,74 @@ def read_video(
         raise RuntimeError(f"File not found: {filename}")
 
     if get_video_backend() != "pyav":
-        return _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
-
-    _check_av_available()
-
-    if end_pts is None:
-        end_pts = float("inf")
-
-    if end_pts < start_pts:
-        raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}")
-
-    info = {}
-    video_frames = []
-    audio_frames = []
-    audio_timebase = _video_opt.default_timebase
-
-    try:
-        with av.open(filename, metadata_errors="ignore") as container:
-            if container.streams.audio:
-                audio_timebase = container.streams.audio[0].time_base
-            if container.streams.video:
-                video_frames = _read_from_stream(
-                    container,
-                    start_pts,
-                    end_pts,
-                    pts_unit,
-                    container.streams.video[0],
-                    {"video": 0},
-                )
-                video_fps = container.streams.video[0].average_rate
-                # guard against potentially corrupted files
-                if video_fps is not None:
-                    info["video_fps"] = float(video_fps)
-
-            if container.streams.audio:
-                audio_frames = _read_from_stream(
-                    container,
-                    start_pts,
-                    end_pts,
-                    pts_unit,
-                    container.streams.audio[0],
-                    {"audio": 0},
-                )
-                info["audio_fps"] = container.streams.audio[0].rate
-
-    except av.AVError:
-        # TODO raise a warning?
-        pass
-
-    vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
-    aframes_list = [frame.to_ndarray() for frame in audio_frames]
-
-    if vframes_list:
-        vframes = torch.as_tensor(np.stack(vframes_list))
-    else:
-        vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
-
-    if aframes_list:
-        aframes = np.concatenate(aframes_list, 1)
-        aframes = torch.as_tensor(aframes)
-        if pts_unit == "sec":
-            start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
-            if end_pts != float("inf"):
-                end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
-        aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
+        vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
     else:
-        aframes = torch.empty((1, 0), dtype=torch.float32)
+        _check_av_available()
+
+        if end_pts is None:
+            end_pts = float("inf")
+
+        if end_pts < start_pts:
+            raise ValueError(
+                f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}"
+            )
+
+        info = {}
+        video_frames = []
+        audio_frames = []
+        audio_timebase = _video_opt.default_timebase
+
+        try:
+            with av.open(filename, metadata_errors="ignore") as container:
+                if container.streams.audio:
+                    audio_timebase = container.streams.audio[0].time_base
+                if container.streams.video:
+                    video_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.video[0],
+                        {"video": 0},
+                    )
+                    video_fps = container.streams.video[0].average_rate
+                    # guard against potentially corrupted files
+                    if video_fps is not None:
+                        info["video_fps"] = float(video_fps)
+
+                if container.streams.audio:
+                    audio_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.audio[0],
+                        {"audio": 0},
+                    )
+                    info["audio_fps"] = container.streams.audio[0].rate
+
+        except av.AVError:
+            # TODO raise a warning?
+            pass
+
+        vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
+        aframes_list = [frame.to_ndarray() for frame in audio_frames]
+
+        if vframes_list:
+            vframes = torch.as_tensor(np.stack(vframes_list))
+        else:
+            vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
+
+        if aframes_list:
+            aframes = np.concatenate(aframes_list, 1)
+            aframes = torch.as_tensor(aframes)
+            if pts_unit == "sec":
+                start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+                if end_pts != float("inf"):
+                    end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
+            aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
+        else:
+            aframes = torch.empty((1, 0), dtype=torch.float32)
 
     if output_format == "TCHW":
         # [T,H,W,C] --> [T,C,H,W]
-- 
GitLab


From 3038cb28d9e9c8c2cebda310943721ec0dc9014e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 4 Oct 2022 20:05:51 +0200
Subject: [PATCH 007/624] fix example galleries in documentation (#6694)

* exclude sphinx-gallery==0.11.0

* fix CSS

* Update docs/requirements.txt
---
 docs/requirements.txt                          |  2 +-
 docs/source/_static/css/custom_torchvision.css | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 1ff0c8280..09a11359a 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,7 @@
 matplotlib
 numpy
 sphinx-copybutton>=0.3.1
-sphinx-gallery>=0.9.0
+sphinx-gallery>=0.11.1
 sphinx==5.0.0
 tabulate
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
diff --git a/docs/source/_static/css/custom_torchvision.css b/docs/source/_static/css/custom_torchvision.css
index bdc4071c1..07346d7b0 100644
--- a/docs/source/_static/css/custom_torchvision.css
+++ b/docs/source/_static/css/custom_torchvision.css
@@ -21,3 +21,15 @@ article.pytorch-article .reference.download.internal, article.pytorch-article .s
 .table-weights p {
     margin-bottom: 0.2rem !important;
 }
+
+/* Fix for Sphinx gallery 0.11
+See https://github.com/sphinx-gallery/sphinx-gallery/issues/990
+*/
+article.pytorch-article .sphx-glr-thumbnails .sphx-glr-thumbcontainer {
+    width: unset;
+    margin-right: 0;
+    margin-left: 0;
+}
+article.pytorch-article div.section div.wy-table-responsive tbody td {
+    width: 50%;
+}
-- 
GitLab


From 71885b0f255f2e76fd0a07e348bddbec6430a8d3 Mon Sep 17 00:00:00 2001
From: Karan Desai <kddeskar@gmail.com>
Date: Tue, 4 Oct 2022 17:58:25 -0400
Subject: [PATCH 008/624] Make CUB200 labels 0-indexed. (#6702)

CUB200 dataset in `torchvision.prototype.datasets` module formed labels using file paths. This resulted in labels being 1-indexed (1-200) instead of 0-indexed (0-199). Similar issue occurred with Flowers102 (`torchvision.datasets` module, #5766).
---
 torchvision/prototype/datasets/_builtin/cub200.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index c07166a96..f1531615c 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -177,7 +177,10 @@ class CUB200(Dataset):
         return dict(
             prepare_ann_fn(anns_data, image.image_size),
             image=image,
-            label=Label(int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]), categories=self._categories),
+            label=Label(
+                int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1,
+                categories=self._categories,
+            ),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-- 
GitLab


From 0e006a9fc9eae6d3cc13fceb3a205af29b06d703 Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowbao@microsoft.com>
Date: Tue, 4 Oct 2022 15:02:57 -0700
Subject: [PATCH 009/624] [ONNX] Rephrase ONNX RoiAlign warning for
 aligned=True (#6704)

* Rephrase ONNX RoiAlign warning for aligned=True

* add comma

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 torchvision/ops/_register_onnx_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py
index eaea0b900..8f9598e1f 100644
--- a/torchvision/ops/_register_onnx_ops.py
+++ b/torchvision/ops/_register_onnx_ops.py
@@ -45,8 +45,8 @@ def _register_custom_op():
         rois = _process_rois_for_roi_align(g, rois)
         if aligned:
             warnings.warn(
-                "ROIAlign with aligned=True is not supported in ONNX, but is supported in opset 16. "
-                "Please export with opset 16 or higher to use aligned=False."
+                "ROIAlign with aligned=True is only supported in opset >= 16. "
+                "Please export with opset 16 or higher, or use aligned=False."
             )
         sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
         return g.op(
-- 
GitLab


From 4a99bae8ad28247520b3e3b179ed604317d5fdce Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Oct 2022 14:07:55 +0200
Subject: [PATCH 010/624] add dispatch tests for prototype transform
 dispatchers (#6631)

---
 test/prototype_transforms_dispatcher_infos.py | 122 ++++++++++++++++--
 test/prototype_transforms_kernel_infos.py     |   2 +-
 test/test_prototype_transforms_functional.py  |  75 +++++++++++
 3 files changed, 189 insertions(+), 10 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 99a9066be..a14d5eaf0 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -1,15 +1,27 @@
+import collections.abc
 import dataclasses
+
 from collections import defaultdict
-from typing import Callable, Dict, List, Sequence, Type
+from typing import Callable, Dict, List, Optional, Sequence, Type
 
 import pytest
 import torchvision.prototype.transforms.functional as F
-from prototype_transforms_kernel_infos import KERNEL_INFOS, Skip
+from prototype_common_utils import BoundingBoxLoader
+from prototype_transforms_kernel_infos import KERNEL_INFOS, KernelInfo, Skip
 from torchvision.prototype import features
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
-KERNEL_SAMPLE_INPUTS_FN_MAP = {info.kernel: info.sample_inputs_fn for info in KERNEL_INFOS}
+KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS}
+
+
+@dataclasses.dataclass
+class PILKernelInfo:
+    kernel: Callable
+    kernel_name: str = dataclasses.field(default=None)
+
+    def __post_init__(self):
+        self.kernel_name = self.kernel_name or self.kernel.__name__
 
 
 def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"):
@@ -28,21 +40,35 @@ def skip_integer_size_jit(name="size"):
 class DispatcherInfo:
     dispatcher: Callable
     kernels: Dict[Type, Callable]
+    kernel_infos: Dict[Type, KernelInfo] = dataclasses.field(default=None)
+    pil_kernel_info: Optional[PILKernelInfo] = None
+    method_name: str = dataclasses.field(default=None)
     skips: Sequence[Skip] = dataclasses.field(default_factory=list)
     _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False)
 
     def __post_init__(self):
+        self.kernel_infos = {feature_type: KERNEL_INFO_MAP[kernel] for feature_type, kernel in self.kernels.items()}
+        self.method_name = self.method_name or self.dispatcher.__name__
         skips_map = defaultdict(list)
         for skip in self.skips:
             skips_map[skip.test_name].append(skip)
         self._skips_map = dict(skips_map)
 
-    def sample_inputs(self, *types):
-        for type in types or self.kernels.keys():
-            if type not in self.kernels:
-                raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}")
+    def sample_inputs(self, *feature_types, filter_metadata=True):
+        for feature_type in feature_types or self.kernels.keys():
+            if feature_type not in self.kernels:
+                raise pytest.UsageError(f"There is no kernel registered for type {feature_type.__name__}")
+
+            sample_inputs = self.kernel_infos[feature_type].sample_inputs_fn()
+            if not filter_metadata:
+                yield from sample_inputs
+            else:
+                for args_kwargs in sample_inputs:
+                    for attribute in feature_type.__annotations__.keys():
+                        if attribute in args_kwargs.kwargs:
+                            del args_kwargs.kwargs[attribute]
 
-            yield from KERNEL_SAMPLE_INPUTS_FN_MAP[self.kernels[type]]()
+                    yield args_kwargs
 
     def maybe_skip(self, *, test_name, args_kwargs, device):
         skips = self._skips_map.get(test_name)
@@ -54,6 +80,31 @@ class DispatcherInfo:
                 pytest.skip(skip.reason)
 
 
+def fill_sequence_needs_broadcast(args_kwargs, device):
+    (image_loader, *_), kwargs = args_kwargs
+    try:
+        fill = kwargs["fill"]
+    except KeyError:
+        return False
+
+    if not isinstance(fill, collections.abc.Sequence) or len(fill) > 1:
+        return False
+
+    return image_loader.num_channels > 1
+
+
+skip_dispatch_pil_if_fill_sequence_needs_broadcast = Skip(
+    "test_dispatch_pil",
+    condition=fill_sequence_needs_broadcast,
+    reason="PIL kernel doesn't support sequences of length 1 if the number of channels is larger.",
+)
+
+skip_dispatch_feature = Skip(
+    "test_dispatch_feature",
+    reason="Dispatcher doesn't support arbitrary feature dispatch.",
+)
+
+
 DISPATCHER_INFOS = [
     DispatcherInfo(
         F.horizontal_flip,
@@ -62,6 +113,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.horizontal_flip_bounding_box,
             features.Mask: F.horizontal_flip_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.horizontal_flip_image_pil, kernel_name="horizontal_flip_image_pil"),
     ),
     DispatcherInfo(
         F.resize,
@@ -70,6 +122,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.resize_bounding_box,
             features.Mask: F.resize_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.resize_image_pil),
         skips=[
             skip_integer_size_jit(),
         ],
@@ -81,7 +134,11 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.affine_bounding_box,
             features.Mask: F.affine_mask,
         },
-        skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")],
+        pil_kernel_info=PILKernelInfo(F.affine_image_pil),
+        skips=[
+            skip_dispatch_pil_if_fill_sequence_needs_broadcast,
+            skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"),
+        ],
     ),
     DispatcherInfo(
         F.vertical_flip,
@@ -90,6 +147,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.vertical_flip_bounding_box,
             features.Mask: F.vertical_flip_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.vertical_flip_image_pil, kernel_name="vertical_flip_image_pil"),
     ),
     DispatcherInfo(
         F.rotate,
@@ -98,6 +156,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.rotate_bounding_box,
             features.Mask: F.rotate_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.rotate_image_pil),
     ),
     DispatcherInfo(
         F.crop,
@@ -106,6 +165,17 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.crop_bounding_box,
             features.Mask: F.crop_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"),
+        skips=[
+            Skip(
+                "test_dispatch_feature",
+                condition=lambda args_kwargs, device: isinstance(args_kwargs.args[0], BoundingBoxLoader),
+                reason=(
+                    "F.crop expects 4 coordinates as input, but bounding box sample inputs only generate two "
+                    "since that is sufficient for the kernel."
+                ),
+            )
+        ],
     ),
     DispatcherInfo(
         F.resized_crop,
@@ -114,6 +184,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.resized_crop_bounding_box,
             features.Mask: F.resized_crop_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.resized_crop_image_pil),
     ),
     DispatcherInfo(
         F.pad,
@@ -122,6 +193,10 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.pad_bounding_box,
             features.Mask: F.pad_mask,
         },
+        skips=[
+            skip_dispatch_pil_if_fill_sequence_needs_broadcast,
+        ],
+        pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"),
     ),
     DispatcherInfo(
         F.perspective,
@@ -130,6 +205,10 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.perspective_bounding_box,
             features.Mask: F.perspective_mask,
         },
+        skips=[
+            skip_dispatch_pil_if_fill_sequence_needs_broadcast,
+        ],
+        pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
     ),
     DispatcherInfo(
         F.elastic,
@@ -138,6 +217,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.elastic_bounding_box,
             features.Mask: F.elastic_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
     ),
     DispatcherInfo(
         F.center_crop,
@@ -146,6 +226,7 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.center_crop_bounding_box,
             features.Mask: F.center_crop_mask,
         },
+        pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
         skips=[
             skip_integer_size_jit("output_size"),
         ],
@@ -155,6 +236,7 @@ DISPATCHER_INFOS = [
         kernels={
             features.Image: F.gaussian_blur_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil),
         skips=[
             skip_python_scalar_arg_jit("kernel_size"),
             skip_python_scalar_arg_jit("sigma"),
@@ -165,80 +247,97 @@ DISPATCHER_INFOS = [
         kernels={
             features.Image: F.equalize_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.equalize_image_pil, kernel_name="equalize_image_pil"),
     ),
     DispatcherInfo(
         F.invert,
         kernels={
             features.Image: F.invert_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.invert_image_pil, kernel_name="invert_image_pil"),
     ),
     DispatcherInfo(
         F.posterize,
         kernels={
             features.Image: F.posterize_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.posterize_image_pil, kernel_name="posterize_image_pil"),
     ),
     DispatcherInfo(
         F.solarize,
         kernels={
             features.Image: F.solarize_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.solarize_image_pil, kernel_name="solarize_image_pil"),
     ),
     DispatcherInfo(
         F.autocontrast,
         kernels={
             features.Image: F.autocontrast_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_sharpness,
         kernels={
             features.Image: F.adjust_sharpness_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
     ),
     DispatcherInfo(
         F.erase,
         kernels={
             features.Image: F.erase_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.erase_image_pil),
+        skips=[
+            skip_dispatch_feature,
+        ],
     ),
     DispatcherInfo(
         F.adjust_brightness,
         kernels={
             features.Image: F.adjust_brightness_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.adjust_brightness_image_pil, kernel_name="adjust_brightness_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_contrast,
         kernels={
             features.Image: F.adjust_contrast_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_gamma,
         kernels={
             features.Image: F.adjust_gamma_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_hue,
         kernels={
             features.Image: F.adjust_hue_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_saturation,
         kernels={
             features.Image: F.adjust_saturation_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
     ),
     DispatcherInfo(
         F.five_crop,
         kernels={
             features.Image: F.five_crop_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         skips=[
             skip_integer_size_jit(),
+            skip_dispatch_feature,
         ],
     ),
     DispatcherInfo(
@@ -246,8 +345,10 @@ DISPATCHER_INFOS = [
         kernels={
             features.Image: F.ten_crop_image_tensor,
         },
+        pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
         skips=[
             skip_integer_size_jit(),
+            skip_dispatch_feature,
         ],
     ),
     DispatcherInfo(
@@ -255,5 +356,8 @@ DISPATCHER_INFOS = [
         kernels={
             features.Image: F.normalize_image_tensor,
         },
+        skips=[
+            skip_dispatch_feature,
+        ],
     ),
 ]
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 3f050ad8f..a047a2d57 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -33,7 +33,7 @@ class KernelInfo:
     sample_inputs_fn: Callable[[], Iterable[ArgsKwargs]]
     # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
     # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
-    kernel_name: Optional[str] = None
+    kernel_name: str = dataclasses.field(default=None)
     # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also take
     # tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should happen
     # inside the function. It should return a tensor or to be more precise an object that can be compared to a
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index b2c830d5d..143a5cd22 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -174,6 +174,18 @@ class TestKernels:
         assert_close(actual, expected, check_dtype=False, **info.closeness_kwargs)
 
 
+@pytest.fixture
+def spy_on(mocker):
+    def make_spy(fn, *, module=None, name=None):
+        # TODO: we can probably get rid of the non-default modules and names if we eliminate aliasing
+        module = module or fn.__module__
+        name = name or fn.__name__
+        spy = mocker.patch(f"{module}.{name}", wraps=fn)
+        return spy
+
+    return make_spy
+
+
 class TestDispatchers:
     @pytest.mark.parametrize(
         ("info", "args_kwargs"),
@@ -211,6 +223,69 @@ class TestDispatchers:
     def test_scriptable(self, dispatcher):
         script(dispatcher)
 
+    @pytest.mark.parametrize(
+        ("info", "args_kwargs"),
+        [
+            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
+            for info in DISPATCHER_INFOS
+            for idx, args_kwargs in enumerate(info.sample_inputs(features.Image))
+            if features.Image in info.kernels
+        ],
+    )
+    def test_dispatch_simple_tensor(self, info, args_kwargs, spy_on):
+        (image_feature, *other_args), kwargs = args_kwargs.load()
+        image_simple_tensor = torch.Tensor(image_feature)
+
+        kernel_info = info.kernel_infos[features.Image]
+        spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.kernel_name)
+
+        info.dispatcher(image_simple_tensor, *other_args, **kwargs)
+
+        spy.assert_called_once()
+
+    @pytest.mark.parametrize(
+        ("info", "args_kwargs"),
+        [
+            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
+            for info in DISPATCHER_INFOS
+            for idx, args_kwargs in enumerate(info.sample_inputs(features.Image))
+            if features.Image in info.kernels and info.pil_kernel_info is not None
+        ],
+    )
+    def test_dispatch_pil(self, info, args_kwargs, spy_on):
+        (image_feature, *other_args), kwargs = args_kwargs.load()
+
+        if image_feature.ndim > 3:
+            pytest.skip("Input is batched")
+
+        image_pil = F.to_image_pil(image_feature)
+
+        pil_kernel_info = info.pil_kernel_info
+        spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.kernel_name)
+
+        info.dispatcher(image_pil, *other_args, **kwargs)
+
+        spy.assert_called_once()
+
+    @pytest.mark.parametrize(
+        ("info", "args_kwargs"),
+        [
+            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
+            for info in DISPATCHER_INFOS
+            for idx, args_kwargs in enumerate(info.sample_inputs())
+        ],
+    )
+    def test_dispatch_feature(self, info, args_kwargs, spy_on):
+        (feature, *other_args), kwargs = args_kwargs.load()
+
+        method = getattr(feature, info.method_name)
+        feature_type = type(feature)
+        spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{info.method_name}")
+
+        info.dispatcher(feature, *other_args, **kwargs)
+
+        spy.assert_called_once()
+
 
 @pytest.mark.parametrize(
     ("alias", "target"),
-- 
GitLab


From a46c4f0ccdb67d94c2ffc8b68b52693533a7683c Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Wed, 5 Oct 2022 13:54:05 +0100
Subject: [PATCH 011/624] [bugfix] Fix the output format for VideoClips.subset
 (#6700)

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 torchvision/datasets/video_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index c4890ff44..b607def24 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -198,6 +198,7 @@ class VideoClips:
             _video_max_dimension=self._video_max_dimension,
             _audio_samples=self._audio_samples,
             _audio_channels=self._audio_channels,
+            output_format=self.output_format,
         )
 
     @staticmethod
-- 
GitLab


From 46eae182b9a2ad3cb906294e51be2838c98b5073 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Oct 2022 16:46:30 +0200
Subject: [PATCH 012/624] use pytest markers instead of custom solution for
 prototype transforms functional tests (#6653)

* use pytest markers instead of custom solution for prototype transforms functional tests

* cleanup

* cleanup

* trigger CI
---
 test/prototype_transforms_dispatcher_infos.py | 139 +++++++++---------
 test/prototype_transforms_kernel_infos.py     | 132 ++++++++---------
 test/test_prototype_transforms_functional.py  | 131 +++++++++--------
 3 files changed, 201 insertions(+), 201 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index a14d5eaf0..11a4c35ae 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -2,12 +2,12 @@ import collections.abc
 import dataclasses
 
 from collections import defaultdict
+
 from typing import Callable, Dict, List, Optional, Sequence, Type
 
 import pytest
 import torchvision.prototype.transforms.functional as F
-from prototype_common_utils import BoundingBoxLoader
-from prototype_transforms_kernel_infos import KERNEL_INFOS, KernelInfo, Skip
+from prototype_transforms_kernel_infos import KERNEL_INFOS, TestMark
 from torchvision.prototype import features
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
@@ -24,35 +24,27 @@ class PILKernelInfo:
         self.kernel_name = self.kernel_name or self.kernel.__name__
 
 
-def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"):
-    return Skip(
-        "test_scripted_smoke",
-        condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs[name], (int, float)),
-        reason=reason,
-    )
-
-
-def skip_integer_size_jit(name="size"):
-    return skip_python_scalar_arg_jit(name, reason="Integer size is not supported when scripting.")
-
-
 @dataclasses.dataclass
 class DispatcherInfo:
     dispatcher: Callable
     kernels: Dict[Type, Callable]
-    kernel_infos: Dict[Type, KernelInfo] = dataclasses.field(default=None)
     pil_kernel_info: Optional[PILKernelInfo] = None
     method_name: str = dataclasses.field(default=None)
-    skips: Sequence[Skip] = dataclasses.field(default_factory=list)
-    _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False)
+    test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list)
+    _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False)
 
     def __post_init__(self):
         self.kernel_infos = {feature_type: KERNEL_INFO_MAP[kernel] for feature_type, kernel in self.kernels.items()}
         self.method_name = self.method_name or self.dispatcher.__name__
-        skips_map = defaultdict(list)
-        for skip in self.skips:
-            skips_map[skip.test_name].append(skip)
-        self._skips_map = dict(skips_map)
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
 
     def sample_inputs(self, *feature_types, filter_metadata=True):
         for feature_type in feature_types or self.kernels.keys():
@@ -70,17 +62,27 @@ class DispatcherInfo:
 
                     yield args_kwargs
 
-    def maybe_skip(self, *, test_name, args_kwargs, device):
-        skips = self._skips_map.get(test_name)
-        if not skips:
-            return
 
-        for skip in skips:
-            if skip.condition(args_kwargs, device):
-                pytest.skip(skip.reason)
+def xfail_python_scalar_arg_jit(name, *, reason=None):
+    reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
+    return TestMark(
+        ("TestDispatchers", "test_scripted_smoke"),
+        pytest.mark.xfail(reason=reason),
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)),
+    )
+
 
+def xfail_integer_size_jit(name="size"):
+    return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.")
 
-def fill_sequence_needs_broadcast(args_kwargs, device):
+
+skip_dispatch_feature = TestMark(
+    ("TestDispatchers", "test_dispatch_feature"),
+    pytest.mark.skip(reason="Dispatcher doesn't support arbitrary feature dispatch."),
+)
+
+
+def fill_sequence_needs_broadcast(args_kwargs):
     (image_loader, *_), kwargs = args_kwargs
     try:
         fill = kwargs["fill"]
@@ -93,15 +95,12 @@ def fill_sequence_needs_broadcast(args_kwargs, device):
     return image_loader.num_channels > 1
 
 
-skip_dispatch_pil_if_fill_sequence_needs_broadcast = Skip(
-    "test_dispatch_pil",
+xfail_dispatch_pil_if_fill_sequence_needs_broadcast = TestMark(
+    ("TestDispatchers", "test_dispatch_pil"),
+    pytest.mark.xfail(
+        reason="PIL kernel doesn't support sequences of length 1 for `fill` if the number of color channels is larger."
+    ),
     condition=fill_sequence_needs_broadcast,
-    reason="PIL kernel doesn't support sequences of length 1 if the number of channels is larger.",
-)
-
-skip_dispatch_feature = Skip(
-    "test_dispatch_feature",
-    reason="Dispatcher doesn't support arbitrary feature dispatch.",
 )
 
 
@@ -123,8 +122,8 @@ DISPATCHER_INFOS = [
             features.Mask: F.resize_mask,
         },
         pil_kernel_info=PILKernelInfo(F.resize_image_pil),
-        skips=[
-            skip_integer_size_jit(),
+        test_marks=[
+            xfail_integer_size_jit(),
         ],
     ),
     DispatcherInfo(
@@ -135,9 +134,9 @@ DISPATCHER_INFOS = [
             features.Mask: F.affine_mask,
         },
         pil_kernel_info=PILKernelInfo(F.affine_image_pil),
-        skips=[
-            skip_dispatch_pil_if_fill_sequence_needs_broadcast,
-            skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"),
+        test_marks=[
+            xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
+            xfail_python_scalar_arg_jit("shear"),
         ],
     ),
     DispatcherInfo(
@@ -166,16 +165,6 @@ DISPATCHER_INFOS = [
             features.Mask: F.crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"),
-        skips=[
-            Skip(
-                "test_dispatch_feature",
-                condition=lambda args_kwargs, device: isinstance(args_kwargs.args[0], BoundingBoxLoader),
-                reason=(
-                    "F.crop expects 4 coordinates as input, but bounding box sample inputs only generate two "
-                    "since that is sufficient for the kernel."
-                ),
-            )
-        ],
     ),
     DispatcherInfo(
         F.resized_crop,
@@ -193,10 +182,20 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.pad_bounding_box,
             features.Mask: F.pad_mask,
         },
-        skips=[
-            skip_dispatch_pil_if_fill_sequence_needs_broadcast,
-        ],
         pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"),
+        test_marks=[
+            TestMark(
+                ("TestDispatchers", "test_dispatch_pil"),
+                pytest.mark.xfail(
+                    reason=(
+                        "PIL kernel doesn't support sequences of length 1 for argument `fill` and "
+                        "`padding_mode='constant'`, if the number of color channels is larger."
+                    )
+                ),
+                condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs)
+                and args_kwargs.kwargs.get("padding_mode", "constant") == "constant",
+            )
+        ],
     ),
     DispatcherInfo(
         F.perspective,
@@ -205,10 +204,10 @@ DISPATCHER_INFOS = [
             features.BoundingBox: F.perspective_bounding_box,
             features.Mask: F.perspective_mask,
         },
-        skips=[
-            skip_dispatch_pil_if_fill_sequence_needs_broadcast,
-        ],
         pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
+        test_marks=[
+            xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
+        ],
     ),
     DispatcherInfo(
         F.elastic,
@@ -227,8 +226,8 @@ DISPATCHER_INFOS = [
             features.Mask: F.center_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
-        skips=[
-            skip_integer_size_jit("output_size"),
+        test_marks=[
+            xfail_integer_size_jit("output_size"),
         ],
     ),
     DispatcherInfo(
@@ -237,9 +236,9 @@ DISPATCHER_INFOS = [
             features.Image: F.gaussian_blur_image_tensor,
         },
         pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil),
-        skips=[
-            skip_python_scalar_arg_jit("kernel_size"),
-            skip_python_scalar_arg_jit("sigma"),
+        test_marks=[
+            xfail_python_scalar_arg_jit("kernel_size"),
+            xfail_python_scalar_arg_jit("sigma"),
         ],
     ),
     DispatcherInfo(
@@ -290,7 +289,7 @@ DISPATCHER_INFOS = [
             features.Image: F.erase_image_tensor,
         },
         pil_kernel_info=PILKernelInfo(F.erase_image_pil),
-        skips=[
+        test_marks=[
             skip_dispatch_feature,
         ],
     ),
@@ -335,8 +334,8 @@ DISPATCHER_INFOS = [
             features.Image: F.five_crop_image_tensor,
         },
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
-        skips=[
-            skip_integer_size_jit(),
+        test_marks=[
+            xfail_integer_size_jit(),
             skip_dispatch_feature,
         ],
     ),
@@ -345,18 +344,18 @@ DISPATCHER_INFOS = [
         kernels={
             features.Image: F.ten_crop_image_tensor,
         },
-        pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
-        skips=[
-            skip_integer_size_jit(),
+        test_marks=[
+            xfail_integer_size_jit(),
             skip_dispatch_feature,
         ],
+        pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
     ),
     DispatcherInfo(
         F.normalize,
         kernels={
             features.Image: F.normalize_image_tensor,
         },
-        skips=[
+        test_marks=[
             skip_dispatch_feature,
         ],
     ),
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index a047a2d57..2e02989b4 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -3,13 +3,15 @@ import functools
 import itertools
 import math
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
 import numpy as np
 import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.prototype.transforms.functional as F
+
+from _pytest.mark.structures import MarkDecorator
 from datasets_utils import combinations_grid
 from prototype_common_utils import ArgsKwargs, make_bounding_box_loaders, make_image_loaders, make_mask_loaders
 from torchvision.prototype import features
@@ -18,11 +20,14 @@ from torchvision.transforms.functional_tensor import _max_value as get_max_value
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
 
+TestID = Tuple[Optional[str], str]
+
+
 @dataclasses.dataclass
-class Skip:
-    test_name: str
-    reason: str
-    condition: Callable[[ArgsKwargs, str], bool] = lambda args_kwargs, device: True
+class TestMark:
+    test_id: TestID
+    mark: MarkDecorator
+    condition: Callable[[ArgsKwargs], bool] = lambda args_kwargs: True
 
 
 @dataclasses.dataclass
@@ -44,26 +49,22 @@ class KernelInfo:
     reference_inputs_fn: Optional[Callable[[], Iterable[ArgsKwargs]]] = None
     # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`.
     closeness_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    skips: Sequence[Skip] = dataclasses.field(default_factory=list)
-    _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False)
+    test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list)
+    _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False)
 
     def __post_init__(self):
         self.kernel_name = self.kernel_name or self.kernel.__name__
         self.reference_inputs_fn = self.reference_inputs_fn or self.sample_inputs_fn
 
-        skips_map = defaultdict(list)
-        for skip in self.skips:
-            skips_map[skip.test_name].append(skip)
-        self._skips_map = dict(skips_map)
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
 
-    def maybe_skip(self, *, test_name, args_kwargs, device):
-        skips = self._skips_map.get(test_name)
-        if not skips:
-            return
-
-        for skip in skips:
-            if skip.condition(args_kwargs, device):
-                pytest.skip(skip.reason)
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
 
 
 DEFAULT_IMAGE_CLOSENESS_KWARGS = dict(
@@ -87,16 +88,27 @@ def pil_reference_wrapper(pil_kernel):
     return wrapper
 
 
-def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"):
-    return Skip(
-        "test_scripted_vs_eager",
-        condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs[name], (int, float)),
-        reason=reason,
+def mark_framework_limitation(test_id, reason):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time.
+    return TestMark(test_id, pytest.mark.skip(reason=reason))
+
+
+def xfail_python_scalar_arg_jit(name, *, reason=None):
+    reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
+    return TestMark(
+        ("TestKernels", "test_scripted_vs_eager"),
+        pytest.mark.xfail(reason=reason),
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)),
     )
 
 
-def skip_integer_size_jit(name="size"):
-    return skip_python_scalar_arg_jit(name, reason="Integer size is not supported when scripting.")
+def xfail_integer_size_jit(name="size"):
+    return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.")
 
 
 KERNEL_INFOS = []
@@ -151,8 +163,7 @@ KERNEL_INFOS.extend(
 def _get_resize_sizes(image_size):
     height, width = image_size
     length = max(image_size)
-    # FIXME: enable me when the kernels are fixed
-    # yield length
+    yield length
     yield [length]
     yield (length,)
     new_height = int(height * 0.75)
@@ -236,15 +247,15 @@ KERNEL_INFOS.extend(
             reference_fn=reference_resize_image_tensor,
             reference_inputs_fn=reference_inputs_resize_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit(),
+            test_marks=[
+                xfail_integer_size_jit(),
             ],
         ),
         KernelInfo(
             F.resize_bounding_box,
             sample_inputs_fn=sample_inputs_resize_bounding_box,
-            skips=[
-                skip_integer_size_jit(),
+            test_marks=[
+                xfail_integer_size_jit(),
             ],
         ),
         KernelInfo(
@@ -253,8 +264,8 @@ KERNEL_INFOS.extend(
             reference_fn=reference_resize_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit(),
+            test_marks=[
+                xfail_integer_size_jit(),
             ],
         ),
     ]
@@ -436,16 +447,6 @@ def reference_inputs_resize_mask():
         yield ArgsKwargs(mask_loader, **affine_kwargs)
 
 
-# FIXME: @datumbox, remove this as soon as you have fixed the behavior in https://github.com/pytorch/vision/pull/6636
-def skip_scalar_shears(*test_names):
-    for test_name in test_names:
-        yield Skip(
-            test_name,
-            condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs["shear"], (int, float)),
-            reason="The kernel is broken for a scalar `shear`",
-        )
-
-
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -454,7 +455,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.affine_image_pil),
             reference_inputs_fn=reference_inputs_affine_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")],
+            test_marks=[xfail_python_scalar_arg_jit("shear")],
         ),
         KernelInfo(
             F.affine_bounding_box,
@@ -462,13 +463,8 @@ KERNEL_INFOS.extend(
             reference_fn=reference_affine_bounding_box,
             reference_inputs_fn=reference_inputs_affine_bounding_box,
             closeness_kwargs=dict(atol=1, rtol=0),
-            skips=[
-                skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"),
-                *skip_scalar_shears(
-                    "test_batched_vs_single",
-                    "test_no_inplace",
-                    "test_dtype_and_device_consistency",
-                ),
+            test_marks=[
+                xfail_python_scalar_arg_jit("shear"),
             ],
         ),
         KernelInfo(
@@ -477,7 +473,7 @@ KERNEL_INFOS.extend(
             reference_fn=reference_affine_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")],
+            test_marks=[xfail_python_scalar_arg_jit("shear")],
         ),
     ]
 )
@@ -1093,15 +1089,15 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit("output_size"),
+            test_marks=[
+                xfail_integer_size_jit("output_size"),
             ],
         ),
         KernelInfo(
             F.center_crop_bounding_box,
             sample_inputs_fn=sample_inputs_center_crop_bounding_box,
-            skips=[
-                skip_integer_size_jit("output_size"),
+            test_marks=[
+                xfail_integer_size_jit("output_size"),
             ],
         ),
         KernelInfo(
@@ -1110,8 +1106,8 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit("output_size"),
+            test_marks=[
+                xfail_integer_size_jit("output_size"),
             ],
         ),
     ]
@@ -1138,9 +1134,9 @@ KERNEL_INFOS.append(
         F.gaussian_blur_image_tensor,
         sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
         closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        skips=[
-            skip_python_scalar_arg_jit("kernel_size"),
-            skip_python_scalar_arg_jit("sigma"),
+        test_marks=[
+            xfail_python_scalar_arg_jit("kernel_size"),
+            xfail_python_scalar_arg_jit("sigma"),
         ],
     )
 )
@@ -1551,9 +1547,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_five_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
-            skips=[
-                skip_integer_size_jit(),
-                Skip("test_batched_vs_single", reason="Custom batching needed for five_crop_image_tensor."),
+            test_marks=[
+                xfail_integer_size_jit(),
+                mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
@@ -1562,9 +1558,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
-            skips=[
-                skip_integer_size_jit(),
-                Skip("test_batched_vs_single", reason="Custom batching needed for ten_crop_image_tensor."),
+            test_marks=[
+                xfail_integer_size_jit(),
+                mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 143a5cd22..a6523045c 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -1,3 +1,4 @@
+import functools
 import math
 import os
 
@@ -26,33 +27,60 @@ def script(fn):
         raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
 
 
-@pytest.fixture(autouse=True)
-def maybe_skip(request):
-    # In case the test uses no parametrization or fixtures, the `callspec` attribute does not exist
-    try:
-        callspec = request.node.callspec
-    except AttributeError:
-        return
+def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, name_fn=lambda info: str(info)):
+    if condition is None:
 
-    try:
-        info = callspec.params["info"]
-        args_kwargs = callspec.params["args_kwargs"]
-    except KeyError:
-        return
+        def condition(info):
+            return True
 
-    info.maybe_skip(
-        test_name=request.node.originalname, args_kwargs=args_kwargs, device=callspec.params.get("device", "cpu")
-    )
+    def decorator(test_fn):
+        parts = test_fn.__qualname__.split(".")
+        if len(parts) == 1:
+            test_class_name = None
+            test_function_name = parts[0]
+        elif len(parts) == 2:
+            test_class_name, test_function_name = parts
+        else:
+            raise pytest.UsageError("Unable to parse the test class and test name from test function")
+        test_id = (test_class_name, test_function_name)
+
+        argnames = ("info", "args_kwargs")
+        argvalues = []
+        for info in infos:
+            if not condition(info):
+                continue
+
+            args_kwargs = list(args_kwargs_fn(info))
+            name = name_fn(info)
+            idx_field_len = len(str(len(args_kwargs)))
+
+            for idx, args_kwargs_ in enumerate(args_kwargs):
+                argvalues.append(
+                    pytest.param(
+                        info,
+                        args_kwargs_,
+                        marks=info.get_marks(test_id, args_kwargs_),
+                        id=f"{name}-{idx:0{idx_field_len}}",
+                    )
+                )
+
+        return pytest.mark.parametrize(argnames, argvalues)(test_fn)
+
+    return decorator
 
 
 class TestKernels:
-    sample_inputs = pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.kernel_name}-{idx}")
-            for info in KERNEL_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs_fn())
-        ],
+    make_kernel_args_kwargs_parametrization = functools.partial(
+        make_args_kwargs_parametrization, name_fn=lambda info: info.kernel_name
+    )
+    sample_inputs = kernel_sample_inputs = make_kernel_args_kwargs_parametrization(
+        KERNEL_INFOS,
+        args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(),
+    )
+    reference_inputs = make_kernel_args_kwargs_parametrization(
+        KERNEL_INFOS,
+        args_kwargs_fn=lambda info: info.reference_inputs_fn(),
+        condition=lambda info: info.reference_fn is not None,
     )
 
     @sample_inputs
@@ -156,15 +184,7 @@ class TestKernels:
         assert output.dtype == input.dtype
         assert output.device == input.device
 
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.kernel_name}-{idx}")
-            for info in KERNEL_INFOS
-            for idx, args_kwargs in enumerate(info.reference_inputs_fn())
-            if info.reference_fn is not None
-        ],
-    )
+    @reference_inputs
     def test_against_reference(self, info, args_kwargs):
         args, kwargs = args_kwargs.load("cpu")
 
@@ -187,15 +207,16 @@ def spy_on(mocker):
 
 
 class TestDispatchers:
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
-            for info in DISPATCHER_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs(features.Image))
-            if features.Image in info.kernels
-        ],
+    make_dispatcher_args_kwargs_parametrization = functools.partial(
+        make_args_kwargs_parametrization, name_fn=lambda info: info.dispatcher.__name__
     )
+    image_sample_inputs = kernel_sample_inputs = make_dispatcher_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
+        condition=lambda info: features.Image in info.kernels,
+    )
+
+    @image_sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_scripted_smoke(self, info, args_kwargs, device):
         dispatcher = script(info.dispatcher)
@@ -223,15 +244,7 @@ class TestDispatchers:
     def test_scriptable(self, dispatcher):
         script(dispatcher)
 
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
-            for info in DISPATCHER_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs(features.Image))
-            if features.Image in info.kernels
-        ],
-    )
+    @image_sample_inputs
     def test_dispatch_simple_tensor(self, info, args_kwargs, spy_on):
         (image_feature, *other_args), kwargs = args_kwargs.load()
         image_simple_tensor = torch.Tensor(image_feature)
@@ -243,14 +256,10 @@ class TestDispatchers:
 
         spy.assert_called_once()
 
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
-            for info in DISPATCHER_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs(features.Image))
-            if features.Image in info.kernels and info.pil_kernel_info is not None
-        ],
+    @make_dispatcher_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
+        condition=lambda info: info.pil_kernel_info is not None,
     )
     def test_dispatch_pil(self, info, args_kwargs, spy_on):
         (image_feature, *other_args), kwargs = args_kwargs.load()
@@ -267,13 +276,9 @@ class TestDispatchers:
 
         spy.assert_called_once()
 
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
-            for info in DISPATCHER_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs())
-        ],
+    @make_dispatcher_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(),
     )
     def test_dispatch_feature(self, info, args_kwargs, spy_on):
         (feature, *other_args), kwargs = args_kwargs.load()
-- 
GitLab


From 96d1fecf282fa23883fe1953f44edd20c8a8658a Mon Sep 17 00:00:00 2001
From: Aditya Gandhamal <61016383+adityagandhamal@users.noreply.github.com>
Date: Thu, 6 Oct 2022 00:42:37 +0530
Subject: [PATCH 013/624] Handle invalid reduction values (#6675)

* Add ValueError

* Add tests for ValueError

* Add tests for ValueError

* Add ValueError

* Change to if/else

* Ammend iou_fn tests

* Move code excerpt

* Format tests

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/test_ops.py              | 22 ++++++++++++++++++++++
 torchvision/ops/ciou_loss.py  | 11 +++++++++--
 torchvision/ops/diou_loss.py  |  9 ++++++++-
 torchvision/ops/focal_loss.py | 11 +++++++++--
 torchvision/ops/giou_loss.py  | 10 ++++++++--
 5 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index b34fbe7f2..d76e57fae 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1394,6 +1394,11 @@ class TestGeneralizedBoxIouLoss:
         assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, device=device, reduction="sum")
         assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, device=device, reduction="mean")
 
+        # Test reduction value
+        # reduction value other than ["none", "mean", "sum"] should raise a ValueError
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz")
+
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
@@ -1413,6 +1418,9 @@ class TestCompleteBoxIouLoss:
         assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean")
         assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum")
 
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.complete_box_iou_loss(box1s, box2s, reduction="xyz")
+
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
@@ -1432,6 +1440,9 @@ class TestDistanceBoxIouLoss:
         assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean")
         assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum")
 
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.distance_box_iou_loss(box1s, box2s, reduction="xyz")
+
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_distance_iou_inputs(self, dtype, device):
@@ -1554,6 +1565,17 @@ class TestFocalLoss:
         tol = 1e-3 if dtype is torch.half else 1e-5
         torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
 
+    # Raise ValueError for anonymous reduction mode
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_reduction_mode(self, device, dtype, reduction="xyz"):
+        if device == "cpu" and dtype is torch.half:
+            pytest.skip("Currently torch.half is not fully supported on cpu")
+        torch.random.manual_seed(0)
+        inputs, targets = self._generate_diverse_input_target_pair(device=device, dtype=dtype)
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.sigmoid_focal_loss(inputs, targets, 0.25, 2, reduction)
+
 
 class TestMasksToBoxes:
     def test_masks_box(self):
diff --git a/torchvision/ops/ciou_loss.py b/torchvision/ops/ciou_loss.py
index a9f20a5f4..75a1c4cb1 100644
--- a/torchvision/ops/ciou_loss.py
+++ b/torchvision/ops/ciou_loss.py
@@ -63,9 +63,16 @@ def complete_box_iou_loss(
         alpha = v / (1 - iou + v + eps)
 
     loss = diou_loss + alpha * v
-    if reduction == "mean":
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/diou_loss.py b/torchvision/ops/diou_loss.py
index 2187aea4c..caf62bd2c 100644
--- a/torchvision/ops/diou_loss.py
+++ b/torchvision/ops/diou_loss.py
@@ -50,10 +50,17 @@ def distance_box_iou_loss(
 
     loss, _ = _diou_iou_loss(boxes1, boxes2, eps)
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
 
 
diff --git a/torchvision/ops/focal_loss.py b/torchvision/ops/focal_loss.py
index c8cc9a8ac..08c282555 100644
--- a/torchvision/ops/focal_loss.py
+++ b/torchvision/ops/focal_loss.py
@@ -32,6 +32,7 @@ def sigmoid_focal_loss(
         Loss tensor with the reduction option applied.
     """
     # Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py
+
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(sigmoid_focal_loss)
     p = torch.sigmoid(inputs)
@@ -43,9 +44,15 @@ def sigmoid_focal_loss(
         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
         loss = alpha_t * loss
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/giou_loss.py b/torchvision/ops/giou_loss.py
index 0c555ec4f..03ef8e622 100644
--- a/torchvision/ops/giou_loss.py
+++ b/torchvision/ops/giou_loss.py
@@ -62,9 +62,15 @@ def generalized_box_iou_loss(
 
     loss = 1 - miouk
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
-- 
GitLab


From d020820edcc7c417fe9ca581da23b298ea6dfb46 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 6 Oct 2022 09:15:47 +0200
Subject: [PATCH 014/624] make pytest summary more concise (#6708)

* make pytest summary more concise

* fix comment
---
 pytest.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 1dde465d3..a2f59ecec 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,7 @@
 [pytest]
 addopts =
-    # show summary of all tests that did not pass
-    -ra
+    # show tests that (f)ailed, (E)rror, or (X)passed in the summary
+    -rfEX
     # Make tracebacks shorter
     --tb=native
     # enable all warnings
-- 
GitLab


From e3941afca3f380914397cb0e5665e5d616d440ae Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 6 Oct 2022 12:40:15 +0100
Subject: [PATCH 015/624] Remove unnecessary `ignore` directives to fix mypy
 (#6713)

---
 torchvision/prototype/features/_feature.py           | 8 ++++----
 torchvision/prototype/features/_image.py             | 2 +-
 torchvision/prototype/transforms/functional/_misc.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
index 9c0cece15..2da10be90 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/features/_feature.py
@@ -32,10 +32,10 @@ class _Feature(torch.Tensor):
         return (
             torch.as_tensor(  # type: ignore[return-value]
                 data,
-                dtype=dtype,  # type: ignore[arg-type]
-                device=device,  # type: ignore[arg-type]
+                dtype=dtype,
+                device=device,
             )
-            .as_subclass(cls)  # type: ignore[arg-type]
+            .as_subclass(cls)
             .requires_grad_(requires_grad)
         )
 
@@ -115,7 +115,7 @@ class _Feature(torch.Tensor):
             # Inplace `func`'s, canonically identified with a trailing underscore in their name like `.add_(...)`,
             # will retain the input type. Thus, we need to unwrap here.
             if isinstance(output, cls):
-                return output.as_subclass(torch.Tensor)  # type: ignore[arg-type]
+                return output.as_subclass(torch.Tensor)
 
             return output
 
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 21126c7f2..c953ae78c 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -71,7 +71,7 @@ class Image(_Feature):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
     ) -> Image:
-        data = torch.as_tensor(data, dtype=dtype, device=device)  # type: ignore[arg-type]
+        data = torch.as_tensor(data, dtype=dtype, device=device)
         if data.ndim < 2:
             raise ValueError
         elif data.ndim == 2:
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 03ddf05ac..6f35781d4 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -16,7 +16,7 @@ def normalize(
         correct_type = isinstance(inpt, torch.Tensor)
     else:
         correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, features.Image)
-        inpt = inpt.as_subclass(torch.Tensor)  # type: ignore[arg-type]
+        inpt = inpt.as_subclass(torch.Tensor)
     if not correct_type:
         raise TypeError(f"img should be Tensor Image. Got {type(inpt)}")
 
-- 
GitLab


From 026991b152ffc3cbad8f49fe3f448ee66fe58803 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 6 Oct 2022 15:19:44 +0200
Subject: [PATCH 016/624] Reduce sample inputs for prototype transform kernels
 (#6714)

* pad_image_tensor

* pad_mask and pad_bounding_box

* resize_{image_tensor, mask, bounding_box}

* center_crop_{image_tensor, mask}

* {five, ten}_crop_image_tensor

* crop_{image_tensor, mask}

* convert_color_space_image_tensor

* affine_{image_tensor, mask, bounding_box}

* rotate_{image_tensor, mask}

* gaussian_blur_image_tensor

* cleanup
---
 test/prototype_common_utils.py                |   6 +-
 test/prototype_transforms_dispatcher_infos.py |  60 ++-
 test/prototype_transforms_kernel_infos.py     | 389 ++++++++++++------
 torchvision/transforms/functional_tensor.py   |   7 +-
 4 files changed, 309 insertions(+), 153 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index e9192f44f..333e11fb2 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -28,6 +28,7 @@ __all__ = [
     "assert_close",
     "assert_equal",
     "ArgsKwargs",
+    "VALID_EXTRA_DIMS",
     "make_image_loaders",
     "make_image",
     "make_images",
@@ -201,7 +202,10 @@ def _parse_image_size(size, *, name="size"):
         )
 
 
-DEFAULT_EXTRA_DIMS = ((), (0,), (4,), (2, 3), (5, 0), (0, 5))
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
 
 
 def from_loader(loader_fn):
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 11a4c35ae..9678249aa 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -63,17 +63,40 @@ class DispatcherInfo:
                     yield args_kwargs
 
 
-def xfail_python_scalar_arg_jit(name, *, reason=None):
+def xfail_jit_python_scalar_arg(name, *, reason=None):
     reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
     return TestMark(
         ("TestDispatchers", "test_scripted_smoke"),
         pytest.mark.xfail(reason=reason),
-        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)),
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
     )
 
 
-def xfail_integer_size_jit(name="size"):
-    return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.")
+def xfail_jit_integer_size(name="size"):
+    return xfail_jit_python_scalar_arg(name, reason=f"Integer `{name}` is not supported when scripting.")
+
+
+def xfail_jit_tuple_instead_of_list(name, *, reason=None):
+    reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting"
+    return TestMark(
+        ("TestDispatchers", "test_scripted_smoke"),
+        pytest.mark.xfail(reason=reason),
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple),
+    )
+
+
+def is_list_of_ints(args_kwargs):
+    fill = args_kwargs.kwargs.get("fill")
+    return isinstance(fill, list) and any(isinstance(scalar_fill, int) for scalar_fill in fill)
+
+
+def xfail_jit_list_of_ints(name, *, reason=None):
+    reason = reason or f"Passing a list of integers for `{name}` is not supported when scripting"
+    return TestMark(
+        ("TestDispatchers", "test_scripted_smoke"),
+        pytest.mark.xfail(reason=reason),
+        condition=is_list_of_ints,
+    )
 
 
 skip_dispatch_feature = TestMark(
@@ -123,7 +146,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.resize_image_pil),
         test_marks=[
-            xfail_integer_size_jit(),
+            xfail_jit_integer_size(),
         ],
     ),
     DispatcherInfo(
@@ -136,7 +159,10 @@ DISPATCHER_INFOS = [
         pil_kernel_info=PILKernelInfo(F.affine_image_pil),
         test_marks=[
             xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
-            xfail_python_scalar_arg_jit("shear"),
+            xfail_jit_python_scalar_arg("shear"),
+            xfail_jit_tuple_instead_of_list("fill"),
+            # TODO: check if this is a regression since it seems that should be supported if `int` is ok
+            xfail_jit_list_of_ints("fill"),
         ],
     ),
     DispatcherInfo(
@@ -156,6 +182,11 @@ DISPATCHER_INFOS = [
             features.Mask: F.rotate_mask,
         },
         pil_kernel_info=PILKernelInfo(F.rotate_image_pil),
+        test_marks=[
+            xfail_jit_tuple_instead_of_list("fill"),
+            # TODO: check if this is a regression since it seems that should be supported if `int` is ok
+            xfail_jit_list_of_ints("fill"),
+        ],
     ),
     DispatcherInfo(
         F.crop,
@@ -194,7 +225,12 @@ DISPATCHER_INFOS = [
                 ),
                 condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs)
                 and args_kwargs.kwargs.get("padding_mode", "constant") == "constant",
-            )
+            ),
+            xfail_jit_python_scalar_arg("padding"),
+            xfail_jit_tuple_instead_of_list("padding"),
+            xfail_jit_tuple_instead_of_list("fill"),
+            # TODO: check if this is a regression since it seems that should be supported if `int` is ok
+            xfail_jit_list_of_ints("fill"),
         ],
     ),
     DispatcherInfo(
@@ -227,7 +263,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
         test_marks=[
-            xfail_integer_size_jit("output_size"),
+            xfail_jit_integer_size("output_size"),
         ],
     ),
     DispatcherInfo(
@@ -237,8 +273,8 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil),
         test_marks=[
-            xfail_python_scalar_arg_jit("kernel_size"),
-            xfail_python_scalar_arg_jit("sigma"),
+            xfail_jit_python_scalar_arg("kernel_size"),
+            xfail_jit_python_scalar_arg("sigma"),
         ],
     ),
     DispatcherInfo(
@@ -335,7 +371,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         test_marks=[
-            xfail_integer_size_jit(),
+            xfail_jit_integer_size(),
             skip_dispatch_feature,
         ],
     ),
@@ -345,7 +381,7 @@ DISPATCHER_INFOS = [
             features.Image: F.ten_crop_image_tensor,
         },
         test_marks=[
-            xfail_integer_size_jit(),
+            xfail_jit_integer_size(),
             skip_dispatch_feature,
         ],
         pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 2e02989b4..c0e7bf5bf 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -12,8 +12,16 @@ import torchvision.ops
 import torchvision.prototype.transforms.functional as F
 
 from _pytest.mark.structures import MarkDecorator
+from common_utils import cycle_over
 from datasets_utils import combinations_grid
-from prototype_common_utils import ArgsKwargs, make_bounding_box_loaders, make_image_loaders, make_mask_loaders
+from prototype_common_utils import (
+    ArgsKwargs,
+    make_bounding_box_loaders,
+    make_image_loader,
+    make_image_loaders,
+    make_mask_loaders,
+    VALID_EXTRA_DIMS,
+)
 from torchvision.prototype import features
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
@@ -98,17 +106,40 @@ def mark_framework_limitation(test_id, reason):
     return TestMark(test_id, pytest.mark.skip(reason=reason))
 
 
-def xfail_python_scalar_arg_jit(name, *, reason=None):
+def xfail_jit_python_scalar_arg(name, *, reason=None):
     reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
     return TestMark(
         ("TestKernels", "test_scripted_vs_eager"),
         pytest.mark.xfail(reason=reason),
-        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)),
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
+    )
+
+
+def xfail_jit_integer_size(name="size"):
+    return xfail_jit_python_scalar_arg(name, reason=f"Integer `{name}` is not supported when scripting.")
+
+
+def xfail_jit_tuple_instead_of_list(name, *, reason=None):
+    reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting"
+    return TestMark(
+        ("TestKernels", "test_scripted_vs_eager"),
+        pytest.mark.xfail(reason=reason),
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple),
     )
 
 
-def xfail_integer_size_jit(name="size"):
-    return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.")
+def is_list_of_ints(args_kwargs):
+    fill = args_kwargs.kwargs.get("fill")
+    return isinstance(fill, list) and any(isinstance(scalar_fill, int) for scalar_fill in fill)
+
+
+def xfail_jit_list_of_ints(name, *, reason=None):
+    reason = reason or f"Passing a list of integers for `{name}` is not supported when scripting"
+    return TestMark(
+        ("TestKernels", "test_scripted_vs_eager"),
+        pytest.mark.xfail(reason=reason),
+        condition=is_list_of_ints,
+    )
 
 
 KERNEL_INFOS = []
@@ -173,15 +204,33 @@ def _get_resize_sizes(image_size):
 
 
 def sample_inputs_resize_image_tensor():
+    for image_loader in make_image_loaders(
+        sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+    ):
+        for size in _get_resize_sizes(image_loader.image_size):
+            yield ArgsKwargs(image_loader, size=size)
+
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(dtypes=[torch.float32]),
+        make_image_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB]),
         [
             F.InterpolationMode.NEAREST,
+            F.InterpolationMode.BILINEAR,
             F.InterpolationMode.BICUBIC,
         ],
     ):
-        for size in _get_resize_sizes(image_loader.image_size):
-            yield ArgsKwargs(image_loader, size=size, interpolation=interpolation)
+        yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation)
+
+    # We have a speed hack in place for nearest interpolation and single channel images (grayscale)
+    for image_loader in make_image_loaders(
+        sizes=["random"],
+        color_spaces=[features.ColorSpace.GRAY],
+        extra_dims=VALID_EXTRA_DIMS,
+    ):
+        yield ArgsKwargs(
+            image_loader, size=[min(image_loader.image_size) + 1], interpolation=F.InterpolationMode.NEAREST
+        )
+
+    yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25)
 
 
 @pil_reference_wrapper
@@ -217,15 +266,14 @@ def reference_inputs_resize_image_tensor():
 
 
 def sample_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(formats=[features.BoundingBoxFormat.XYXY]):
+    for bounding_box_loader in make_bounding_box_loaders():
         for size in _get_resize_sizes(bounding_box_loader.image_size):
             yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size)
 
 
 def sample_inputs_resize_mask():
-    for mask_loader in make_mask_loaders(dtypes=[torch.uint8]):
-        for size in _get_resize_sizes(mask_loader.shape[-2:]):
-            yield ArgsKwargs(mask_loader, size=size)
+    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+        yield ArgsKwargs(mask_loader, size=[min(mask_loader.shape[-2:]) + 1])
 
 
 @pil_reference_wrapper
@@ -248,14 +296,14 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resize_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_integer_size_jit(),
+                xfail_jit_integer_size(),
             ],
         ),
         KernelInfo(
             F.resize_bounding_box,
             sample_inputs_fn=sample_inputs_resize_bounding_box,
             test_marks=[
-                xfail_integer_size_jit(),
+                xfail_jit_integer_size(),
             ],
         ),
         KernelInfo(
@@ -265,7 +313,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resize_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_integer_size_jit(),
+                xfail_jit_integer_size(),
             ],
         ),
     ]
@@ -290,28 +338,51 @@ def _diversify_affine_kwargs_types(affine_kwargs):
         yield dict(affine_kwargs, shear=diverse_shear)
 
 
+def _full_affine_params(**partial_params):
+    partial_params.setdefault("angle", 0.0)
+    partial_params.setdefault("translate", [0.0, 0.0])
+    partial_params.setdefault("scale", 1.0)
+    partial_params.setdefault("shear", [0.0, 0.0])
+    partial_params.setdefault("center", None)
+    return partial_params
+
+
+_DIVERSE_AFFINE_PARAMS = [
+    _full_affine_params(**{name: arg})
+    for name, args in [
+        ("angle", [1.0, 2]),
+        ("translate", [[1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
+        ("scale", [0.5]),
+        ("shear", [1.0, 2, [1.0], [2], (1.0,), (2,), [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
+        ("center", [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
+    ]
+    for arg in args
+]
+
+
 def sample_inputs_affine_image_tensor():
-    for image_loader, interpolation_mode, center in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.float32]),
+    make_affine_image_loaders = functools.partial(
+        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+    )
+
+    for image_loader, affine_params in itertools.product(make_affine_image_loaders(), _DIVERSE_AFFINE_PARAMS):
+        yield ArgsKwargs(image_loader, **affine_params)
+
+    for image_loader in make_affine_image_loaders():
+        fills = [None, 0.5]
+        if image_loader.num_channels > 1:
+            fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]])
+        for fill in fills:
+            yield ArgsKwargs(image_loader, **_full_affine_params(), fill=fill)
+
+    for image_loader, interpolation in itertools.product(
+        make_affine_image_loaders(),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.BILINEAR,
         ],
-        [None, (0, 0)],
-    ):
-        for fill in [None, 128.0, 128, [12.0], [0.5] * image_loader.num_channels]:
-            yield ArgsKwargs(
-                image_loader,
-                interpolation=interpolation_mode,
-                center=center,
-                fill=fill,
-                **_AFFINE_KWARGS[0],
-            )
-
-    for image_loader, affine_kwargs in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.float32]), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0])
     ):
-        yield ArgsKwargs(image_loader, **affine_kwargs)
+        yield ArgsKwargs(image_loader, **_full_affine_params(), fill=0)
 
 
 def reference_inputs_affine_image_tensor():
@@ -324,22 +395,14 @@ def reference_inputs_affine_image_tensor():
 
 
 def sample_inputs_affine_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
-            **_AFFINE_KWARGS[0],
-        )
-
-    for bounding_box_loader, affine_kwargs in itertools.product(
-        make_bounding_box_loaders(), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0])
+    for bounding_box_loader, affine_params in itertools.product(
+        make_bounding_box_loaders(formats=[features.BoundingBoxFormat.XYXY]), _DIVERSE_AFFINE_PARAMS
     ):
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
             image_size=bounding_box_loader.image_size,
-            **affine_kwargs,
+            **affine_params,
         )
 
 
@@ -423,16 +486,8 @@ def reference_inputs_affine_bounding_box():
 
 
 def sample_inputs_affine_image_mask():
-    for mask_loader, center in itertools.product(
-        make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]),
-        [None, (0, 0)],
-    ):
-        yield ArgsKwargs(mask_loader, center=center, **_AFFINE_KWARGS[0])
-
-    for mask_loader, affine_kwargs in itertools.product(
-        make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0])
-    ):
-        yield ArgsKwargs(mask_loader, **affine_kwargs)
+    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+        yield ArgsKwargs(mask_loader, **_full_affine_params())
 
 
 @pil_reference_wrapper
@@ -455,7 +510,12 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.affine_image_pil),
             reference_inputs_fn=reference_inputs_affine_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            test_marks=[xfail_python_scalar_arg_jit("shear")],
+            test_marks=[
+                xfail_jit_python_scalar_arg("shear"),
+                xfail_jit_tuple_instead_of_list("fill"),
+                # TODO: check if this is a regression since it seems that should be supported if `int` is ok
+                xfail_jit_list_of_ints("fill"),
+            ],
         ),
         KernelInfo(
             F.affine_bounding_box,
@@ -464,7 +524,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_affine_bounding_box,
             closeness_kwargs=dict(atol=1, rtol=0),
             test_marks=[
-                xfail_python_scalar_arg_jit("shear"),
+                xfail_jit_python_scalar_arg("shear"),
             ],
         ),
         KernelInfo(
@@ -473,7 +533,9 @@ KERNEL_INFOS.extend(
             reference_fn=reference_affine_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            test_marks=[xfail_python_scalar_arg_jit("shear")],
+            test_marks=[
+                xfail_jit_python_scalar_arg("shear"),
+            ],
         ),
     ]
 )
@@ -514,15 +576,21 @@ KERNEL_INFOS.append(
 
 
 def sample_inputs_convert_color_space_image_tensor():
-    color_spaces = set(features.ColorSpace) - {features.ColorSpace.OTHER}
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=color_spaces, constant_alpha=True):
-        old_color_space = image_loader.color_space
-        for params in combinations_grid(new_color_space=color_spaces - {old_color_space}, copy=(True, False)):
-            yield ArgsKwargs(image_loader, old_color_space=old_color_space, **params)
+    color_spaces = list(set(features.ColorSpace) - {features.ColorSpace.OTHER})
+
+    for old_color_space, new_color_space in cycle_over(color_spaces):
+        for image_loader in make_image_loaders(sizes=["random"], color_spaces=[old_color_space], constant_alpha=True):
+            yield ArgsKwargs(image_loader, old_color_space=old_color_space, new_color_space=new_color_space)
+
+    for color_space in color_spaces:
+        for image_loader in make_image_loaders(
+            sizes=["random"], color_spaces=[color_space], dtypes=[torch.float32], constant_alpha=True
+        ):
+            yield ArgsKwargs(image_loader, old_color_space=color_space, new_color_space=color_space, copy=False)
 
 
 @pil_reference_wrapper
-def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space, copy):
+def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space, copy=True):
     color_space_pil = features.ColorSpace.from_pil_mode(image_pil.mode)
     if color_space_pil != old_color_space:
         raise pytest.UsageError(
@@ -600,25 +668,30 @@ _ROTATE_ANGLES = [-87, 15, 90]
 
 
 def sample_inputs_rotate_image_tensor():
-    for image_loader, params in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.float32]),
-        combinations_grid(
-            interpolation=[F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
-            expand=[True, False],
-            center=[None, (0, 0)],
-        ),
+    make_rotate_image_loaders = functools.partial(
+        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+    )
+
+    for image_loader in make_rotate_image_loaders():
+        yield ArgsKwargs(image_loader, angle=15.0, expand=True)
+
+    for image_loader, center in itertools.product(
+        make_rotate_image_loaders(), [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]
     ):
-        if params["center"] is not None and params["expand"]:
-            # Otherwise this will emit a warning and ignore center anyway
-            continue
+        yield ArgsKwargs(image_loader, angle=15.0, center=center)
 
-        for fill in [None, 0.5, [0.5] * image_loader.num_channels]:
-            yield ArgsKwargs(
-                image_loader,
-                angle=_ROTATE_ANGLES[0],
-                fill=fill,
-                **params,
-            )
+    for image_loader in make_rotate_image_loaders():
+        fills = [None, 0.5]
+        if image_loader.num_channels > 1:
+            fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]])
+        for fill in fills:
+            yield ArgsKwargs(image_loader, angle=15.0, fill=fill)
+
+    for image_loader, interpolation in itertools.product(
+        make_rotate_image_loaders(),
+        [F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
+    ):
+        yield ArgsKwargs(image_loader, angle=15.0, fill=0)
 
 
 def reference_inputs_rotate_image_tensor():
@@ -637,22 +710,8 @@ def sample_inputs_rotate_bounding_box():
 
 
 def sample_inputs_rotate_mask():
-    for image_loader, params in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.uint8]),
-        combinations_grid(
-            expand=[True, False],
-            center=[None, (0, 0)],
-        ),
-    ):
-        if params["center"] is not None and params["expand"]:
-            # Otherwise this will emit a warning and ignore center anyway
-            continue
-
-        yield ArgsKwargs(
-            image_loader,
-            angle=_ROTATE_ANGLES[0],
-            **params,
-        )
+    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+        yield ArgsKwargs(mask_loader, angle=15.0)
 
 
 @pil_reference_wrapper
@@ -673,6 +732,11 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.rotate_image_pil),
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=[
+                xfail_jit_tuple_instead_of_list("fill"),
+                # TODO: check if this is a regression since it seems that should be supported if `int` is ok
+                xfail_jit_list_of_ints("fill"),
+            ],
         ),
         KernelInfo(
             F.rotate_bounding_box,
@@ -692,7 +756,16 @@ _CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20
 
 
 def sample_inputs_crop_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]):
+    for image_loader, params in itertools.product(
+        make_image_loaders(sizes=[(16, 17)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]),
+        [
+            dict(top=4, left=3, height=7, width=8),
+            dict(top=-1, left=3, height=7, width=8),
+            dict(top=4, left=-1, height=7, width=8),
+            dict(top=4, left=3, height=17, width=8),
+            dict(top=4, left=3, height=7, width=18),
+        ],
+    ):
         yield ArgsKwargs(image_loader, **params)
 
 
@@ -709,8 +782,8 @@ def sample_inputs_crop_bounding_box():
 
 
 def sample_inputs_crop_mask():
-    for mask_loader, params in itertools.product(make_mask_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]):
-        yield ArgsKwargs(mask_loader, **params)
+    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=["random"], num_objects=["random"]):
+        yield ArgsKwargs(mask_loader, top=4, left=3, height=7, width=8)
 
 
 def reference_inputs_crop_mask():
@@ -829,12 +902,34 @@ _PAD_PARAMS = combinations_grid(
 
 
 def sample_inputs_pad_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(sizes=["random"]), _PAD_PARAMS):
-        fills = [None, 128.0, 128, [12.0]]
-        if params["padding_mode"] == "constant":
-            fills.append([12.0 + c for c in range(image_loader.num_channels)])
+    make_pad_image_loaders = functools.partial(
+        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+    )
+
+    for image_loader, padding in itertools.product(
+        make_pad_image_loaders(),
+        [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]],
+    ):
+        yield ArgsKwargs(image_loader, padding=padding)
+
+    for image_loader in make_pad_image_loaders():
+        fills = [None, 0.5]
+        if image_loader.num_channels > 1:
+            fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]])
         for fill in fills:
-            yield ArgsKwargs(image_loader, fill=fill, **params)
+            yield ArgsKwargs(image_loader, padding=[1], fill=fill)
+
+    for image_loader, padding_mode in itertools.product(
+        # We branch for non-constant padding and integer inputs
+        make_pad_image_loaders(dtypes=[torch.uint8]),
+        ["constant", "symmetric", "edge", "reflect"],
+    ):
+        yield ArgsKwargs(image_loader, padding=[1], padding_mode=padding_mode)
+
+    # `torch.nn.functional.pad` does not support symmetric padding, and thus we have a custom implementation. Besides
+    # negative padding, this is already handled by the inputs above.
+    for image_loader in make_pad_image_loaders():
+        yield ArgsKwargs(image_loader, padding=[-1], padding_mode="symmetric")
 
 
 def reference_inputs_pad_image_tensor():
@@ -848,18 +943,21 @@ def reference_inputs_pad_image_tensor():
 
 
 def sample_inputs_pad_bounding_box():
-    for bounding_box_loader, params in itertools.product(make_bounding_box_loaders(), _PAD_PARAMS):
-        if params["padding_mode"] != "constant":
-            continue
-
+    for bounding_box_loader, padding in itertools.product(
+        make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
+    ):
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size, **params
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            image_size=bounding_box_loader.image_size,
+            padding=padding,
+            padding_mode="constant",
         )
 
 
 def sample_inputs_pad_mask():
-    for image_loader, fill, params in itertools.product(make_mask_loaders(sizes=["random"]), [None, 127], _PAD_PARAMS):
-        yield ArgsKwargs(image_loader, fill=fill, **params)
+    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+        yield ArgsKwargs(mask_loader, padding=[1])
 
 
 def reference_inputs_pad_mask():
@@ -875,10 +973,21 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=[
+                xfail_jit_python_scalar_arg("padding"),
+                xfail_jit_tuple_instead_of_list("padding"),
+                xfail_jit_tuple_instead_of_list("fill"),
+                # TODO: check if this is a regression since it seems that should be supported if `int` is ok
+                xfail_jit_list_of_ints("fill"),
+            ],
         ),
         KernelInfo(
             F.pad_bounding_box,
             sample_inputs_fn=sample_inputs_pad_bounding_box,
+            test_marks=[
+                xfail_jit_python_scalar_arg("padding"),
+                xfail_jit_tuple_instead_of_list("padding"),
+            ],
         ),
         KernelInfo(
             F.pad_mask,
@@ -1045,7 +1154,13 @@ _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
 
 def sample_inputs_center_crop_image_tensor():
     for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES), _CENTER_CROP_OUTPUT_SIZES
+        make_image_loaders(sizes=[(16, 17)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]),
+        [
+            # valid `output_size` types for which cropping is applied to both dimensions
+            *[5, (4,), (2, 3), [6], [3, 2]],
+            # `output_size`'s for which at least one dimension needs to be padded
+            *[[4, 18], [17, 5], [17, 18]],
+        ],
     ):
         yield ArgsKwargs(image_loader, output_size=output_size)
 
@@ -1068,10 +1183,9 @@ def sample_inputs_center_crop_bounding_box():
 
 
 def sample_inputs_center_crop_mask():
-    for mask_loader, output_size in itertools.product(
-        make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES), _CENTER_CROP_OUTPUT_SIZES
-    ):
-        yield ArgsKwargs(mask_loader, output_size=output_size)
+    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+        height, width = mask_loader.shape[-2:]
+        yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2))
 
 
 def reference_inputs_center_crop_mask():
@@ -1090,14 +1204,14 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_center_crop_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_integer_size_jit("output_size"),
+                xfail_jit_integer_size("output_size"),
             ],
         ),
         KernelInfo(
             F.center_crop_bounding_box,
             sample_inputs_fn=sample_inputs_center_crop_bounding_box,
             test_marks=[
-                xfail_integer_size_jit("output_size"),
+                xfail_jit_integer_size("output_size"),
             ],
         ),
         KernelInfo(
@@ -1107,7 +1221,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_center_crop_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_integer_size_jit("output_size"),
+                xfail_jit_integer_size("output_size"),
             ],
         ),
     ]
@@ -1115,18 +1229,21 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_gaussian_blur_image_tensor():
-    for image_loader, params in itertools.product(
-        make_image_loaders(
-            sizes=["random"],
-            # FIXME: kernel should support arbitrary batch sizes
-            extra_dims=[(), (4,)],
-        ),
-        combinations_grid(
-            kernel_size=[(3, 3), [3, 3], 5],
-            sigma=[None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)],
-        ),
+    make_gaussian_blur_image_loaders = functools.partial(
+        make_image_loaders,
+        sizes=["random"],
+        color_spaces=[features.ColorSpace.RGB],
+        # FIXME: kernel should support arbitrary batch sizes
+        extra_dims=[(), (4,)],
+    )
+
+    for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
+        yield ArgsKwargs(image_loader, kernel_size=kernel_size)
+
+    for image_loader, sigma in itertools.product(
+        make_gaussian_blur_image_loaders(), [None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)]
     ):
-        yield ArgsKwargs(image_loader, **params)
+        yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma)
 
 
 KERNEL_INFOS.append(
@@ -1135,8 +1252,8 @@ KERNEL_INFOS.append(
         sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
         closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         test_marks=[
-            xfail_python_scalar_arg_jit("kernel_size"),
-            xfail_python_scalar_arg_jit("sigma"),
+            xfail_jit_python_scalar_arg("kernel_size"),
+            xfail_jit_python_scalar_arg("sigma"),
         ],
     )
 )
@@ -1518,7 +1635,9 @@ def _get_five_ten_crop_image_size(size):
 
 def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)]):
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+        ):
             yield ArgsKwargs(image_loader, size=size)
 
 
@@ -1530,7 +1649,9 @@ def reference_inputs_five_crop_image_tensor():
 
 def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)]):
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+        ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
@@ -1548,7 +1669,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
             test_marks=[
-                xfail_integer_size_jit(),
+                xfail_jit_integer_size(),
                 mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
@@ -1559,7 +1680,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
             test_marks=[
-                xfail_integer_size_jit(),
+                xfail_jit_integer_size(),
                 mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index 2be2964b9..20b76fbf0 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -755,12 +755,7 @@ def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Te
     kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
     kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
 
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(
-        img,
-        [
-            kernel.dtype,
-        ],
-    )
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
 
     # padding = (left, right, top, bottom)
     padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
-- 
GitLab


From 61034d534c1dff58a66bf7e2a9be8c173648a483 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 6 Oct 2022 15:19:22 +0100
Subject: [PATCH 017/624] Avoid recommuting the affine matrix in bbox rotate
 (#6712)

* Avoid recommuting the affine matrix in bbox rotate

* Fix linter

* inverted=True for estimating image size

* Update the image size estimation to match the one from the image kernel

* Nits

* Address comments.

* Center=0,0 when expand=true
---
 .../transforms/functional/_geometry.py        | 51 ++++++++++---------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 6a035b257..7a291967b 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -279,9 +279,9 @@ def affine_image_tensor(
     center_f = [0.0, 0.0]
     if center is not None:
         # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-        center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
+        center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
 
-    translate_f = [1.0 * t for t in translate]
+    translate_f = [float(t) for t in translate]
     matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
 
     output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill)
@@ -321,7 +321,7 @@ def _affine_bounding_box_xyxy(
     shear: List[float],
     center: Optional[List[float]] = None,
     expand: bool = False,
-) -> torch.Tensor:
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
     angle, translate, shear, center = _affine_parse_args(
         angle, translate, scale, shear, InterpolationMode.NEAREST, center
     )
@@ -333,11 +333,16 @@ def _affine_bounding_box_xyxy(
     dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
     device = bounding_box.device
 
-    affine_matrix = torch.tensor(
-        _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False),
-        dtype=dtype,
-        device=device,
-    ).view(2, 3)
+    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
+    transposed_affine_matrix = (
+        torch.tensor(
+            affine_vector,
+            dtype=dtype,
+            device=device,
+        )
+        .view(2, 3)
+        .T
+    )
     # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
@@ -345,7 +350,7 @@ def _affine_bounding_box_xyxy(
     points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
     points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
     # 2) Now let's transform the points using affine matrix
-    transformed_points = torch.matmul(points, affine_matrix.T)
+    transformed_points = torch.matmul(points, transposed_affine_matrix)
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
     transformed_points = transformed_points.view(-1, 4, 2)
@@ -360,20 +365,24 @@ def _affine_bounding_box_xyxy(
         points = torch.tensor(
             [
                 [0.0, 0.0, 1.0],
-                [0.0, 1.0 * height, 1.0],
-                [1.0 * width, 1.0 * height, 1.0],
-                [1.0 * width, 0.0, 1.0],
+                [0.0, float(height), 1.0],
+                [float(width), float(height), 1.0],
+                [float(width), 0.0, 1.0],
             ],
             dtype=dtype,
             device=device,
         )
-        new_points = torch.matmul(points, affine_matrix.T)
+        new_points = torch.matmul(points, transposed_affine_matrix)
         tr, _ = torch.min(new_points, dim=0, keepdim=True)
         # Translate bounding boxes
         out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0]
         out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1]
+        # Estimate meta-data for image with inverted=True and with center=[0,0]
+        affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
+        new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height)
+        image_size = (new_height, new_width)
 
-    return out_bboxes.to(bounding_box.dtype)
+    return out_bboxes.to(bounding_box.dtype), image_size
 
 
 def affine_bounding_box(
@@ -391,7 +400,7 @@ def affine_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
     ).view(-1, 4)
 
-    out_bboxes = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center)
+    out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center)
 
     # out_bboxes should be of shape [N boxes, 4]
 
@@ -502,7 +511,7 @@ def rotate_image_tensor(
             warnings.warn("The provided center argument has no effect on the result if expand is True")
         else:
             # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-            center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
+            center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
 
     # due to current incoherence of rotation angle direction between affine and rotate implementations
     # we need to set -angle.
@@ -558,7 +567,7 @@ def rotate_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
     ).view(-1, 4)
 
-    out_bboxes = _affine_bounding_box_xyxy(
+    out_bboxes, image_size = _affine_bounding_box_xyxy(
         bounding_box,
         image_size,
         angle=-angle,
@@ -569,14 +578,6 @@ def rotate_bounding_box(
         expand=expand,
     )
 
-    if expand:
-        # TODO: Move this computation inside of `_affine_bounding_box_xyxy` to avoid computing the rotation and points
-        #  matrix twice
-        height, width = image_size
-        rotation_matrix = _get_inverse_affine_matrix([0.0, 0.0], angle, [0.0, 0.0], 1.0, [0.0, 0.0])
-        new_width, new_height = _FT._compute_affine_output_size(rotation_matrix, width, height)
-        image_size = (new_height, new_width)
-
     return (
         convert_format_bounding_box(
             out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-- 
GitLab


From 7d2de404372b0a77c5dec825c62f739e75a351ee Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 6 Oct 2022 11:49:29 -0400
Subject: [PATCH 018/624] Fix windows python 3.8 required dlls not found
 (#6715)

* Fix windows python 3.8

* Update torchvision/extension.py

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>

* Update torchvision/extension.py

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 torchvision/extension.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/torchvision/extension.py b/torchvision/extension.py
index 3bad8351b..702e7e33b 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -16,6 +16,18 @@ def _has_ops():
 
 
 try:
+    # On Windows Python-3.8.x has `os.add_dll_directory` call,
+    # which is called to configure dll search path.
+    # To find cuda related dlls we need to make sure the
+    # conda environment/bin path is configured Please take a look:
+    # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
+    if os.name == "nt" and sys.version_info >= (3, 8) and sys.version_info < (3, 9):
+        env_path = os.environ["PATH"]
+        path_arr = env_path.split(";")
+        for path in path_arr:
+            if os.path.exists(path):
+                os.add_dll_directory(path)  # type: ignore[attr-defined]
+
     lib_path = _get_extension_path("_C")
     torch.ops.load_library(lib_path)
     _HAS_OPS = True
-- 
GitLab


From 7eb5d7fcab73afec976907a855d9e63fa31f5579 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 6 Oct 2022 22:42:42 +0200
Subject: [PATCH 019/624] close streams in prototype datasets (#6647)

* close streams in prototype datasets

* refactor prototype SBD to avoid closing demux streams at construction time

* mypy
---
 test/builtin_dataset_mocks.py                 | 28 ++++----
 test/test_prototype_datasets_builtin.py       | 70 +++++++++++++++----
 .../prototype/datasets/_builtin/celeba.py     | 10 +--
 .../prototype/datasets/_builtin/cifar.py      |  4 +-
 .../prototype/datasets/_builtin/clevr.py      |  2 +
 .../prototype/datasets/_builtin/mnist.py      |  2 +
 .../prototype/datasets/_builtin/pcam.py       |  2 +
 .../prototype/datasets/_builtin/sbd.py        | 68 ++++++++++--------
 .../prototype/datasets/_builtin/voc.py        |  4 +-
 .../prototype/datasets/utils/_internal.py     |  8 +--
 torchvision/prototype/features/_encoded.py    |  4 +-
 11 files changed, 135 insertions(+), 67 deletions(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index 8c5484a28..001e7e831 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -661,15 +661,15 @@ class SBDMockData:
     _NUM_CATEGORIES = 20
 
     @classmethod
-    def _make_split_files(cls, root_map):
-        ids_map = {
-            split: [f"2008_{idx:06d}" for idx in idcs]
-            for split, idcs in (
-                ("train", [0, 1, 2]),
-                ("train_noval", [0, 2]),
-                ("val", [3]),
-            )
-        }
+    def _make_split_files(cls, root_map, *, split):
+        splits_and_idcs = [
+            ("train", [0, 1, 2]),
+            ("val", [3]),
+        ]
+        if split == "train_noval":
+            splits_and_idcs.append(("train_noval", [0, 2]))
+
+        ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs}
 
         for split, ids in ids_map.items():
             with open(root_map[split] / f"{split}.txt", "w") as fh:
@@ -710,12 +710,14 @@ class SBDMockData:
         return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
 
     @classmethod
-    def generate(cls, root):
+    def generate(cls, root, *, split):
         archive_folder = root / "benchmark_RELEASE"
         dataset_folder = archive_folder / "dataset"
         dataset_folder.mkdir(parents=True, exist_ok=True)
 
-        ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root}))
+        ids, num_samples_map = cls._make_split_files(
+            defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split
+        )
         sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
         create_image_folder(
             dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
@@ -723,12 +725,12 @@ class SBDMockData:
 
         make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
 
-        return num_samples_map
+        return num_samples_map[split]
 
 
 @register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
 def sbd(root, config):
-    return SBDMockData.generate(root)[config["split"]]
+    return SBDMockData.generate(root, split=config["split"])
 
 
 @register_mock(configs=[dict()])
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 283a30a3d..7bea05fce 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -1,6 +1,7 @@
 import functools
 import io
 import pickle
+from collections import deque
 from pathlib import Path
 
 import pytest
@@ -11,10 +12,11 @@ from torch.utils.data import DataLoader
 from torch.utils.data.graph import traverse_dps
 from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.datapipes.iter import ShardingFilter, Shuffler
+from torchdata.datapipes.utils import StreamWrapper
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import datasets, transforms
+from torchvision.prototype import datasets, features, transforms
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.prototype.features import Image, Label
+
 
 assert_samples_equal = functools.partial(
     assert_equal, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True
@@ -25,6 +27,17 @@ def extract_datapipes(dp):
     return get_all_graph_pipes(traverse_dps(dp))
 
 
+def consume(iterator):
+    # Copied from the official itertools recipes: https://docs.python.org/3/library/itertools.html#itertools-recipes
+    deque(iterator, maxlen=0)
+
+
+def next_consume(iterator):
+    item = next(iterator)
+    consume(iterator)
+    return item
+
+
 @pytest.fixture(autouse=True)
 def test_home(mocker, tmp_path):
     mocker.patch("torchvision.prototype.datasets._api.home", return_value=str(tmp_path))
@@ -66,7 +79,7 @@ class TestCommon:
         dataset, _ = dataset_mock.load(config)
 
         try:
-            sample = next(iter(dataset))
+            sample = next_consume(iter(dataset))
         except StopIteration:
             raise AssertionError("Unable to draw any sample.") from None
         except Exception as error:
@@ -84,22 +97,53 @@ class TestCommon:
 
         assert len(list(dataset)) == mock_info["num_samples"]
 
+    @pytest.fixture
+    def log_session_streams(self):
+        debug_unclosed_streams = StreamWrapper.debug_unclosed_streams
+        try:
+            StreamWrapper.debug_unclosed_streams = True
+            yield
+        finally:
+            StreamWrapper.debug_unclosed_streams = debug_unclosed_streams
+
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_vanilla_tensors(self, dataset_mock, config):
+    def test_stream_closing(self, log_session_streams, dataset_mock, config):
+        def make_msg_and_close(head):
+            unclosed_streams = []
+            for stream in StreamWrapper.session_streams.keys():
+                unclosed_streams.append(repr(stream.file_obj))
+                stream.close()
+            unclosed_streams = "\n".join(unclosed_streams)
+            return f"{head}\n\n{unclosed_streams}"
+
+        if StreamWrapper.session_streams:
+            raise pytest.UsageError(make_msg_and_close("A previous test did not close the following streams:"))
+
         dataset, _ = dataset_mock.load(config)
 
-        vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
-        if vanilla_tensors:
+        consume(iter(dataset))
+
+        if StreamWrapper.session_streams:
+            raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:"))
+
+    @parametrize_dataset_mocks(DATASET_MOCKS)
+    def test_no_simple_tensors(self, dataset_mock, config):
+        dataset, _ = dataset_mock.load(config)
+
+        simple_tensors = {key for key, value in next_consume(iter(dataset)).items() if features.is_simple_tensor(value)}
+        if simple_tensors:
             raise AssertionError(
                 f"The values of key(s) "
-                f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors."
+                f"{sequence_to_str(sorted(simple_tensors), separate_last='and ')} contained simple tensors."
             )
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_transformable(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        next(iter(dataset.map(transforms.Identity())))
+        dataset = dataset.map(transforms.Identity())
+
+        consume(iter(dataset))
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_traversable(self, dataset_mock, config):
@@ -131,7 +175,7 @@ class TestCommon:
             collate_fn=self._collate_fn,
         )
 
-        next(iter(dl))
+        consume(dl)
 
     # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
     #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680
@@ -148,7 +192,7 @@ class TestCommon:
     def test_save_load(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        sample = next(iter(dataset))
+        sample = next_consume(iter(dataset))
 
         with io.BytesIO() as buffer:
             torch.save(sample, buffer)
@@ -177,7 +221,7 @@ class TestQMNIST:
     def test_extra_label(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        sample = next(iter(dataset))
+        sample = next_consume(iter(dataset))
         for key, type in (
             ("nist_hsf_series", int),
             ("nist_writer_id", int),
@@ -214,7 +258,7 @@ class TestUSPS:
             assert "image" in sample
             assert "label" in sample
 
-            assert isinstance(sample["image"], Image)
-            assert isinstance(sample["label"], Label)
+            assert isinstance(sample["image"], features.Image)
+            assert isinstance(sample["label"], features.Label)
 
             assert sample["image"].shape == (1, 16, 16)
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index e42657e82..a0a021845 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -30,24 +30,26 @@ class CelebACSVParser(IterDataPipe[Tuple[str, Dict[str, str]]]):
 
     def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]:
         for _, file in self.datapipe:
-            file = (line.decode() for line in file)
+            lines = (line.decode() for line in file)
 
             if self.fieldnames:
                 fieldnames = self.fieldnames
             else:
                 # The first row is skipped, because it only contains the number of samples
-                next(file)
+                next(lines)
 
                 # Empty field names are filtered out, because some files have an extra white space after the header
                 # line, which is recognized as extra column
-                fieldnames = [name for name in next(csv.reader([next(file)], dialect="celeba")) if name]
+                fieldnames = [name for name in next(csv.reader([next(lines)], dialect="celeba")) if name]
                 # Some files do not include a label for the image ID column
                 if fieldnames[0] != "image_id":
                     fieldnames.insert(0, "image_id")
 
-            for line in csv.DictReader(file, fieldnames=fieldnames, dialect="celeba"):
+            for line in csv.DictReader(lines, fieldnames=fieldnames, dialect="celeba"):
                 yield line.pop("image_id"), line
 
+            file.close()
+
 
 NAME = "celeba"
 
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
index 26196ded6..0fff2e6a1 100644
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ b/torchvision/prototype/datasets/_builtin/cifar.py
@@ -62,7 +62,9 @@ class _CifarBase(Dataset):
 
     def _unpickle(self, data: Tuple[str, io.BytesIO]) -> Dict[str, Any]:
         _, file = data
-        return cast(Dict[str, Any], pickle.load(file, encoding="latin1"))
+        content = cast(Dict[str, Any], pickle.load(file, encoding="latin1"))
+        file.close()
+        return content
 
     def _prepare_sample(self, data: Tuple[np.ndarray, int]) -> Dict[str, Any]:
         image_array, category_idx = data
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
index 4ddacdfb9..cb701fbe6 100644
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ b/torchvision/prototype/datasets/_builtin/clevr.py
@@ -97,6 +97,8 @@ class CLEVR(Dataset):
                 buffer_size=INFINITE_BUFFER_SIZE,
             )
         else:
+            for _, file in scenes_dp:
+                file.close()
             dp = Mapper(images_dp, self._add_empty_anns)
 
         return Mapper(dp, self._prepare_sample)
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 7a459b2d0..c13836a8c 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -57,6 +57,8 @@ class MNISTFileReader(IterDataPipe[torch.Tensor]):
             for _ in range(stop - start):
                 yield read(dtype=dtype, count=count).reshape(shape)
 
+            file.close()
+
 
 class _MNISTBase(Dataset):
     _URL_BASE: Union[str, Sequence[str]]
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
index 162f22f1a..3a9fe6e90 100644
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ b/torchvision/prototype/datasets/_builtin/pcam.py
@@ -33,6 +33,8 @@ class PCAMH5Reader(IterDataPipe[Tuple[str, io.IOBase]]):
                     data = data[self.key]
                 yield from data
 
+            handle.close()
+
 
 _Resource = namedtuple("_Resource", ("file_name", "gdrive_id", "sha256"))
 
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
index c7a79c418..7aea1e0f7 100644
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ b/torchvision/prototype/datasets/_builtin/sbd.py
@@ -49,31 +49,35 @@ class SBD(Dataset):
         super().__init__(root, dependencies=("scipy",), skip_integrity_check=skip_integrity_check)
 
     def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz",
-            sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53",
-        )
-        extra_split = HttpResource(
-            "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt",
-            sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432",
-        )
-        return [archive, extra_split]
+        resources = [
+            HttpResource(
+                "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz",
+                sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53",
+            )
+        ]
+        if self._split == "train_noval":
+            resources.append(
+                HttpResource(
+                    "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt",
+                    sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432",
+                )
+            )
+        return resources  # type: ignore[return-value]
 
     def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
         path = pathlib.Path(data[0])
         parent, grandparent, *_ = path.parents
 
-        if parent.name == "dataset":
-            return 0
-        elif grandparent.name == "dataset":
+        if grandparent.name == "dataset":
             if parent.name == "img":
-                return 1
+                return 0
             elif parent.name == "cls":
-                return 2
-            else:
-                return None
-        else:
-            return None
+                return 1
+
+        if parent.name == "dataset" and self._split != "train_noval":
+            return 2
+
+        return None
 
     def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
         split_and_image_data, ann_data = data
@@ -93,18 +97,24 @@ class SBD(Dataset):
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp, extra_split_dp = resource_dps
-
-        archive_dp = resource_dps[0]
-        split_dp, images_dp, anns_dp = Demultiplexer(
-            archive_dp,
-            3,
-            self._classify_archive,
-            buffer_size=INFINITE_BUFFER_SIZE,
-            drop_none=True,
-        )
         if self._split == "train_noval":
-            split_dp = extra_split_dp
+            archive_dp, split_dp = resource_dps
+            images_dp, anns_dp = Demultiplexer(
+                archive_dp,
+                2,
+                self._classify_archive,
+                buffer_size=INFINITE_BUFFER_SIZE,
+                drop_none=True,
+            )
+        else:
+            archive_dp = resource_dps[0]
+            images_dp, anns_dp, split_dp = Demultiplexer(
+                archive_dp,
+                3,
+                self._classify_archive,
+                buffer_size=INFINITE_BUFFER_SIZE,
+                drop_none=True,
+            )
 
         split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
         split_dp = LineReader(split_dp, decode=True)
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index 2f13ce10d..84a9b3a7f 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -94,7 +94,9 @@ class VOC(Dataset):
             return None
 
     def _parse_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        return cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"])
+        ann = cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"])
+        buffer.close()
+        return ann
 
     def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
         anns = self._parse_detection_ann(buffer)
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 0385d98c2..55f1b8a3f 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -8,7 +8,6 @@ import torch
 import torch.distributed as dist
 import torch.utils.data
 from torchdata.datapipes.iter import IoPathFileLister, IoPathFileOpener, IterDataPipe, ShardingFilter, Shuffler
-from torchdata.datapipes.utils import StreamWrapper
 from torchvision.prototype.utils._internal import fromfile
 
 
@@ -40,10 +39,9 @@ def read_mat(buffer: BinaryIO, **kwargs: Any) -> Any:
     except ImportError as error:
         raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error
 
-    if isinstance(buffer, StreamWrapper):
-        buffer = buffer.file_obj
-
-    return sio.loadmat(buffer, **kwargs)
+    data = sio.loadmat(buffer, **kwargs)
+    buffer.close()
+    return data
 
 
 class MappingIterator(IterDataPipe[Union[Tuple[K, D], D]]):
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
index b8b983960..0ec14ab20 100644
--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/features/_encoded.py
@@ -27,7 +27,9 @@ class EncodedData(_Feature):
 
     @classmethod
     def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D:
-        return cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs)
+        encoded_data = cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs)
+        file.close()
+        return encoded_data
 
     @classmethod
     def from_path(cls: Type[D], path: Union[str, os.PathLike], **kwargs: Any) -> D:
-- 
GitLab


From 3118fb520d5c8f0d413241104b27848c46c2460e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 7 Oct 2022 15:59:42 +0200
Subject: [PATCH 020/624] add Video feature and kernels (#6667)

* add video feature

* add video kernels

* add video testing utils

* add one kernel info

* fix kernel names in Video feature

* use only uint8 for video testing

* require at least 4 dims for Video feature

* add TODO for image_size -> spatial_size

* image -> video in feature constructor

* introduce new combined images and video type

* add video to transform utils

* fix transforms test

* fix auto augment

* cleanup

* address review comments

* add remaining video kernel infos

* add batch dimension squashing to some kernels

* fix tests and kernel infos

* add xfails for arbitrary batch sizes on some kernels

* fix test setup

* fix equalize_image_tensor for multi batch dims

* fix adjust_sharpness_image_tensor for multi batch dims

* address review comments
---
 test/prototype_common_utils.py                |  81 ++-
 test/prototype_transforms_dispatcher_infos.py |  20 +
 test/prototype_transforms_kernel_infos.py     | 601 +++++++++++++-----
 test/test_prototype_transforms.py             |   9 +-
 test/test_prototype_transforms_functional.py  |   1 +
 torchvision/prototype/features/__init__.py    |   1 +
 torchvision/prototype/features/_video.py      | 240 +++++++
 torchvision/prototype/transforms/_augment.py  |   4 +-
 .../prototype/transforms/_auto_augment.py     |  86 +--
 torchvision/prototype/transforms/_color.py    |  12 +-
 torchvision/prototype/transforms/_geometry.py |   6 +-
 torchvision/prototype/transforms/_meta.py     |   4 +-
 torchvision/prototype/transforms/_misc.py     |   8 +-
 torchvision/prototype/transforms/_utils.py    |   4 +-
 .../transforms/functional/__init__.py         |  35 +-
 .../transforms/functional/_augment.py         |  14 +-
 .../prototype/transforms/functional/_color.py |  97 ++-
 .../transforms/functional/_geometry.py        | 137 ++++
 .../prototype/transforms/functional/_meta.py  |  34 +-
 .../prototype/transforms/functional/_misc.py  |  34 +-
 20 files changed, 1171 insertions(+), 257 deletions(-)
 create mode 100644 torchvision/prototype/features/_video.py

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 333e11fb2..c10cec94c 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -45,6 +45,8 @@ __all__ = [
     "make_segmentation_masks",
     "make_mask_loaders",
     "make_masks",
+    "make_video",
+    "make_videos",
 ]
 
 
@@ -210,17 +212,19 @@ DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
 
 def from_loader(loader_fn):
     def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
         loader = loader_fn(*args, **kwargs)
-        return loader.load(kwargs.get("device", "cpu"))
+        return loader.load(device)
 
     return wrapper
 
 
 def from_loaders(loaders_fn):
     def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
         loaders = loaders_fn(*args, **kwargs)
         for loader in loaders:
-            yield loader.load(kwargs.get("device", "cpu"))
+            yield loader.load(device)
 
     return wrapper
 
@@ -246,6 +250,21 @@ class ImageLoader(TensorLoader):
         self.num_channels = self.shape[-3]
 
 
+NUM_CHANNELS_MAP = {
+    features.ColorSpace.GRAY: 1,
+    features.ColorSpace.GRAY_ALPHA: 2,
+    features.ColorSpace.RGB: 3,
+    features.ColorSpace.RGB_ALPHA: 4,
+}
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
 def make_image_loader(
     size="random",
     *,
@@ -255,16 +274,7 @@ def make_image_loader(
     constant_alpha=True,
 ):
     size = _parse_image_size(size)
-
-    try:
-        num_channels = {
-            features.ColorSpace.GRAY: 1,
-            features.ColorSpace.GRAY_ALPHA: 2,
-            features.ColorSpace.RGB: 3,
-            features.ColorSpace.RGB_ALPHA: 4,
-        }[color_space]
-    except KeyError as error:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}") from error
+    num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device):
         max_value = get_max_value(dtype)
@@ -531,3 +541,50 @@ def make_mask_loaders(
 
 
 make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size="random",
+    *,
+    color_space=features.ColorSpace.RGB,
+    num_frames="random",
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_image_size(size)
+    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
+
+    def fn(shape, dtype, device):
+        video = make_image(size=shape[-2:], color_space=color_space, extra_dims=shape[:-3], dtype=dtype, device=device)
+        return features.Video(video, color_space=color_space)
+
+    return VideoLoader(
+        fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype, color_space=color_space
+    )
+
+
+make_video = from_loader(make_video_loader)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_IMAGE_SIZES,
+    color_spaces=(
+        features.ColorSpace.GRAY,
+        features.ColorSpace.RGB,
+    ),
+    num_frames=(1, 0, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 9678249aa..be8bd3002 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -127,6 +127,23 @@ xfail_dispatch_pil_if_fill_sequence_needs_broadcast = TestMark(
 )
 
 
+def xfail_all_tests(*, reason, condition):
+    return [
+        TestMark(("TestDispatchers", test_name), pytest.mark.xfail(reason=reason), condition=condition)
+        for test_name in [
+            "test_scripted_smoke",
+            "test_dispatch_simple_tensor",
+            "test_dispatch_feature",
+        ]
+    ]
+
+
+xfails_degenerate_or_multi_batch_dims = xfail_all_tests(
+    reason="See https://github.com/pytorch/vision/issues/6670 for details.",
+    condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]),
+)
+
+
 DISPATCHER_INFOS = [
     DispatcherInfo(
         F.horizontal_flip,
@@ -243,6 +260,7 @@ DISPATCHER_INFOS = [
         pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
         test_marks=[
             xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
+            *xfails_degenerate_or_multi_batch_dims,
         ],
     ),
     DispatcherInfo(
@@ -253,6 +271,7 @@ DISPATCHER_INFOS = [
             features.Mask: F.elastic_mask,
         },
         pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
+        test_marks=xfails_degenerate_or_multi_batch_dims,
     ),
     DispatcherInfo(
         F.center_crop,
@@ -275,6 +294,7 @@ DISPATCHER_INFOS = [
         test_marks=[
             xfail_jit_python_scalar_arg("kernel_size"),
             xfail_jit_python_scalar_arg("sigma"),
+            *xfails_degenerate_or_multi_batch_dims,
         ],
     ),
     DispatcherInfo(
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index c0e7bf5bf..d90d3bf68 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -20,6 +20,7 @@ from prototype_common_utils import (
     make_image_loader,
     make_image_loaders,
     make_mask_loaders,
+    make_video_loaders,
     VALID_EXTRA_DIMS,
 )
 from torchvision.prototype import features
@@ -142,6 +143,25 @@ def xfail_jit_list_of_ints(name, *, reason=None):
     )
 
 
+def xfail_all_tests(*, reason, condition):
+    return [
+        TestMark(("TestKernels", test_name), pytest.mark.xfail(reason=reason), condition=condition)
+        for test_name in [
+            "test_scripted_vs_eager",
+            "test_batched_vs_single",
+            "test_no_inplace",
+            "test_cuda_vs_cpu",
+            "test_dtype_and_device_consistency",
+        ]
+    ]
+
+
+xfails_image_degenerate_or_multi_batch_dims = xfail_all_tests(
+    reason="See https://github.com/pytorch/vision/issues/6670 for details.",
+    condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]),
+)
+
+
 KERNEL_INFOS = []
 
 
@@ -169,6 +189,11 @@ def sample_inputs_horizontal_flip_mask():
         yield ArgsKwargs(image_loader)
 
 
+def sample_inputs_horizontal_flip_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -187,6 +212,10 @@ KERNEL_INFOS.extend(
             F.horizontal_flip_mask,
             sample_inputs_fn=sample_inputs_horizontal_flip_mask,
         ),
+        KernelInfo(
+            F.horizontal_flip_video,
+            sample_inputs_fn=sample_inputs_horizontal_flip_video,
+        ),
     ]
 )
 
@@ -287,6 +316,11 @@ def reference_inputs_resize_mask():
             yield ArgsKwargs(mask_loader, size=size)
 
 
+def sample_inputs_resize_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, size=[min(video_loader.shape[-2:]) + 1])
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -316,6 +350,10 @@ KERNEL_INFOS.extend(
                 xfail_jit_integer_size(),
             ],
         ),
+        KernelInfo(
+            F.resize_video,
+            sample_inputs_fn=sample_inputs_resize_video,
+        ),
     ]
 )
 
@@ -485,7 +523,7 @@ def reference_inputs_affine_bounding_box():
         )
 
 
-def sample_inputs_affine_image_mask():
+def sample_inputs_affine_mask():
     for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
         yield ArgsKwargs(mask_loader, **_full_affine_params())
 
@@ -502,6 +540,11 @@ def reference_inputs_resize_mask():
         yield ArgsKwargs(mask_loader, **affine_kwargs)
 
 
+def sample_inputs_affine_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, **_full_affine_params())
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -529,7 +572,7 @@ KERNEL_INFOS.extend(
         ),
         KernelInfo(
             F.affine_mask,
-            sample_inputs_fn=sample_inputs_affine_image_mask,
+            sample_inputs_fn=sample_inputs_affine_mask,
             reference_fn=reference_affine_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
@@ -537,6 +580,10 @@ KERNEL_INFOS.extend(
                 xfail_jit_python_scalar_arg("shear"),
             ],
         ),
+        KernelInfo(
+            F.affine_video,
+            sample_inputs_fn=sample_inputs_affine_video,
+        ),
     ]
 )
 
@@ -608,14 +655,28 @@ def reference_inputs_convert_color_space_image_tensor():
             yield args_kwargs
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.convert_color_space_image_tensor,
-        sample_inputs_fn=sample_inputs_convert_color_space_image_tensor,
-        reference_fn=reference_convert_color_space_image_tensor,
-        reference_inputs_fn=reference_inputs_convert_color_space_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    ),
+def sample_inputs_convert_color_space_video():
+    color_spaces = [features.ColorSpace.GRAY, features.ColorSpace.RGB]
+
+    for old_color_space, new_color_space in cycle_over(color_spaces):
+        for video_loader in make_video_loaders(sizes=["random"], color_spaces=[old_color_space], num_frames=["random"]):
+            yield ArgsKwargs(video_loader, old_color_space=old_color_space, new_color_space=new_color_space)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.convert_color_space_image_tensor,
+            sample_inputs_fn=sample_inputs_convert_color_space_image_tensor,
+            reference_fn=reference_convert_color_space_image_tensor,
+            reference_inputs_fn=reference_inputs_convert_color_space_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.convert_color_space_video,
+            sample_inputs_fn=sample_inputs_convert_color_space_video,
+        ),
+    ]
 )
 
 
@@ -643,6 +704,11 @@ def sample_inputs_vertical_flip_mask():
         yield ArgsKwargs(image_loader)
 
 
+def sample_inputs_vertical_flip_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -661,6 +727,10 @@ KERNEL_INFOS.extend(
             F.vertical_flip_mask,
             sample_inputs_fn=sample_inputs_vertical_flip_mask,
         ),
+        KernelInfo(
+            F.vertical_flip_video,
+            sample_inputs_fn=sample_inputs_vertical_flip_video,
+        ),
     ]
 )
 
@@ -724,6 +794,11 @@ def reference_inputs_rotate_mask():
         yield ArgsKwargs(mask_loader, angle=angle)
 
 
+def sample_inputs_rotate_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, angle=15.0)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -749,6 +824,10 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_rotate_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
+        KernelInfo(
+            F.rotate_video,
+            sample_inputs_fn=sample_inputs_rotate_video,
+        ),
     ]
 )
 
@@ -791,6 +870,11 @@ def reference_inputs_crop_mask():
         yield ArgsKwargs(mask_loader, **params)
 
 
+def sample_inputs_crop_video():
+    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -812,6 +896,10 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_crop_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
+        KernelInfo(
+            F.crop_video,
+            sample_inputs_fn=sample_inputs_crop_video,
+        ),
     ]
 )
 
@@ -872,6 +960,11 @@ def reference_inputs_resized_crop_mask():
         yield ArgsKwargs(mask_loader, **params)
 
 
+def sample_inputs_resized_crop_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0])
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -892,6 +985,10 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resized_crop_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
+        KernelInfo(
+            F.resized_crop_video,
+            sample_inputs_fn=sample_inputs_resized_crop_video,
+        ),
     ]
 )
 
@@ -965,6 +1062,11 @@ def reference_inputs_pad_mask():
         yield ArgsKwargs(image_loader, fill=fill, **params)
 
 
+def sample_inputs_pad_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, padding=[1])
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -996,6 +1098,10 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_pad_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
+        KernelInfo(
+            F.pad_video,
+            sample_inputs_fn=sample_inputs_pad_video,
+        ),
     ]
 )
 
@@ -1006,11 +1112,7 @@ _PERSPECTIVE_COEFFS = [
 
 
 def sample_inputs_perspective_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
+    for image_loader in make_image_loaders(sizes=["random"]):
         for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
             yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
 
@@ -1030,11 +1132,7 @@ def sample_inputs_perspective_bounding_box():
 
 
 def sample_inputs_perspective_mask():
-    for mask_loader in make_mask_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
+    for mask_loader in make_mask_loaders(sizes=["random"]):
         yield ArgsKwargs(mask_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
 
 
@@ -1045,6 +1143,11 @@ def reference_inputs_perspective_mask():
         yield ArgsKwargs(mask_loader, perspective_coeffs=perspective_coeffs)
 
 
+def sample_inputs_perspective_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1053,6 +1156,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=xfails_image_degenerate_or_multi_batch_dims,
         ),
         KernelInfo(
             F.perspective_bounding_box,
@@ -1064,6 +1168,11 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=xfails_image_degenerate_or_multi_batch_dims,
+        ),
+        KernelInfo(
+            F.perspective_video,
+            sample_inputs_fn=sample_inputs_perspective_video,
         ),
     ]
 )
@@ -1074,11 +1183,7 @@ def _get_elastic_displacement(image_size):
 
 
 def sample_inputs_elastic_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
+    for image_loader in make_image_loaders(sizes=["random"]):
         displacement = _get_elastic_displacement(image_loader.image_size)
         for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
@@ -1109,11 +1214,7 @@ def sample_inputs_elastic_bounding_box():
 
 
 def sample_inputs_elastic_mask():
-    for mask_loader in make_mask_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
+    for mask_loader in make_mask_loaders(sizes=["random"]):
         displacement = _get_elastic_displacement(mask_loader.shape[-2:])
         yield ArgsKwargs(mask_loader, displacement=displacement)
 
@@ -1124,6 +1225,12 @@ def reference_inputs_elastic_mask():
         yield ArgsKwargs(mask_loader, displacement=displacement)
 
 
+def sample_inputs_elastic_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        displacement = _get_elastic_displacement(video_loader.shape[-2:])
+        yield ArgsKwargs(video_loader, displacement=displacement)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1132,6 +1239,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=xfails_image_degenerate_or_multi_batch_dims,
         ),
         KernelInfo(
             F.elastic_bounding_box,
@@ -1143,6 +1251,11 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=xfails_image_degenerate_or_multi_batch_dims,
+        ),
+        KernelInfo(
+            F.elastic_video,
+            sample_inputs_fn=sample_inputs_elastic_video,
         ),
     ]
 )
@@ -1195,6 +1308,12 @@ def reference_inputs_center_crop_mask():
         yield ArgsKwargs(mask_loader, output_size=output_size)
 
 
+def sample_inputs_center_crop_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        height, width = video_loader.shape[-2:]
+        yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2))
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1224,17 +1343,17 @@ KERNEL_INFOS.extend(
                 xfail_jit_integer_size("output_size"),
             ],
         ),
+        KernelInfo(
+            F.center_crop_video,
+            sample_inputs_fn=sample_inputs_center_crop_video,
+        ),
     ]
 )
 
 
 def sample_inputs_gaussian_blur_image_tensor():
     make_gaussian_blur_image_loaders = functools.partial(
-        make_image_loaders,
-        sizes=["random"],
-        color_spaces=[features.ColorSpace.RGB],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
+        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB]
     )
 
     for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
@@ -1246,26 +1365,34 @@ def sample_inputs_gaussian_blur_image_tensor():
         yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.gaussian_blur_image_tensor,
-        sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        test_marks=[
-            xfail_jit_python_scalar_arg("kernel_size"),
-            xfail_jit_python_scalar_arg("sigma"),
-        ],
-    )
+def sample_inputs_gaussian_blur_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, kernel_size=[3, 3])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.gaussian_blur_image_tensor,
+            sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            test_marks=[
+                xfail_jit_python_scalar_arg("kernel_size"),
+                xfail_jit_python_scalar_arg("sigma"),
+                *xfails_image_degenerate_or_multi_batch_dims,
+            ],
+        ),
+        KernelInfo(
+            F.gaussian_blur_video,
+            sample_inputs_fn=sample_inputs_gaussian_blur_video,
+        ),
+    ]
 )
 
 
 def sample_inputs_equalize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB),
-        dtypes=[torch.uint8],
+        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8]
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1277,15 +1404,26 @@ def reference_inputs_equalize_image_tensor():
         yield ArgsKwargs(image_loader)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.equalize_image_tensor,
-        kernel_name="equalize_image_tensor",
-        sample_inputs_fn=sample_inputs_equalize_image_tensor,
-        reference_fn=pil_reference_wrapper(F.equalize_image_pil),
-        reference_inputs_fn=reference_inputs_equalize_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_equalize_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.equalize_image_tensor,
+            kernel_name="equalize_image_tensor",
+            sample_inputs_fn=sample_inputs_equalize_image_tensor,
+            reference_fn=pil_reference_wrapper(F.equalize_image_pil),
+            reference_inputs_fn=reference_inputs_equalize_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.equalize_video,
+            sample_inputs_fn=sample_inputs_equalize_video,
+        ),
+    ]
 )
 
 
@@ -1303,15 +1441,26 @@ def reference_inputs_invert_image_tensor():
         yield ArgsKwargs(image_loader)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.invert_image_tensor,
-        kernel_name="invert_image_tensor",
-        sample_inputs_fn=sample_inputs_invert_image_tensor,
-        reference_fn=pil_reference_wrapper(F.invert_image_pil),
-        reference_inputs_fn=reference_inputs_invert_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_invert_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.invert_image_tensor,
+            kernel_name="invert_image_tensor",
+            sample_inputs_fn=sample_inputs_invert_image_tensor,
+            reference_fn=pil_reference_wrapper(F.invert_image_pil),
+            reference_inputs_fn=reference_inputs_invert_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.invert_video,
+            sample_inputs_fn=sample_inputs_invert_video,
+        ),
+    ]
 )
 
 
@@ -1335,15 +1484,26 @@ def reference_inputs_posterize_image_tensor():
         yield ArgsKwargs(image_loader, bits=bits)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.posterize_image_tensor,
-        kernel_name="posterize_image_tensor",
-        sample_inputs_fn=sample_inputs_posterize_image_tensor,
-        reference_fn=pil_reference_wrapper(F.posterize_image_pil),
-        reference_inputs_fn=reference_inputs_posterize_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_posterize_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.posterize_image_tensor,
+            kernel_name="posterize_image_tensor",
+            sample_inputs_fn=sample_inputs_posterize_image_tensor,
+            reference_fn=pil_reference_wrapper(F.posterize_image_pil),
+            reference_inputs_fn=reference_inputs_posterize_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.posterize_video,
+            sample_inputs_fn=sample_inputs_posterize_video,
+        ),
+    ]
 )
 
 
@@ -1368,15 +1528,26 @@ def reference_inputs_solarize_image_tensor():
             yield ArgsKwargs(image_loader, threshold=threshold)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.solarize_image_tensor,
-        kernel_name="solarize_image_tensor",
-        sample_inputs_fn=sample_inputs_solarize_image_tensor,
-        reference_fn=pil_reference_wrapper(F.solarize_image_pil),
-        reference_inputs_fn=reference_inputs_solarize_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_solarize_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype)))
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.solarize_image_tensor,
+            kernel_name="solarize_image_tensor",
+            sample_inputs_fn=sample_inputs_solarize_image_tensor,
+            reference_fn=pil_reference_wrapper(F.solarize_image_pil),
+            reference_inputs_fn=reference_inputs_solarize_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.solarize_video,
+            sample_inputs_fn=sample_inputs_solarize_video,
+        ),
+    ]
 )
 
 
@@ -1394,15 +1565,26 @@ def reference_inputs_autocontrast_image_tensor():
         yield ArgsKwargs(image_loader)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.autocontrast_image_tensor,
-        kernel_name="autocontrast_image_tensor",
-        sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
-        reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
-        reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_autocontrast_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.autocontrast_image_tensor,
+            kernel_name="autocontrast_image_tensor",
+            sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
+            reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
+            reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.autocontrast_video,
+            sample_inputs_fn=sample_inputs_autocontrast_video,
+        ),
+    ]
 )
 
 _ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]
@@ -1412,8 +1594,6 @@ def sample_inputs_adjust_sharpness_image_tensor():
     for image_loader in make_image_loaders(
         sizes=["random", (2, 2)],
         color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB),
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
 
@@ -1426,15 +1606,26 @@ def reference_inputs_adjust_sharpness_image_tensor():
         yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_sharpness_image_tensor,
-        kernel_name="adjust_sharpness_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_adjust_sharpness_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_sharpness_image_tensor,
+            kernel_name="adjust_sharpness_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
+            reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.adjust_sharpness_video,
+            sample_inputs_fn=sample_inputs_adjust_sharpness_video,
+        ),
+    ]
 )
 
 
@@ -1446,12 +1637,26 @@ def sample_inputs_erase_image_tensor():
         yield ArgsKwargs(image_loader, i=1, j=2, h=h, w=w, v=v)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.erase_image_tensor,
-        kernel_name="erase_image_tensor",
-        sample_inputs_fn=sample_inputs_erase_image_tensor,
-    )
+def sample_inputs_erase_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        # FIXME: make the parameters more diverse
+        h, w = 6, 7
+        v = torch.rand(video_loader.num_channels, h, w)
+        yield ArgsKwargs(video_loader, i=1, j=2, h=h, w=w, v=v)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.erase_image_tensor,
+            kernel_name="erase_image_tensor",
+            sample_inputs_fn=sample_inputs_erase_image_tensor,
+        ),
+        KernelInfo(
+            F.erase_video,
+            sample_inputs_fn=sample_inputs_erase_video,
+        ),
+    ]
 )
 
 _ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]
@@ -1472,15 +1677,26 @@ def reference_inputs_adjust_brightness_image_tensor():
         yield ArgsKwargs(image_loader, brightness_factor=brightness_factor)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_brightness_image_tensor,
-        kernel_name="adjust_brightness_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_adjust_brightness_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_brightness_image_tensor,
+            kernel_name="adjust_brightness_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
+            reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.adjust_brightness_video,
+            sample_inputs_fn=sample_inputs_adjust_brightness_video,
+        ),
+    ]
 )
 
 
@@ -1502,15 +1718,26 @@ def reference_inputs_adjust_contrast_image_tensor():
         yield ArgsKwargs(image_loader, contrast_factor=contrast_factor)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_contrast_image_tensor,
-        kernel_name="adjust_contrast_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_adjust_contrast_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_contrast_image_tensor,
+            kernel_name="adjust_contrast_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
+            reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.adjust_contrast_video,
+            sample_inputs_fn=sample_inputs_adjust_contrast_video,
+        ),
+    ]
 )
 
 _ADJUST_GAMMA_GAMMAS_GAINS = [
@@ -1535,15 +1762,27 @@ def reference_inputs_adjust_gamma_image_tensor():
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_gamma_image_tensor,
-        kernel_name="adjust_gamma_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_adjust_gamma_video():
+    gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, gamma=gamma, gain=gain)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_gamma_image_tensor,
+            kernel_name="adjust_gamma_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
+            reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.adjust_gamma_video,
+            sample_inputs_fn=sample_inputs_adjust_gamma_video,
+        ),
+    ]
 )
 
 
@@ -1565,15 +1804,26 @@ def reference_inputs_adjust_hue_image_tensor():
         yield ArgsKwargs(image_loader, hue_factor=hue_factor)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_hue_image_tensor,
-        kernel_name="adjust_hue_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_adjust_hue_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_hue_image_tensor,
+            kernel_name="adjust_hue_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
+            reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.adjust_hue_video,
+            sample_inputs_fn=sample_inputs_adjust_hue_video,
+        ),
+    ]
 )
 
 _ADJUST_SATURATION_FACTORS = [0.1, 0.5]
@@ -1594,15 +1844,26 @@ def reference_inputs_adjust_saturation_image_tensor():
         yield ArgsKwargs(image_loader, saturation_factor=saturation_factor)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_saturation_image_tensor,
-        kernel_name="adjust_saturation_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
+def sample_inputs_adjust_saturation_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_saturation_image_tensor,
+            kernel_name="adjust_saturation_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
+            reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+        ),
+        KernelInfo(
+            F.adjust_saturation_video,
+            sample_inputs_fn=sample_inputs_adjust_saturation_video,
+        ),
+    ]
 )
 
 
@@ -1702,10 +1963,24 @@ def sample_inputs_normalize_image_tensor():
         yield ArgsKwargs(image_loader, mean=mean, std=std)
 
 
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.normalize_image_tensor,
-        kernel_name="normalize_image_tensor",
-        sample_inputs_fn=sample_inputs_normalize_image_tensor,
-    )
+def sample_inputs_normalize_video():
+    mean, std = _NORMALIZE_MEANS_STDS[0]
+    for video_loader in make_video_loaders(
+        sizes=["random"], color_spaces=[features.ColorSpace.RGB], num_frames=["random"], dtypes=[torch.float32]
+    ):
+        yield ArgsKwargs(video_loader, mean=mean, std=std)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.normalize_image_tensor,
+            kernel_name="normalize_image_tensor",
+            sample_inputs_fn=sample_inputs_normalize_image_tensor,
+        ),
+        KernelInfo(
+            F.normalize_video,
+            sample_inputs_fn=sample_inputs_normalize_video,
+        ),
+    ]
 )
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 9734a5dc3..916861f4e 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -17,6 +17,7 @@ from prototype_common_utils import (
     make_masks,
     make_one_hot_labels,
     make_segmentation_mask,
+    make_videos,
 )
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import features, transforms
@@ -65,6 +66,7 @@ def parametrize_from_transforms(*transforms):
             make_vanilla_tensor_images,
             make_pil_images,
             make_masks,
+            make_videos,
         ]:
             inputs = list(creation_fn())
             try:
@@ -155,12 +157,14 @@ class TestSmoke:
                             features.ColorSpace.RGB,
                         ],
                         dtypes=[torch.uint8],
-                        extra_dims=[(4,)],
+                        extra_dims=[(), (4,)],
+                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
                     )
                     for fn in [
                         make_images,
                         make_vanilla_tensor_images,
                         make_pil_images,
+                        make_videos,
                     ]
                 ),
             )
@@ -184,6 +188,7 @@ class TestSmoke:
                     for fn in [
                         make_images,
                         make_vanilla_tensor_images,
+                        make_videos,
                     ]
                 ),
             ),
@@ -200,6 +205,7 @@ class TestSmoke:
                     make_images(extra_dims=[(4,)]),
                     make_vanilla_tensor_images(),
                     make_pil_images(),
+                    make_videos(extra_dims=[()]),
                 ),
             )
         ]
@@ -218,6 +224,7 @@ class TestSmoke:
                             make_images,
                             make_vanilla_tensor_images,
                             make_pil_images,
+                            make_videos,
                         )
                     ]
                 ),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index a6523045c..5adea4d26 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -129,6 +129,7 @@ class TestKernels:
             # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as
             # common ground.
             features.Mask: 2,
+            features.Video: 4,
         }.get(feature_type)
         if data_dims is None:
             raise pytest.UsageError(
diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
index df77e8b77..6fc2fb6ea 100644
--- a/torchvision/prototype/features/__init__.py
+++ b/torchvision/prototype/features/__init__.py
@@ -13,3 +13,4 @@ from ._image import (
 )
 from ._label import Label, OneHotLabel
 from ._mask import Mask
+from ._video import ImageOrVideoType, ImageOrVideoTypeJIT, TensorImageOrVideoType, TensorImageOrVideoTypeJIT, Video
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
new file mode 100644
index 000000000..e19b6f7ed
--- /dev/null
+++ b/torchvision/prototype/features/_video.py
@@ -0,0 +1,240 @@
+from __future__ import annotations
+
+import warnings
+from typing import Any, cast, List, Optional, Tuple, Union
+
+import torch
+from torchvision.transforms.functional import InterpolationMode
+
+from ._feature import _Feature, FillTypeJIT
+from ._image import ColorSpace, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
+
+
+class Video(_Feature):
+    color_space: ColorSpace
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        color_space: Optional[Union[ColorSpace, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: bool = False,
+    ) -> Video:
+        data = torch.as_tensor(data, dtype=dtype, device=device)
+        if data.ndim < 4:
+            raise ValueError
+        video = super().__new__(cls, data, requires_grad=requires_grad)
+
+        if color_space is None:
+            color_space = ColorSpace.from_tensor_shape(video.shape)  # type: ignore[arg-type]
+            if color_space == ColorSpace.OTHER:
+                warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.")
+        elif isinstance(color_space, str):
+            color_space = ColorSpace.from_str(color_space.upper())
+        elif not isinstance(color_space, ColorSpace):
+            raise ValueError
+        video.color_space = color_space
+
+        return video
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(color_space=self.color_space)
+
+    @classmethod
+    def new_like(
+        cls, other: Video, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any
+    ) -> Video:
+        return super().new_like(
+            other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs
+        )
+
+    # TODO: rename this (and all instances of this term to spatial size)
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        return cast(Tuple[int, int], tuple(self.shape[-2:]))
+
+    @property
+    def num_channels(self) -> int:
+        return self.shape[-3]
+
+    @property
+    def num_frames(self) -> int:
+        return self.shape[-4]
+
+    def to_color_space(self, color_space: Union[str, ColorSpace], copy: bool = True) -> Video:
+        if isinstance(color_space, str):
+            color_space = ColorSpace.from_str(color_space.upper())
+
+        return Video.new_like(
+            self,
+            self._F.convert_color_space_video(
+                self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
+            ),
+            color_space=color_space,
+        )
+
+    def horizontal_flip(self) -> Video:
+        output = self._F.horizontal_flip_video(self)
+        return Video.new_like(self, output)
+
+    def vertical_flip(self) -> Video:
+        output = self._F.vertical_flip_video(self)
+        return Video.new_like(self, output)
+
+    def resize(  # type: ignore[override]
+        self,
+        size: List[int],
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        max_size: Optional[int] = None,
+        antialias: bool = False,
+    ) -> Video:
+        output = self._F.resize_video(self, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+        return Video.new_like(self, output)
+
+    def crop(self, top: int, left: int, height: int, width: int) -> Video:
+        output = self._F.crop_video(self, top, left, height, width)
+        return Video.new_like(self, output)
+
+    def center_crop(self, output_size: List[int]) -> Video:
+        output = self._F.center_crop_video(self, output_size=output_size)
+        return Video.new_like(self, output)
+
+    def resized_crop(
+        self,
+        top: int,
+        left: int,
+        height: int,
+        width: int,
+        size: List[int],
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: bool = False,
+    ) -> Video:
+        output = self._F.resized_crop_video(
+            self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
+        )
+        return Video.new_like(self, output)
+
+    def pad(
+        self,
+        padding: Union[int, List[int]],
+        fill: FillTypeJIT = None,
+        padding_mode: str = "constant",
+    ) -> Video:
+        output = self._F.pad_video(self, padding, fill=fill, padding_mode=padding_mode)
+        return Video.new_like(self, output)
+
+    def rotate(
+        self,
+        angle: float,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        expand: bool = False,
+        fill: FillTypeJIT = None,
+        center: Optional[List[float]] = None,
+    ) -> Video:
+        output = self._F._geometry.rotate_video(
+            self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
+        )
+        return Video.new_like(self, output)
+
+    def affine(
+        self,
+        angle: Union[int, float],
+        translate: List[float],
+        scale: float,
+        shear: List[float],
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: FillTypeJIT = None,
+        center: Optional[List[float]] = None,
+    ) -> Video:
+        output = self._F._geometry.affine_video(
+            self,
+            angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+        return Video.new_like(self, output)
+
+    def perspective(
+        self,
+        perspective_coeffs: List[float],
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        fill: FillTypeJIT = None,
+    ) -> Video:
+        output = self._F._geometry.perspective_video(self, perspective_coeffs, interpolation=interpolation, fill=fill)
+        return Video.new_like(self, output)
+
+    def elastic(
+        self,
+        displacement: torch.Tensor,
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        fill: FillTypeJIT = None,
+    ) -> Video:
+        output = self._F._geometry.elastic_video(self, displacement, interpolation=interpolation, fill=fill)
+        return Video.new_like(self, output)
+
+    def adjust_brightness(self, brightness_factor: float) -> Video:
+        output = self._F.adjust_brightness_video(self, brightness_factor=brightness_factor)
+        return Video.new_like(self, output)
+
+    def adjust_saturation(self, saturation_factor: float) -> Video:
+        output = self._F.adjust_saturation_video(self, saturation_factor=saturation_factor)
+        return Video.new_like(self, output)
+
+    def adjust_contrast(self, contrast_factor: float) -> Video:
+        output = self._F.adjust_contrast_video(self, contrast_factor=contrast_factor)
+        return Video.new_like(self, output)
+
+    def adjust_sharpness(self, sharpness_factor: float) -> Video:
+        output = self._F.adjust_sharpness_video(self, sharpness_factor=sharpness_factor)
+        return Video.new_like(self, output)
+
+    def adjust_hue(self, hue_factor: float) -> Video:
+        output = self._F.adjust_hue_video(self, hue_factor=hue_factor)
+        return Video.new_like(self, output)
+
+    def adjust_gamma(self, gamma: float, gain: float = 1) -> Video:
+        output = self._F.adjust_gamma_video(self, gamma=gamma, gain=gain)
+        return Video.new_like(self, output)
+
+    def posterize(self, bits: int) -> Video:
+        output = self._F.posterize_video(self, bits=bits)
+        return Video.new_like(self, output)
+
+    def solarize(self, threshold: float) -> Video:
+        output = self._F.solarize_video(self, threshold=threshold)
+        return Video.new_like(self, output)
+
+    def autocontrast(self) -> Video:
+        output = self._F.autocontrast_video(self)
+        return Video.new_like(self, output)
+
+    def equalize(self) -> Video:
+        output = self._F.equalize_video(self)
+        return Video.new_like(self, output)
+
+    def invert(self) -> Video:
+        output = self._F.invert_video(self)
+        return Video.new_like(self, output)
+
+    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Video:
+        output = self._F.gaussian_blur_video(self, kernel_size=kernel_size, sigma=sigma)
+        return Video.new_like(self, output)
+
+
+VideoType = Union[torch.Tensor, Video]
+VideoTypeJIT = torch.Tensor
+LegacyVideoType = torch.Tensor
+LegacyVideoTypeJIT = torch.Tensor
+TensorVideoType = Union[torch.Tensor, Video]
+TensorVideoTypeJIT = torch.Tensor
+
+ImageOrVideoType = Union[ImageType, VideoType]
+ImageOrVideoTypeJIT = Union[ImageTypeJIT, VideoTypeJIT]
+TensorImageOrVideoType = Union[TensorImageType, TensorVideoType]
+TensorImageOrVideoTypeJIT = Union[TensorImageTypeJIT, TensorVideoTypeJIT]
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 3cd925fd9..311ad6d5a 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -15,7 +15,7 @@ from ._utils import has_any, query_chw
 
 
 class RandomErasing(_RandomApplyTransform):
-    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image)
+    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image, features.Video)
 
     def __init__(
         self,
@@ -92,7 +92,7 @@ class RandomErasing(_RandomApplyTransform):
 
         return dict(i=i, j=j, h=h, w=w, v=v)
 
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
+    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
         if params["v"] is not None:
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index c98e5c36e..4732f88d4 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -31,40 +31,41 @@ class _AutoAugmentBase(Transform):
         key = keys[int(torch.randint(len(keys), ()))]
         return key, dct[key]
 
-    def _extract_image(
+    def _extract_image_or_video(
         self,
         sample: Any,
         unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask),
-    ) -> Tuple[int, features.ImageType]:
+    ) -> Tuple[int, features.ImageOrVideoType]:
         sample_flat, _ = tree_flatten(sample)
-        images = []
+        image_or_videos = []
         for id, inpt in enumerate(sample_flat):
-            if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor)):
-                images.append((id, inpt))
+            if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)):
+                image_or_videos.append((id, inpt))
             elif isinstance(inpt, unsupported_types):
                 raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
 
-        if not images:
+        if not image_or_videos:
             raise TypeError("Found no image in the sample.")
-        if len(images) > 1:
+        if len(image_or_videos) > 1:
             raise TypeError(
-                f"Auto augment transformations are only properly defined for a single image, but found {len(images)}."
+                f"Auto augment transformations are only properly defined for a single image or video, "
+                f"but found {len(image_or_videos)}."
             )
-        return images[0]
+        return image_or_videos[0]
 
     def _put_into_sample(self, sample: Any, id: int, item: Any) -> Any:
         sample_flat, spec = tree_flatten(sample)
         sample_flat[id] = item
         return tree_unflatten(sample_flat, spec)
 
-    def _apply_image_transform(
+    def _apply_image_or_video_transform(
         self,
-        image: features.ImageType,
+        image: features.ImageOrVideoType,
         transform_id: str,
         magnitude: float,
         interpolation: InterpolationMode,
         fill: Dict[Type, features.FillType],
-    ) -> features.ImageType:
+    ) -> features.ImageOrVideoType:
         fill_ = fill[type(image)]
         fill_ = F._geometry._convert_fill_arg(fill_)
 
@@ -276,8 +277,8 @@ class AutoAugment(_AutoAugmentBase):
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
 
-        id, image = self._extract_image(sample)
-        _, height, width = get_chw(image)
+        id, image_or_video = self._extract_image_or_video(sample)
+        _, height, width = get_chw(image_or_video)
 
         policy = self._policies[int(torch.randint(len(self._policies), ()))]
 
@@ -295,11 +296,11 @@ class AutoAugment(_AutoAugmentBase):
             else:
                 magnitude = 0.0
 
-            image = self._apply_image_transform(
-                image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
             )
 
-        return self._put_into_sample(sample, id, image)
+        return self._put_into_sample(sample, id, image_or_video)
 
 
 class RandAugment(_AutoAugmentBase):
@@ -347,8 +348,8 @@ class RandAugment(_AutoAugmentBase):
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
 
-        id, image = self._extract_image(sample)
-        _, height, width = get_chw(image)
+        id, image_or_video = self._extract_image_or_video(sample)
+        _, height, width = get_chw(image_or_video)
 
         for _ in range(self.num_ops):
             transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
@@ -359,11 +360,11 @@ class RandAugment(_AutoAugmentBase):
                     magnitude *= -1
             else:
                 magnitude = 0.0
-            image = self._apply_image_transform(
-                image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
             )
 
-        return self._put_into_sample(sample, id, image)
+        return self._put_into_sample(sample, id, image_or_video)
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
@@ -401,8 +402,8 @@ class TrivialAugmentWide(_AutoAugmentBase):
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
 
-        id, image = self._extract_image(sample)
-        _, height, width = get_chw(image)
+        id, image_or_video = self._extract_image_or_video(sample)
+        _, height, width = get_chw(image_or_video)
 
         transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
 
@@ -414,10 +415,10 @@ class TrivialAugmentWide(_AutoAugmentBase):
         else:
             magnitude = 0.0
 
-        image = self._apply_image_transform(
-            image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+        image_or_video = self._apply_image_or_video_transform(
+            image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
         )
-        return self._put_into_sample(sample, id, image)
+        return self._put_into_sample(sample, id, image_or_video)
 
 
 class AugMix(_AutoAugmentBase):
@@ -471,27 +472,28 @@ class AugMix(_AutoAugmentBase):
 
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
-        id, orig_image = self._extract_image(sample)
-        _, height, width = get_chw(orig_image)
+        id, orig_image_or_video = self._extract_image_or_video(sample)
+        _, height, width = get_chw(orig_image_or_video)
 
-        if isinstance(orig_image, torch.Tensor):
-            image = orig_image
+        if isinstance(orig_image_or_video, torch.Tensor):
+            image_or_video = orig_image_or_video
         else:  # isinstance(inpt, PIL.Image.Image):
-            image = F.pil_to_tensor(orig_image)
+            image_or_video = F.pil_to_tensor(orig_image_or_video)
 
         augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
 
-        orig_dims = list(image.shape)
-        batch = image.view([1] * max(4 - image.ndim, 0) + orig_dims)
+        orig_dims = list(image_or_video.shape)
+        batch = image_or_video.view([1] * max(4 - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
-        # Sample the beta weights for combining the original and augmented image. To get Beta, we use a Dirichlet
-        # with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of augmented image.
+        # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
+        # Dirichlet with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of
+        # augmented image or video.
         m = self._sample_dirichlet(
             torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1)
         )
 
-        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images.
+        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos.
         combined_weights = self._sample_dirichlet(
             torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
         ) * m[:, 1].view([batch_dims[0], -1])
@@ -511,15 +513,15 @@ class AugMix(_AutoAugmentBase):
                 else:
                     magnitude = 0.0
 
-                aug = self._apply_image_transform(
+                aug = self._apply_image_or_video_transform(
                     aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
                 )
             mix.add_(combined_weights[:, i].view(batch_dims) * aug)
-        mix = mix.view(orig_dims).to(dtype=image.dtype)
+        mix = mix.view(orig_dims).to(dtype=image_or_video.dtype)
 
-        if isinstance(orig_image, features.Image):
-            mix = features.Image.new_like(orig_image, mix)
-        elif isinstance(orig_image, PIL.Image.Image):
+        if isinstance(orig_image_or_video, (features.Image, features.Video)):
+            mix = type(orig_image_or_video).new_like(orig_image_or_video, mix)  # type: ignore[arg-type]
+        elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_image_pil(mix)
 
         return self._put_into_sample(sample, id, mix)
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index e0ee8d1b9..451b57b66 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -82,7 +82,7 @@ class ColorJitter(Transform):
 
 
 class RandomPhotometricDistort(Transform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
+    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
 
     def __init__(
         self,
@@ -110,20 +110,22 @@ class RandomPhotometricDistort(Transform):
             channel_permutation=torch.randperm(num_channels) if torch.rand(()) < self.p else None,
         )
 
-    def _permute_channels(self, inpt: features.ImageType, permutation: torch.Tensor) -> features.ImageType:
+    def _permute_channels(
+        self, inpt: features.ImageOrVideoType, permutation: torch.Tensor
+    ) -> features.ImageOrVideoType:
         if isinstance(inpt, PIL.Image.Image):
             inpt = F.pil_to_tensor(inpt)
 
         output = inpt[..., permutation, :, :]
 
-        if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.OTHER)
+        if isinstance(inpt, (features.Image, features.Video)):
+            output = type(inpt).new_like(inpt, output, color_space=features.ColorSpace.OTHER)  # type: ignore[arg-type]
         elif isinstance(inpt, PIL.Image.Image):
             output = F.to_image_pil(output)
 
         return output
 
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
+    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
         if params["brightness"]:
             inpt = F.adjust_brightness(
                 inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 008d4d195..1f132ec92 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -855,8 +855,10 @@ class FixedSizeCrop(Transform):
         return inpt
 
     def forward(self, *inputs: Any) -> Any:
-        if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor):
-            raise TypeError(f"{type(self).__name__}() requires input sample to contain an tensor or PIL image.")
+        if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
+            )
 
         if has_any(inputs, features.BoundingBox) and not has_any(inputs, features.Label, features.OneHotLabel):
             raise TypeError(
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 2ea3014aa..cb090492a 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -34,7 +34,7 @@ class ConvertImageDtype(Transform):
 
 
 class ConvertColorSpace(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image)
+    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image, features.Video)
 
     def __init__(
         self,
@@ -54,7 +54,7 @@ class ConvertColorSpace(Transform):
 
         self.copy = copy
 
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
+    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
         return F.convert_color_space(
             inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy
         )
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 976e9f8b5..2531bf8f6 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -38,7 +38,7 @@ class Lambda(Transform):
 
 
 class LinearTransformation(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image)
+    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
 
     def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
         super().__init__()
@@ -68,7 +68,7 @@ class LinearTransformation(Transform):
 
         return super().forward(*inputs)
 
-    def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> torch.Tensor:
+    def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor:
         # Image instance after linear transformation is not Image anymore due to unknown data range
         # Thus we will return Tensor for input Image
 
@@ -93,7 +93,7 @@ class LinearTransformation(Transform):
 
 
 class Normalize(Transform):
-    _transformed_types = (features.Image, features.is_simple_tensor)
+    _transformed_types = (features.Image, features.is_simple_tensor, features.Video)
 
     def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
         super().__init__()
@@ -101,7 +101,7 @@ class Normalize(Transform):
         self.std = list(std)
         self.inplace = inplace
 
-    def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> torch.Tensor:
+    def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
     def forward(self, *inpts: Any) -> Any:
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index 219e6e505..a76891a34 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -82,10 +82,10 @@ def query_chw(sample: Any) -> Tuple[int, int, int]:
     chws = {
         get_chw(item)
         for item in flat_sample
-        if isinstance(item, (features.Image, PIL.Image.Image)) or features.is_simple_tensor(item)
+        if isinstance(item, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(item)
     }
     if not chws:
-        raise TypeError("No image was found in the sample")
+        raise TypeError("No image or video was found in the sample")
     elif len(chws) > 1:
         raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
     return chws.pop()
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index f081d101d..cb801df73 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -6,6 +6,7 @@ from ._meta import (
     convert_format_bounding_box,
     convert_color_space_image_tensor,
     convert_color_space_image_pil,
+    convert_color_space_video,
     convert_color_space,
     get_dimensions,
     get_image_num_channels,
@@ -13,41 +14,52 @@ from ._meta import (
     get_spatial_size,
 )  # usort: skip
 
-from ._augment import erase, erase_image_pil, erase_image_tensor
+from ._augment import erase, erase_image_pil, erase_image_tensor, erase_video
 from ._color import (
     adjust_brightness,
     adjust_brightness_image_pil,
     adjust_brightness_image_tensor,
+    adjust_brightness_video,
     adjust_contrast,
     adjust_contrast_image_pil,
     adjust_contrast_image_tensor,
+    adjust_contrast_video,
     adjust_gamma,
     adjust_gamma_image_pil,
     adjust_gamma_image_tensor,
+    adjust_gamma_video,
     adjust_hue,
     adjust_hue_image_pil,
     adjust_hue_image_tensor,
+    adjust_hue_video,
     adjust_saturation,
     adjust_saturation_image_pil,
     adjust_saturation_image_tensor,
+    adjust_saturation_video,
     adjust_sharpness,
     adjust_sharpness_image_pil,
     adjust_sharpness_image_tensor,
+    adjust_sharpness_video,
     autocontrast,
     autocontrast_image_pil,
     autocontrast_image_tensor,
+    autocontrast_video,
     equalize,
     equalize_image_pil,
     equalize_image_tensor,
+    equalize_video,
     invert,
     invert_image_pil,
     invert_image_tensor,
+    invert_video,
     posterize,
     posterize_image_pil,
     posterize_image_tensor,
+    posterize_video,
     solarize,
     solarize_image_pil,
     solarize_image_tensor,
+    solarize_video,
 )
 from ._geometry import (
     affine,
@@ -55,22 +67,26 @@ from ._geometry import (
     affine_image_pil,
     affine_image_tensor,
     affine_mask,
+    affine_video,
     center_crop,
     center_crop_bounding_box,
     center_crop_image_pil,
     center_crop_image_tensor,
     center_crop_mask,
+    center_crop_video,
     crop,
     crop_bounding_box,
     crop_image_pil,
     crop_image_tensor,
     crop_mask,
+    crop_video,
     elastic,
     elastic_bounding_box,
     elastic_image_pil,
     elastic_image_tensor,
     elastic_mask,
     elastic_transform,
+    elastic_video,
     five_crop,
     five_crop_image_pil,
     five_crop_image_tensor,
@@ -80,31 +96,37 @@ from ._geometry import (
     horizontal_flip_image_pil,
     horizontal_flip_image_tensor,
     horizontal_flip_mask,
+    horizontal_flip_video,
     pad,
     pad_bounding_box,
     pad_image_pil,
     pad_image_tensor,
     pad_mask,
+    pad_video,
     perspective,
     perspective_bounding_box,
     perspective_image_pil,
     perspective_image_tensor,
     perspective_mask,
+    perspective_video,
     resize,
     resize_bounding_box,
     resize_image_pil,
     resize_image_tensor,
     resize_mask,
+    resize_video,
     resized_crop,
     resized_crop_bounding_box,
     resized_crop_image_pil,
     resized_crop_image_tensor,
     resized_crop_mask,
+    resized_crop_video,
     rotate,
     rotate_bounding_box,
     rotate_image_pil,
     rotate_image_tensor,
     rotate_mask,
+    rotate_video,
     ten_crop,
     ten_crop_image_pil,
     ten_crop_image_tensor,
@@ -113,9 +135,18 @@ from ._geometry import (
     vertical_flip_image_pil,
     vertical_flip_image_tensor,
     vertical_flip_mask,
+    vertical_flip_video,
     vflip,
 )
-from ._misc import gaussian_blur, gaussian_blur_image_pil, gaussian_blur_image_tensor, normalize, normalize_image_tensor
+from ._misc import (
+    gaussian_blur,
+    gaussian_blur_image_pil,
+    gaussian_blur_image_tensor,
+    gaussian_blur_video,
+    normalize,
+    normalize_image_tensor,
+    normalize_video,
+)
 from ._type_conversion import (
     convert_image_dtype,
     decode_image_with_pil,
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index fb48c3588..976feb99e 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -17,19 +17,25 @@ def erase_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
+def erase_video(
+    video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    return erase_image_tensor(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+
 def erase(
-    inpt: features.ImageTypeJIT,
+    inpt: features.ImageOrVideoTypeJIT,
     i: int,
     j: int,
     h: int,
     w: int,
     v: torch.Tensor,
     inplace: bool = False,
-) -> features.ImageTypeJIT:
+) -> features.ImageOrVideoTypeJIT:
     if isinstance(inpt, torch.Tensor):
         output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output)
+        if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
+            output = type(inpt).new_like(inpt, output)  # type: ignore[arg-type]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index f375cb048..d11dd3c3b 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -2,10 +2,16 @@ import torch
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
+from ._meta import get_dimensions_image_tensor
+
 adjust_brightness_image_tensor = _FT.adjust_brightness
 adjust_brightness_image_pil = _FP.adjust_brightness
 
 
+def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor)
+
+
 def adjust_brightness(inpt: features.InputTypeJIT, brightness_factor: float) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
@@ -19,6 +25,10 @@ adjust_saturation_image_tensor = _FT.adjust_saturation
 adjust_saturation_image_pil = _FP.adjust_saturation
 
 
+def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor)
+
+
 def adjust_saturation(inpt: features.InputTypeJIT, saturation_factor: float) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
@@ -32,6 +42,10 @@ adjust_contrast_image_tensor = _FT.adjust_contrast
 adjust_contrast_image_pil = _FP.adjust_contrast
 
 
+def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor)
+
+
 def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
@@ -41,10 +55,40 @@ def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> feat
         return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
 
 
-adjust_sharpness_image_tensor = _FT.adjust_sharpness
+def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    num_channels, height, width = get_dimensions_image_tensor(image)
+    if num_channels not in (1, 3):
+        raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
+
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    if image.numel() == 0 or height <= 2 or width <= 2:
+        return image
+
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.view(-1, num_channels, height, width)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = _FT._blend(image, _FT._blurred_degenerate_image(image), sharpness_factor)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
+
+
 adjust_sharpness_image_pil = _FP.adjust_sharpness
 
 
+def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor)
+
+
 def adjust_sharpness(inpt: features.InputTypeJIT, sharpness_factor: float) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
@@ -58,6 +102,10 @@ adjust_hue_image_tensor = _FT.adjust_hue
 adjust_hue_image_pil = _FP.adjust_hue
 
 
+def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    return adjust_hue_image_tensor(video, hue_factor=hue_factor)
+
+
 def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
@@ -71,6 +119,10 @@ adjust_gamma_image_tensor = _FT.adjust_gamma
 adjust_gamma_image_pil = _FP.adjust_gamma
 
 
+def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain)
+
+
 def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
@@ -84,6 +136,10 @@ posterize_image_tensor = _FT.posterize
 posterize_image_pil = _FP.posterize
 
 
+def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
+    return posterize_image_tensor(video, bits=bits)
+
+
 def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return posterize_image_tensor(inpt, bits=bits)
@@ -97,6 +153,10 @@ solarize_image_tensor = _FT.solarize
 solarize_image_pil = _FP.solarize
 
 
+def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
+    return solarize_image_tensor(video, threshold=threshold)
+
+
 def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return solarize_image_tensor(inpt, threshold=threshold)
@@ -110,6 +170,10 @@ autocontrast_image_tensor = _FT.autocontrast
 autocontrast_image_pil = _FP.autocontrast
 
 
+def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
+    return autocontrast_image_tensor(video)
+
+
 def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return autocontrast_image_tensor(inpt)
@@ -119,10 +183,35 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return autocontrast_image_pil(inpt)
 
 
-equalize_image_tensor = _FT.equalize
+def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
+    if image.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {image.dtype}")
+
+    num_channels, height, width = get_dimensions_image_tensor(image)
+    if num_channels not in (1, 3):
+        raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
+
+    if image.numel() == 0:
+        return image
+    elif image.ndim == 2:
+        return _FT._scale_channel(image)
+    else:
+        return torch.stack(
+            [
+                # TODO: when merging transforms v1 and v2, we can inline this function call
+                _FT._equalize_single_image(single_image)
+                for single_image in image.view(-1, num_channels, height, width)
+            ]
+        ).view(image.shape)
+
+
 equalize_image_pil = _FP.equalize
 
 
+def equalize_video(video: torch.Tensor) -> torch.Tensor:
+    return equalize_image_tensor(video)
+
+
 def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return equalize_image_tensor(inpt)
@@ -136,6 +225,10 @@ invert_image_tensor = _FT.invert
 invert_image_pil = _FP.invert
 
 
+def invert_video(video: torch.Tensor) -> torch.Tensor:
+    return invert_image_tensor(video)
+
+
 def invert(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return invert_image_tensor(inpt)
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 7a291967b..f205b5aea 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -47,6 +47,10 @@ def horizontal_flip_bounding_box(
     ).view(shape)
 
 
+def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image_tensor(video)
+
+
 def horizontal_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return horizontal_flip_image_tensor(inpt)
@@ -80,6 +84,10 @@ def vertical_flip_bounding_box(
     ).view(shape)
 
 
+def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image_tensor(video)
+
+
 def vertical_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return vertical_flip_image_tensor(inpt)
@@ -185,6 +193,16 @@ def resize_bounding_box(
     )
 
 
+def resize_video(
+    video: torch.Tensor,
+    size: List[int],
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: bool = False,
+) -> torch.Tensor:
+    return resize_image_tensor(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
 def resize(
     inpt: features.InputTypeJIT,
     size: List[int],
@@ -441,6 +459,28 @@ def affine_mask(
     return output
 
 
+def affine_video(
+    video: torch.Tensor,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    fill: features.FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    return affine_image_tensor(
+        video,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
 def _convert_fill_arg(fill: features.FillType) -> features.FillTypeJIT:
     # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
     # So, we can't reassign fill to 0
@@ -614,6 +654,17 @@ def rotate_mask(
     return output
 
 
+def rotate_video(
+    video: torch.Tensor,
+    angle: float,
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    expand: bool = False,
+    fill: features.FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
 def rotate(
     inpt: features.InputTypeJIT,
     angle: float,
@@ -751,6 +802,15 @@ def pad_bounding_box(
     return bounding_box, (height, width)
 
 
+def pad_video(
+    video: torch.Tensor,
+    padding: Union[int, List[int]],
+    fill: features.FillTypeJIT = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode)
+
+
 def pad(
     inpt: features.InputTypeJIT,
     padding: Union[int, List[int]],
@@ -798,6 +858,10 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int)
     return crop_image_tensor(mask, top, left, height, width)
 
 
+def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    return crop_image_tensor(video, top, left, height, width)
+
+
 def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: int) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return crop_image_tensor(inpt, top, left, height, width)
@@ -932,6 +996,33 @@ def perspective_mask(
     return output
 
 
+def perspective_video(
+    video: torch.Tensor,
+    perspective_coeffs: List[float],
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    fill: features.FillTypeJIT = None,
+) -> torch.Tensor:
+    # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when
+    #  https://github.com/pytorch/vision/issues/6670 is resolved.
+    if video.numel() == 0:
+        return video
+
+    shape = video.shape
+
+    if video.ndim > 4:
+        video = video.view((-1,) + shape[-3:])
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
+
+
 def perspective(
     inpt: features.InputTypeJIT,
     perspective_coeffs: List[float],
@@ -1026,6 +1117,33 @@ def elastic_mask(
     return output
 
 
+def elastic_video(
+    video: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    fill: features.FillTypeJIT = None,
+) -> torch.Tensor:
+    # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when
+    #  https://github.com/pytorch/vision/issues/6670 is resolved.
+    if video.numel() == 0:
+        return video
+
+    shape = video.shape
+
+    if video.ndim > 4:
+        video = video.view((-1,) + shape[-3:])
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
+
+
 def elastic(
     inpt: features.InputTypeJIT,
     displacement: torch.Tensor,
@@ -1128,6 +1246,10 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor
     return output
 
 
+def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+    return center_crop_image_tensor(video, output_size)
+
+
 def center_crop(inpt: features.InputTypeJIT, output_size: List[int]) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return center_crop_image_tensor(inpt, output_size)
@@ -1190,6 +1312,21 @@ def resized_crop_mask(
     return resize_mask(mask, size)
 
 
+def resized_crop_video(
+    video: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    antialias: bool = False,
+) -> torch.Tensor:
+    return resized_crop_image_tensor(
+        video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
+    )
+
+
 def resized_crop(
     inpt: features.InputTypeJIT,
     top: int,
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 90cfffcf2..1e53edf39 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -11,10 +11,12 @@ get_dimensions_image_pil = _FP.get_dimensions
 
 
 # TODO: Should this be prefixed with `_` similar to other methods that don't get exposed by init?
-def get_chw(image: features.ImageTypeJIT) -> Tuple[int, int, int]:
-    if isinstance(image, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(image, features.Image)):
+def get_chw(image: features.ImageOrVideoTypeJIT) -> Tuple[int, int, int]:
+    if isinstance(image, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
+    ):
         channels, height, width = get_dimensions_image_tensor(image)
-    elif isinstance(image, features.Image):
+    elif isinstance(image, (features.Image, features.Video)):
         channels = image.num_channels
         height, width = image.image_size
     else:  # isinstance(image, PIL.Image.Image)
@@ -29,11 +31,11 @@ def get_chw(image: features.ImageTypeJIT) -> Tuple[int, int, int]:
 # detailed above.
 
 
-def get_dimensions(image: features.ImageTypeJIT) -> List[int]:
+def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]:
     return list(get_chw(image))
 
 
-def get_num_channels(image: features.ImageTypeJIT) -> int:
+def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int:
     num_channels, *_ = get_chw(image)
     return num_channels
 
@@ -43,7 +45,7 @@ def get_num_channels(image: features.ImageTypeJIT) -> int:
 get_image_num_channels = get_num_channels
 
 
-def get_spatial_size(image: features.ImageTypeJIT) -> List[int]:
+def get_spatial_size(image: features.ImageOrVideoTypeJIT) -> List[int]:
     _, *size = get_chw(image)
     return size
 
@@ -207,13 +209,23 @@ def convert_color_space_image_pil(
     return image.convert(new_mode)
 
 
+def convert_color_space_video(
+    video: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True
+) -> torch.Tensor:
+    return convert_color_space_image_tensor(
+        video, old_color_space=old_color_space, new_color_space=new_color_space, copy=copy
+    )
+
+
 def convert_color_space(
-    inpt: features.ImageTypeJIT,
+    inpt: features.ImageOrVideoTypeJIT,
     color_space: ColorSpace,
     old_color_space: Optional[ColorSpace] = None,
     copy: bool = True,
-) -> features.ImageTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Image)):
+) -> features.ImageOrVideoTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+    ):
         if old_color_space is None:
             raise RuntimeError(
                 "In order to convert the color space of simple tensor images, "
@@ -222,7 +234,7 @@ def convert_color_space(
         return convert_color_space_image_tensor(
             inpt, old_color_space=old_color_space, new_color_space=color_space, copy=copy
         )
-    elif isinstance(inpt, features.Image):
+    elif isinstance(inpt, (features.Image, features.Video)):
         return inpt.to_color_space(color_space, copy=copy)
     else:
-        return cast(features.ImageTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy))
+        return cast(features.ImageOrVideoTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy))
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 6f35781d4..7b3773e63 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -9,18 +9,22 @@ from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 normalize_image_tensor = _FT.normalize
 
 
+def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
+    return normalize_image_tensor(video, mean, std, inplace=inplace)
+
+
 def normalize(
-    inpt: features.TensorImageTypeJIT, mean: List[float], std: List[float], inplace: bool = False
+    inpt: features.TensorImageOrVideoTypeJIT, mean: List[float], std: List[float], inplace: bool = False
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
         correct_type = isinstance(inpt, torch.Tensor)
     else:
-        correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, features.Image)
+        correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, (features.Image, features.Video))
         inpt = inpt.as_subclass(torch.Tensor)
     if not correct_type:
         raise TypeError(f"img should be Tensor Image. Got {type(inpt)}")
 
-    # Image instance after normalization is not Image anymore due to unknown data range
+    # Image or Video type should not be retained after normalization due to unknown data range
     # Thus we return Tensor for input Image
     return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
 
@@ -64,6 +68,30 @@ def gaussian_blur_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
+def gaussian_blur_video(
+    video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> torch.Tensor:
+    # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when
+    #  https://github.com/pytorch/vision/issues/6670 is resolved.
+    if video.numel() == 0:
+        return video
+
+    shape = video.shape
+
+    if video.ndim > 4:
+        video = video.view((-1,) + shape[-3:])
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = gaussian_blur_image_tensor(video, kernel_size, sigma)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
+
+
 def gaussian_blur(
     inpt: features.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> features.InputTypeJIT:
-- 
GitLab


From 4c049ca3b74c2f93bb2acd952548626aada08fe0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 7 Oct 2022 17:35:23 +0200
Subject: [PATCH 021/624] replace new_like with wrap_like (#6718)

* replace new_like with wrap_like

* fix videos

* revert casting in favor of ignoring mypy
---
 test/test_prototype_features.py               |  4 +-
 test/test_prototype_transforms.py             | 17 ++--
 .../prototype/features/_bounding_box.py       | 57 +++++++------
 torchvision/prototype/features/_encoded.py    | 11 ++-
 torchvision/prototype/features/_feature.py    | 55 +++++-------
 torchvision/prototype/features/_image.py      | 85 ++++++++++---------
 torchvision/prototype/features/_label.py      | 20 +++--
 torchvision/prototype/features/_mask.py       | 47 +++++++---
 torchvision/prototype/features/_video.py      | 74 ++++++++--------
 torchvision/prototype/transforms/_augment.py  | 22 ++---
 .../prototype/transforms/_auto_augment.py     |  2 +-
 torchvision/prototype/transforms/_color.py    |  3 +-
 .../prototype/transforms/_deprecated.py       |  4 +-
 torchvision/prototype/transforms/_geometry.py | 16 ++--
 torchvision/prototype/transforms/_meta.py     | 10 ++-
 torchvision/prototype/transforms/_misc.py     |  2 +-
 .../transforms/functional/_augment.py         |  2 +-
 .../transforms/functional/_geometry.py        |  4 +-
 18 files changed, 239 insertions(+), 196 deletions(-)

diff --git a/test/test_prototype_features.py b/test/test_prototype_features.py
index 2701dd66b..d2b0d2e63 100644
--- a/test/test_prototype_features.py
+++ b/test/test_prototype_features.py
@@ -99,14 +99,14 @@ def test_inplace_op_no_wrapping():
     assert type(label) is features.Label
 
 
-def test_new_like():
+def test_wrap_like():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
     label = features.Label(tensor, categories=["foo", "bar"])
 
     # any operation besides .to() and .clone() will do here
     output = label * 2
 
-    label_new = features.Label.new_like(label, output)
+    label_new = features.Label.wrap_like(label, output)
 
     assert type(label_new) is features.Label
     assert label_new.data_ptr() == output.data_ptr()
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 916861f4e..4037a7467 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -8,6 +8,7 @@ import pytest
 import torch
 from common_utils import assert_equal, cpu_and_gpu
 from prototype_common_utils import (
+    DEFAULT_EXTRA_DIMS,
     make_bounding_box,
     make_bounding_boxes,
     make_detection_mask,
@@ -23,6 +24,8 @@ from torchvision.ops.boxes import box_iou
 from torchvision.prototype import features, transforms
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
 
+BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
+
 
 def make_vanilla_tensor_images(*args, **kwargs):
     for image in make_images(*args, **kwargs):
@@ -109,13 +112,11 @@ class TestSmoke:
             (
                 transform,
                 [
-                    dict(
-                        image=features.Image.new_like(image, image.unsqueeze(0), dtype=torch.float),
-                        one_hot_label=features.OneHotLabel.new_like(
-                            one_hot_label, one_hot_label.unsqueeze(0), dtype=torch.float
-                        ),
+                    dict(image=image, one_hot_label=one_hot_label)
+                    for image, one_hot_label in itertools.product(
+                        make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                        make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
                     )
-                    for image, one_hot_label in itertools.product(make_images(), make_one_hot_labels())
                 ],
             )
             for transform in [
@@ -300,7 +301,7 @@ class TestRandomHorizontalFlip:
         actual = transform(input)
 
         expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
+        expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
         assert_equal(expected, actual)
         assert actual.format == expected.format
         assert actual.image_size == expected.image_size
@@ -353,7 +354,7 @@ class TestRandomVerticalFlip:
         actual = transform(input)
 
         expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
+        expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
         assert_equal(expected, actual)
         assert actual.format == expected.format
         assert actual.image_size == expected.image_size
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 9ccd4fa62..7b69af5f9 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -19,6 +19,13 @@ class BoundingBox(_Feature):
     format: BoundingBoxFormat
     image_size: Tuple[int, int]
 
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, image_size: Tuple[int, int]) -> BoundingBox:
+        bounding_box = tensor.as_subclass(cls)
+        bounding_box.format = format
+        bounding_box.image_size = image_size
+        return bounding_box
+
     def __new__(
         cls,
         data: Any,
@@ -29,52 +36,46 @@ class BoundingBox(_Feature):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
     ) -> BoundingBox:
-        bounding_box = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
         if isinstance(format, str):
             format = BoundingBoxFormat.from_str(format.upper())
-        bounding_box.format = format
-
-        bounding_box.image_size = image_size
 
-        return bounding_box
-
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, image_size=self.image_size)
+        return cls._wrap(tensor, format=format, image_size=image_size)
 
     @classmethod
-    def new_like(
+    def wrap_like(
         cls,
         other: BoundingBox,
-        data: Any,
+        tensor: torch.Tensor,
         *,
-        format: Optional[Union[BoundingBoxFormat, str]] = None,
+        format: Optional[BoundingBoxFormat] = None,
         image_size: Optional[Tuple[int, int]] = None,
-        **kwargs: Any,
     ) -> BoundingBox:
-        return super().new_like(
-            other,
-            data,
+        return cls._wrap(
+            tensor,
             format=format if format is not None else other.format,
             image_size=image_size if image_size is not None else other.image_size,
-            **kwargs,
         )
 
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(format=self.format, image_size=self.image_size)
+
     def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
         if isinstance(format, str):
             format = BoundingBoxFormat.from_str(format.upper())
 
-        return BoundingBox.new_like(
+        return BoundingBox.wrap_like(
             self, self._F.convert_format_bounding_box(self, old_format=self.format, new_format=format), format=format
         )
 
     def horizontal_flip(self) -> BoundingBox:
         output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size)
-        return BoundingBox.new_like(self, output)
+        return BoundingBox.wrap_like(self, output)
 
     def vertical_flip(self) -> BoundingBox:
         output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size)
-        return BoundingBox.new_like(self, output)
+        return BoundingBox.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
         self,
@@ -84,19 +85,19 @@ class BoundingBox(_Feature):
         antialias: bool = False,
     ) -> BoundingBox:
         output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size)
-        return BoundingBox.new_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, image_size=image_size)
 
     def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
         output, image_size = self._F.crop_bounding_box(
             self, self.format, top=top, left=left, height=height, width=width
         )
-        return BoundingBox.new_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, image_size=image_size)
 
     def center_crop(self, output_size: List[int]) -> BoundingBox:
         output, image_size = self._F.center_crop_bounding_box(
             self, format=self.format, image_size=self.image_size, output_size=output_size
         )
-        return BoundingBox.new_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, image_size=image_size)
 
     def resized_crop(
         self,
@@ -109,7 +110,7 @@ class BoundingBox(_Feature):
         antialias: bool = False,
     ) -> BoundingBox:
         output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
-        return BoundingBox.new_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, image_size=image_size)
 
     def pad(
         self,
@@ -120,7 +121,7 @@ class BoundingBox(_Feature):
         output, image_size = self._F.pad_bounding_box(
             self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode
         )
-        return BoundingBox.new_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, image_size=image_size)
 
     def rotate(
         self,
@@ -133,7 +134,7 @@ class BoundingBox(_Feature):
         output, image_size = self._F.rotate_bounding_box(
             self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center
         )
-        return BoundingBox.new_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, image_size=image_size)
 
     def affine(
         self,
@@ -155,7 +156,7 @@ class BoundingBox(_Feature):
             shear=shear,
             center=center,
         )
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
+        return BoundingBox.wrap_like(self, output)
 
     def perspective(
         self,
@@ -164,7 +165,7 @@ class BoundingBox(_Feature):
         fill: FillTypeJIT = None,
     ) -> BoundingBox:
         output = self._F.perspective_bounding_box(self, self.format, perspective_coeffs)
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
+        return BoundingBox.wrap_like(self, output)
 
     def elastic(
         self,
@@ -173,4 +174,4 @@ class BoundingBox(_Feature):
         fill: FillTypeJIT = None,
     ) -> BoundingBox:
         output = self._F.elastic_bounding_box(self, self.format, displacement)
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
+        return BoundingBox.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
index 0ec14ab20..4b963986b 100644
--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/features/_encoded.py
@@ -14,6 +14,10 @@ D = TypeVar("D", bound="EncodedData")
 
 
 class EncodedData(_Feature):
+    @classmethod
+    def _wrap(cls: Type[D], tensor: torch.Tensor) -> D:
+        return tensor.as_subclass(cls)
+
     def __new__(
         cls,
         data: Any,
@@ -22,8 +26,13 @@ class EncodedData(_Feature):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
     ) -> EncodedData:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         # TODO: warn / bail out if we encounter a tensor with shape other than (N,) or with dtype other than uint8?
-        return super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor)
+
+    @classmethod
+    def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
+        return cls._wrap(tensor)
 
     @classmethod
     def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D:
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
index 2da10be90..a56441f29 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/features/_feature.py
@@ -21,48 +21,39 @@ def is_simple_tensor(inpt: Any) -> bool:
 class _Feature(torch.Tensor):
     __F: Optional[ModuleType] = None
 
-    def __new__(
-        cls: Type[F],
+    @staticmethod
+    def _to_tensor(
         data: Any,
-        *,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
-    ) -> F:
-        return (
-            torch.as_tensor(  # type: ignore[return-value]
-                data,
-                dtype=dtype,
-                device=device,
-            )
-            .as_subclass(cls)
-            .requires_grad_(requires_grad)
-        )
+    ) -> torch.Tensor:
+        return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
-    @classmethod
-    def new_like(
-        cls: Type[F],
-        other: F,
+    # FIXME: this is just here for BC with the prototype datasets. Some datasets use the _Feature directly to have a
+    #  a no-op input for the prototype transforms. For this use case, we can't use plain tensors, since they will be
+    #  interpreted as images. We should decide if we want a public no-op feature like `GenericFeature` or make this one
+    #  public again.
+    def __new__(
+        cls,
         data: Any,
-        *,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: Optional[bool] = None,
-        **kwargs: Any,
-    ) -> F:
-        return cls(
-            data,
-            dtype=dtype if dtype is not None else other.dtype,
-            device=device if device is not None else other.device,
-            requires_grad=requires_grad if requires_grad is not None else other.requires_grad,
-            **kwargs,
-        )
+        requires_grad: bool = False,
+    ) -> _Feature:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return tensor.as_subclass(_Feature)
+
+    @classmethod
+    def wrap_like(cls: Type[F], other: F, tensor: torch.Tensor) -> F:
+        # FIXME: this is just here for BC with the prototype datasets. See __new__ for details. If that is resolved,
+        #  this method should be made abstract
+        # raise NotImplementedError
+        return tensor.as_subclass(cls)
 
     _NO_WRAPPING_EXCEPTIONS = {
-        torch.Tensor.clone: lambda cls, input, output: cls.new_like(input, output),
-        torch.Tensor.to: lambda cls, input, output: cls.new_like(
-            input, output, dtype=output.dtype, device=output.device
-        ),
+        torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output),
+        torch.Tensor.to: lambda cls, input, output: cls.wrap_like(input, output),
         # We don't need to wrap the output of `Tensor.requires_grad_`, since it is an inplace operation and thus
         # retains the type automatically
         torch.Tensor.requires_grad_: lambda cls, input, output: output,
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index c953ae78c..23f81678d 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -62,6 +62,12 @@ def _from_tensor_shape(shape: List[int]) -> ColorSpace:
 class Image(_Feature):
     color_space: ColorSpace
 
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Image:
+        image = tensor.as_subclass(cls)
+        image.color_space = color_space
+        return image
+
     def __new__(
         cls,
         data: Any,
@@ -71,36 +77,33 @@ class Image(_Feature):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
     ) -> Image:
-        data = torch.as_tensor(data, dtype=dtype, device=device)
-        if data.ndim < 2:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if tensor.ndim < 2:
             raise ValueError
-        elif data.ndim == 2:
-            data = data.unsqueeze(0)
-        image = super().__new__(cls, data, requires_grad=requires_grad)
+        elif tensor.ndim == 2:
+            tensor = tensor.unsqueeze(0)
 
         if color_space is None:
-            color_space = ColorSpace.from_tensor_shape(image.shape)  # type: ignore[arg-type]
+            color_space = ColorSpace.from_tensor_shape(tensor.shape)  # type: ignore[arg-type]
             if color_space == ColorSpace.OTHER:
                 warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.")
         elif isinstance(color_space, str):
             color_space = ColorSpace.from_str(color_space.upper())
         elif not isinstance(color_space, ColorSpace):
             raise ValueError
-        image.color_space = color_space
 
-        return image
-
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(color_space=self.color_space)
+        return cls._wrap(tensor, color_space=color_space)
 
     @classmethod
-    def new_like(
-        cls, other: Image, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any
-    ) -> Image:
-        return super().new_like(
-            other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs
+    def wrap_like(cls, other: Image, tensor: torch.Tensor, *, color_space: Optional[ColorSpace] = None) -> Image:
+        return cls._wrap(
+            tensor,
+            color_space=color_space if color_space is not None else other.color_space,
         )
 
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(color_space=self.color_space)
+
     @property
     def image_size(self) -> Tuple[int, int]:
         return cast(Tuple[int, int], tuple(self.shape[-2:]))
@@ -113,7 +116,7 @@ class Image(_Feature):
         if isinstance(color_space, str):
             color_space = ColorSpace.from_str(color_space.upper())
 
-        return Image.new_like(
+        return Image.wrap_like(
             self,
             self._F.convert_color_space_image_tensor(
                 self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
@@ -129,15 +132,15 @@ class Image(_Feature):
     def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image:
         # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
         #  promote this out of the prototype state
-        return Image.new_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs))
+        return Image.wrap_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs))
 
     def horizontal_flip(self) -> Image:
         output = self._F.horizontal_flip_image_tensor(self)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def vertical_flip(self) -> Image:
         output = self._F.vertical_flip_image_tensor(self)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
         self,
@@ -149,15 +152,15 @@ class Image(_Feature):
         output = self._F.resize_image_tensor(
             self, size, interpolation=interpolation, max_size=max_size, antialias=antialias
         )
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def crop(self, top: int, left: int, height: int, width: int) -> Image:
         output = self._F.crop_image_tensor(self, top, left, height, width)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def center_crop(self, output_size: List[int]) -> Image:
         output = self._F.center_crop_image_tensor(self, output_size=output_size)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def resized_crop(
         self,
@@ -172,7 +175,7 @@ class Image(_Feature):
         output = self._F.resized_crop_image_tensor(
             self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
         )
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def pad(
         self,
@@ -181,7 +184,7 @@ class Image(_Feature):
         padding_mode: str = "constant",
     ) -> Image:
         output = self._F.pad_image_tensor(self, padding, fill=fill, padding_mode=padding_mode)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def rotate(
         self,
@@ -194,7 +197,7 @@ class Image(_Feature):
         output = self._F._geometry.rotate_image_tensor(
             self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
         )
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def affine(
         self,
@@ -216,7 +219,7 @@ class Image(_Feature):
             fill=fill,
             center=center,
         )
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def perspective(
         self,
@@ -227,7 +230,7 @@ class Image(_Feature):
         output = self._F._geometry.perspective_image_tensor(
             self, perspective_coeffs, interpolation=interpolation, fill=fill
         )
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def elastic(
         self,
@@ -236,55 +239,55 @@ class Image(_Feature):
         fill: FillTypeJIT = None,
     ) -> Image:
         output = self._F._geometry.elastic_image_tensor(self, displacement, interpolation=interpolation, fill=fill)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def adjust_brightness(self, brightness_factor: float) -> Image:
         output = self._F.adjust_brightness_image_tensor(self, brightness_factor=brightness_factor)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def adjust_saturation(self, saturation_factor: float) -> Image:
         output = self._F.adjust_saturation_image_tensor(self, saturation_factor=saturation_factor)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def adjust_contrast(self, contrast_factor: float) -> Image:
         output = self._F.adjust_contrast_image_tensor(self, contrast_factor=contrast_factor)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def adjust_sharpness(self, sharpness_factor: float) -> Image:
         output = self._F.adjust_sharpness_image_tensor(self, sharpness_factor=sharpness_factor)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def adjust_hue(self, hue_factor: float) -> Image:
         output = self._F.adjust_hue_image_tensor(self, hue_factor=hue_factor)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def adjust_gamma(self, gamma: float, gain: float = 1) -> Image:
         output = self._F.adjust_gamma_image_tensor(self, gamma=gamma, gain=gain)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def posterize(self, bits: int) -> Image:
         output = self._F.posterize_image_tensor(self, bits=bits)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def solarize(self, threshold: float) -> Image:
         output = self._F.solarize_image_tensor(self, threshold=threshold)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def autocontrast(self) -> Image:
         output = self._F.autocontrast_image_tensor(self)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def equalize(self) -> Image:
         output = self._F.equalize_image_tensor(self)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def invert(self) -> Image:
         output = self._F.invert_image_tensor(self)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
     def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image:
         output = self._F.gaussian_blur_image_tensor(self, kernel_size=kernel_size, sigma=sigma)
-        return Image.new_like(self, output)
+        return Image.wrap_like(self, output)
 
 
 ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
diff --git a/torchvision/prototype/features/_label.py b/torchvision/prototype/features/_label.py
index ebaa84d66..9c2bcfc0f 100644
--- a/torchvision/prototype/features/_label.py
+++ b/torchvision/prototype/features/_label.py
@@ -14,6 +14,12 @@ L = TypeVar("L", bound="_LabelBase")
 class _LabelBase(_Feature):
     categories: Optional[Sequence[str]]
 
+    @classmethod
+    def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L:
+        label_base = tensor.as_subclass(cls)
+        label_base.categories = categories
+        return label_base
+
     def __new__(
         cls: Type[L],
         data: Any,
@@ -23,16 +29,14 @@ class _LabelBase(_Feature):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
     ) -> L:
-        label_base = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        label_base.categories = categories
-
-        return label_base
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor, categories=categories)
 
     @classmethod
-    def new_like(cls: Type[L], other: L, data: Any, *, categories: Optional[Sequence[str]] = None, **kwargs: Any) -> L:
-        return super().new_like(
-            other, data, categories=categories if categories is not None else other.categories, **kwargs
+    def wrap_like(cls: Type[L], other: L, tensor: torch.Tensor, *, categories: Optional[Sequence[str]] = None) -> L:
+        return cls._wrap(
+            tensor,
+            categories=categories if categories is not None else other.categories,
         )
 
     @classmethod
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 9dd614752..65793dc45 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import List, Optional, Union
+from typing import Any, List, Optional, Union
 
 import torch
 from torchvision.transforms import InterpolationMode
@@ -9,13 +9,36 @@ from ._feature import _Feature, FillTypeJIT
 
 
 class Mask(_Feature):
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor) -> Mask:
+        return tensor.as_subclass(cls)
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: bool = False,
+    ) -> Mask:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor)
+
+    @classmethod
+    def wrap_like(
+        cls,
+        other: Mask,
+        tensor: torch.Tensor,
+    ) -> Mask:
+        return cls._wrap(tensor)
+
     def horizontal_flip(self) -> Mask:
         output = self._F.horizontal_flip_mask(self)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def vertical_flip(self) -> Mask:
         output = self._F.vertical_flip_mask(self)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
         self,
@@ -25,15 +48,15 @@ class Mask(_Feature):
         antialias: bool = False,
     ) -> Mask:
         output = self._F.resize_mask(self, size, max_size=max_size)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def crop(self, top: int, left: int, height: int, width: int) -> Mask:
         output = self._F.crop_mask(self, top, left, height, width)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def center_crop(self, output_size: List[int]) -> Mask:
         output = self._F.center_crop_mask(self, output_size=output_size)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def resized_crop(
         self,
@@ -46,7 +69,7 @@ class Mask(_Feature):
         antialias: bool = False,
     ) -> Mask:
         output = self._F.resized_crop_mask(self, top, left, height, width, size=size)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def pad(
         self,
@@ -55,7 +78,7 @@ class Mask(_Feature):
         padding_mode: str = "constant",
     ) -> Mask:
         output = self._F.pad_mask(self, padding, padding_mode=padding_mode, fill=fill)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def rotate(
         self,
@@ -66,7 +89,7 @@ class Mask(_Feature):
         center: Optional[List[float]] = None,
     ) -> Mask:
         output = self._F.rotate_mask(self, angle, expand=expand, center=center, fill=fill)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def affine(
         self,
@@ -87,7 +110,7 @@ class Mask(_Feature):
             fill=fill,
             center=center,
         )
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def perspective(
         self,
@@ -96,7 +119,7 @@ class Mask(_Feature):
         fill: FillTypeJIT = None,
     ) -> Mask:
         output = self._F.perspective_mask(self, perspective_coeffs, fill=fill)
-        return Mask.new_like(self, output)
+        return Mask.wrap_like(self, output)
 
     def elastic(
         self,
@@ -105,4 +128,4 @@ class Mask(_Feature):
         fill: FillTypeJIT = None,
     ) -> Mask:
         output = self._F.elastic_mask(self, displacement, fill=fill)
-        return Mask.new_like(self, output, dtype=output.dtype)
+        return Mask.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index e19b6f7ed..a58027243 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -13,6 +13,12 @@ from ._image import ColorSpace, ImageType, ImageTypeJIT, TensorImageType, Tensor
 class Video(_Feature):
     color_space: ColorSpace
 
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Video:
+        image = tensor.as_subclass(cls)
+        image.color_space = color_space
+        return image
+
     def __new__(
         cls,
         data: Any,
@@ -22,7 +28,7 @@ class Video(_Feature):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
     ) -> Video:
-        data = torch.as_tensor(data, dtype=dtype, device=device)
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         if data.ndim < 4:
             raise ValueError
         video = super().__new__(cls, data, requires_grad=requires_grad)
@@ -35,21 +41,19 @@ class Video(_Feature):
             color_space = ColorSpace.from_str(color_space.upper())
         elif not isinstance(color_space, ColorSpace):
             raise ValueError
-        video.color_space = color_space
-
-        return video
 
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(color_space=self.color_space)
+        return cls._wrap(tensor, color_space=color_space)
 
     @classmethod
-    def new_like(
-        cls, other: Video, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any
-    ) -> Video:
-        return super().new_like(
-            other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs
+    def wrap_like(cls, other: Video, tensor: torch.Tensor, *, color_space: Optional[ColorSpace] = None) -> Video:
+        return cls._wrap(
+            tensor,
+            color_space=color_space if color_space is not None else other.color_space,
         )
 
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(color_space=self.color_space)
+
     # TODO: rename this (and all instances of this term to spatial size)
     @property
     def image_size(self) -> Tuple[int, int]:
@@ -67,7 +71,7 @@ class Video(_Feature):
         if isinstance(color_space, str):
             color_space = ColorSpace.from_str(color_space.upper())
 
-        return Video.new_like(
+        return Video.wrap_like(
             self,
             self._F.convert_color_space_video(
                 self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
@@ -77,11 +81,11 @@ class Video(_Feature):
 
     def horizontal_flip(self) -> Video:
         output = self._F.horizontal_flip_video(self)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def vertical_flip(self) -> Video:
         output = self._F.vertical_flip_video(self)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
         self,
@@ -91,15 +95,15 @@ class Video(_Feature):
         antialias: bool = False,
     ) -> Video:
         output = self._F.resize_video(self, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def crop(self, top: int, left: int, height: int, width: int) -> Video:
         output = self._F.crop_video(self, top, left, height, width)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def center_crop(self, output_size: List[int]) -> Video:
         output = self._F.center_crop_video(self, output_size=output_size)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def resized_crop(
         self,
@@ -114,7 +118,7 @@ class Video(_Feature):
         output = self._F.resized_crop_video(
             self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
         )
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def pad(
         self,
@@ -123,7 +127,7 @@ class Video(_Feature):
         padding_mode: str = "constant",
     ) -> Video:
         output = self._F.pad_video(self, padding, fill=fill, padding_mode=padding_mode)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def rotate(
         self,
@@ -136,7 +140,7 @@ class Video(_Feature):
         output = self._F._geometry.rotate_video(
             self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
         )
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def affine(
         self,
@@ -158,7 +162,7 @@ class Video(_Feature):
             fill=fill,
             center=center,
         )
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def perspective(
         self,
@@ -167,7 +171,7 @@ class Video(_Feature):
         fill: FillTypeJIT = None,
     ) -> Video:
         output = self._F._geometry.perspective_video(self, perspective_coeffs, interpolation=interpolation, fill=fill)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def elastic(
         self,
@@ -176,55 +180,55 @@ class Video(_Feature):
         fill: FillTypeJIT = None,
     ) -> Video:
         output = self._F._geometry.elastic_video(self, displacement, interpolation=interpolation, fill=fill)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def adjust_brightness(self, brightness_factor: float) -> Video:
         output = self._F.adjust_brightness_video(self, brightness_factor=brightness_factor)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def adjust_saturation(self, saturation_factor: float) -> Video:
         output = self._F.adjust_saturation_video(self, saturation_factor=saturation_factor)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def adjust_contrast(self, contrast_factor: float) -> Video:
         output = self._F.adjust_contrast_video(self, contrast_factor=contrast_factor)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def adjust_sharpness(self, sharpness_factor: float) -> Video:
         output = self._F.adjust_sharpness_video(self, sharpness_factor=sharpness_factor)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def adjust_hue(self, hue_factor: float) -> Video:
         output = self._F.adjust_hue_video(self, hue_factor=hue_factor)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def adjust_gamma(self, gamma: float, gain: float = 1) -> Video:
         output = self._F.adjust_gamma_video(self, gamma=gamma, gain=gain)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def posterize(self, bits: int) -> Video:
         output = self._F.posterize_video(self, bits=bits)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def solarize(self, threshold: float) -> Video:
         output = self._F.solarize_video(self, threshold=threshold)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def autocontrast(self) -> Video:
         output = self._F.autocontrast_video(self)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def equalize(self) -> Video:
         output = self._F.equalize_video(self)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def invert(self) -> Video:
         output = self._F.invert_video(self)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
     def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Video:
         output = self._F.gaussian_blur_video(self, kernel_size=kernel_size, sigma=sigma)
-        return Video.new_like(self, output)
+        return Video.wrap_like(self, output)
 
 
 VideoType = Union[torch.Tensor, Video]
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 311ad6d5a..bcab0a3f4 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -119,7 +119,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
             raise ValueError("Need a batch of one hot labels")
         output = inpt.clone()
         output = output.roll(1, -2).mul_(1 - lam).add_(output.mul_(lam))
-        return features.OneHotLabel.new_like(inpt, output)
+        return features.OneHotLabel.wrap_like(inpt, output)
 
 
 class RandomMixup(_BaseMixupCutmix):
@@ -135,7 +135,7 @@ class RandomMixup(_BaseMixupCutmix):
             output = output.roll(1, -4).mul_(1 - lam).add_(output.mul_(lam))
 
             if isinstance(inpt, features.Image):
-                output = features.Image.new_like(inpt, output)
+                output = features.Image.wrap_like(inpt, output)
 
             return output
         elif isinstance(inpt, features.OneHotLabel):
@@ -178,7 +178,7 @@ class RandomCutmix(_BaseMixupCutmix):
             output[..., y1:y2, x1:x2] = image_rolled[..., y1:y2, x1:x2]
 
             if isinstance(inpt, features.Image):
-                output = features.Image.new_like(inpt, output)
+                output = features.Image.wrap_like(inpt, output)
 
             return output
         elif isinstance(inpt, features.OneHotLabel):
@@ -213,9 +213,11 @@ class SimpleCopyPaste(_RandomApplyTransform):
         antialias: Optional[bool],
     ) -> Tuple[features.TensorImageType, Dict[str, Any]]:
 
-        paste_masks = paste_target["masks"].new_like(paste_target["masks"], paste_target["masks"][random_selection])
-        paste_boxes = paste_target["boxes"].new_like(paste_target["boxes"], paste_target["boxes"][random_selection])
-        paste_labels = paste_target["labels"].new_like(paste_target["labels"], paste_target["labels"][random_selection])
+        paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection])
+        paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection])
+        paste_labels = paste_target["labels"].wrap_like(
+            paste_target["labels"], paste_target["labels"][random_selection]
+        )
 
         masks = target["masks"]
 
@@ -317,7 +319,7 @@ class SimpleCopyPaste(_RandomApplyTransform):
         c0, c1, c2, c3 = 0, 0, 0, 0
         for i, obj in enumerate(flat_sample):
             if isinstance(obj, features.Image):
-                flat_sample[i] = features.Image.new_like(obj, output_images[c0])
+                flat_sample[i] = features.Image.wrap_like(obj, output_images[c0])
                 c0 += 1
             elif isinstance(obj, PIL.Image.Image):
                 flat_sample[i] = F.to_image_pil(output_images[c0])
@@ -326,13 +328,13 @@ class SimpleCopyPaste(_RandomApplyTransform):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
             elif isinstance(obj, features.BoundingBox):
-                flat_sample[i] = features.BoundingBox.new_like(obj, output_targets[c1]["boxes"])
+                flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"])
                 c1 += 1
             elif isinstance(obj, features.Mask):
-                flat_sample[i] = features.Mask.new_like(obj, output_targets[c2]["masks"])
+                flat_sample[i] = features.Mask.wrap_like(obj, output_targets[c2]["masks"])
                 c2 += 1
             elif isinstance(obj, (features.Label, features.OneHotLabel)):
-                flat_sample[i] = obj.new_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
+                flat_sample[i] = obj.wrap_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
                 c3 += 1
 
     def forward(self, *inputs: Any) -> Any:
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 4732f88d4..7e28d9d6c 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -520,7 +520,7 @@ class AugMix(_AutoAugmentBase):
         mix = mix.view(orig_dims).to(dtype=image_or_video.dtype)
 
         if isinstance(orig_image_or_video, (features.Image, features.Video)):
-            mix = type(orig_image_or_video).new_like(orig_image_or_video, mix)  # type: ignore[arg-type]
+            mix = type(orig_image_or_video).wrap_like(orig_image_or_video, mix)  # type: ignore[arg-type]
         elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_image_pil(mix)
 
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 451b57b66..67a6cc3cc 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -119,7 +119,8 @@ class RandomPhotometricDistort(Transform):
         output = inpt[..., permutation, :, :]
 
         if isinstance(inpt, (features.Image, features.Video)):
-            output = type(inpt).new_like(inpt, output, color_space=features.ColorSpace.OTHER)  # type: ignore[arg-type]
+            output = type(inpt).wrap_like(inpt, output, color_space=features.ColorSpace.OTHER)  # type: ignore[arg-type]
+
         elif isinstance(inpt, PIL.Image.Image):
             output = F.to_image_pil(output)
 
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index a9341415c..3979b178f 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -55,7 +55,7 @@ class Grayscale(Transform):
     def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
         if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.GRAY)
+            output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)
         return output
 
 
@@ -84,5 +84,5 @@ class RandomGrayscale(_RandomApplyTransform):
     def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
         if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.GRAY)
+            output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)
         return output
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 1f132ec92..37e2aee02 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -158,8 +158,8 @@ class FiveCrop(Transform):
         ...     def forward(self, sample: Tuple[Tuple[features.Image, ...], features.Label]):
         ...         images, labels = sample
         ...         batch_size = len(images)
-        ...         images = features.Image.new_like(images[0], torch.stack(images))
-        ...         labels = features.Label.new_like(labels, labels.repeat(batch_size))
+        ...         images = features.Image.wrap_like(images[0], torch.stack(images))
+        ...         labels = features.Label.wrap_like(labels, labels.repeat(batch_size))
         ...         return images, labels
         ...
         >>> image = features.Image(torch.rand(3, 256, 256))
@@ -677,18 +677,18 @@ class RandomIoUCrop(Transform):
         is_within_crop_area = params["is_within_crop_area"]
 
         if isinstance(inpt, (features.Label, features.OneHotLabel)):
-            return inpt.new_like(inpt, inpt[is_within_crop_area])  # type: ignore[arg-type]
+            return inpt.wrap_like(inpt, inpt[is_within_crop_area])  # type: ignore[arg-type]
 
         output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
 
         if isinstance(output, features.BoundingBox):
             bboxes = output[is_within_crop_area]
             bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size)
-            output = features.BoundingBox.new_like(output, bboxes)
+            output = features.BoundingBox.wrap_like(output, bboxes)
         elif isinstance(output, features.Mask):
             # apply is_within_crop_area if mask is one-hot encoded
             masks = output[is_within_crop_area]
-            output = features.Mask.new_like(output, masks)
+            output = features.Mask.wrap_like(output, masks)
 
         return output
 
@@ -801,7 +801,7 @@ class FixedSizeCrop(Transform):
             bounding_boxes = cast(
                 features.BoundingBox, F.crop(bounding_boxes, top=top, left=left, height=new_height, width=new_width)
             )
-            bounding_boxes = features.BoundingBox.new_like(
+            bounding_boxes = features.BoundingBox.wrap_like(
                 bounding_boxes,
                 F.clamp_bounding_box(
                     bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size
@@ -840,9 +840,9 @@ class FixedSizeCrop(Transform):
 
         if params["is_valid"] is not None:
             if isinstance(inpt, (features.Label, features.OneHotLabel, features.Mask)):
-                inpt = inpt.new_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
+                inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
             elif isinstance(inpt, features.BoundingBox):
-                inpt = features.BoundingBox.new_like(
+                inpt = features.BoundingBox.wrap_like(
                     inpt,
                     F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size),
                 )
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index cb090492a..74fbcd60f 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -18,7 +18,7 @@ class ConvertBoundingBoxFormat(Transform):
 
     def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
         output = F.convert_format_bounding_box(inpt, old_format=inpt.format, new_format=params["format"])
-        return features.BoundingBox.new_like(inpt, output, format=params["format"])
+        return features.BoundingBox.wrap_like(inpt, output, format=params["format"])
 
 
 class ConvertImageDtype(Transform):
@@ -30,7 +30,11 @@ class ConvertImageDtype(Transform):
 
     def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> features.TensorImageType:
         output = F.convert_image_dtype(inpt, dtype=self.dtype)
-        return output if features.is_simple_tensor(inpt) else features.Image.new_like(inpt, output, dtype=self.dtype)  # type: ignore[arg-type]
+        return (
+            output
+            if features.is_simple_tensor(inpt)
+            else features.Image.wrap_like(inpt, output)  # type: ignore[arg-type]
+        )
 
 
 class ConvertColorSpace(Transform):
@@ -65,4 +69,4 @@ class ClampBoundingBoxes(Transform):
 
     def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
         output = F.clamp_bounding_box(inpt, format=inpt.format, image_size=inpt.image_size)
-        return features.BoundingBox.new_like(inpt, output)
+        return features.BoundingBox.wrap_like(inpt, output)
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 2531bf8f6..dd1e1cdf8 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -171,4 +171,4 @@ class RemoveSmallBoundingBoxes(Transform):
         return dict(valid_indices=valid_indices)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt.new_like(inpt, inpt[params["valid_indices"]])
+        return inpt.wrap_like(inpt, inpt[params["valid_indices"]])
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 976feb99e..847343dbf 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -35,7 +35,7 @@ def erase(
     if isinstance(inpt, torch.Tensor):
         output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
         if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
-            output = type(inpt).new_like(inpt, output)  # type: ignore[arg-type]
+            output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index f205b5aea..c63fe5b41 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1409,7 +1409,7 @@ def five_crop(
     if isinstance(inpt, torch.Tensor):
         output = five_crop_image_tensor(inpt, size)
         if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = tuple(features.Image.new_like(inpt, item) for item in output)  # type: ignore[assignment]
+            output = tuple(features.Image.wrap_like(inpt, item) for item in output)  # type: ignore[assignment]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return five_crop_image_pil(inpt, size)
@@ -1446,7 +1446,7 @@ def ten_crop(inpt: features.ImageTypeJIT, size: List[int], vertical_flip: bool =
     if isinstance(inpt, torch.Tensor):
         output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
         if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = [features.Image.new_like(inpt, item) for item in output]
+            output = [features.Image.wrap_like(inpt, item) for item in output]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
-- 
GitLab


From 6e203b44098c3371689f56abc17b7c02bd51a261 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:26:50 +0100
Subject: [PATCH 022/624] [prototype] Rewrite the meta dimension methods
 (#6722)

* Rewrite `get_dimensions`, `get_num_channels` and `get_spatial_size`

* Remove `get_chw`

* Remove comments

* Make `get_spatial_size` support non-image input

* Reduce the unnecessary use of `get_dimensions*`

* Fix linters

* Fix merge bug

* Linter

* Fix linter
---
 torchvision/prototype/features/_mask.py       |  6 +-
 .../prototype/transforms/_auto_augment.py     | 10 ++--
 torchvision/prototype/transforms/_utils.py    |  7 ++-
 .../transforms/functional/__init__.py         |  6 ++
 .../transforms/functional/_geometry.py        | 21 ++++---
 .../prototype/transforms/functional/_meta.py  | 58 ++++++++++++-------
 6 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 65793dc45..7b49ce8e8 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Union
+from typing import Any, cast, List, Optional, Tuple, Union
 
 import torch
 from torchvision.transforms import InterpolationMode
@@ -32,6 +32,10 @@ class Mask(_Feature):
     ) -> Mask:
         return cls._wrap(tensor)
 
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        return cast(Tuple[int, int], tuple(self.shape[-2:]))
+
     def horizontal_flip(self) -> Mask:
         output = self._F.horizontal_flip_mask(self)
         return Mask.wrap_like(self, output)
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 7e28d9d6c..6ef9edba3 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -7,7 +7,7 @@ import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision.prototype import features
 from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
-from torchvision.prototype.transforms.functional._meta import get_chw
+from torchvision.prototype.transforms.functional._meta import get_spatial_size
 
 from ._utils import _isinstance, _setup_fill_arg
 
@@ -278,7 +278,7 @@ class AutoAugment(_AutoAugmentBase):
         sample = inputs if len(inputs) > 1 else inputs[0]
 
         id, image_or_video = self._extract_image_or_video(sample)
-        _, height, width = get_chw(image_or_video)
+        height, width = get_spatial_size(image_or_video)
 
         policy = self._policies[int(torch.randint(len(self._policies), ()))]
 
@@ -349,7 +349,7 @@ class RandAugment(_AutoAugmentBase):
         sample = inputs if len(inputs) > 1 else inputs[0]
 
         id, image_or_video = self._extract_image_or_video(sample)
-        _, height, width = get_chw(image_or_video)
+        height, width = get_spatial_size(image_or_video)
 
         for _ in range(self.num_ops):
             transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
@@ -403,7 +403,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         sample = inputs if len(inputs) > 1 else inputs[0]
 
         id, image_or_video = self._extract_image_or_video(sample)
-        _, height, width = get_chw(image_or_video)
+        height, width = get_spatial_size(image_or_video)
 
         transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
 
@@ -473,7 +473,7 @@ class AugMix(_AutoAugmentBase):
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
         id, orig_image_or_video = self._extract_image_or_video(sample)
-        _, height, width = get_chw(orig_image_or_video)
+        height, width = get_spatial_size(orig_image_or_video)
 
         if isinstance(orig_image_or_video, torch.Tensor):
             image_or_video = orig_image_or_video
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index a76891a34..db1ff4b7b 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -10,7 +10,7 @@ from torchvision._utils import sequence_to_str
 from torchvision.prototype import features
 from torchvision.prototype.features._feature import FillType
 
-from torchvision.prototype.transforms.functional._meta import get_chw
+from torchvision.prototype.transforms.functional._meta import get_dimensions
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
 from typing_extensions import Literal
@@ -80,7 +80,7 @@ def query_bounding_box(sample: Any) -> features.BoundingBox:
 def query_chw(sample: Any) -> Tuple[int, int, int]:
     flat_sample, _ = tree_flatten(sample)
     chws = {
-        get_chw(item)
+        tuple(get_dimensions(item))
         for item in flat_sample
         if isinstance(item, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(item)
     }
@@ -88,7 +88,8 @@ def query_chw(sample: Any) -> Tuple[int, int, int]:
         raise TypeError("No image or video was found in the sample")
     elif len(chws) > 1:
         raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
-    return chws.pop()
+    c, h, w = chws.pop()
+    return c, h, w
 
 
 def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index cb801df73..1e918cc34 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -8,9 +8,15 @@ from ._meta import (
     convert_color_space_image_pil,
     convert_color_space_video,
     convert_color_space,
+    get_dimensions_image_tensor,
+    get_dimensions_image_pil,
     get_dimensions,
     get_image_num_channels,
+    get_num_channels_image_tensor,
+    get_num_channels_image_pil,
     get_num_channels,
+    get_spatial_size_image_tensor,
+    get_spatial_size_image_pil,
     get_spatial_size,
 )  # usort: skip
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index c63fe5b41..670b2cb87 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -21,7 +21,12 @@ from torchvision.transforms.functional_tensor import (
     interpolate,
 )
 
-from ._meta import convert_format_bounding_box, get_dimensions_image_pil, get_dimensions_image_tensor
+from ._meta import (
+    convert_format_bounding_box,
+    get_dimensions_image_tensor,
+    get_spatial_size_image_pil,
+    get_spatial_size_image_tensor,
+)
 
 horizontal_flip_image_tensor = _FT.hflip
 horizontal_flip_image_pil = _FP.hflip
@@ -323,7 +328,7 @@ def affine_image_pil(
     # it is visually better to estimate the center without 0.5 offset
     # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
     if center is None:
-        _, height, width = get_dimensions_image_pil(image)
+        height, width = get_spatial_size_image_pil(image)
         center = [width * 0.5, height * 0.5]
     matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
 
@@ -1189,13 +1194,13 @@ def _center_crop_compute_crop_anchor(
 
 def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    _, image_height, image_width = get_dimensions_image_tensor(image)
+    image_height, image_width = get_spatial_size_image_tensor(image)
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
         image = pad_image_tensor(image, padding_ltrb, fill=0)
 
-        _, image_height, image_width = get_dimensions_image_tensor(image)
+        image_height, image_width = get_spatial_size_image_tensor(image)
         if crop_width == image_width and crop_height == image_height:
             return image
 
@@ -1206,13 +1211,13 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
 @torch.jit.unused
 def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    _, image_height, image_width = get_dimensions_image_pil(image)
+    image_height, image_width = get_spatial_size_image_pil(image)
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
         image = pad_image_pil(image, padding_ltrb, fill=0)
 
-        _, image_height, image_width = get_dimensions_image_pil(image)
+        image_height, image_width = get_spatial_size_image_pil(image)
         if crop_width == image_width and crop_height == image_height:
             return image
 
@@ -1365,7 +1370,7 @@ def five_crop_image_tensor(
     image: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     crop_height, crop_width = _parse_five_crop_size(size)
-    _, image_height, image_width = get_dimensions_image_tensor(image)
+    image_height, image_width = get_spatial_size_image_tensor(image)
 
     if crop_width > image_width or crop_height > image_height:
         msg = "Requested crop size {} is bigger than input size {}"
@@ -1385,7 +1390,7 @@ def five_crop_image_pil(
     image: PIL.Image.Image, size: List[int]
 ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
     crop_height, crop_width = _parse_five_crop_size(size)
-    _, image_height, image_width = get_dimensions_image_pil(image)
+    image_height, image_width = get_spatial_size_image_pil(image)
 
     if crop_width > image_width or crop_height > image_height:
         msg = "Requested crop size {} is bigger than input size {}"
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 1e53edf39..e24b68c9f 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -6,38 +6,37 @@ from torchvision.prototype import features
 from torchvision.prototype.features import BoundingBoxFormat, ColorSpace
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
+
 get_dimensions_image_tensor = _FT.get_dimensions
 get_dimensions_image_pil = _FP.get_dimensions
 
 
-# TODO: Should this be prefixed with `_` similar to other methods that don't get exposed by init?
-def get_chw(image: features.ImageOrVideoTypeJIT) -> Tuple[int, int, int]:
+def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]:
     if isinstance(image, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
     ):
-        channels, height, width = get_dimensions_image_tensor(image)
+        return get_dimensions_image_tensor(image)
     elif isinstance(image, (features.Image, features.Video)):
         channels = image.num_channels
         height, width = image.image_size
-    else:  # isinstance(image, PIL.Image.Image)
-        channels, height, width = get_dimensions_image_pil(image)
-    return channels, height, width
-
-
-# The three functions below are here for BC. Whether we want to have two different kernels and how they and the
-# compound version should be named is still under discussion: https://github.com/pytorch/vision/issues/6491
-# Given that these kernels should also support boxes, masks, and videos, it is unlikely that there name will stay.
-# They will either be deprecated or simply aliased to the new kernels if we have reached consensus about the issue
-# detailed above.
+        return [channels, height, width]
+    else:
+        return get_dimensions_image_pil(image)
 
 
-def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]:
-    return list(get_chw(image))
+get_num_channels_image_tensor = _FT.get_image_num_channels
+get_num_channels_image_pil = _FP.get_image_num_channels
 
 
 def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int:
-    num_channels, *_ = get_chw(image)
-    return num_channels
+    if isinstance(image, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
+    ):
+        return _FT.get_image_num_channels(image)
+    elif isinstance(image, (features.Image, features.Video)):
+        return image.num_channels
+    else:
+        return _FP.get_image_num_channels(image)
 
 
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
@@ -45,9 +44,28 @@ def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int:
 get_image_num_channels = get_num_channels
 
 
-def get_spatial_size(image: features.ImageOrVideoTypeJIT) -> List[int]:
-    _, *size = get_chw(image)
-    return size
+def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]:
+    width, height = _FT.get_image_size(image)
+    return [height, width]
+
+
+@torch.jit.unused
+def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]:
+    width, height = _FP.get_image_size(image)
+    return [height, width]
+
+
+def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]:
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+        return get_spatial_size_image_tensor(inpt)
+    elif isinstance(inpt, features._Feature):
+        image_size = getattr(inpt, "image_size", None)
+        if image_size is not None:
+            return list(image_size)
+        else:
+            raise ValueError(f"Type {inpt.__class__} doesn't have spatial size.")
+    else:
+        return get_spatial_size_image_pil(inpt)
 
 
 def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor:
-- 
GitLab


From af54e5645399bc67711155de2f8bb9cb1f4ebbe1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 10 Oct 2022 09:46:09 +0100
Subject: [PATCH 023/624] [FBcode->GH] Fix GRACE_HOPPER file internal discovery
 (#6719)

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/test_models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index d284ec6fe..a169f5053 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -44,9 +44,11 @@ def _get_image(input_shape, real_image, device):
     To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params`
     """
     if real_image:
-        GRACE_HOPPER = get_relative_path(
-            os.path.dirname(os.path.realpath(__file__)), "test", "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
+        # TODO: Maybe unify file discovery logic with test_image.py
+        GRACE_HOPPER = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
         )
+
         img = Image.open(GRACE_HOPPER)
 
         original_width, original_height = img.size
-- 
GitLab


From 019139f7875c3388aa6c3cd5d65782b69b3059bf Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 10 Oct 2022 11:16:42 +0200
Subject: [PATCH 024/624] make _setup_fill_arg serializable (#6730)

---
 torchvision/prototype/transforms/_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index db1ff4b7b..a3980fa21 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -1,6 +1,6 @@
+import functools
 import numbers
 from collections import defaultdict
-
 from typing import Any, Callable, Dict, Sequence, Tuple, Type, Union
 
 import PIL.Image
@@ -43,13 +43,19 @@ def _check_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> None:
             raise TypeError("Got inappropriate fill arg")
 
 
+def _default_fill(fill: FillType) -> FillType:
+    return fill
+
+
 def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillType]:
     _check_fill_arg(fill)
 
     if isinstance(fill, dict):
         return fill
 
-    return defaultdict(lambda: fill)  # type: ignore[return-value, arg-type]
+    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
+    # If it were possible, we could replace this with `defaultdict(lambda: fill)`
+    return defaultdict(functools.partial(_default_fill, fill))
 
 
 def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
-- 
GitLab


From 17969ebad94eecf8c59db531d53a205ec8993467 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 10 Oct 2022 11:17:22 +0200
Subject: [PATCH 025/624] enable arbitrary batch size for all prototype kernels
 (#6726)

* enable arbitrary batch size for all prototype kernels

* put back perspective dispatcher
---
 test/prototype_transforms_dispatcher_infos.py |  9 ---
 test/prototype_transforms_kernel_infos.py     | 11 ---
 .../transforms/functional/_geometry.py        | 76 +++++++++----------
 .../prototype/transforms/functional/_misc.py  | 38 +++++-----
 4 files changed, 54 insertions(+), 80 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index be8bd3002..de933c7e3 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -138,12 +138,6 @@ def xfail_all_tests(*, reason, condition):
     ]
 
 
-xfails_degenerate_or_multi_batch_dims = xfail_all_tests(
-    reason="See https://github.com/pytorch/vision/issues/6670 for details.",
-    condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]),
-)
-
-
 DISPATCHER_INFOS = [
     DispatcherInfo(
         F.horizontal_flip,
@@ -260,7 +254,6 @@ DISPATCHER_INFOS = [
         pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
         test_marks=[
             xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
-            *xfails_degenerate_or_multi_batch_dims,
         ],
     ),
     DispatcherInfo(
@@ -271,7 +264,6 @@ DISPATCHER_INFOS = [
             features.Mask: F.elastic_mask,
         },
         pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
-        test_marks=xfails_degenerate_or_multi_batch_dims,
     ),
     DispatcherInfo(
         F.center_crop,
@@ -294,7 +286,6 @@ DISPATCHER_INFOS = [
         test_marks=[
             xfail_jit_python_scalar_arg("kernel_size"),
             xfail_jit_python_scalar_arg("sigma"),
-            *xfails_degenerate_or_multi_batch_dims,
         ],
     ),
     DispatcherInfo(
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index d90d3bf68..9ebfc7a00 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -156,12 +156,6 @@ def xfail_all_tests(*, reason, condition):
     ]
 
 
-xfails_image_degenerate_or_multi_batch_dims = xfail_all_tests(
-    reason="See https://github.com/pytorch/vision/issues/6670 for details.",
-    condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]),
-)
-
-
 KERNEL_INFOS = []
 
 
@@ -1156,7 +1150,6 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            test_marks=xfails_image_degenerate_or_multi_batch_dims,
         ),
         KernelInfo(
             F.perspective_bounding_box,
@@ -1168,7 +1161,6 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            test_marks=xfails_image_degenerate_or_multi_batch_dims,
         ),
         KernelInfo(
             F.perspective_video,
@@ -1239,7 +1231,6 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            test_marks=xfails_image_degenerate_or_multi_batch_dims,
         ),
         KernelInfo(
             F.elastic_bounding_box,
@@ -1251,7 +1242,6 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            test_marks=xfails_image_degenerate_or_multi_batch_dims,
         ),
         KernelInfo(
             F.elastic_video,
@@ -1379,7 +1369,6 @@ KERNEL_INFOS.extend(
             test_marks=[
                 xfail_jit_python_scalar_arg("kernel_size"),
                 xfail_jit_python_scalar_arg("sigma"),
-                *xfails_image_degenerate_or_multi_batch_dims,
             ],
         ),
         KernelInfo(
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 670b2cb87..2c064245e 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -882,7 +882,23 @@ def perspective_image_tensor(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
-    return _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill)
+    if image.numel() == 0:
+        return image
+
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.view((-1,) + shape[-3:])
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
 
 
 @torch.jit.unused
@@ -1007,25 +1023,7 @@ def perspective_video(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
-    # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when
-    #  https://github.com/pytorch/vision/issues/6670 is resolved.
-    if video.numel() == 0:
-        return video
-
-    shape = video.shape
-
-    if video.ndim > 4:
-        video = video.view((-1,) + shape[-3:])
-        needs_unsquash = True
-    else:
-        needs_unsquash = False
-
-    output = perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill)
-
-    if needs_unsquash:
-        output = output.view(shape)
-
-    return output
+    return perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill)
 
 
 def perspective(
@@ -1048,7 +1046,23 @@ def elastic_image_tensor(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
-    return _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill)
+    if image.numel() == 0:
+        return image
+
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.view((-1,) + shape[-3:])
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
 
 
 @torch.jit.unused
@@ -1128,25 +1142,7 @@ def elastic_video(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
-    # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when
-    #  https://github.com/pytorch/vision/issues/6670 is resolved.
-    if video.numel() == 0:
-        return video
-
-    shape = video.shape
-
-    if video.ndim > 4:
-        video = video.view((-1,) + shape[-3:])
-        needs_unsquash = True
-    else:
-        needs_unsquash = False
-
-    output = elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
-
-    if needs_unsquash:
-        output = output.view(shape)
-
-    return output
+    return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
 
 
 def elastic(
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 7b3773e63..79a358b4e 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -56,7 +56,23 @@ def gaussian_blur_image_tensor(
         if s <= 0.0:
             raise ValueError(f"sigma should have positive values. Got {sigma}")
 
-    return _FT.gaussian_blur(image, kernel_size, sigma)
+    if image.numel() == 0:
+        return image
+
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.view((-1,) + shape[-3:])
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    output = _FT.gaussian_blur(image, kernel_size, sigma)
+
+    if needs_unsquash:
+        output = output.view(shape)
+
+    return output
 
 
 @torch.jit.unused
@@ -71,25 +87,7 @@ def gaussian_blur_image_pil(
 def gaussian_blur_video(
     video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
-    # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when
-    #  https://github.com/pytorch/vision/issues/6670 is resolved.
-    if video.numel() == 0:
-        return video
-
-    shape = video.shape
-
-    if video.ndim > 4:
-        video = video.view((-1,) + shape[-3:])
-        needs_unsquash = True
-    else:
-        needs_unsquash = False
-
-    output = gaussian_blur_image_tensor(video, kernel_size, sigma)
-
-    if needs_unsquash:
-        output = output.view(shape)
-
-    return output
+    return gaussian_blur_image_tensor(video, kernel_size, sigma)
 
 
 def gaussian_blur(
-- 
GitLab


From 3f1d9f6b21464aa023327dd0d2b397648470c387 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 10 Oct 2022 13:07:23 +0200
Subject: [PATCH 026/624] Refactor `KernelInfo` and `DispatcherInfo` (#6710)

* make args and kwargs in ArgsKwargs more accessible

* refactor KernelInfo and DispatcherInfo

* remove ArgsKwargs __getitem__ shortcut again
---
 test/prototype_common_utils.py                | 53 +++++++++++
 test/prototype_transforms_dispatcher_infos.py | 94 ++++++++++---------
 test/prototype_transforms_kernel_infos.py     | 92 +++++++-----------
 test/test_prototype_transforms_functional.py  | 33 +++----
 4 files changed, 150 insertions(+), 122 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index c10cec94c..1d5766b1f 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -3,6 +3,7 @@
 import collections.abc
 import dataclasses
 import functools
+from collections import defaultdict
 from typing import Callable, Optional, Sequence, Tuple, Union
 
 import PIL.Image
@@ -47,6 +48,9 @@ __all__ = [
     "make_masks",
     "make_video",
     "make_videos",
+    "TestMark",
+    "mark_framework_limitation",
+    "InfoBase",
 ]
 
 
@@ -588,3 +592,52 @@ def make_video_loaders(
 
 
 make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason))
+
+
+class InfoBase:
+    def __init__(self, *, id, test_marks=None, closeness_kwargs=None):
+        # Identifier if the info that shows up the parametrization.
+        self.id = id
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        self.test_marks = test_marks or []
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`.
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index de933c7e3..82173907c 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -1,57 +1,67 @@
 import collections.abc
-import dataclasses
-
-from collections import defaultdict
-
-from typing import Callable, Dict, List, Optional, Sequence, Type
 
 import pytest
 import torchvision.prototype.transforms.functional as F
-from prototype_transforms_kernel_infos import KERNEL_INFOS, TestMark
+from prototype_common_utils import InfoBase, TestMark
+from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torchvision.prototype import features
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
-KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS}
-
-
-@dataclasses.dataclass
-class PILKernelInfo:
-    kernel: Callable
-    kernel_name: str = dataclasses.field(default=None)
-
-    def __post_init__(self):
-        self.kernel_name = self.kernel_name or self.kernel.__name__
 
-
-@dataclasses.dataclass
-class DispatcherInfo:
-    dispatcher: Callable
-    kernels: Dict[Type, Callable]
-    pil_kernel_info: Optional[PILKernelInfo] = None
-    method_name: str = dataclasses.field(default=None)
-    test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list)
-    _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False)
-
-    def __post_init__(self):
-        self.kernel_infos = {feature_type: KERNEL_INFO_MAP[kernel] for feature_type, kernel in self.kernels.items()}
-        self.method_name = self.method_name or self.dispatcher.__name__
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
+class PILKernelInfo(InfoBase):
+    def __init__(
+        self,
+        kernel,
+        *,
+        # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
+        # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
+        kernel_name=None,
+    ):
+        super().__init__(id=kernel_name or kernel.__name__)
+        self.kernel = kernel
+
+
+class DispatcherInfo(InfoBase):
+    _KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS}
+
+    def __init__(
+        self,
+        dispatcher,
+        *,
+        # Dictionary of types that map to the kernel the dispatcher dispatches to.
+        kernels,
+        # If omitted, no PIL dispatch test will be performed.
+        pil_kernel_info=None,
+        # See InfoBase
+        test_marks=None,
+        # See InfoBase
+        closeness_kwargs=None,
+    ):
+        super().__init__(id=dispatcher.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
+        self.dispatcher = dispatcher
+        self.kernels = kernels
+        self.pil_kernel_info = pil_kernel_info
+
+        kernel_infos = {}
+        for feature_type, kernel in self.kernels.items():
+            kernel_info = self._KERNEL_INFO_MAP.get(kernel)
+            if not kernel_info:
+                raise pytest.UsageError(
+                    f"Can't register {kernel.__name__} for type {feature_type} since there is no `KernelInfo` for it. "
+                    f"Please add a `KernelInfo` for it in `prototype_transforms_kernel_infos.py`."
+                )
+            kernel_infos[feature_type] = kernel_info
+        self.kernel_infos = kernel_infos
 
     def sample_inputs(self, *feature_types, filter_metadata=True):
-        for feature_type in feature_types or self.kernels.keys():
-            if feature_type not in self.kernels:
-                raise pytest.UsageError(f"There is no kernel registered for type {feature_type.__name__}")
+        for feature_type in feature_types or self.kernel_infos.keys():
+            kernel_info = self.kernel_infos.get(feature_type)
+            if not kernel_info:
+                raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}")
+
+            sample_inputs = kernel_info.sample_inputs_fn()
 
-            sample_inputs = self.kernel_infos[feature_type].sample_inputs_fn()
             if not filter_metadata:
                 yield from sample_inputs
             else:
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 9ebfc7a00..34f1f875a 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1,26 +1,24 @@
-import dataclasses
 import functools
 import itertools
 import math
-from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
 import numpy as np
 import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.prototype.transforms.functional as F
-
-from _pytest.mark.structures import MarkDecorator
 from common_utils import cycle_over
 from datasets_utils import combinations_grid
 from prototype_common_utils import (
     ArgsKwargs,
+    InfoBase,
     make_bounding_box_loaders,
     make_image_loader,
     make_image_loaders,
     make_mask_loaders,
     make_video_loaders,
+    mark_framework_limitation,
+    TestMark,
     VALID_EXTRA_DIMS,
 )
 from torchvision.prototype import features
@@ -29,51 +27,35 @@ from torchvision.transforms.functional_tensor import _max_value as get_max_value
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
 
-TestID = Tuple[Optional[str], str]
-
-
-@dataclasses.dataclass
-class TestMark:
-    test_id: TestID
-    mark: MarkDecorator
-    condition: Callable[[ArgsKwargs], bool] = lambda args_kwargs: True
-
-
-@dataclasses.dataclass
-class KernelInfo:
-    kernel: Callable
-    # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but should
-    # not include extensive parameter combinations to keep to overall test count moderate.
-    sample_inputs_fn: Callable[[], Iterable[ArgsKwargs]]
-    # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
-    # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
-    kernel_name: str = dataclasses.field(default=None)
-    # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also take
-    # tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should happen
-    # inside the function. It should return a tensor or to be more precise an object that can be compared to a
-    # tensor by `assert_close`. If omitted, no reference test will be performed.
-    reference_fn: Optional[Callable] = None
-    # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter
-    # values to be tested. If not specified, `sample_inputs_fn` will be used.
-    reference_inputs_fn: Optional[Callable[[], Iterable[ArgsKwargs]]] = None
-    # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`.
-    closeness_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list)
-    _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False)
-
-    def __post_init__(self):
-        self.kernel_name = self.kernel_name or self.kernel.__name__
-        self.reference_inputs_fn = self.reference_inputs_fn or self.sample_inputs_fn
-
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
+class KernelInfo(InfoBase):
+    def __init__(
+        self,
+        kernel,
+        *,
+        # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
+        # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
+        kernel_name=None,
+        # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but
+        # should not include extensive parameter combinations to keep to overall test count moderate.
+        sample_inputs_fn,
+        # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also
+        # take tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should
+        # happen inside the function. It should return a tensor or to be more precise an object that can be compared to
+        # a tensor by `assert_close`. If omitted, no reference test will be performed.
+        reference_fn=None,
+        # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter
+        # values to be tested. If not specified, `sample_inputs_fn` will be used.
+        reference_inputs_fn=None,
+        # See InfoBase
+        test_marks=None,
+        # See InfoBase
+        closeness_kwargs=None,
+    ):
+        super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
+        self.kernel = kernel
+        self.sample_inputs_fn = sample_inputs_fn
+        self.reference_fn = reference_fn
+        self.reference_inputs_fn = reference_inputs_fn
 
 
 DEFAULT_IMAGE_CLOSENESS_KWARGS = dict(
@@ -97,16 +79,6 @@ def pil_reference_wrapper(pil_kernel):
     return wrapper
 
 
-def mark_framework_limitation(test_id, reason):
-    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
-    # framework cannot handle the kernel in general or a specific parameter combination.
-    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
-    # still justified.
-    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
-    # we are wasting CI resources for no reason for most of the time.
-    return TestMark(test_id, pytest.mark.skip(reason=reason))
-
-
 def xfail_jit_python_scalar_arg(name, *, reason=None):
     reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
     return TestMark(
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 5adea4d26..8329de697 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -1,4 +1,3 @@
-import functools
 import math
 import os
 
@@ -27,7 +26,7 @@ def script(fn):
         raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
 
 
-def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, name_fn=lambda info: str(info)):
+def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None):
     if condition is None:
 
         def condition(info):
@@ -41,7 +40,7 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n
         elif len(parts) == 2:
             test_class_name, test_function_name = parts
         else:
-            raise pytest.UsageError("Unable to parse the test class and test name from test function")
+            raise pytest.UsageError("Unable to parse the test class name and test function name from test function")
         test_id = (test_class_name, test_function_name)
 
         argnames = ("info", "args_kwargs")
@@ -51,7 +50,6 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n
                 continue
 
             args_kwargs = list(args_kwargs_fn(info))
-            name = name_fn(info)
             idx_field_len = len(str(len(args_kwargs)))
 
             for idx, args_kwargs_ in enumerate(args_kwargs):
@@ -60,7 +58,7 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n
                         info,
                         args_kwargs_,
                         marks=info.get_marks(test_id, args_kwargs_),
-                        id=f"{name}-{idx:0{idx_field_len}}",
+                        id=f"{info.id}-{idx:0{idx_field_len}}",
                     )
                 )
 
@@ -70,14 +68,11 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n
 
 
 class TestKernels:
-    make_kernel_args_kwargs_parametrization = functools.partial(
-        make_args_kwargs_parametrization, name_fn=lambda info: info.kernel_name
-    )
-    sample_inputs = kernel_sample_inputs = make_kernel_args_kwargs_parametrization(
+    sample_inputs = make_info_args_kwargs_parametrization(
         KERNEL_INFOS,
         args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(),
     )
-    reference_inputs = make_kernel_args_kwargs_parametrization(
+    reference_inputs = make_info_args_kwargs_parametrization(
         KERNEL_INFOS,
         args_kwargs_fn=lambda info: info.reference_inputs_fn(),
         condition=lambda info: info.reference_fn is not None,
@@ -208,10 +203,7 @@ def spy_on(mocker):
 
 
 class TestDispatchers:
-    make_dispatcher_args_kwargs_parametrization = functools.partial(
-        make_args_kwargs_parametrization, name_fn=lambda info: info.dispatcher.__name__
-    )
-    image_sample_inputs = kernel_sample_inputs = make_dispatcher_args_kwargs_parametrization(
+    image_sample_inputs = make_info_args_kwargs_parametrization(
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
         condition=lambda info: features.Image in info.kernels,
@@ -251,13 +243,13 @@ class TestDispatchers:
         image_simple_tensor = torch.Tensor(image_feature)
 
         kernel_info = info.kernel_infos[features.Image]
-        spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.kernel_name)
+        spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.id)
 
         info.dispatcher(image_simple_tensor, *other_args, **kwargs)
 
         spy.assert_called_once()
 
-    @make_dispatcher_args_kwargs_parametrization(
+    @make_info_args_kwargs_parametrization(
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
         condition=lambda info: info.pil_kernel_info is not None,
@@ -271,22 +263,23 @@ class TestDispatchers:
         image_pil = F.to_image_pil(image_feature)
 
         pil_kernel_info = info.pil_kernel_info
-        spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.kernel_name)
+        spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.id)
 
         info.dispatcher(image_pil, *other_args, **kwargs)
 
         spy.assert_called_once()
 
-    @make_dispatcher_args_kwargs_parametrization(
+    @make_info_args_kwargs_parametrization(
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(),
     )
     def test_dispatch_feature(self, info, args_kwargs, spy_on):
         (feature, *other_args), kwargs = args_kwargs.load()
 
-        method = getattr(feature, info.method_name)
+        method_name = info.id
+        method = getattr(feature, method_name)
         feature_type = type(feature)
-        spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{info.method_name}")
+        spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{method_name}")
 
         info.dispatcher(feature, *other_args, **kwargs)
 
-- 
GitLab


From 0ab50f5fabbb976a70c815d49fec4a56b8f46359 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 10 Oct 2022 12:32:31 +0100
Subject: [PATCH 027/624] Remove performance workaround for mask resize (#6729)

* Remove performance workaround for mask resize

* Fix linter

* bug fixes

* remove unnecessary import

* Fixing linter
---
 test/prototype_transforms_kernel_infos.py     | 11 ------
 .../transforms/functional/_geometry.py        | 39 ++++---------------
 2 files changed, 7 insertions(+), 43 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 34f1f875a..c8cca77e0 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -19,7 +19,6 @@ from prototype_common_utils import (
     make_video_loaders,
     mark_framework_limitation,
     TestMark,
-    VALID_EXTRA_DIMS,
 )
 from torchvision.prototype import features
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
@@ -215,16 +214,6 @@ def sample_inputs_resize_image_tensor():
     ):
         yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation)
 
-    # We have a speed hack in place for nearest interpolation and single channel images (grayscale)
-    for image_loader in make_image_loaders(
-        sizes=["random"],
-        color_spaces=[features.ColorSpace.GRAY],
-        extra_dims=VALID_EXTRA_DIMS,
-    ):
-        yield ArgsKwargs(
-            image_loader, size=[min(image_loader.image_size) + 1], interpolation=F.InterpolationMode.NEAREST
-        )
-
     yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25)
 
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 2c064245e..93df59ad6 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -14,12 +14,7 @@ from torchvision.transforms.functional import (
     pil_to_tensor,
     to_pil_image,
 )
-from torchvision.transforms.functional_tensor import (
-    _cast_squeeze_in,
-    _cast_squeeze_out,
-    _parse_pad_padding,
-    interpolate,
-)
+from torchvision.transforms.functional_tensor import _parse_pad_padding
 
 from ._meta import (
     convert_format_bounding_box,
@@ -130,32 +125,12 @@ def resize_image_tensor(
     if image.numel() > 0:
         image = image.view(-1, num_channels, old_height, old_width)
 
-        # This is a perf hack to avoid slow channels_last upsample code path
-        # Related issue: https://github.com/pytorch/pytorch/issues/83840
-        # We are transforming (N, 1, H, W) into (N, 2, H, W) to force to take channels_first path
-        if image.shape[1] == 1 and interpolation == InterpolationMode.NEAREST:
-            # Below code is copied from _FT.resize
-            # This is due to the fact that we need to apply the hack on casted image and not before
-            # Otherwise, image will be copied while cast to float and interpolate will work on twice more data
-            image, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(image, [torch.float32, torch.float64])
-
-            shape = (image.shape[0], 2, image.shape[2], image.shape[3])
-            image = image.expand(shape)
-
-            image = interpolate(
-                image, size=[new_height, new_width], mode=interpolation.value, align_corners=None, antialias=False
-            )
-
-            image = image[:, 0, ...]
-            image = _cast_squeeze_out(image, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
-
-        else:
-            image = _FT.resize(
-                image,
-                size=[new_height, new_width],
-                interpolation=interpolation.value,
-                antialias=antialias,
-            )
+        image = _FT.resize(
+            image,
+            size=[new_height, new_width],
+            interpolation=interpolation.value,
+            antialias=antialias,
+        )
 
     return image.view(extra_dims + (num_channels, new_height, new_width))
 
-- 
GitLab


From a3fe870b0f036e6b7917200b5a884e57c22ec6cf Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 10 Oct 2022 12:40:35 +0100
Subject: [PATCH 028/624] Adding support of Video to remaining Transforms and
 Kernels (#6724)

* Adding support of Video to missed Transforms and Kernels

* Fixing Grayscale Transform.

* Fixing FiveCrop and TenCrop Transforms.

* Fix Linter

* Fix more kernels.

* Add `five_crop_video` and `ten_crop_video` kernels

* Added a TODO.

* Missed Video isinstance

* nits

* Fix bug on AugMix

* Nits and TODOs.

* Reapply Philip's recommendation

* Fix mypy and JIT

* Fixing test
---
 torchvision/prototype/features/__init__.py    | 12 ++++++-
 torchvision/prototype/features/_video.py      |  1 +
 torchvision/prototype/transforms/_augment.py  |  1 +
 .../prototype/transforms/_auto_augment.py     |  5 +--
 torchvision/prototype/transforms/_color.py    |  2 +-
 .../prototype/transforms/_deprecated.py       | 16 ++++-----
 torchvision/prototype/transforms/_geometry.py | 27 +++++++++------
 torchvision/prototype/transforms/_meta.py     | 10 +++---
 torchvision/prototype/transforms/_misc.py     |  1 +
 .../transforms/functional/__init__.py         |  2 ++
 .../transforms/functional/_augment.py         |  2 +-
 .../transforms/functional/_deprecated.py      | 11 ++++---
 .../transforms/functional/_geometry.py        | 33 ++++++++++++++-----
 .../prototype/transforms/functional/_meta.py  |  6 +++-
 14 files changed, 88 insertions(+), 41 deletions(-)

diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
index 6fc2fb6ea..944ae9bd3 100644
--- a/torchvision/prototype/features/__init__.py
+++ b/torchvision/prototype/features/__init__.py
@@ -13,4 +13,14 @@ from ._image import (
 )
 from ._label import Label, OneHotLabel
 from ._mask import Mask
-from ._video import ImageOrVideoType, ImageOrVideoTypeJIT, TensorImageOrVideoType, TensorImageOrVideoTypeJIT, Video
+from ._video import (
+    ImageOrVideoType,
+    ImageOrVideoTypeJIT,
+    LegacyVideoType,
+    LegacyVideoTypeJIT,
+    TensorImageOrVideoType,
+    TensorImageOrVideoTypeJIT,
+    Video,
+    VideoType,
+    VideoTypeJIT,
+)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index a58027243..e32c36d5d 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -238,6 +238,7 @@ LegacyVideoTypeJIT = torch.Tensor
 TensorVideoType = Union[torch.Tensor, Video]
 TensorVideoTypeJIT = torch.Tensor
 
+# TODO: decide if we should do definitions for both Images and Videos or use unions in the methods
 ImageOrVideoType = Union[ImageType, VideoType]
 ImageOrVideoTypeJIT = Union[ImageTypeJIT, VideoTypeJIT]
 TensorImageOrVideoType = Union[TensorImageType, TensorVideoType]
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index bcab0a3f4..7b2dca8a6 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -99,6 +99,7 @@ class RandomErasing(_RandomApplyTransform):
         return inpt
 
 
+# TODO: Add support for Video: https://github.com/pytorch/vision/issues/6731
 class _BaseMixupCutmix(_RandomApplyTransform):
     def __init__(self, alpha: float, p: float = 0.5) -> None:
         super().__init__(p=p)
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 6ef9edba3..d078cb2d1 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -483,7 +483,8 @@ class AugMix(_AutoAugmentBase):
         augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
 
         orig_dims = list(image_or_video.shape)
-        batch = image_or_video.view([1] * max(4 - image_or_video.ndim, 0) + orig_dims)
+        expected_dim = 5 if isinstance(orig_image_or_video, features.Video) else 4
+        batch = image_or_video.view([1] * max(expected_dim - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
         # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
@@ -520,7 +521,7 @@ class AugMix(_AutoAugmentBase):
         mix = mix.view(orig_dims).to(dtype=image_or_video.dtype)
 
         if isinstance(orig_image_or_video, (features.Image, features.Video)):
-            mix = type(orig_image_or_video).wrap_like(orig_image_or_video, mix)  # type: ignore[arg-type]
+            mix = orig_image_or_video.wrap_like(orig_image_or_video, mix)  # type: ignore[arg-type]
         elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_image_pil(mix)
 
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 67a6cc3cc..340e721da 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -119,7 +119,7 @@ class RandomPhotometricDistort(Transform):
         output = inpt[..., permutation, :, :]
 
         if isinstance(inpt, (features.Image, features.Video)):
-            output = type(inpt).wrap_like(inpt, output, color_space=features.ColorSpace.OTHER)  # type: ignore[arg-type]
+            output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.OTHER)  # type: ignore[arg-type]
 
         elif isinstance(inpt, PIL.Image.Image):
             output = F.to_image_pil(output)
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index 3979b178f..f8aec22b9 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -29,7 +29,7 @@ class ToTensor(Transform):
 
 
 class Grayscale(Transform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
+    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
 
     def __init__(self, num_output_channels: Literal[1, 3] = 1) -> None:
         deprecation_msg = (
@@ -52,15 +52,15 @@ class Grayscale(Transform):
         super().__init__()
         self.num_output_channels = num_output_channels
 
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
+    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
-        if isinstance(inpt, features.Image):
-            output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)
+        if isinstance(inpt, (features.Image, features.Video)):
+            output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)  # type: ignore[arg-type]
         return output
 
 
 class RandomGrayscale(_RandomApplyTransform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
+    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
 
     def __init__(self, p: float = 0.1) -> None:
         warnings.warn(
@@ -81,8 +81,8 @@ class RandomGrayscale(_RandomApplyTransform):
         num_input_channels, _, _ = query_chw(sample)
         return dict(num_input_channels=num_input_channels)
 
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
+    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
-        if isinstance(inpt, features.Image):
-            output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)
+        if isinstance(inpt, (features.Image, features.Video)):
+            output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)  # type: ignore[arg-type]
         return output
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 37e2aee02..371ea7f69 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -155,12 +155,13 @@ class FiveCrop(Transform):
     """
     Example:
         >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[features.Image, ...], features.Label]):
-        ...         images, labels = sample
-        ...         batch_size = len(images)
-        ...         images = features.Image.wrap_like(images[0], torch.stack(images))
+        ...     def forward(self, sample: Tuple[Tuple[Union[features.Image, features.Video], ...], features.Label]):
+        ...         images_or_videos, labels = sample
+        ...         batch_size = len(images_or_videos)
+        ...         image_or_video = images_or_videos[0]
+        ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
         ...         labels = features.Label.wrap_like(labels, labels.repeat(batch_size))
-        ...         return images, labels
+        ...         return images_or_videos, labels
         ...
         >>> image = features.Image(torch.rand(3, 256, 256))
         >>> label = features.Label(0)
@@ -172,15 +173,21 @@ class FiveCrop(Transform):
         torch.Size([5])
     """
 
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
+    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
 
     def __init__(self, size: Union[int, Sequence[int]]) -> None:
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
     def _transform(
-        self, inpt: features.ImageType, params: Dict[str, Any]
-    ) -> Tuple[features.ImageType, features.ImageType, features.ImageType, features.ImageType, features.ImageType]:
+        self, inpt: features.ImageOrVideoType, params: Dict[str, Any]
+    ) -> Tuple[
+        features.ImageOrVideoType,
+        features.ImageOrVideoType,
+        features.ImageOrVideoType,
+        features.ImageOrVideoType,
+        features.ImageOrVideoType,
+    ]:
         return F.five_crop(inpt, self.size)
 
     def forward(self, *inputs: Any) -> Any:
@@ -194,14 +201,14 @@ class TenCrop(Transform):
     See :class:`~torchvision.prototype.transforms.FiveCrop` for an example.
     """
 
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
+    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
 
     def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
         self.vertical_flip = vertical_flip
 
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> List[features.ImageType]:
+    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> List[features.ImageOrVideoType]:
         return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
 
     def forward(self, *inputs: Any) -> Any:
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 74fbcd60f..e5c7d05b0 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -22,18 +22,18 @@ class ConvertBoundingBoxFormat(Transform):
 
 
 class ConvertImageDtype(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image)
+    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
 
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
         super().__init__()
         self.dtype = dtype
 
-    def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> features.TensorImageType:
+    def _transform(
+        self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]
+    ) -> features.TensorImageOrVideoType:
         output = F.convert_image_dtype(inpt, dtype=self.dtype)
         return (
-            output
-            if features.is_simple_tensor(inpt)
-            else features.Image.wrap_like(inpt, output)  # type: ignore[arg-type]
+            output if features.is_simple_tensor(inpt) else type(inpt).wrap_like(inpt, output)  # type: ignore[attr-defined]
         )
 
 
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index dd1e1cdf8..d3c8a57dc 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -140,6 +140,7 @@ class GaussianBlur(Transform):
         return F.gaussian_blur(inpt, self.kernel_size, **params)
 
 
+# TODO: Enhance as described at https://github.com/pytorch/vision/issues/6697
 class ToDtype(Lambda):
     def __init__(self, dtype: torch.dtype, *types: Type) -> None:
         self.dtype = dtype
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index 1e918cc34..579442dc7 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -96,6 +96,7 @@ from ._geometry import (
     five_crop,
     five_crop_image_pil,
     five_crop_image_tensor,
+    five_crop_video,
     hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
     horizontal_flip,
     horizontal_flip_bounding_box,
@@ -136,6 +137,7 @@ from ._geometry import (
     ten_crop,
     ten_crop_image_pil,
     ten_crop_image_tensor,
+    ten_crop_video,
     vertical_flip,
     vertical_flip_bounding_box,
     vertical_flip_image_pil,
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 847343dbf..57c3602cc 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -35,7 +35,7 @@ def erase(
     if isinstance(inpt, torch.Tensor):
         output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
         if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
-            output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
+            output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index cbdea5130..854920b96 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Any, List
+from typing import Any, List, Union
 
 import PIL.Image
 import torch
@@ -22,10 +22,13 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
     return _F.to_grayscale(inpt, num_output_channels=num_output_channels)
 
 
-def rgb_to_grayscale(inpt: features.LegacyImageTypeJIT, num_output_channels: int = 1) -> features.LegacyImageTypeJIT:
+def rgb_to_grayscale(
+    inpt: Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT], num_output_channels: int = 1
+) -> Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT]:
     old_color_space = (
         features._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
-        if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Image))
+        if isinstance(inpt, torch.Tensor)
+        and (torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video)))
         else None
     )
 
@@ -56,7 +59,7 @@ def to_tensor(inpt: Any) -> torch.Tensor:
     return _F.to_tensor(inpt)
 
 
-def get_image_size(inpt: features.ImageTypeJIT) -> List[int]:
+def get_image_size(inpt: features.ImageOrVideoTypeJIT) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
         "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 93df59ad6..44b4986ab 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1376,16 +1376,27 @@ def five_crop_image_pil(
     return tl, tr, bl, br, center
 
 
+def five_crop_video(
+    video: torch.Tensor, size: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    return five_crop_image_tensor(video, size)
+
+
 def five_crop(
-    inpt: features.ImageTypeJIT, size: List[int]
+    inpt: features.ImageOrVideoTypeJIT, size: List[int]
 ) -> Tuple[
-    features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT
+    features.ImageOrVideoTypeJIT,
+    features.ImageOrVideoTypeJIT,
+    features.ImageOrVideoTypeJIT,
+    features.ImageOrVideoTypeJIT,
+    features.ImageOrVideoTypeJIT,
 ]:
-    # TODO: consider breaking BC here to return List[features.ImageTypeJIT] to align this op with `ten_crop`
+    # TODO: consider breaking BC here to return List[features.ImageOrVideoTypeJIT] to align this op with `ten_crop`
     if isinstance(inpt, torch.Tensor):
         output = five_crop_image_tensor(inpt, size)
-        if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = tuple(features.Image.wrap_like(inpt, item) for item in output)  # type: ignore[assignment]
+        if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
+            tmp = tuple(inpt.wrap_like(inpt, item) for item in output)  # type: ignore[arg-type]
+            output = tmp  # type: ignore[assignment]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return five_crop_image_pil(inpt, size)
@@ -1418,11 +1429,17 @@ def ten_crop_image_pil(image: PIL.Image.Image, size: List[int], vertical_flip: b
     return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
 
 
-def ten_crop(inpt: features.ImageTypeJIT, size: List[int], vertical_flip: bool = False) -> List[features.ImageTypeJIT]:
+def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]:
+    return ten_crop_image_tensor(video, size, vertical_flip=vertical_flip)
+
+
+def ten_crop(
+    inpt: features.ImageOrVideoTypeJIT, size: List[int], vertical_flip: bool = False
+) -> List[features.ImageOrVideoTypeJIT]:
     if isinstance(inpt, torch.Tensor):
         output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
-        if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = [features.Image.wrap_like(inpt, item) for item in output]
+        if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
+            output = [inpt.wrap_like(inpt, item) for item in output]  # type: ignore[arg-type]
         return output
     else:  # isinstance(inpt, PIL.Image.Image):
         return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index e24b68c9f..c03d65c95 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -55,6 +55,10 @@ def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]:
     return [height, width]
 
 
+# TODO: Should we have get_spatial_size_video here? How about masks/bbox etc? What is the criterion for deciding when
+# a kernel will be created?
+
+
 def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return get_spatial_size_image_tensor(inpt)
@@ -246,7 +250,7 @@ def convert_color_space(
     ):
         if old_color_space is None:
             raise RuntimeError(
-                "In order to convert the color space of simple tensor images, "
+                "In order to convert the color space of simple tensors, "
                 "the `old_color_space=...` parameter needs to be passed."
             )
         return convert_color_space_image_tensor(
-- 
GitLab


From 12adc5426ef345ab7999661538a60da99dd85281 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 10 Oct 2022 14:56:22 +0100
Subject: [PATCH 029/624] Add video support on MixUp and CutMix (#6733)

* Add video support on MixUp and CutMix

* Switch back to roll

* Fix tests and mypy

* Another mypy fix
---
 test/test_prototype_transforms.py             |  9 +++--
 torchvision/prototype/transforms/_augment.py  | 37 +++++++++++--------
 .../prototype/transforms/_auto_augment.py     |  4 +-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 4037a7467..d7a41e7c1 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -112,9 +112,12 @@ class TestSmoke:
             (
                 transform,
                 [
-                    dict(image=image, one_hot_label=one_hot_label)
-                    for image, one_hot_label in itertools.product(
-                        make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                    dict(inpt=inpt, one_hot_label=one_hot_label)
+                    for inpt, one_hot_label in itertools.product(
+                        itertools.chain(
+                            make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                            make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                        ),
                         make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
                     )
                 ],
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 7b2dca8a6..4bfb5c9ed 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -107,8 +107,11 @@ class _BaseMixupCutmix(_RandomApplyTransform):
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
     def forward(self, *inputs: Any) -> Any:
-        if not (has_any(inputs, features.Image, features.is_simple_tensor) and has_any(inputs, features.OneHotLabel)):
-            raise TypeError(f"{type(self).__name__}() is only defined for tensor images and one-hot labels.")
+        if not (
+            has_any(inputs, features.Image, features.Video, features.is_simple_tensor)
+            and has_any(inputs, features.OneHotLabel)
+        ):
+            raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
         if has_any(inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
             raise TypeError(
                 f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
@@ -119,7 +122,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
         if inpt.ndim < 2:
             raise ValueError("Need a batch of one hot labels")
         output = inpt.clone()
-        output = output.roll(1, -2).mul_(1 - lam).add_(output.mul_(lam))
+        output = output.roll(1, 0).mul_(1.0 - lam).add_(output.mul_(lam))
         return features.OneHotLabel.wrap_like(inpt, output)
 
 
@@ -129,14 +132,15 @@ class RandomMixup(_BaseMixupCutmix):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         lam = params["lam"]
-        if isinstance(inpt, features.Image) or features.is_simple_tensor(inpt):
-            if inpt.ndim < 4:
-                raise ValueError("Need a batch of images")
+        if isinstance(inpt, (features.Image, features.Video)) or features.is_simple_tensor(inpt):
+            expected_ndim = 5 if isinstance(inpt, features.Video) else 4
+            if inpt.ndim < expected_ndim:
+                raise ValueError("The transform expects a batched input")
             output = inpt.clone()
-            output = output.roll(1, -4).mul_(1 - lam).add_(output.mul_(lam))
+            output = output.roll(1, 0).mul_(1.0 - lam).add_(output.mul_(lam))
 
-            if isinstance(inpt, features.Image):
-                output = features.Image.wrap_like(inpt, output)
+            if isinstance(inpt, (features.Image, features.Video)):
+                output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
 
             return output
         elif isinstance(inpt, features.OneHotLabel):
@@ -169,17 +173,18 @@ class RandomCutmix(_BaseMixupCutmix):
         return dict(box=box, lam_adjusted=lam_adjusted)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Image) or features.is_simple_tensor(inpt):
+        if isinstance(inpt, (features.Image, features.Video)) or features.is_simple_tensor(inpt):
             box = params["box"]
-            if inpt.ndim < 4:
-                raise ValueError("Need a batch of images")
+            expected_ndim = 5 if isinstance(inpt, features.Video) else 4
+            if inpt.ndim < expected_ndim:
+                raise ValueError("The transform expects a batched input")
             x1, y1, x2, y2 = box
-            image_rolled = inpt.roll(1, -4)
+            rolled = inpt.roll(1, 0)
             output = inpt.clone()
-            output[..., y1:y2, x1:x2] = image_rolled[..., y1:y2, x1:x2]
+            output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
 
-            if isinstance(inpt, features.Image):
-                output = features.Image.wrap_like(inpt, output)
+            if isinstance(inpt, (features.Image, features.Video)):
+                output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
 
             return output
         elif isinstance(inpt, features.OneHotLabel):
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index d078cb2d1..b35b5529b 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -483,8 +483,8 @@ class AugMix(_AutoAugmentBase):
         augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
 
         orig_dims = list(image_or_video.shape)
-        expected_dim = 5 if isinstance(orig_image_or_video, features.Video) else 4
-        batch = image_or_video.view([1] * max(expected_dim - image_or_video.ndim, 0) + orig_dims)
+        expected_ndim = 5 if isinstance(orig_image_or_video, features.Video) else 4
+        batch = image_or_video.view([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
         # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
-- 
GitLab


From 3099e0cc73610ccd39cca7fccbb72fce920f09de Mon Sep 17 00:00:00 2001
From: vsuryamurthy <v.sury@fizyr.com>
Date: Tue, 11 Oct 2022 09:55:08 +0200
Subject: [PATCH 030/624] Add missing type hints to anchor_utils (#6735)

* Use the variable name sizes instead of scales for consistency

* Add the missing type hints

* Restore the naming back to scales instead of sizes to avoid backwards incompatibility
---
 torchvision/models/detection/anchor_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py
index f42c10d82..cdf572a8b 100644
--- a/torchvision/models/detection/anchor_utils.py
+++ b/torchvision/models/detection/anchor_utils.py
@@ -61,7 +61,7 @@ class AnchorGenerator(nn.Module):
         aspect_ratios: List[float],
         dtype: torch.dtype = torch.float32,
         device: torch.device = torch.device("cpu"),
-    ):
+    ) -> Tensor:
         scales = torch.as_tensor(scales, dtype=dtype, device=device)
         aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
         h_ratios = torch.sqrt(aspect_ratios)
@@ -76,7 +76,7 @@ class AnchorGenerator(nn.Module):
     def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
         self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
 
-    def num_anchors_per_location(self):
+    def num_anchors_per_location(self) -> List[int]:
         return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
 
     # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
@@ -201,7 +201,7 @@ class DefaultBoxGenerator(nn.Module):
             _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device))
         return _wh_pairs
 
-    def num_anchors_per_location(self):
+    def num_anchors_per_location(self) -> List[int]:
         # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map.
         return [2 + 2 * len(r) for r in self.aspect_ratios]
 
-- 
GitLab


From 4d4711d970f5cbd0a9e1adb465dca2703c8efbfd Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 11 Oct 2022 10:10:48 +0100
Subject: [PATCH 031/624] [prototype] Switch to `spatial_size` (#6736)

* Change `image_size` to `spatial_size`

* Fix linter

* Fixing more tests.

* Adding get_num_channels_video and get_spatial_size_* kernels for video, masks and bboxes.

* Refactor get_spatial_size

* Reduce the usage of `query_chw` where possible

* Rename `query_chw` to `query_spatial_size`

* Adding `get_num_frames` dispatcher and kernel.

* Adding jit-scriptability tests
---
 test/prototype_common_utils.py                |  55 ++++----
 test/prototype_transforms_kernel_infos.py     |  66 +++++-----
 test/test_prototype_transforms.py             | 118 +++++++++---------
 test/test_prototype_transforms_consistency.py |  10 +-
 test/test_prototype_transforms_functional.py  | 115 ++++++++---------
 test/test_prototype_transforms_utils.py       |   4 +-
 .../prototype/datasets/_builtin/caltech.py    |   4 +-
 .../prototype/datasets/_builtin/celeba.py     |   2 +-
 .../prototype/datasets/_builtin/coco.py       |  16 ++-
 .../prototype/datasets/_builtin/cub200.py     |  12 +-
 .../prototype/datasets/_builtin/gtsrb.py      |   2 +-
 .../datasets/_builtin/stanford_cars.py        |   2 +-
 .../prototype/datasets/_builtin/voc.py        |   2 +-
 .../prototype/features/_bounding_box.py       |  54 ++++----
 torchvision/prototype/features/_encoded.py    |   8 +-
 torchvision/prototype/features/_image.py      |   2 +-
 torchvision/prototype/features/_mask.py       |   2 +-
 torchvision/prototype/features/_video.py      |   3 +-
 torchvision/prototype/transforms/_augment.py  |   4 +-
 torchvision/prototype/transforms/_color.py    |   2 +-
 .../prototype/transforms/_deprecated.py       |   2 +-
 torchvision/prototype/transforms/_geometry.py |  38 +++---
 torchvision/prototype/transforms/_meta.py     |   2 +-
 torchvision/prototype/transforms/_utils.py    |  18 ++-
 .../transforms/functional/__init__.py         |   6 +
 .../transforms/functional/_geometry.py        |  56 ++++-----
 .../prototype/transforms/functional/_meta.py  |  48 +++++--
 27 files changed, 354 insertions(+), 299 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 1d5766b1f..220a793ac 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -184,13 +184,18 @@ class ArgsKwargs:
         return args, kwargs
 
 
-DEFAULT_SQUARE_IMAGE_SIZE = 15
-DEFAULT_LANDSCAPE_IMAGE_SIZE = (7, 33)
-DEFAULT_PORTRAIT_IMAGE_SIZE = (31, 9)
-DEFAULT_IMAGE_SIZES = (DEFAULT_LANDSCAPE_IMAGE_SIZE, DEFAULT_PORTRAIT_IMAGE_SIZE, DEFAULT_SQUARE_IMAGE_SIZE, "random")
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    "random",
+)
 
 
-def _parse_image_size(size, *, name="size"):
+def _parse_spatial_size(size, *, name="size"):
     if size == "random":
         return tuple(torch.randint(15, 33, (2,)).tolist())
     elif isinstance(size, int) and size > 0:
@@ -246,11 +251,11 @@ class TensorLoader:
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
     color_space: features.ColorSpace
-    image_size: Tuple[int, int] = dataclasses.field(init=False)
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
 
     def __post_init__(self):
-        self.image_size = self.shape[-2:]
+        self.spatial_size = self.shape[-2:]
         self.num_channels = self.shape[-3]
 
 
@@ -277,7 +282,7 @@ def make_image_loader(
     dtype=torch.float32,
     constant_alpha=True,
 ):
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device):
@@ -295,7 +300,7 @@ make_image = from_loader(make_image_loader)
 
 def make_image_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
         features.ColorSpace.GRAY,
         features.ColorSpace.GRAY_ALPHA,
@@ -316,7 +321,7 @@ make_images = from_loaders(make_image_loaders)
 @dataclasses.dataclass
 class BoundingBoxLoader(TensorLoader):
     format: features.BoundingBoxFormat
-    image_size: Tuple[int, int]
+    spatial_size: Tuple[int, int]
 
 
 def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
@@ -331,7 +336,7 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
     ).reshape(low.shape)
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
     if isinstance(format, str):
         format = features.BoundingBoxFormat[format]
     if format not in {
@@ -341,7 +346,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
     }:
         raise pytest.UsageError(f"Can't make bounding box in format {format}")
 
-    image_size = _parse_image_size(image_size, name="image_size")
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
 
     def fn(shape, dtype, device):
         *extra_dims, num_coordinates = shape
@@ -350,10 +355,10 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
 
         if any(dim == 0 for dim in extra_dims):
             return features.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, image_size=image_size
+                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
             )
 
-        height, width = image_size
+        height, width = spatial_size
 
         if format == features.BoundingBoxFormat.XYXY:
             x1 = torch.randint(0, width // 2, extra_dims)
@@ -375,10 +380,10 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
             parts = (cx, cy, w, h)
 
         return features.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, image_size=image_size
+            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
         )
 
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, image_size=image_size)
+    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
 make_bounding_box = from_loader(make_bounding_box_loader)
@@ -388,11 +393,11 @@ def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(features.BoundingBoxFormat),
-    image_size="random",
+    spatial_size="random",
     dtypes=(torch.float32, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, image_size=image_size)
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
 
 
 make_bounding_boxes = from_loaders(make_bounding_box_loaders)
@@ -475,7 +480,7 @@ class MaskLoader(TensorLoader):
 
 def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
 
     def fn(shape, dtype, device):
@@ -489,7 +494,7 @@ make_detection_mask = from_loader(make_detection_mask_loader)
 
 
 def make_detection_mask_loaders(
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     num_objects=(1, 0, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
@@ -503,7 +508,7 @@ make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
     # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
 
     def fn(shape, dtype, device):
@@ -518,7 +523,7 @@ make_segmentation_mask = from_loader(make_segmentation_mask_loader)
 
 def make_segmentation_mask_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     num_categories=(1, 2, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
@@ -532,7 +537,7 @@ make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
 
 def make_mask_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     num_objects=(1, 0, "random"),
     num_categories=(1, 2, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
@@ -559,7 +564,7 @@ def make_video_loader(
     extra_dims=(),
     dtype=torch.uint8,
 ):
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device):
@@ -576,7 +581,7 @@ make_video = from_loader(make_video_loader)
 
 def make_video_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
         features.ColorSpace.GRAY,
         features.ColorSpace.RGB,
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index c8cca77e0..239425d17 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -145,7 +145,7 @@ def sample_inputs_horizontal_flip_bounding_box():
         formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -185,9 +185,9 @@ KERNEL_INFOS.extend(
 )
 
 
-def _get_resize_sizes(image_size):
-    height, width = image_size
-    length = max(image_size)
+def _get_resize_sizes(spatial_size):
+    height, width = spatial_size
+    length = max(spatial_size)
     yield length
     yield [length]
     yield (length,)
@@ -201,7 +201,7 @@ def sample_inputs_resize_image_tensor():
     for image_loader in make_image_loaders(
         sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
     ):
-        for size in _get_resize_sizes(image_loader.image_size):
+        for size in _get_resize_sizes(image_loader.spatial_size):
             yield ArgsKwargs(image_loader, size=size)
 
     for image_loader, interpolation in itertools.product(
@@ -212,7 +212,7 @@ def sample_inputs_resize_image_tensor():
             F.InterpolationMode.BICUBIC,
         ],
     ):
-        yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation)
+        yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation)
 
     yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25)
 
@@ -236,7 +236,7 @@ def reference_inputs_resize_image_tensor():
             F.InterpolationMode.BICUBIC,
         ],
     ):
-        for size in _get_resize_sizes(image_loader.image_size):
+        for size in _get_resize_sizes(image_loader.spatial_size):
             yield ArgsKwargs(
                 image_loader,
                 size=size,
@@ -251,8 +251,8 @@ def reference_inputs_resize_image_tensor():
 
 def sample_inputs_resize_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
-        for size in _get_resize_sizes(bounding_box_loader.image_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size)
+        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
+            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
 
 
 def sample_inputs_resize_mask():
@@ -394,7 +394,7 @@ def sample_inputs_affine_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
             **affine_params,
         )
 
@@ -422,9 +422,9 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
     return true_matrix
 
 
-def reference_affine_bounding_box(bounding_box, *, format, image_size, angle, translate, scale, shear, center=None):
+def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
     if center is None:
-        center = [s * 0.5 for s in image_size[::-1]]
+        center = [s * 0.5 for s in spatial_size[::-1]]
 
     def transform(bbox):
         affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
@@ -473,7 +473,7 @@ def reference_inputs_affine_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
             **affine_kwargs,
         )
 
@@ -650,7 +650,7 @@ def sample_inputs_vertical_flip_bounding_box():
         formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -729,7 +729,7 @@ def sample_inputs_rotate_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
             angle=_ROTATE_ANGLES[0],
         )
 
@@ -1001,7 +1001,7 @@ def sample_inputs_pad_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -1131,13 +1131,13 @@ KERNEL_INFOS.extend(
 )
 
 
-def _get_elastic_displacement(image_size):
-    return torch.rand(1, *image_size, 2)
+def _get_elastic_displacement(spatial_size):
+    return torch.rand(1, *spatial_size, 2)
 
 
 def sample_inputs_elastic_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"]):
-        displacement = _get_elastic_displacement(image_loader.image_size)
+        displacement = _get_elastic_displacement(image_loader.spatial_size)
         for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
 
@@ -1151,14 +1151,14 @@ def reference_inputs_elastic_image_tensor():
             F.InterpolationMode.BICUBIC,
         ],
     ):
-        displacement = _get_elastic_displacement(image_loader.image_size)
+        displacement = _get_elastic_displacement(image_loader.spatial_size)
         for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
             yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
 
 
 def sample_inputs_elastic_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
-        displacement = _get_elastic_displacement(bounding_box_loader.image_size)
+        displacement = _get_elastic_displacement(bounding_box_loader.spatial_size)
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
@@ -1212,7 +1212,7 @@ KERNEL_INFOS.extend(
 )
 
 
-_CENTER_CROP_IMAGE_SIZES = [(16, 16), (7, 33), (31, 9)]
+_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)]
 _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
 
 
@@ -1231,7 +1231,7 @@ def sample_inputs_center_crop_image_tensor():
 
 def reference_inputs_center_crop_image_tensor():
     for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
+        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
     ):
         yield ArgsKwargs(image_loader, output_size=output_size)
 
@@ -1241,7 +1241,7 @@ def sample_inputs_center_crop_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
             output_size=output_size,
         )
 
@@ -1254,7 +1254,7 @@ def sample_inputs_center_crop_mask():
 
 def reference_inputs_center_crop_mask():
     for mask_loader, output_size in itertools.product(
-        make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
+        make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
     ):
         yield ArgsKwargs(mask_loader, output_size=output_size)
 
@@ -1820,7 +1820,7 @@ KERNEL_INFOS.extend(
 def sample_inputs_clamp_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -1834,7 +1834,7 @@ KERNEL_INFOS.append(
 _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
 
 
-def _get_five_ten_crop_image_size(size):
+def _get_five_ten_crop_spatial_size(size):
     if isinstance(size, int):
         crop_height = crop_width = size
     elif len(size) == 1:
@@ -1847,28 +1847,32 @@ def _get_five_ten_crop_image_size(size):
 def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+            sizes=[_get_five_ten_crop_spatial_size(size)],
+            color_spaces=[features.ColorSpace.RGB],
+            dtypes=[torch.float32],
         ):
             yield ArgsKwargs(image_loader, size=size)
 
 
 def reference_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
             yield ArgsKwargs(image_loader, size=size)
 
 
 def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+            sizes=[_get_five_ten_crop_spatial_size(size)],
+            color_spaces=[features.ColorSpace.RGB],
+            dtypes=[torch.float32],
         ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
 def reference_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index d7a41e7c1..2c095fa6e 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -298,7 +298,7 @@ class TestRandomHorizontalFlip:
         assert_equal(features.Mask(expected), actual)
 
     def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
+        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
         transform = transforms.RandomHorizontalFlip(p=p)
 
         actual = transform(input)
@@ -307,7 +307,7 @@ class TestRandomHorizontalFlip:
         expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
         assert_equal(expected, actual)
         assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
+        assert actual.spatial_size == expected.spatial_size
 
 
 @pytest.mark.parametrize("p", [0.0, 1.0])
@@ -351,7 +351,7 @@ class TestRandomVerticalFlip:
         assert_equal(features.Mask(expected), actual)
 
     def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
+        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
         transform = transforms.RandomVerticalFlip(p=p)
 
         actual = transform(input)
@@ -360,7 +360,7 @@ class TestRandomVerticalFlip:
         expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
         assert_equal(expected, actual)
         assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
+        assert actual.spatial_size == expected.spatial_size
 
 
 class TestPad:
@@ -435,7 +435,7 @@ class TestRandomZoomOut:
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
 
         image = mocker.MagicMock(spec=features.Image)
-        h, w = image.image_size = (24, 32)
+        h, w = image.spatial_size = (24, 32)
 
         params = transform._get_params(image)
 
@@ -450,7 +450,7 @@ class TestRandomZoomOut:
     def test__transform(self, fill, side_range, mocker):
         inpt = mocker.MagicMock(spec=features.Image)
         inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
 
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
 
@@ -559,17 +559,17 @@ class TestRandomRotation:
 
     @pytest.mark.parametrize("angle", [34, -87])
     @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_image_size(self, angle, expand):
+    def test_boundingbox_spatial_size(self, angle, expand):
         # Specific test for BoundingBox.rotate
         bbox = features.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32)
+            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
         )
         img = features.Image(torch.rand(1, 3, 32, 32))
 
         out_img = img.rotate(angle, expand=expand)
         out_bbox = bbox.rotate(angle, expand=expand)
 
-        assert out_img.image_size == out_bbox.image_size
+        assert out_img.spatial_size == out_bbox.spatial_size
 
 
 class TestRandomAffine:
@@ -619,8 +619,8 @@ class TestRandomAffine:
     def test__get_params(self, degrees, translate, scale, shear, mocker):
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
-        h, w = image.image_size
+        image.spatial_size = (24, 32)
+        h, w = image.spatial_size
 
         transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
         params = transform._get_params(image)
@@ -682,7 +682,7 @@ class TestRandomAffine:
         fn = mocker.patch("torchvision.prototype.transforms.functional.affine")
         inpt = mocker.MagicMock(spec=features.Image)
         inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
 
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
@@ -718,8 +718,8 @@ class TestRandomCrop:
     def test__get_params(self, padding, pad_if_needed, size, mocker):
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
-        h, w = image.image_size
+        image.spatial_size = (24, 32)
+        h, w = image.spatial_size
 
         transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
         params = transform._get_params(image)
@@ -771,19 +771,19 @@ class TestRandomCrop:
 
         inpt = mocker.MagicMock(spec=features.Image)
         inpt.num_channels = 3
-        inpt.image_size = (32, 32)
+        inpt.spatial_size = (32, 32)
 
         expected = mocker.MagicMock(spec=features.Image)
         expected.num_channels = 3
         if isinstance(padding, int):
-            expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding)
+            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
         elif isinstance(padding, list):
-            expected.image_size = (
-                inpt.image_size[0] + sum(padding[0::2]),
-                inpt.image_size[1] + sum(padding[1::2]),
+            expected.spatial_size = (
+                inpt.spatial_size[0] + sum(padding[0::2]),
+                inpt.spatial_size[1] + sum(padding[1::2]),
             )
         else:
-            expected.image_size = inpt.image_size
+            expected.spatial_size = inpt.spatial_size
         _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected)
         fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop")
 
@@ -859,7 +859,7 @@ class TestGaussianBlur:
         fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur")
         inpt = mocker.MagicMock(spec=features.Image)
         inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
 
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
@@ -910,11 +910,11 @@ class TestRandomPerspective:
         transform = transforms.RandomPerspective(dscale)
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
 
         params = transform._get_params(image)
 
-        h, w = image.image_size
+        h, w = image.spatial_size
         assert "perspective_coeffs" in params
         assert len(params["perspective_coeffs"]) == 8
 
@@ -927,7 +927,7 @@ class TestRandomPerspective:
         fn = mocker.patch("torchvision.prototype.transforms.functional.perspective")
         inpt = mocker.MagicMock(spec=features.Image)
         inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
         # Otherwise, we can mock transform._get_params
@@ -971,11 +971,11 @@ class TestElasticTransform:
         transform = transforms.ElasticTransform(alpha, sigma)
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
 
         params = transform._get_params(image)
 
-        h, w = image.image_size
+        h, w = image.spatial_size
         displacement = params["displacement"]
         assert displacement.shape == (1, h, w, 2)
         assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
@@ -1001,7 +1001,7 @@ class TestElasticTransform:
         fn = mocker.patch("torchvision.prototype.transforms.functional.elastic")
         inpt = mocker.MagicMock(spec=features.Image)
         inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
 
         # Let's mock transform._get_params to control the output:
         transform._get_params = mocker.MagicMock()
@@ -1030,7 +1030,7 @@ class TestRandomErasing:
 
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
 
         transform = transforms.RandomErasing(value=[1, 2, 3, 4])
 
@@ -1041,7 +1041,7 @@ class TestRandomErasing:
     def test__get_params(self, value, mocker):
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
 
         transform = transforms.RandomErasing(value=value)
         params = transform._get_params(image)
@@ -1057,8 +1057,8 @@ class TestRandomErasing:
         elif isinstance(value, (list, tuple)):
             assert v.shape == (image.num_channels, 1, 1)
 
-        assert 0 <= i <= image.image_size[0] - h
-        assert 0 <= j <= image.image_size[1] - w
+        assert 0 <= i <= image.spatial_size[0] - h
+        assert 0 <= j <= image.spatial_size[1] - w
 
     @pytest.mark.parametrize("p", [0, 1])
     def test__transform(self, mocker, p):
@@ -1222,11 +1222,11 @@ class TestRandomIoUCrop:
     def test__get_params(self, device, options, mocker):
         image = mocker.MagicMock(spec=features.Image)
         image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
         bboxes = features.BoundingBox(
             torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
             format="XYXY",
-            image_size=image.image_size,
+            spatial_size=image.spatial_size,
             device=device,
         )
         sample = [image, bboxes]
@@ -1245,8 +1245,8 @@ class TestRandomIoUCrop:
             assert len(params["is_within_crop_area"]) > 0
             assert params["is_within_crop_area"].dtype == torch.bool
 
-            orig_h = image.image_size[0]
-            orig_w = image.image_size[1]
+            orig_h = image.spatial_size[0]
+            orig_w = image.spatial_size[1]
             assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
             assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
 
@@ -1261,7 +1261,7 @@ class TestRandomIoUCrop:
     def test__transform_empty_params(self, mocker):
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
         image = features.Image(torch.rand(1, 3, 4, 4))
-        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4))
+        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
         label = features.Label(torch.tensor([1]))
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
@@ -1281,7 +1281,7 @@ class TestRandomIoUCrop:
         transform = transforms.RandomIoUCrop()
 
         image = features.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
         label = features.Label(torch.randint(0, 10, size=(6,)))
         ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
         masks = make_detection_mask((32, 24), num_objects=6)
@@ -1329,12 +1329,12 @@ class TestRandomIoUCrop:
 
 class TestScaleJitter:
     def test__get_params(self, mocker):
-        image_size = (24, 32)
+        spatial_size = (24, 32)
         target_size = (16, 12)
         scale_range = (0.5, 1.5)
 
         transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
+        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
 
         n_samples = 5
         for _ in range(n_samples):
@@ -1347,11 +1347,11 @@ class TestScaleJitter:
             assert isinstance(size, tuple) and len(size) == 2
             height, width = size
 
-            r_min = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[1]
+            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
 
-            assert int(image_size[0] * r_min) <= height <= int(image_size[0] * r_max)
-            assert int(image_size[1] * r_min) <= width <= int(image_size[1] * r_max)
+            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
+            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
 
     def test__transform(self, mocker):
         interpolation_sentinel = mocker.MagicMock()
@@ -1379,13 +1379,13 @@ class TestScaleJitter:
 
 class TestRandomShortestSize:
     def test__get_params(self, mocker):
-        image_size = (3, 10)
+        spatial_size = (3, 10)
         min_size = [5, 9]
         max_size = 20
 
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
 
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
+        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
         params = transform._get_params(sample)
 
         assert "size" in params
@@ -1504,7 +1504,7 @@ class TestSimpleCopyPaste:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
             "boxes": features.BoundingBox(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32)
+                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
             ),
             "masks": features.Mask(masks),
             "labels": label_type(labels),
@@ -1519,7 +1519,7 @@ class TestSimpleCopyPaste:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
             "boxes": features.BoundingBox(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32)
+                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
             ),
             "masks": features.Mask(paste_masks),
             "labels": label_type(paste_labels),
@@ -1550,14 +1550,14 @@ class TestFixedSizeCrop:
     def test__get_params(self, mocker):
         crop_size = (7, 7)
         batch_shape = (10,)
-        image_size = (11, 5)
+        spatial_size = (11, 5)
 
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         sample = dict(
-            image=make_image(size=image_size, color_space=features.ColorSpace.RGB),
+            image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB),
             bounding_boxes=make_bounding_box(
-                format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape
+                format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
             ),
         )
         params = transform._get_params(sample)
@@ -1638,7 +1638,7 @@ class TestFixedSizeCrop:
 
     def test__transform_culling(self, mocker):
         batch_size = 10
-        image_size = (10, 10)
+        spatial_size = (10, 10)
 
         is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
         mocker.patch(
@@ -1647,17 +1647,17 @@ class TestFixedSizeCrop:
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=image_size[0],
-                width=image_size[1],
+                height=spatial_size[0],
+                width=spatial_size[1],
                 is_valid=is_valid,
                 needs_pad=False,
             ),
         )
 
         bounding_boxes = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
-        masks = make_detection_mask(size=image_size, extra_dims=(batch_size,))
+        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -1678,7 +1678,7 @@ class TestFixedSizeCrop:
 
     def test__transform_bounding_box_clamping(self, mocker):
         batch_size = 3
-        image_size = (10, 10)
+        spatial_size = (10, 10)
 
         mocker.patch(
             "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
@@ -1686,15 +1686,15 @@ class TestFixedSizeCrop:
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=image_size[0],
-                width=image_size[1],
+                height=spatial_size[0],
+                width=spatial_size[1],
                 is_valid=torch.full((batch_size,), fill_value=True),
                 needs_pad=False,
             ),
         )
 
         bounding_box = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index c8debe1e2..f335220fb 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -24,7 +24,7 @@ from torchvision import transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import features, transforms as prototype_transforms
 from torchvision.prototype.transforms import functional as F
-from torchvision.prototype.transforms._utils import query_chw
+from torchvision.prototype.transforms._utils import query_spatial_size
 from torchvision.prototype.transforms.functional import to_image_pil
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)])
@@ -871,7 +871,7 @@ class TestRefDetTransforms:
 
         pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB))
         target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -881,7 +881,7 @@ class TestRefDetTransforms:
 
         tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB))
         target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -891,7 +891,7 @@ class TestRefDetTransforms:
 
         feature_image = make_image(size=size, color_space=features.ColorSpace.RGB)
         target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -949,7 +949,7 @@ class PadIfSmaller(prototype_transforms.Transform):
         self.fill = prototype_transforms._geometry._setup_fill_arg(fill)
 
     def _get_params(self, sample):
-        _, height, width = query_chw(sample)
+        height, width = query_spatial_size(sample)
         padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
         needs_padding = any(padding)
         return dict(padding=padding, needs_padding=needs_padding)
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 8329de697..56c473a23 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -224,11 +224,14 @@ class TestDispatchers:
     @pytest.mark.parametrize(
         "dispatcher",
         [
+            F.clamp_bounding_box,
             F.convert_color_space,
             F.convert_image_dtype,
             F.get_dimensions,
             F.get_image_num_channels,
             F.get_image_size,
+            F.get_num_channels,
+            F.get_num_frames,
             F.get_spatial_size,
             F.rgb_to_grayscale,
         ],
@@ -333,16 +336,16 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 def test_correctness_affine_bounding_box_on_fixed_input(device):
     # Check transformation against known expected output
-    image_size = (64, 64)
+    spatial_size = (64, 64)
     # xyxy format
     in_boxes = [
         [20, 25, 35, 45],
         [50, 5, 70, 22],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
+        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
         [1, 1, 5, 5],
     ]
     in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
     )
     # Tested parameters
     angle = 63
@@ -355,9 +358,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
     # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox
     # expected_bboxes = []
     # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *image_size)
-    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size)
-    #     out_box = denormalize_bbox(n_out_box, *image_size)
+    #     n_in_box = normalize_bbox(in_box, *spatial_size)
+    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *spatial_size)
+    #     out_box = denormalize_bbox(n_out_box, *spatial_size)
     #     expected_bboxes.append(out_box)
     expected_bboxes = [
         (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695),
@@ -369,9 +372,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
     output_boxes = F.affine_bounding_box(
         in_boxes,
         in_boxes.format,
-        in_boxes.image_size,
+        in_boxes.spatial_size,
         angle,
-        (dx * image_size[1], dy * image_size[0]),
+        (dx * spatial_size[1], dy * spatial_size[0]),
         scale,
         shear=(0, 0),
     )
@@ -406,7 +409,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
         affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
         affine_matrix = affine_matrix[:2, :]
 
-        height, width = bbox.image_size
+        height, width = bbox.spatial_size
         bbox_xyxy = convert_format_bounding_box(
             bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
         )
@@ -444,7 +447,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
         out_bbox = features.BoundingBox(
             out_bbox,
             format=features.BoundingBoxFormat.XYXY,
-            image_size=(height, width),
+            spatial_size=(height, width),
             dtype=bbox.dtype,
             device=bbox.device,
         )
@@ -455,16 +458,16 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
             (height, width),
         )
 
-    image_size = (32, 38)
+    spatial_size = (32, 38)
 
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
 
-        output_bboxes, output_image_size = F.rotate_bounding_box(
+        output_bboxes, output_spatial_size = F.rotate_bounding_box(
             bboxes,
             bboxes_format,
-            image_size=bboxes_image_size,
+            spatial_size=bboxes_spatial_size,
             angle=angle,
             expand=expand,
             center=center,
@@ -472,38 +475,38 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
 
         center_ = center
         if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_image_size[::-1]]
+            center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]]
 
         if bboxes.ndim < 2:
             bboxes = [bboxes]
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_)
             expected_bboxes.append(expected_bbox)
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
         else:
             expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0)
+        torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
     # Check transformation against known expected output
-    image_size = (64, 64)
+    spatial_size = (64, 64)
     # xyxy format
     in_boxes = [
         [1, 1, 5, 5],
-        [1, image_size[0] - 6, 5, image_size[0] - 2],
-        [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
+        [1, spatial_size[0] - 6, 5, spatial_size[0] - 2],
+        [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
+        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
     ]
     in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
     )
     # Tested parameters
     angle = 45
@@ -535,7 +538,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
     output_boxes, _ = F.rotate_bounding_box(
         in_boxes,
         in_boxes.format,
-        in_boxes.image_size,
+        in_boxes.spatial_size,
         angle,
         expand=expand,
         center=center,
@@ -593,11 +596,11 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         [50.0, 5.0, 70.0, 22.0],
         [45.0, 46.0, 56.0, 62.0],
     ]
-    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device)
+    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=size, device=device)
     if format != features.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_image_size = F.crop_bounding_box(
+    output_boxes, output_spatial_size = F.crop_bounding_box(
         in_boxes,
         format,
         top,
@@ -610,7 +613,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
+    torch.testing.assert_close(output_spatial_size, size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -658,7 +661,7 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
         bbox[3] = (bbox[3] - top_) * size_[0] / height_
         return bbox
 
-    image_size = (100, 100)
+    spatial_size = (100, 100)
     # xyxy format
     in_boxes = [
         [10.0, 10.0, 20.0, 20.0],
@@ -670,18 +673,18 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
     in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
     )
     if format != features.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
 
     if format != features.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
+    torch.testing.assert_close(output_spatial_size, size)
 
 
 def _parse_padding(padding):
@@ -718,28 +721,28 @@ def test_correctness_pad_bounding_box(device, padding):
             bbox = bbox.to(bbox_dtype)
         return bbox
 
-    def _compute_expected_image_size(bbox, padding_):
+    def _compute_expected_spatial_size(bbox, padding_):
         pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
-        height, width = bbox.image_size
+        height, width = bbox.spatial_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
     for bboxes in make_bounding_boxes():
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
 
-        output_boxes, output_image_size = F.pad_bounding_box(
-            bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding
+        output_boxes, output_spatial_size = F.pad_bounding_box(
+            bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding
         )
 
-        torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding))
+        torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding))
 
         if bboxes.ndim < 2 or bboxes.shape[0] == 0:
             bboxes = [bboxes]
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, padding))
 
         if len(expected_bboxes) > 1:
@@ -807,7 +810,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
         out_bbox = features.BoundingBox(
             np.array(out_bbox),
             format=features.BoundingBoxFormat.XYXY,
-            image_size=bbox.image_size,
+            spatial_size=bbox.spatial_size,
             dtype=bbox.dtype,
             device=bbox.device,
         )
@@ -815,15 +818,15 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
             out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
         )
 
-    image_size = (32, 38)
+    spatial_size = (32, 38)
 
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
 
         output_bboxes = F.perspective_bounding_box(
             bboxes,
@@ -836,7 +839,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
@@ -853,14 +856,14 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 def test_correctness_center_crop_bounding_box(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
-        image_size_ = bbox.image_size
+        spatial_size_ = bbox.spatial_size
         bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
 
-        cy = int(round((image_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((image_size_[1] - output_size_[1]) * 0.5))
+        cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5))
+        cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5))
         out_bbox = [
             bbox[0].item() - cx,
             bbox[1].item() - cy,
@@ -870,7 +873,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
         out_bbox = features.BoundingBox(
             out_bbox,
             format=features.BoundingBoxFormat.XYWH,
-            image_size=output_size_,
+            spatial_size=output_size_,
             dtype=bbox.dtype,
             device=bbox.device,
         )
@@ -879,10 +882,10 @@ def test_correctness_center_crop_bounding_box(device, output_size):
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
 
-        output_boxes, output_image_size = F.center_crop_bounding_box(
-            bboxes, bboxes_format, bboxes_image_size, output_size
+        output_boxes, output_spatial_size = F.center_crop_bounding_box(
+            bboxes, bboxes_format, bboxes_spatial_size, output_size
         )
 
         if bboxes.ndim < 2:
@@ -890,7 +893,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
 
         if len(expected_bboxes) > 1:
@@ -898,7 +901,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
         else:
             expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_boxes, expected_bboxes)
-        torch.testing.assert_close(output_image_size, output_size)
+        torch.testing.assert_close(output_spatial_size, output_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -926,11 +929,11 @@ def test_correctness_center_crop_mask(device, output_size):
 
 # Copied from test/test_functional_tensor.py
 @pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("image_size", ("small", "large"))
+@pytest.mark.parametrize("spatial_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
 @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma):
+def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma):
     fn = F.gaussian_blur_image_tensor
 
     # true_cv2_results = {
@@ -950,7 +953,7 @@ def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, s
     p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
     true_cv2_results = torch.load(p)
 
-    if image_size == "small":
+    if spatial_size == "small":
         tensor = (
             torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
         )
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
index 9a8ed67dd..3d5960c96 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -11,8 +11,8 @@ from torchvision.prototype.transforms.functional import to_image_pil
 
 
 IMAGE = make_image(color_space=features.ColorSpace.RGB)
-BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size)
-MASK = make_detection_mask(size=IMAGE.image_size)
+BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
+MASK = make_detection_mask(size=IMAGE.spatial_size)
 
 
 @pytest.mark.parametrize(
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index a00bf2e2c..29ed162cc 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -110,7 +110,9 @@ class Caltech101(Dataset):
             image=image,
             ann_path=ann_path,
             bounding_box=BoundingBox(
-                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", image_size=image.image_size
+                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]],
+                format="xyxy",
+                spatial_size=image.spatial_size,
             ),
             contour=_Feature(ann["obj_contour"].T),
         )
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index a0a021845..3382b62b6 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -144,7 +144,7 @@ class CelebA(Dataset):
             bounding_box=BoundingBox(
                 [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")],
                 format="xywh",
-                image_size=image.image_size,
+                spatial_size=image.spatial_size,
             ),
             landmarks={
                 landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index 16a16998b..72d76f487 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -97,25 +97,29 @@ class Coco(Dataset):
         )
         return [images, meta]
 
-    def _segmentation_to_mask(self, segmentation: Any, *, is_crowd: bool, image_size: Tuple[int, int]) -> torch.Tensor:
+    def _segmentation_to_mask(
+        self, segmentation: Any, *, is_crowd: bool, spatial_size: Tuple[int, int]
+    ) -> torch.Tensor:
         from pycocotools import mask
 
         if is_crowd:
-            segmentation = mask.frPyObjects(segmentation, *image_size)
+            segmentation = mask.frPyObjects(segmentation, *spatial_size)
         else:
-            segmentation = mask.merge(mask.frPyObjects(segmentation, *image_size))
+            segmentation = mask.merge(mask.frPyObjects(segmentation, *spatial_size))
 
         return torch.from_numpy(mask.decode(segmentation)).to(torch.bool)
 
     def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        image_size = (image_meta["height"], image_meta["width"])
+        spatial_size = (image_meta["height"], image_meta["width"])
         labels = [ann["category_id"] for ann in anns]
         return dict(
             # TODO: create a segmentation feature
             segmentations=_Feature(
                 torch.stack(
                     [
-                        self._segmentation_to_mask(ann["segmentation"], is_crowd=ann["iscrowd"], image_size=image_size)
+                        self._segmentation_to_mask(
+                            ann["segmentation"], is_crowd=ann["iscrowd"], spatial_size=spatial_size
+                        )
                         for ann in anns
                     ]
                 )
@@ -125,7 +129,7 @@ class Coco(Dataset):
             bounding_boxes=BoundingBox(
                 [ann["bbox"] for ann in anns],
                 format="xywh",
-                image_size=image_size,
+                spatial_size=spatial_size,
             ),
             labels=Label(labels, categories=self._categories),
             super_categories=[self._category_to_super_category[self._categories[label]] for label in labels],
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index f1531615c..9c32d96f9 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -130,13 +130,13 @@ class CUB200(Dataset):
         return path.with_suffix(".jpg").name
 
     def _2011_prepare_ann(
-        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], image_size: Tuple[int, int]
+        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int]
     ) -> Dict[str, Any]:
         _, (bounding_box_data, segmentation_data) = data
         segmentation_path, segmentation_buffer = segmentation_data
         return dict(
             bounding_box=BoundingBox(
-                [float(part) for part in bounding_box_data[1:]], format="xywh", image_size=image_size
+                [float(part) for part in bounding_box_data[1:]], format="xywh", spatial_size=spatial_size
             ),
             segmentation_path=segmentation_path,
             segmentation=EncodedImage.from_file(segmentation_buffer),
@@ -149,7 +149,9 @@ class CUB200(Dataset):
         path = pathlib.Path(data[0])
         return path.with_suffix(".jpg").name, data
 
-    def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: Tuple[int, int]) -> Dict[str, Any]:
+    def _2010_prepare_ann(
+        self, data: Tuple[str, Tuple[str, BinaryIO]], spatial_size: Tuple[int, int]
+    ) -> Dict[str, Any]:
         _, (path, buffer) = data
         content = read_mat(buffer)
         return dict(
@@ -157,7 +159,7 @@ class CUB200(Dataset):
             bounding_box=BoundingBox(
                 [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
                 format="xyxy",
-                image_size=image_size,
+                spatial_size=spatial_size,
             ),
             segmentation=_Feature(content["seg"]),
         )
@@ -175,7 +177,7 @@ class CUB200(Dataset):
         image = EncodedImage.from_file(buffer)
 
         return dict(
-            prepare_ann_fn(anns_data, image.image_size),
+            prepare_ann_fn(anns_data, image.spatial_size),
             image=image,
             label=Label(
                 int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1,
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 8dc0a8240..e11dc2bb4 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -78,7 +78,7 @@ class GTSRB(Dataset):
         bounding_box = BoundingBox(
             [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
             format="xyxy",
-            image_size=(int(csv_info["Height"]), int(csv_info["Width"])),
+            spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])),
         )
 
         return {
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index 011204f2b..a0e7a377e 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -89,7 +89,7 @@ class StanfordCars(Dataset):
             path=path,
             image=image,
             label=Label(target[4] - 1, categories=self._categories),
-            bounding_box=BoundingBox(target[:4], format="xyxy", image_size=image.image_size),
+            bounding_box=BoundingBox(target[:4], format="xyxy", spatial_size=image.spatial_size),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index 84a9b3a7f..8db82b4aa 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -108,7 +108,7 @@ class VOC(Dataset):
                     for instance in instances
                 ],
                 format="xyxy",
-                image_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
+                spatial_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
             ),
             labels=Label(
                 [self._categories.index(instance["name"]) for instance in instances], categories=self._categories
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 7b69af5f9..18c607d4d 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -17,13 +17,13 @@ class BoundingBoxFormat(StrEnum):
 
 class BoundingBox(_Feature):
     format: BoundingBoxFormat
-    image_size: Tuple[int, int]
+    spatial_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, image_size: Tuple[int, int]) -> BoundingBox:
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBox:
         bounding_box = tensor.as_subclass(cls)
         bounding_box.format = format
-        bounding_box.image_size = image_size
+        bounding_box.spatial_size = spatial_size
         return bounding_box
 
     def __new__(
@@ -31,7 +31,7 @@ class BoundingBox(_Feature):
         data: Any,
         *,
         format: Union[BoundingBoxFormat, str],
-        image_size: Tuple[int, int],
+        spatial_size: Tuple[int, int],
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
@@ -41,7 +41,7 @@ class BoundingBox(_Feature):
         if isinstance(format, str):
             format = BoundingBoxFormat.from_str(format.upper())
 
-        return cls._wrap(tensor, format=format, image_size=image_size)
+        return cls._wrap(tensor, format=format, spatial_size=spatial_size)
 
     @classmethod
     def wrap_like(
@@ -50,16 +50,16 @@ class BoundingBox(_Feature):
         tensor: torch.Tensor,
         *,
         format: Optional[BoundingBoxFormat] = None,
-        image_size: Optional[Tuple[int, int]] = None,
+        spatial_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBox:
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
-            image_size=image_size if image_size is not None else other.image_size,
+            spatial_size=spatial_size if spatial_size is not None else other.spatial_size,
         )
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, image_size=self.image_size)
+        return self._make_repr(format=self.format, spatial_size=self.spatial_size)
 
     def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
         if isinstance(format, str):
@@ -70,11 +70,11 @@ class BoundingBox(_Feature):
         )
 
     def horizontal_flip(self) -> BoundingBox:
-        output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size)
+        output = self._F.horizontal_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size)
         return BoundingBox.wrap_like(self, output)
 
     def vertical_flip(self) -> BoundingBox:
-        output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size)
+        output = self._F.vertical_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size)
         return BoundingBox.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
@@ -84,20 +84,22 @@ class BoundingBox(_Feature):
         max_size: Optional[int] = None,
         antialias: bool = False,
     ) -> BoundingBox:
-        output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size)
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        output, spatial_size = self._F.resize_bounding_box(
+            self, spatial_size=self.spatial_size, size=size, max_size=max_size
+        )
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        output, image_size = self._F.crop_bounding_box(
+        output, spatial_size = self._F.crop_bounding_box(
             self, self.format, top=top, left=left, height=height, width=width
         )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def center_crop(self, output_size: List[int]) -> BoundingBox:
-        output, image_size = self._F.center_crop_bounding_box(
-            self, format=self.format, image_size=self.image_size, output_size=output_size
+        output, spatial_size = self._F.center_crop_bounding_box(
+            self, format=self.format, spatial_size=self.spatial_size, output_size=output_size
         )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def resized_crop(
         self,
@@ -109,8 +111,8 @@ class BoundingBox(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         antialias: bool = False,
     ) -> BoundingBox:
-        output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        output, spatial_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def pad(
         self,
@@ -118,10 +120,10 @@ class BoundingBox(_Feature):
         fill: FillTypeJIT = None,
         padding_mode: str = "constant",
     ) -> BoundingBox:
-        output, image_size = self._F.pad_bounding_box(
-            self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode
+        output, spatial_size = self._F.pad_bounding_box(
+            self, format=self.format, spatial_size=self.spatial_size, padding=padding, padding_mode=padding_mode
         )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def rotate(
         self,
@@ -131,10 +133,10 @@ class BoundingBox(_Feature):
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> BoundingBox:
-        output, image_size = self._F.rotate_bounding_box(
-            self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center
+        output, spatial_size = self._F.rotate_bounding_box(
+            self, format=self.format, spatial_size=self.spatial_size, angle=angle, expand=expand, center=center
         )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def affine(
         self,
@@ -149,7 +151,7 @@ class BoundingBox(_Feature):
         output = self._F.affine_bounding_box(
             self,
             self.format,
-            self.image_size,
+            self.spatial_size,
             angle,
             translate=translate,
             scale=scale,
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
index 4b963986b..9347b4eca 100644
--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/features/_encoded.py
@@ -49,12 +49,12 @@ class EncodedData(_Feature):
 class EncodedImage(EncodedData):
     # TODO: Use @functools.cached_property if we can depend on Python 3.8
     @property
-    def image_size(self) -> Tuple[int, int]:
-        if not hasattr(self, "_image_size"):
+    def spatial_size(self) -> Tuple[int, int]:
+        if not hasattr(self, "_spatial_size"):
             with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image:
-                self._image_size = image.height, image.width
+                self._spatial_size = image.height, image.width
 
-        return self._image_size
+        return self._spatial_size
 
 
 class EncodedVideo(EncodedData):
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 23f81678d..6d52a178b 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -105,7 +105,7 @@ class Image(_Feature):
         return self._make_repr(color_space=self.color_space)
 
     @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
         return cast(Tuple[int, int], tuple(self.shape[-2:]))
 
     @property
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 7b49ce8e8..2da10195e 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -33,7 +33,7 @@ class Mask(_Feature):
         return cls._wrap(tensor)
 
     @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
         return cast(Tuple[int, int], tuple(self.shape[-2:]))
 
     def horizontal_flip(self) -> Mask:
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index e32c36d5d..ca4253c73 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -54,9 +54,8 @@ class Video(_Feature):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr(color_space=self.color_space)
 
-    # TODO: rename this (and all instances of this term to spatial size)
     @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
         return cast(Tuple[int, int], tuple(self.shape[-2:]))
 
     @property
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 4bfb5c9ed..f0e527385 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -11,7 +11,7 @@ from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, InterpolationMode
 
 from ._transform import _RandomApplyTransform
-from ._utils import has_any, query_chw
+from ._utils import has_any, query_chw, query_spatial_size
 
 
 class RandomErasing(_RandomApplyTransform):
@@ -153,7 +153,7 @@ class RandomCutmix(_BaseMixupCutmix):
     def _get_params(self, sample: Any) -> Dict[str, Any]:
         lam = float(self._dist.sample(()))
 
-        _, H, W = query_chw(sample)
+        H, W = query_spatial_size(sample)
 
         r_x = torch.randint(W, ())
         r_y = torch.randint(H, ())
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 340e721da..616669cc8 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -100,7 +100,7 @@ class RandomPhotometricDistort(Transform):
         self.p = p
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_channels, _, _ = query_chw(sample)
+        num_channels, *_ = query_chw(sample)
         return dict(
             zip(
                 ["brightness", "contrast1", "saturation", "hue", "contrast2"],
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index f8aec22b9..0cc4a90c4 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -78,7 +78,7 @@ class RandomGrayscale(_RandomApplyTransform):
         super().__init__(p=p)
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_input_channels, _, _ = query_chw(sample)
+        num_input_channels, *_ = query_chw(sample)
         return dict(num_input_channels=num_input_channels)
 
     def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 371ea7f69..91d7c294e 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -24,7 +24,7 @@ from ._utils import (
     has_all,
     has_any,
     query_bounding_box,
-    query_chw,
+    query_spatial_size,
 )
 
 
@@ -105,10 +105,7 @@ class RandomResizedCrop(Transform):
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # vfdev-5: techically, this op can work on bboxes/segm masks only inputs without image in samples
-        # What if we have multiple images/bboxes/masks of different sizes ?
-        # TODO: let's support bbox or mask in samples without image
-        _, height, width = query_chw(sample)
+        height, width = query_spatial_size(sample)
         area = height * width
 
         log_ratio = self._log_ratio
@@ -263,7 +260,7 @@ class RandomZoomOut(_RandomApplyTransform):
             raise ValueError(f"Invalid canvas side range provided {side_range}.")
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_h, orig_w = query_chw(sample)
+        orig_h, orig_w = query_spatial_size(sample)
 
         r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
         canvas_width = int(orig_w * r)
@@ -362,10 +359,7 @@ class RandomAffine(Transform):
         self.center = center
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, height, width = query_chw(sample)
+        height, width = query_spatial_size(sample)
 
         angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
         if self.translate is not None:
@@ -427,7 +421,7 @@ class RandomCrop(Transform):
         self.padding_mode = padding_mode
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, padded_height, padded_width = query_chw(sample)
+        padded_height, padded_width = query_spatial_size(sample)
 
         if self.padding is not None:
             pad_left, pad_right, pad_top, pad_bottom = self.padding
@@ -515,9 +509,7 @@ class RandomPerspective(_RandomApplyTransform):
         self.fill = _setup_fill_arg(fill)
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, height, width = query_chw(sample)
+        height, width = query_spatial_size(sample)
 
         distortion_scale = self.distortion_scale
 
@@ -571,9 +563,7 @@ class ElasticTransform(Transform):
         self.fill = _setup_fill_arg(fill)
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, *size = query_chw(sample)
+        size = list(query_spatial_size(sample))
 
         dx = torch.rand([1, 1] + size) * 2 - 1
         if self.sigma[0] > 0.0:
@@ -628,7 +618,7 @@ class RandomIoUCrop(Transform):
         self.trials = trials
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_h, orig_w = query_chw(sample)
+        orig_h, orig_w = query_spatial_size(sample)
         bboxes = query_bounding_box(sample)
 
         while True:
@@ -690,7 +680,7 @@ class RandomIoUCrop(Transform):
 
         if isinstance(output, features.BoundingBox):
             bboxes = output[is_within_crop_area]
-            bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size)
+            bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size)
             output = features.BoundingBox.wrap_like(output, bboxes)
         elif isinstance(output, features.Mask):
             # apply is_within_crop_area if mask is one-hot encoded
@@ -727,7 +717,7 @@ class ScaleJitter(Transform):
         self.antialias = antialias
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_height, orig_width = query_chw(sample)
+        orig_height, orig_width = query_spatial_size(sample)
 
         scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
         r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
@@ -755,7 +745,7 @@ class RandomShortestSize(Transform):
         self.antialias = antialias
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_height, orig_width = query_chw(sample)
+        orig_height, orig_width = query_spatial_size(sample)
 
         min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
         r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
@@ -786,7 +776,7 @@ class FixedSizeCrop(Transform):
         self.padding_mode = padding_mode
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, height, width = query_chw(sample)
+        height, width = query_spatial_size(sample)
         new_height = min(height, self.crop_height)
         new_width = min(width, self.crop_width)
 
@@ -811,7 +801,7 @@ class FixedSizeCrop(Transform):
             bounding_boxes = features.BoundingBox.wrap_like(
                 bounding_boxes,
                 F.clamp_bounding_box(
-                    bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size
+                    bounding_boxes, format=bounding_boxes.format, spatial_size=bounding_boxes.spatial_size
                 ),
             )
             height_and_width = bounding_boxes.to_format(features.BoundingBoxFormat.XYWH)[..., 2:]
@@ -851,7 +841,7 @@ class FixedSizeCrop(Transform):
             elif isinstance(inpt, features.BoundingBox):
                 inpt = features.BoundingBox.wrap_like(
                     inpt,
-                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size),
+                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size),
                 )
 
         if params["needs_pad"]:
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index e5c7d05b0..dc109269f 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -68,5 +68,5 @@ class ClampBoundingBoxes(Transform):
     _transformed_types = (features.BoundingBox,)
 
     def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
-        output = F.clamp_bounding_box(inpt, format=inpt.format, image_size=inpt.image_size)
+        output = F.clamp_bounding_box(inpt, format=inpt.format, spatial_size=inpt.spatial_size)
         return features.BoundingBox.wrap_like(inpt, output)
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index a3980fa21..53b27f2e2 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -10,7 +10,7 @@ from torchvision._utils import sequence_to_str
 from torchvision.prototype import features
 from torchvision.prototype.features._feature import FillType
 
-from torchvision.prototype.transforms.functional._meta import get_dimensions
+from torchvision.prototype.transforms.functional._meta import get_dimensions, get_spatial_size
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
 from typing_extensions import Literal
@@ -98,6 +98,22 @@ def query_chw(sample: Any) -> Tuple[int, int, int]:
     return c, h, w
 
 
+def query_spatial_size(sample: Any) -> Tuple[int, int]:
+    flat_sample, _ = tree_flatten(sample)
+    sizes = {
+        tuple(get_spatial_size(item))
+        for item in flat_sample
+        if isinstance(item, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox))
+        or features.is_simple_tensor(item)
+    }
+    if not sizes:
+        raise TypeError("No image, video, mask or bounding box was found in the sample")
+    elif len(sizes) > 1:
+        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
+    h, w = sizes.pop()
+    return h, w
+
+
 def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
     for type_or_check in types_or_checks:
         if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index 579442dc7..fb72e7b57 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -11,12 +11,18 @@ from ._meta import (
     get_dimensions_image_tensor,
     get_dimensions_image_pil,
     get_dimensions,
+    get_num_frames_video,
+    get_num_frames,
     get_image_num_channels,
     get_num_channels_image_tensor,
     get_num_channels_image_pil,
+    get_num_channels_video,
     get_num_channels,
+    get_spatial_size_bounding_box,
     get_spatial_size_image_tensor,
     get_spatial_size_image_pil,
+    get_spatial_size_mask,
+    get_spatial_size_video,
     get_spatial_size,
 )  # usort: skip
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 44b4986ab..590a13310 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -32,7 +32,7 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def horizontal_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
@@ -40,7 +40,7 @@ def horizontal_flip_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
     ).view(-1, 4)
 
-    bounding_box[:, [0, 2]] = image_size[1] - bounding_box[:, [2, 0]]
+    bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]
 
     return convert_format_bounding_box(
         bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
@@ -69,7 +69,7 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
@@ -77,7 +77,7 @@ def vertical_flip_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
     ).view(-1, 4)
 
-    bounding_box[:, [1, 3]] = image_size[0] - bounding_box[:, [3, 1]]
+    bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]
 
     return convert_format_bounding_box(
         bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
@@ -104,11 +104,11 @@ vflip = vertical_flip
 
 
 def _compute_resized_output_size(
-    image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> List[int]:
     if isinstance(size, int):
         size = [size]
-    return __compute_resized_output_size(image_size, size=size, max_size=max_size)
+    return __compute_resized_output_size(spatial_size, size=size, max_size=max_size)
 
 
 def resize_image_tensor(
@@ -162,10 +162,10 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
 
 
 def resize_bounding_box(
-    bounding_box: torch.Tensor, image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    bounding_box: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    old_height, old_width = image_size
-    new_height, new_width = _compute_resized_output_size(image_size, size=size, max_size=max_size)
+    old_height, old_width = spatial_size
+    new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
     ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
     return (
         bounding_box.view(-1, 2, 2).mul(ratios).to(bounding_box.dtype).view(bounding_box.shape),
@@ -312,7 +312,7 @@ def affine_image_pil(
 
 def _affine_bounding_box_xyxy(
     bounding_box: torch.Tensor,
-    image_size: Tuple[int, int],
+    spatial_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -325,7 +325,7 @@ def _affine_bounding_box_xyxy(
     )
 
     if center is None:
-        height, width = image_size
+        height, width = spatial_size
         center = [width * 0.5, height * 0.5]
 
     dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
@@ -359,7 +359,7 @@ def _affine_bounding_box_xyxy(
     if expand:
         # Compute minimum point for transformed image frame:
         # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-        height, width = image_size
+        height, width = spatial_size
         points = torch.tensor(
             [
                 [0.0, 0.0, 1.0],
@@ -378,15 +378,15 @@ def _affine_bounding_box_xyxy(
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
         new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height)
-        image_size = (new_height, new_width)
+        spatial_size = (new_height, new_width)
 
-    return out_bboxes.to(bounding_box.dtype), image_size
+    return out_bboxes.to(bounding_box.dtype), spatial_size
 
 
 def affine_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
+    spatial_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -398,7 +398,7 @@ def affine_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
     ).view(-1, 4)
 
-    out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center)
+    out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center)
 
     # out_bboxes should be of shape [N boxes, 4]
 
@@ -573,7 +573,7 @@ def rotate_image_pil(
 def rotate_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
+    spatial_size: Tuple[int, int],
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
@@ -587,9 +587,9 @@ def rotate_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
     ).view(-1, 4)
 
-    out_bboxes, image_size = _affine_bounding_box_xyxy(
+    out_bboxes, spatial_size = _affine_bounding_box_xyxy(
         bounding_box,
-        image_size,
+        spatial_size,
         angle=-angle,
         translate=[0.0, 0.0],
         scale=1.0,
@@ -602,7 +602,7 @@ def rotate_bounding_box(
         convert_format_bounding_box(
             out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
         ).view(original_shape),
-        image_size,
+        spatial_size,
     )
 
 
@@ -756,7 +756,7 @@ def pad_mask(
 def pad_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
+    spatial_size: Tuple[int, int],
     padding: Union[int, List[int]],
     padding_mode: str = "constant",
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
@@ -775,7 +775,7 @@ def pad_bounding_box(
         bounding_box[..., 2] += left
         bounding_box[..., 3] += top
 
-    height, width = image_size
+    height, width = spatial_size
     height += top + bottom
     width += left + right
 
@@ -1066,10 +1066,10 @@ def elastic_bounding_box(
     ).view(-1, 4)
 
     # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
-    # Or add image_size arg and check displacement shape
-    image_size = displacement.shape[-3], displacement.shape[-2]
+    # Or add spatial_size arg and check displacement shape
+    spatial_size = displacement.shape[-3], displacement.shape[-2]
 
-    id_grid = _FT._create_identity_grid(list(image_size)).to(bounding_box.device)
+    id_grid = _FT._create_identity_grid(list(spatial_size)).to(bounding_box.device)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
     inv_grid = id_grid - displacement
@@ -1079,7 +1079,7 @@ def elastic_bounding_box(
     index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long)
     index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long)
     # Transform points:
-    t_size = torch.tensor(image_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype)
     transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5
 
     transformed_points = transformed_points.view(-1, 4, 2)
@@ -1199,11 +1199,11 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL
 def center_crop_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
+    spatial_size: Tuple[int, int],
     output_size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *image_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size)
     return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
 
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index c03d65c95..a118784eb 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -18,7 +18,7 @@ def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]:
         return get_dimensions_image_tensor(image)
     elif isinstance(image, (features.Image, features.Video)):
         channels = image.num_channels
-        height, width = image.image_size
+        height, width = image.spatial_size
         return [channels, height, width]
     else:
         return get_dimensions_image_pil(image)
@@ -28,6 +28,10 @@ get_num_channels_image_tensor = _FT.get_image_num_channels
 get_num_channels_image_pil = _FP.get_image_num_channels
 
 
+def get_num_channels_video(video: torch.Tensor) -> int:
+    return get_num_channels_image_tensor(video)
+
+
 def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int:
     if isinstance(image, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
@@ -55,21 +59,39 @@ def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]:
     return [height, width]
 
 
-# TODO: Should we have get_spatial_size_video here? How about masks/bbox etc? What is the criterion for deciding when
-# a kernel will be created?
+def get_spatial_size_video(video: torch.Tensor) -> List[int]:
+    return get_spatial_size_image_tensor(video)
+
+
+def get_spatial_size_mask(mask: torch.Tensor) -> List[int]:
+    return get_spatial_size_image_tensor(mask)
+
+
+@torch.jit.unused
+def get_spatial_size_bounding_box(bounding_box: features.BoundingBox) -> List[int]:
+    return list(bounding_box.spatial_size)
 
 
 def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return get_spatial_size_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
-        image_size = getattr(inpt, "image_size", None)
-        if image_size is not None:
-            return list(image_size)
-        else:
-            raise ValueError(f"Type {inpt.__class__} doesn't have spatial size.")
+    elif isinstance(inpt, (features.Image, features.Video, features.BoundingBox, features.Mask)):
+        return list(inpt.spatial_size)
+    else:
+        return get_spatial_size_image_pil(inpt)  # type: ignore[no-any-return]
+
+
+def get_num_frames_video(video: torch.Tensor) -> int:
+    return video.shape[-4]
+
+
+def get_num_frames(inpt: features.VideoTypeJIT) -> int:
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)):
+        return get_num_frames_video(inpt)
+    elif isinstance(inpt, features.Video):
+        return inpt.num_frames
     else:
-        return get_spatial_size_image_pil(inpt)
+        raise TypeError(f"The video should be a Tensor. Got {type(inpt)}")
 
 
 def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor:
@@ -125,13 +147,13 @@ def convert_format_bounding_box(
 
 
 def clamp_bounding_box(
-    bounding_box: torch.Tensor, format: BoundingBoxFormat, image_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
     # TODO: (PERF) Possible speed up clamping if we have different implementations for each bbox format.
     # Not sure if they yield equivalent results.
     xyxy_boxes = convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY)
-    xyxy_boxes[..., 0::2].clamp_(min=0, max=image_size[1])
-    xyxy_boxes[..., 1::2].clamp_(min=0, max=image_size[0])
+    xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
+    xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
     return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format, copy=False)
 
 
-- 
GitLab


From 6e72f2fda1df6704003742238f0e87732b9635a1 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 11 Oct 2022 16:59:58 +0100
Subject: [PATCH 032/624] Add seeds on Kernel Info and reduce randomness for
 Gaussian Blur (#6741)

* Add seeds on Kernel Info and reduce randomness for Gaussian Blur

* Fix linter
---
 test/prototype_transforms_kernel_infos.py    |  9 +++++++--
 test/test_prototype_transforms_functional.py | 14 +++++++++++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 239425d17..f7b1e71f3 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -49,12 +49,14 @@ class KernelInfo(InfoBase):
         test_marks=None,
         # See InfoBase
         closeness_kwargs=None,
+        seed=None,
     ):
         super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
         self.kernel = kernel
         self.sample_inputs_fn = sample_inputs_fn
         self.reference_fn = reference_fn
         self.reference_inputs_fn = reference_inputs_fn
+        self.seed = seed
 
 
 DEFAULT_IMAGE_CLOSENESS_KWARGS = dict(
@@ -1304,7 +1306,7 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_gaussian_blur_image_tensor():
     make_gaussian_blur_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB]
+        make_image_loaders, sizes=[(7, 33)], color_spaces=[features.ColorSpace.RGB]
     )
 
     for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
@@ -1317,7 +1319,7 @@ def sample_inputs_gaussian_blur_image_tensor():
 
 
 def sample_inputs_gaussian_blur_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[(7, 33)], num_frames=[5]):
         yield ArgsKwargs(video_loader, kernel_size=[3, 3])
 
 
@@ -1331,10 +1333,13 @@ KERNEL_INFOS.extend(
                 xfail_jit_python_scalar_arg("kernel_size"),
                 xfail_jit_python_scalar_arg("sigma"),
             ],
+            seed=0,
         ),
         KernelInfo(
             F.gaussian_blur_video,
             sample_inputs_fn=sample_inputs_gaussian_blur_video,
+            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            seed=0,
         ),
     ]
 )
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 56c473a23..c08228769 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -6,7 +6,7 @@ import PIL.Image
 import pytest
 
 import torch
-from common_utils import cache, cpu_and_gpu, needs_cuda
+from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
 from prototype_common_utils import assert_close, make_bounding_boxes, make_image
 from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
 from prototype_transforms_kernel_infos import KERNEL_INFOS
@@ -81,6 +81,8 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_scripted_vs_eager(self, info, args_kwargs, device):
+        if info.seed is not None:
+            set_rng_seed(info.seed)
         kernel_eager = info.kernel
         kernel_scripted = script(kernel_eager)
 
@@ -111,6 +113,8 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_batched_vs_single(self, info, args_kwargs, device):
+        if info.seed is not None:
+            set_rng_seed(info.seed)
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
         feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input)
@@ -146,6 +150,8 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_no_inplace(self, info, args_kwargs, device):
+        if info.seed is not None:
+            set_rng_seed(info.seed)
         (input, *other_args), kwargs = args_kwargs.load(device)
 
         if input.numel() == 0:
@@ -159,6 +165,8 @@ class TestKernels:
     @sample_inputs
     @needs_cuda
     def test_cuda_vs_cpu(self, info, args_kwargs):
+        if info.seed is not None:
+            set_rng_seed(info.seed)
         (input_cpu, *other_args), kwargs = args_kwargs.load("cpu")
         input_cuda = input_cpu.to("cuda")
 
@@ -170,6 +178,8 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_dtype_and_device_consistency(self, info, args_kwargs, device):
+        if info.seed is not None:
+            set_rng_seed(info.seed)
         (input, *other_args), kwargs = args_kwargs.load(device)
 
         output = info.kernel(input, *other_args, **kwargs)
@@ -182,6 +192,8 @@ class TestKernels:
 
     @reference_inputs
     def test_against_reference(self, info, args_kwargs):
+        if info.seed is not None:
+            set_rng_seed(info.seed)
         args, kwargs = args_kwargs.load("cpu")
 
         actual = info.kernel(*args, **kwargs)
-- 
GitLab


From 1b5e1b4dd4e173655c255a3c472b0a668d8c9414 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 11 Oct 2022 12:41:55 -0400
Subject: [PATCH 033/624] Fix for windows and python 3.8 call to
 add_dll_directory (#6742)

---
 torchvision/extension.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torchvision/extension.py b/torchvision/extension.py
index 702e7e33b..de5ea0c94 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -21,12 +21,16 @@ try:
     # To find cuda related dlls we need to make sure the
     # conda environment/bin path is configured Please take a look:
     # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
+    # Please note: if some path can't be added using add_dll_directory we simply ignore this path
     if os.name == "nt" and sys.version_info >= (3, 8) and sys.version_info < (3, 9):
         env_path = os.environ["PATH"]
         path_arr = env_path.split(";")
         for path in path_arr:
             if os.path.exists(path):
-                os.add_dll_directory(path)  # type: ignore[attr-defined]
+                try:
+                    os.add_dll_directory(path)  # type: ignore[attr-defined]
+                except Exception:
+                    pass
 
     lib_path = _get_extension_path("_C")
     torch.ops.load_library(lib_path)
-- 
GitLab


From 9d16da222434c59fe26645c22116618625ccfed0 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 11 Oct 2022 15:04:56 -0400
Subject: [PATCH 034/624] Increase inactivity timeout for binary build jobs
 (#6746)

* Increase inactivity timeout for binary build jobs

* Fix binary build steo
---
 .circleci/config.yml    | 17 ++++++++++++++---
 .circleci/config.yml.in | 17 ++++++++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1e4f2e319..713c1e6c4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -377,7 +377,12 @@ jobs:
     steps:
       - checkout_merge
       - designate_upload_channel
-      - run: packaging/build_wheel.sh
+      - run:
+          name: Build conda packages
+          no_output_timeout: 30m
+          command: |
+              set -ex
+              packaging/build_wheel.sh
       - store_artifacts:
           path: dist
       - persist_to_workspace:
@@ -393,7 +398,12 @@ jobs:
     steps:
       - checkout_merge
       - designate_upload_channel
-      - run: packaging/build_conda.sh
+      - run:
+          name: Build conda packages
+          no_output_timeout: 30m
+          command: |
+              set -ex
+              packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -411,7 +421,7 @@ jobs:
       - designate_upload_channel
       - run:
           name: Build conda packages
-          no_output_timeout: 20m
+          no_output_timeout: 30m
           command: |
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
@@ -438,6 +448,7 @@ jobs:
       - designate_upload_channel
       - run:
           name: Build wheel packages
+          no_output_timeout: 30m
           command: |
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index d93ddb0be..b421dc1a7 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -377,7 +377,12 @@ jobs:
     steps:
       - checkout_merge
       - designate_upload_channel
-      - run: packaging/build_wheel.sh
+      - run:
+          name: Build conda packages
+          no_output_timeout: 30m
+          command: |
+              set -ex
+              packaging/build_wheel.sh
       - store_artifacts:
           path: dist
       - persist_to_workspace:
@@ -393,7 +398,12 @@ jobs:
     steps:
       - checkout_merge
       - designate_upload_channel
-      - run: packaging/build_conda.sh
+      - run:
+          name: Build conda packages
+          no_output_timeout: 30m
+          command: |
+              set -ex
+              packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -411,7 +421,7 @@ jobs:
       - designate_upload_channel
       - run:
           name: Build conda packages
-          no_output_timeout: 20m
+          no_output_timeout: 30m
           command: |
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
@@ -438,6 +448,7 @@ jobs:
       - designate_upload_channel
       - run:
           name: Build wheel packages
+          no_output_timeout: 30m
           command: |
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
-- 
GitLab


From 11a2eeda8fb127a7ad72b4c98ca918b93055c1e7 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 11 Oct 2022 23:47:46 +0200
Subject: [PATCH 035/624] [proto] Small improvement for tensor equalize op
 (#6738)

* [proto] Small improvement for tensor equalize op

* Fix code formatting

* Added a comment on the ops
---
 .../prototype/transforms/functional/_color.py | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index d11dd3c3b..63fa8a28c 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -183,6 +183,30 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return autocontrast_image_pil(inpt)
 
 
+def _scale_channel(img_chan: torch.Tensor) -> torch.Tensor:
+    # TODO: we should expect bincount to always be faster than histc, but this
+    # isn't always the case. Once
+    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
+    # block and only use bincount.
+    if img_chan.is_cuda:
+        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
+    else:
+        hist = torch.bincount(img_chan.view(-1), minlength=256)
+
+    nonzero_hist = hist[hist != 0]
+    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
+    if step == 0:
+        return img_chan
+
+    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
+    # Doing inplace clamp and converting lut to uint8 improves perfs
+    lut.clamp_(0, 255)
+    lut = lut.to(torch.uint8)
+    lut = torch.nn.functional.pad(lut[:-1], [1, 0])
+
+    return lut[img_chan.to(torch.int64)]
+
+
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.dtype != torch.uint8:
         raise TypeError(f"Only torch.uint8 image tensors are supported, but found {image.dtype}")
@@ -194,15 +218,9 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
     elif image.ndim == 2:
-        return _FT._scale_channel(image)
+        return _scale_channel(image)
     else:
-        return torch.stack(
-            [
-                # TODO: when merging transforms v1 and v2, we can inline this function call
-                _FT._equalize_single_image(single_image)
-                for single_image in image.view(-1, num_channels, height, width)
-            ]
-        ).view(image.shape)
+        return torch.stack([_scale_channel(x) for x in image.view(-1, height, width)]).view(image.shape)
 
 
 equalize_image_pil = _FP.equalize
-- 
GitLab


From 0bfbabc2a9841a160a66a72c1e02ca8a97e6f8ee Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 12 Oct 2022 11:41:38 +0200
Subject: [PATCH 036/624] cache traceback together with exceptions (#6748)

---
 test/common_utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 8f07e91d1..9e919a149 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -210,7 +210,7 @@ def cache(fn):
     """
     sentinel = object()
     out_cache = {}
-    exc_cache = {}
+    exc_tb_cache = {}
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
@@ -220,14 +220,17 @@ def cache(fn):
         if out is not sentinel:
             return out
 
-        exc = exc_cache.get(key, sentinel)
-        if exc is not sentinel:
-            raise exc
+        exc_tb = exc_tb_cache.get(key, sentinel)
+        if exc_tb is not sentinel:
+            raise exc_tb[0].with_traceback(exc_tb[1])
 
         try:
             out = fn(*args, **kwargs)
         except Exception as exc:
-            exc_cache[key] = exc
+            # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest
+            # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding
+            # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details.
+            exc_tb_cache[key] = exc, exc.__traceback__
             raise exc
 
         out_cache[key] = out
-- 
GitLab


From 7d36d263a8356fac0bb363617b0c57c3bac6f89f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 12 Oct 2022 12:03:22 +0200
Subject: [PATCH 037/624] Seed transform tests (#6749)

* Revert "Add seeds on Kernel Info and reduce randomness for Gaussian Blur (#6741)"

This reverts commit 6e72f2fda1df6704003742238f0e87732b9635a1.

* add fixture to fix the RNG seed

* re-add changes to gaussian_blur_* sample input shapes

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/prototype_transforms_kernel_infos.py    |  5 -----
 test/test_prototype_transforms_functional.py | 18 ++++++------------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index f7b1e71f3..5af2f8f6a 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -49,14 +49,12 @@ class KernelInfo(InfoBase):
         test_marks=None,
         # See InfoBase
         closeness_kwargs=None,
-        seed=None,
     ):
         super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
         self.kernel = kernel
         self.sample_inputs_fn = sample_inputs_fn
         self.reference_fn = reference_fn
         self.reference_inputs_fn = reference_inputs_fn
-        self.seed = seed
 
 
 DEFAULT_IMAGE_CLOSENESS_KWARGS = dict(
@@ -1333,13 +1331,10 @@ KERNEL_INFOS.extend(
                 xfail_jit_python_scalar_arg("kernel_size"),
                 xfail_jit_python_scalar_arg("sigma"),
             ],
-            seed=0,
         ),
         KernelInfo(
             F.gaussian_blur_video,
             sample_inputs_fn=sample_inputs_gaussian_blur_video,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            seed=0,
         ),
     ]
 )
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index c08228769..982d776bd 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -67,6 +67,12 @@ def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=No
     return decorator
 
 
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
 class TestKernels:
     sample_inputs = make_info_args_kwargs_parametrization(
         KERNEL_INFOS,
@@ -81,8 +87,6 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_scripted_vs_eager(self, info, args_kwargs, device):
-        if info.seed is not None:
-            set_rng_seed(info.seed)
         kernel_eager = info.kernel
         kernel_scripted = script(kernel_eager)
 
@@ -113,8 +117,6 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_batched_vs_single(self, info, args_kwargs, device):
-        if info.seed is not None:
-            set_rng_seed(info.seed)
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
         feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input)
@@ -150,8 +152,6 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_no_inplace(self, info, args_kwargs, device):
-        if info.seed is not None:
-            set_rng_seed(info.seed)
         (input, *other_args), kwargs = args_kwargs.load(device)
 
         if input.numel() == 0:
@@ -165,8 +165,6 @@ class TestKernels:
     @sample_inputs
     @needs_cuda
     def test_cuda_vs_cpu(self, info, args_kwargs):
-        if info.seed is not None:
-            set_rng_seed(info.seed)
         (input_cpu, *other_args), kwargs = args_kwargs.load("cpu")
         input_cuda = input_cpu.to("cuda")
 
@@ -178,8 +176,6 @@ class TestKernels:
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_dtype_and_device_consistency(self, info, args_kwargs, device):
-        if info.seed is not None:
-            set_rng_seed(info.seed)
         (input, *other_args), kwargs = args_kwargs.load(device)
 
         output = info.kernel(input, *other_args, **kwargs)
@@ -192,8 +188,6 @@ class TestKernels:
 
     @reference_inputs
     def test_against_reference(self, info, args_kwargs):
-        if info.seed is not None:
-            set_rng_seed(info.seed)
         args, kwargs = args_kwargs.load("cpu")
 
         actual = info.kernel(*args, **kwargs)
-- 
GitLab


From 54a2d4e8f7a4568823532d4342f6ba13e7339dce Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 12 Oct 2022 14:44:10 +0100
Subject: [PATCH 038/624] [prototype] Video Classes Clean up (#6751)

* Removing unnecessary methods/classes.

* Unions instead of ImageOrVideo types

* Fixing JIT issue.
---
 torchvision/prototype/features/__init__.py    |  8 +++-----
 torchvision/prototype/features/_encoded.py    |  4 ----
 torchvision/prototype/features/_image.py      | 14 +-------------
 torchvision/prototype/features/_video.py      |  8 +-------
 torchvision/prototype/transforms/_augment.py  |  7 ++++---
 .../prototype/transforms/_auto_augment.py     |  6 +++---
 torchvision/prototype/transforms/_color.py    |  8 +++++---
 .../prototype/transforms/_deprecated.py       |  8 ++++++--
 torchvision/prototype/transforms/_geometry.py | 17 ++++++++---------
 torchvision/prototype/transforms/_meta.py     |  8 +++++---
 torchvision/prototype/transforms/_misc.py     |  8 ++++++--
 .../transforms/functional/_augment.py         |  6 ++++--
 .../transforms/functional/_deprecated.py      |  2 +-
 .../transforms/functional/_geometry.py        | 19 ++++++++-----------
 .../prototype/transforms/functional/_meta.py  | 12 ++++++------
 .../prototype/transforms/functional/_misc.py  |  7 +++++--
 16 files changed, 66 insertions(+), 76 deletions(-)

diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
index 944ae9bd3..8a461e1be 100644
--- a/torchvision/prototype/features/__init__.py
+++ b/torchvision/prototype/features/__init__.py
@@ -1,5 +1,5 @@
 from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._encoded import EncodedData, EncodedImage, EncodedVideo
+from ._encoded import EncodedData, EncodedImage
 from ._feature import _Feature, FillType, FillTypeJIT, InputType, InputTypeJIT, is_simple_tensor
 from ._image import (
     ColorSpace,
@@ -14,12 +14,10 @@ from ._image import (
 from ._label import Label, OneHotLabel
 from ._mask import Mask
 from ._video import (
-    ImageOrVideoType,
-    ImageOrVideoTypeJIT,
     LegacyVideoType,
     LegacyVideoTypeJIT,
-    TensorImageOrVideoType,
-    TensorImageOrVideoTypeJIT,
+    TensorVideoType,
+    TensorVideoTypeJIT,
     Video,
     VideoType,
     VideoTypeJIT,
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
index 9347b4eca..ffa347a3e 100644
--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/features/_encoded.py
@@ -55,7 +55,3 @@ class EncodedImage(EncodedData):
                 self._spatial_size = image.height, image.width
 
         return self._spatial_size
-
-
-class EncodedVideo(EncodedData):
-    pass
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 6d52a178b..e9128b94b 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -6,10 +6,8 @@ from typing import Any, cast, List, Optional, Tuple, Union
 import PIL.Image
 import torch
 from torchvision._utils import StrEnum
-from torchvision.transforms.functional import InterpolationMode, to_pil_image
-from torchvision.utils import draw_bounding_boxes, make_grid
+from torchvision.transforms.functional import InterpolationMode
 
-from ._bounding_box import BoundingBox
 from ._feature import _Feature, FillTypeJIT
 
 
@@ -124,16 +122,6 @@ class Image(_Feature):
             color_space=color_space,
         )
 
-    def show(self) -> None:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-        to_pil_image(make_grid(self.view(-1, *self.shape[-3:]))).show()
-
-    def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-        return Image.wrap_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs))
-
     def horizontal_flip(self) -> Image:
         output = self._F.horizontal_flip_image_tensor(self)
         return Image.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index ca4253c73..9dfff7f96 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -7,7 +7,7 @@ import torch
 from torchvision.transforms.functional import InterpolationMode
 
 from ._feature import _Feature, FillTypeJIT
-from ._image import ColorSpace, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
+from ._image import ColorSpace
 
 
 class Video(_Feature):
@@ -236,9 +236,3 @@ LegacyVideoType = torch.Tensor
 LegacyVideoTypeJIT = torch.Tensor
 TensorVideoType = Union[torch.Tensor, Video]
 TensorVideoTypeJIT = torch.Tensor
-
-# TODO: decide if we should do definitions for both Images and Videos or use unions in the methods
-ImageOrVideoType = Union[ImageType, VideoType]
-ImageOrVideoTypeJIT = Union[ImageTypeJIT, VideoTypeJIT]
-TensorImageOrVideoType = Union[TensorImageType, TensorVideoType]
-TensorImageOrVideoTypeJIT = Union[TensorImageTypeJIT, TensorVideoTypeJIT]
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index f0e527385..9a4d32fc6 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -1,7 +1,7 @@
 import math
 import numbers
 import warnings
-from typing import Any, cast, Dict, List, Optional, Tuple
+from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
@@ -92,14 +92,15 @@ class RandomErasing(_RandomApplyTransform):
 
         return dict(i=i, j=j, h=h, w=w, v=v)
 
-    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
+    def _transform(
+        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
+    ) -> Union[features.ImageType, features.VideoType]:
         if params["v"] is not None:
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
         return inpt
 
 
-# TODO: Add support for Video: https://github.com/pytorch/vision/issues/6731
 class _BaseMixupCutmix(_RandomApplyTransform):
     def __init__(self, alpha: float, p: float = 0.5) -> None:
         super().__init__(p=p)
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index b35b5529b..02c1a18da 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -35,7 +35,7 @@ class _AutoAugmentBase(Transform):
         self,
         sample: Any,
         unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask),
-    ) -> Tuple[int, features.ImageOrVideoType]:
+    ) -> Tuple[int, Union[features.ImageType, features.VideoType]]:
         sample_flat, _ = tree_flatten(sample)
         image_or_videos = []
         for id, inpt in enumerate(sample_flat):
@@ -60,12 +60,12 @@ class _AutoAugmentBase(Transform):
 
     def _apply_image_or_video_transform(
         self,
-        image: features.ImageOrVideoType,
+        image: Union[features.ImageType, features.VideoType],
         transform_id: str,
         magnitude: float,
         interpolation: InterpolationMode,
         fill: Dict[Type, features.FillType],
-    ) -> features.ImageOrVideoType:
+    ) -> Union[features.ImageType, features.VideoType]:
         fill_ = fill[type(image)]
         fill_ = F._geometry._convert_fill_arg(fill_)
 
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 616669cc8..609f03bf4 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -111,8 +111,8 @@ class RandomPhotometricDistort(Transform):
         )
 
     def _permute_channels(
-        self, inpt: features.ImageOrVideoType, permutation: torch.Tensor
-    ) -> features.ImageOrVideoType:
+        self, inpt: Union[features.ImageType, features.VideoType], permutation: torch.Tensor
+    ) -> Union[features.ImageType, features.VideoType]:
         if isinstance(inpt, PIL.Image.Image):
             inpt = F.pil_to_tensor(inpt)
 
@@ -126,7 +126,9 @@ class RandomPhotometricDistort(Transform):
 
         return output
 
-    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
+    def _transform(
+        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
+    ) -> Union[features.ImageType, features.VideoType]:
         if params["brightness"]:
             inpt = F.adjust_brightness(
                 inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index 0cc4a90c4..e401534f4 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -52,7 +52,9 @@ class Grayscale(Transform):
         super().__init__()
         self.num_output_channels = num_output_channels
 
-    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
+    def _transform(
+        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
+    ) -> Union[features.ImageType, features.VideoType]:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
         if isinstance(inpt, (features.Image, features.Video)):
             output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)  # type: ignore[arg-type]
@@ -81,7 +83,9 @@ class RandomGrayscale(_RandomApplyTransform):
         num_input_channels, *_ = query_chw(sample)
         return dict(num_input_channels=num_input_channels)
 
-    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
+    def _transform(
+        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
+    ) -> Union[features.ImageType, features.VideoType]:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
         if isinstance(inpt, (features.Image, features.Video)):
             output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)  # type: ignore[arg-type]
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 91d7c294e..b09533273 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -148,6 +148,9 @@ class RandomResizedCrop(Transform):
         )
 
 
+ImageOrVideoTypeJIT = Union[features.ImageTypeJIT, features.VideoTypeJIT]
+
+
 class FiveCrop(Transform):
     """
     Example:
@@ -177,14 +180,8 @@ class FiveCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
     def _transform(
-        self, inpt: features.ImageOrVideoType, params: Dict[str, Any]
-    ) -> Tuple[
-        features.ImageOrVideoType,
-        features.ImageOrVideoType,
-        features.ImageOrVideoType,
-        features.ImageOrVideoType,
-        features.ImageOrVideoType,
-    ]:
+        self, inpt: ImageOrVideoTypeJIT, params: Dict[str, Any]
+    ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
         return F.five_crop(inpt, self.size)
 
     def forward(self, *inputs: Any) -> Any:
@@ -205,7 +202,9 @@ class TenCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
         self.vertical_flip = vertical_flip
 
-    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> List[features.ImageOrVideoType]:
+    def _transform(
+        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
+    ) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]:
         return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
 
     def forward(self, *inputs: Any) -> Any:
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index dc109269f..bdfe8b47a 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -29,8 +29,8 @@ class ConvertImageDtype(Transform):
         self.dtype = dtype
 
     def _transform(
-        self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]
-    ) -> features.TensorImageOrVideoType:
+        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+    ) -> Union[features.TensorImageType, features.TensorVideoType]:
         output = F.convert_image_dtype(inpt, dtype=self.dtype)
         return (
             output if features.is_simple_tensor(inpt) else type(inpt).wrap_like(inpt, output)  # type: ignore[attr-defined]
@@ -58,7 +58,9 @@ class ConvertColorSpace(Transform):
 
         self.copy = copy
 
-    def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType:
+    def _transform(
+        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
+    ) -> Union[features.ImageType, features.VideoType]:
         return F.convert_color_space(
             inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy
         )
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index d3c8a57dc..945aa8456 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -68,7 +68,9 @@ class LinearTransformation(Transform):
 
         return super().forward(*inputs)
 
-    def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor:
+    def _transform(
+        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+    ) -> torch.Tensor:
         # Image instance after linear transformation is not Image anymore due to unknown data range
         # Thus we will return Tensor for input Image
 
@@ -101,7 +103,9 @@ class Normalize(Transform):
         self.std = list(std)
         self.inplace = inplace
 
-    def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor:
+    def _transform(
+        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+    ) -> torch.Tensor:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
     def forward(self, *inpts: Any) -> Any:
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 57c3602cc..20e5ac916 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -1,3 +1,5 @@
+from typing import Union
+
 import PIL.Image
 
 import torch
@@ -24,14 +26,14 @@ def erase_video(
 
 
 def erase(
-    inpt: features.ImageOrVideoTypeJIT,
+    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT],
     i: int,
     j: int,
     h: int,
     w: int,
     v: torch.Tensor,
     inplace: bool = False,
-) -> features.ImageOrVideoTypeJIT:
+) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
     if isinstance(inpt, torch.Tensor):
         output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
         if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index 854920b96..e18c267e8 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -59,7 +59,7 @@ def to_tensor(inpt: Any) -> torch.Tensor:
     return _F.to_tensor(inpt)
 
 
-def get_image_size(inpt: features.ImageOrVideoTypeJIT) -> List[int]:
+def get_image_size(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
         "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 590a13310..43962ad4d 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1382,16 +1382,13 @@ def five_crop_video(
     return five_crop_image_tensor(video, size)
 
 
+ImageOrVideoTypeJIT = Union[features.ImageTypeJIT, features.VideoTypeJIT]
+
+
 def five_crop(
-    inpt: features.ImageOrVideoTypeJIT, size: List[int]
-) -> Tuple[
-    features.ImageOrVideoTypeJIT,
-    features.ImageOrVideoTypeJIT,
-    features.ImageOrVideoTypeJIT,
-    features.ImageOrVideoTypeJIT,
-    features.ImageOrVideoTypeJIT,
-]:
-    # TODO: consider breaking BC here to return List[features.ImageOrVideoTypeJIT] to align this op with `ten_crop`
+    inpt: ImageOrVideoTypeJIT, size: List[int]
+) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
+    # TODO: consider breaking BC here to return List[features.ImageTypeJIT/VideoTypeJIT] to align this op with `ten_crop`
     if isinstance(inpt, torch.Tensor):
         output = five_crop_image_tensor(inpt, size)
         if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
@@ -1434,8 +1431,8 @@ def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = F
 
 
 def ten_crop(
-    inpt: features.ImageOrVideoTypeJIT, size: List[int], vertical_flip: bool = False
-) -> List[features.ImageOrVideoTypeJIT]:
+    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], size: List[int], vertical_flip: bool = False
+) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]:
     if isinstance(inpt, torch.Tensor):
         output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
         if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index a118784eb..2903d73ce 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -1,4 +1,4 @@
-from typing import cast, List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
@@ -11,7 +11,7 @@ get_dimensions_image_tensor = _FT.get_dimensions
 get_dimensions_image_pil = _FP.get_dimensions
 
 
-def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]:
+def get_dimensions(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]:
     if isinstance(image, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
     ):
@@ -32,7 +32,7 @@ def get_num_channels_video(video: torch.Tensor) -> int:
     return get_num_channels_image_tensor(video)
 
 
-def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int:
+def get_num_channels(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> int:
     if isinstance(image, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
     ):
@@ -262,11 +262,11 @@ def convert_color_space_video(
 
 
 def convert_color_space(
-    inpt: features.ImageOrVideoTypeJIT,
+    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT],
     color_space: ColorSpace,
     old_color_space: Optional[ColorSpace] = None,
     copy: bool = True,
-) -> features.ImageOrVideoTypeJIT:
+) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
     ):
@@ -281,4 +281,4 @@ def convert_color_space(
     elif isinstance(inpt, (features.Image, features.Video)):
         return inpt.to_color_space(color_space, copy=copy)
     else:
-        return cast(features.ImageOrVideoTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy))
+        return convert_color_space_image_pil(inpt, color_space, copy=copy)
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 79a358b4e..8fda24e17 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import PIL.Image
 import torch
@@ -14,7 +14,10 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
 
 
 def normalize(
-    inpt: features.TensorImageOrVideoTypeJIT, mean: List[float], std: List[float], inplace: bool = False
+    inpt: Union[features.TensorImageTypeJIT, features.TensorVideoTypeJIT],
+    mean: List[float],
+    std: List[float],
+    inplace: bool = False,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
         correct_type = isinstance(inpt, torch.Tensor)
-- 
GitLab


From b16dec19a4b737b3fb120c48c7da4b07456902fa Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 13 Oct 2022 13:29:45 +0200
Subject: [PATCH 039/624] [proto] Performance improvements for equalize op
 (#6757)

* [proto] Performance improvements for equalize op

* Added tests
---
 test/test_prototype_transforms_functional.py  | 11 ++++
 .../prototype/transforms/functional/_color.py | 59 +++++++++++--------
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 982d776bd..34291611d 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -1037,3 +1037,14 @@ def test_to_image_pil(inpt, mode):
     assert isinstance(output, PIL.Image.Image)
 
     assert np.asarray(inpt).sum() == np.asarray(output).sum()
+
+
+def test_equalize_image_tensor_edge_cases():
+    inpt = torch.zeros(3, 200, 200, dtype=torch.uint8)
+    output = F.equalize_image_tensor(inpt)
+    torch.testing.assert_close(inpt, output)
+
+    inpt = torch.zeros(5, 3, 200, 200, dtype=torch.uint8)
+    inpt[..., 100:, 100:] = 1
+    output = F.equalize_image_tensor(inpt)
+    assert output.unique().tolist() == [0, 255]
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 63fa8a28c..7cbf8885c 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -183,28 +183,37 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return autocontrast_image_pil(inpt)
 
 
-def _scale_channel(img_chan: torch.Tensor) -> torch.Tensor:
-    # TODO: we should expect bincount to always be faster than histc, but this
-    # isn't always the case. Once
-    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
-    # block and only use bincount.
-    if img_chan.is_cuda:
-        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
-    else:
-        hist = torch.bincount(img_chan.view(-1), minlength=256)
-
-    nonzero_hist = hist[hist != 0]
-    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
-    if step == 0:
-        return img_chan
-
-    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
-    # Doing inplace clamp and converting lut to uint8 improves perfs
-    lut.clamp_(0, 255)
-    lut = lut.to(torch.uint8)
-    lut = torch.nn.functional.pad(lut[:-1], [1, 0])
-
-    return lut[img_chan.to(torch.int64)]
+def _equalize_image_tensor_vec(img: torch.Tensor) -> torch.Tensor:
+    # input img shape should be [N, H, W]
+    shape = img.shape
+    # Compute image histogram:
+    flat_img = img.flatten(start_dim=1).to(torch.long)  # -> [N, H * W]
+    hist = flat_img.new_zeros(shape[0], 256)
+    hist.scatter_add_(dim=1, index=flat_img, src=flat_img.new_ones(1).expand_as(flat_img))
+
+    # Compute image cdf
+    chist = hist.cumsum_(dim=1)
+    # Compute steps, where step per channel is nonzero_hist[:-1].sum() // 255
+    # Trick: nonzero_hist[:-1].sum() == chist[idx - 1], where idx = chist.argmax()
+    idx = chist.argmax(dim=1).sub_(1)
+    # If histogram is degenerate (hist of zero image), index is -1
+    neg_idx_mask = idx < 0
+    idx.clamp_(min=0)
+    step = chist.gather(dim=1, index=idx.unsqueeze(1))
+    step[neg_idx_mask] = 0
+    step.div_(255, rounding_mode="floor")
+
+    # Compute batched Look-up-table:
+    # Necessary to avoid an integer division by zero, which raises
+    clamped_step = step.clamp(min=1)
+    chist.add_(torch.div(step, 2, rounding_mode="floor")).div_(clamped_step, rounding_mode="floor").clamp_(0, 255)
+    lut = chist.to(torch.uint8)  # [N, 256]
+
+    # Pad lut with zeros
+    zeros = lut.new_zeros((1, 1)).expand(shape[0], 1)
+    lut = torch.cat([zeros, lut[:, :-1]], dim=1)
+
+    return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).view_as(img))
 
 
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
@@ -217,10 +226,8 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
 
     if image.numel() == 0:
         return image
-    elif image.ndim == 2:
-        return _scale_channel(image)
-    else:
-        return torch.stack([_scale_channel(x) for x in image.view(-1, height, width)]).view(image.shape)
+
+    return _equalize_image_tensor_vec(image.view(-1, height, width)).view(image.shape)
 
 
 equalize_image_pil = _FP.equalize
-- 
GitLab


From 6d774c6fe53f2492e782095bacd544eebfeb0fc5 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 13 Oct 2022 13:31:26 +0200
Subject: [PATCH 040/624] Fixed repr for ElasticTransform (#6758)

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 torchvision/transforms/transforms.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 18e2ffc96..6011e2372 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -2133,9 +2133,9 @@ class ElasticTransform(torch.nn.Module):
         return F.elastic_transform(tensor, displacement, self.interpolation, self.fill)
 
     def __repr__(self):
-        format_string = self.__class__.__name__ + "(alpha="
-        format_string += str(self.alpha) + ")"
-        format_string += ", (sigma=" + str(self.sigma) + ")"
-        format_string += ", interpolation={self.interpolation}"
-        format_string += ", fill={self.fill})"
+        format_string = self.__class__.__name__
+        format_string += f"(alpha={self.alpha}"
+        format_string += f", sigma={self.sigma}"
+        format_string += f", interpolation={self.interpolation}"
+        format_string += f", fill={self.fill})"
         return format_string
-- 
GitLab


From 3eafe77a51ba1aab061b2564f9cd8774a0df3be7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 13 Oct 2022 13:45:31 +0200
Subject: [PATCH 041/624] expand ToDtype to support multiple conversions at
 once (#6756)

* expand ToDtype to support multiple conversions at once

* simplify
---
 test/test_prototype_transforms.py         | 38 +++++++++++++++++++++++
 torchvision/prototype/transforms/_misc.py | 21 +++++++++----
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 2c095fa6e..f18597a24 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1789,3 +1789,41 @@ class TestRandomResize:
         mock_resize.assert_called_with(
             inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
         )
+
+
+@pytest.mark.parametrize(
+    ("dtype", "expected_dtypes"),
+    [
+        (
+            torch.float64,
+            {torch.Tensor: torch.float64, features.Image: torch.float64, features.BoundingBox: torch.float64},
+        ),
+        (
+            {torch.Tensor: torch.int32, features.Image: torch.float32, features.BoundingBox: torch.float64},
+            {torch.Tensor: torch.int32, features.Image: torch.float32, features.BoundingBox: torch.float64},
+        ),
+    ],
+)
+def test_to_dtype(dtype, expected_dtypes):
+    sample = dict(
+        plain_tensor=torch.testing.make_tensor(5, dtype=torch.int64, device="cpu"),
+        image=make_image(dtype=torch.uint8),
+        bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY, dtype=torch.float32),
+        str="str",
+        int=0,
+    )
+
+    transform = transforms.ToDtype(dtype)
+    transformed_sample = transform(sample)
+
+    for key, value in sample.items():
+        value_type = type(value)
+        transformed_value = transformed_sample[key]
+
+        # make sure the transformation retains the type
+        assert isinstance(transformed_value, value_type)
+
+        if isinstance(value, torch.Tensor):
+            assert transformed_value.dtype is expected_dtypes[value_type]
+        else:
+            assert transformed_value is value
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 945aa8456..eac65da6e 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,4 +1,5 @@
 import functools
+from collections import defaultdict
 from typing import Any, Callable, Dict, Sequence, Type, Union
 
 import PIL.Image
@@ -144,14 +145,22 @@ class GaussianBlur(Transform):
         return F.gaussian_blur(inpt, self.kernel_size, **params)
 
 
-# TODO: Enhance as described at https://github.com/pytorch/vision/issues/6697
-class ToDtype(Lambda):
-    def __init__(self, dtype: torch.dtype, *types: Type) -> None:
+class ToDtype(Transform):
+    _transformed_types = (torch.Tensor,)
+
+    def _default_dtype(self, dtype: torch.dtype) -> torch.dtype:
+        return dtype
+
+    def __init__(self, dtype: Union[torch.dtype, Dict[Type, torch.dtype]]) -> None:
+        super().__init__()
+        if not isinstance(dtype, dict):
+            # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
+            # If it were possible, we could replace this with `defaultdict(lambda: dtype)`
+            dtype = defaultdict(functools.partial(self._default_dtype, dtype))
         self.dtype = dtype
-        super().__init__(functools.partial(torch.Tensor.to, dtype=dtype), *types or (torch.Tensor,))
 
-    def extra_repr(self) -> str:
-        return ", ".join([f"dtype={self.dtype}", f"types={[type.__name__ for type in self.types]}"])
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return inpt.to(self.dtype[type(inpt)])
 
 
 class RemoveSmallBoundingBoxes(Transform):
-- 
GitLab


From bdc55567d0e0f639b1c7b1dc4374819bd1b9693f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 13 Oct 2022 13:59:04 +0200
Subject: [PATCH 042/624] introduce nearest-exact interpolation (#6754)

* introduce nearest-exact interpolation

* update prototype tests

* update stable tests
---
 test/prototype_transforms_kernel_infos.py |  2 ++
 test/test_functional_tensor.py            | 11 ++++++++---
 test/test_transforms_tensor.py            | 15 ++++++++++-----
 torchvision/transforms/functional.py      | 11 ++++++++---
 torchvision/transforms/transforms.py      |  8 ++++----
 5 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 5af2f8f6a..c455caa6b 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -232,6 +232,7 @@ def reference_inputs_resize_image_tensor():
         make_image_loaders(extra_dims=[()]),
         [
             F.InterpolationMode.NEAREST,
+            F.InterpolationMode.NEAREST_EXACT,
             F.InterpolationMode.BILINEAR,
             F.InterpolationMode.BICUBIC,
         ],
@@ -881,6 +882,7 @@ def reference_inputs_resized_crop_image_tensor():
         make_image_loaders(extra_dims=[()]),
         [
             F.InterpolationMode.NEAREST,
+            F.InterpolationMode.NEAREST_EXACT,
             F.InterpolationMode.BILINEAR,
             F.InterpolationMode.BICUBIC,
         ],
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 9bdd4ab83..25f4e709f 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -25,7 +25,12 @@ from common_utils import (
 )
 from torchvision.transforms import InterpolationMode
 
-NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC
+NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
+    InterpolationMode.NEAREST,
+    InterpolationMode.NEAREST_EXACT,
+    InterpolationMode.BILINEAR,
+    InterpolationMode.BICUBIC,
+)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -506,7 +511,7 @@ def test_perspective_interpolation_warning():
     ],
 )
 @pytest.mark.parametrize("max_size", [None, 34, 40, 1000])
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST])
+@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
 def test_resize(device, dt, size, max_size, interpolation):
 
     if dt == torch.float16 and device == "cpu":
@@ -966,7 +971,7 @@ def test_pad(device, dt, pad, config):
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("mode", [NEAREST, BILINEAR, BICUBIC])
+@pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC])
 def test_resized_crop(device, mode):
     # test values of F.resized_crop in several cases:
     # 1) resize to the same size, crop to the same size => should be identity
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index f4ca544de..7b75a4436 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -20,7 +20,12 @@ from torchvision import transforms as T
 from torchvision.transforms import functional as F, InterpolationMode
 from torchvision.transforms.autoaugment import _apply_op
 
-NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC
+NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
+    InterpolationMode.NEAREST,
+    InterpolationMode.NEAREST_EXACT,
+    InterpolationMode.BILINEAR,
+    InterpolationMode.BICUBIC,
+)
 
 
 def _test_transform_vs_scripted(transform, s_transform, tensor, msg=None):
@@ -378,7 +383,7 @@ class TestResize:
     @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64])
     @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]])
     @pytest.mark.parametrize("max_size", [None, 35, 1000])
-    @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST])
+    @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
     def test_resize_scripted(self, dt, size, max_size, interpolation, device):
         tensor, _ = _create_data(height=34, width=36, device=device)
         batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device)
@@ -402,12 +407,12 @@ class TestResize:
     @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
     @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]])
     @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]])
-    @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC])
+    @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC, NEAREST_EXACT])
     @pytest.mark.parametrize("antialias", [None, True, False])
     def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device):
 
-        if antialias and interpolation == NEAREST:
-            pytest.skip("Can not resize if interpolation mode is NEAREST and antialias=True")
+        if antialias and interpolation in {NEAREST, NEAREST_EXACT}:
+            pytest.skip(f"Can not resize if interpolation mode is {interpolation} and antialias=True")
 
         tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
         batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device)
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index e82c5eca8..f06b5dbc9 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -20,10 +20,12 @@ from . import functional_pil as F_pil, functional_tensor as F_t
 
 class InterpolationMode(Enum):
     """Interpolation modes
-    Available interpolation methods are ``nearest``, ``bilinear``, ``bicubic``, ``box``, ``hamming``, and ``lanczos``.
+    Available interpolation methods are ``nearest``, ``nearest-exact``, ``bilinear``, ``bicubic``, ``box``, ``hamming``,
+    and ``lanczos``.
     """
 
     NEAREST = "nearest"
+    NEAREST_EXACT = "nearest-exact"
     BILINEAR = "bilinear"
     BICUBIC = "bicubic"
     # For PIL compatibility
@@ -50,6 +52,7 @@ pil_modes_mapping = {
     InterpolationMode.NEAREST: 0,
     InterpolationMode.BILINEAR: 2,
     InterpolationMode.BICUBIC: 3,
+    InterpolationMode.NEAREST_EXACT: 0,
     InterpolationMode.BOX: 4,
     InterpolationMode.HAMMING: 5,
     InterpolationMode.LANCZOS: 1,
@@ -416,7 +419,8 @@ def resize(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
             For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
             but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         max_size (int, optional): The maximum allowed for the longer edge of
@@ -617,7 +621,8 @@ def resized_crop(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
             For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
             but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 6011e2372..985937678 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -296,8 +296,8 @@ class Resize(torch.nn.Module):
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
-            ``InterpolationMode.BICUBIC`` are supported.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
             For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
             but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         max_size (int, optional): The maximum allowed for the longer edge of
@@ -865,8 +865,8 @@ class RandomResizedCrop(torch.nn.Module):
             resizing.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
-            ``InterpolationMode.BICUBIC`` are supported.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
             For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
             but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-- 
GitLab


From e1b21f9c20e70ee5385ecd6ea2268010b8c4aed1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 13 Oct 2022 16:46:12 +0200
Subject: [PATCH 043/624] introduce _check method for type checks on prototype
 transforms (#6503)

* introduce _check method for type checks on prototype transforms

* cleanup

* Update torchvision/prototype/transforms/_geometry.py

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>

* introduce _check on new transforms

* _check -> _check_inputs

* always check inputs in _RandomApplyTransform

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 torchvision/prototype/transforms/_augment.py  |  9 ++-
 torchvision/prototype/transforms/_geometry.py | 63 +++++++++----------
 torchvision/prototype/transforms/_misc.py     | 15 ++---
 .../prototype/transforms/_transform.py        | 20 +++++-
 4 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 9a4d32fc6..5861dd291 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -107,17 +107,16 @@ class _BaseMixupCutmix(_RandomApplyTransform):
         self.alpha = alpha
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
-    def forward(self, *inputs: Any) -> Any:
+    def _check_inputs(self, sample: Any) -> None:
         if not (
-            has_any(inputs, features.Image, features.Video, features.is_simple_tensor)
-            and has_any(inputs, features.OneHotLabel)
+            has_any(sample, features.Image, features.Video, features.is_simple_tensor)
+            and has_any(sample, features.OneHotLabel)
         ):
             raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
-        if has_any(inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
+        if has_any(sample, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
             raise TypeError(
                 f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
             )
-        return super().forward(*inputs)
 
     def _mixup_onehotlabel(self, inpt: features.OneHotLabel, lam: float) -> features.OneHotLabel:
         if inpt.ndim < 2:
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index b09533273..5b31adc9e 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -184,10 +184,9 @@ class FiveCrop(Transform):
     ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
         return F.five_crop(inpt, self.size)
 
-    def forward(self, *inputs: Any) -> Any:
-        if has_any(inputs, features.BoundingBox, features.Mask):
+    def _check_inputs(self, sample: Any) -> None:
+        if has_any(sample, features.BoundingBox, features.Mask):
             raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
-        return super().forward(*inputs)
 
 
 class TenCrop(Transform):
@@ -202,16 +201,15 @@ class TenCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
         self.vertical_flip = vertical_flip
 
+    def _check_inputs(self, sample: Any) -> None:
+        if has_any(sample, features.BoundingBox, features.Mask):
+            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
+
     def _transform(
         self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
     ) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]:
         return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
 
-    def forward(self, *inputs: Any) -> Any:
-        if has_any(inputs, features.BoundingBox, features.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
-        return super().forward(*inputs)
-
 
 class Pad(Transform):
     def __init__(
@@ -616,6 +614,17 @@ class RandomIoUCrop(Transform):
         self.options = sampler_options
         self.trials = trials
 
+    def _check_inputs(self, sample: Any) -> None:
+        if not (
+            has_all(sample, features.BoundingBox)
+            and has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor)
+            and has_any(sample, features.Label, features.OneHotLabel)
+        ):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
+                "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks."
+            )
+
     def _get_params(self, sample: Any) -> Dict[str, Any]:
         orig_h, orig_w = query_spatial_size(sample)
         bboxes = query_bounding_box(sample)
@@ -688,18 +697,6 @@ class RandomIoUCrop(Transform):
 
         return output
 
-    def forward(self, *inputs: Any) -> Any:
-        if not (
-            has_all(inputs, features.BoundingBox)
-            and has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor)
-            and has_any(inputs, features.Label, features.OneHotLabel)
-        ):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
-                "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks."
-            )
-        return super().forward(*inputs)
-
 
 class ScaleJitter(Transform):
     def __init__(
@@ -774,6 +771,18 @@ class FixedSizeCrop(Transform):
 
         self.padding_mode = padding_mode
 
+    def _check_inputs(self, sample: Any) -> None:
+        if not has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
+            )
+
+        if has_any(sample, features.BoundingBox) and not has_any(sample, features.Label, features.OneHotLabel):
+            raise TypeError(
+                f"If a BoundingBox is contained in the input sample, "
+                f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
+            )
+
     def _get_params(self, sample: Any) -> Dict[str, Any]:
         height, width = query_spatial_size(sample)
         new_height = min(height, self.crop_height)
@@ -850,20 +859,6 @@ class FixedSizeCrop(Transform):
 
         return inpt
 
-    def forward(self, *inputs: Any) -> Any:
-        if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
-            )
-
-        if has_any(inputs, features.BoundingBox) and not has_any(inputs, features.Label, features.OneHotLabel):
-            raise TypeError(
-                f"If a BoundingBox is contained in the input sample, "
-                f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
-            )
-
-        return super().forward(*inputs)
-
 
 class RandomResize(Transform):
     def __init__(
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index eac65da6e..61be60cee 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -63,12 +63,10 @@ class LinearTransformation(Transform):
         self.transformation_matrix = transformation_matrix
         self.mean_vector = mean_vector
 
-    def forward(self, *inputs: Any) -> Any:
-        if has_any(inputs, PIL.Image.Image):
+    def _check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
             raise TypeError("LinearTransformation does not work on PIL Images")
 
-        return super().forward(*inputs)
-
     def _transform(
         self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
@@ -104,16 +102,15 @@ class Normalize(Transform):
         self.std = list(std)
         self.inplace = inplace
 
+    def _check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
     def _transform(
         self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
-    def forward(self, *inpts: Any) -> Any:
-        if has_any(inpts, PIL.Image.Image):
-            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
-        return super().forward(*inpts)
-
 
 class GaussianBlur(Transform):
     def __init__(
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 54ae91b79..056c2da9f 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -23,6 +23,9 @@ class Transform(nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
+    def _check_inputs(self, sample: Any) -> None:
+        pass
+
     def _get_params(self, sample: Any) -> Dict[str, Any]:
         return dict()
 
@@ -32,6 +35,8 @@ class Transform(nn.Module):
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
 
+        self._check_inputs(sample)
+
         params = self._get_params(sample)
 
         flat_inputs, spec = tree_flatten(sample)
@@ -64,9 +69,22 @@ class _RandomApplyTransform(Transform):
         self.p = p
 
     def forward(self, *inputs: Any) -> Any:
+        # We need to almost duplicate `Transform.forward()` here since we always want to check the inputs, but return
+        # early afterwards in case the random check triggers. The same result could be achieved by calling
+        # `super().forward()` after the random check, but that would call `self._check_inputs` twice.
+
         sample = inputs if len(inputs) > 1 else inputs[0]
 
+        self._check_inputs(sample)
+
         if torch.rand(1) >= self.p:
             return sample
 
-        return super().forward(sample)
+        params = self._get_params(sample)
+
+        flat_inputs, spec = tree_flatten(sample)
+        flat_outputs = [
+            self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt
+            for inpt in flat_inputs
+        ]
+        return tree_unflatten(flat_outputs, spec)
-- 
GitLab


From dc5fd831ed9f4a0c58a194853ffa9cce6c240026 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 13 Oct 2022 17:13:45 +0200
Subject: [PATCH 044/624] improve test id for consistency tests (#6763)

---
 test/test_prototype_transforms_consistency.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index f335220fb..589d45595 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -575,9 +575,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s
 @pytest.mark.parametrize(
     ("config", "args_kwargs"),
     [
-        pytest.param(config, args_kwargs, id=f"{config.legacy_cls.__name__}({args_kwargs})")
+        pytest.param(
+            config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}"
+        )
         for config in CONSISTENCY_CONFIGS
-        for args_kwargs in config.args_kwargs
+        for idx, args_kwargs in enumerate(config.args_kwargs)
     ],
 )
 def test_call_consistency(config, args_kwargs):
-- 
GitLab


From e3238e5af74e2c1af594ab4bae8cd6bfbf5bce2c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 14 Oct 2022 10:01:40 +0200
Subject: [PATCH 045/624] only flatten a pytree once (#6767)

---
 test/test_prototype_transforms.py             | 45 ++++++------
 test/test_prototype_transforms_consistency.py |  2 +-
 torchvision/prototype/transforms/_augment.py  | 26 +++----
 .../prototype/transforms/_auto_augment.py     | 54 +++++++-------
 torchvision/prototype/transforms/_color.py    |  8 +--
 .../prototype/transforms/_deprecated.py       |  6 +-
 torchvision/prototype/transforms/_geometry.py | 72 ++++++++++---------
 torchvision/prototype/transforms/_misc.py     |  8 +--
 .../prototype/transforms/_transform.py        | 25 +++----
 torchvision/prototype/transforms/_utils.py    | 42 +++++------
 10 files changed, 143 insertions(+), 145 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index f18597a24..11a51f7b5 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -437,7 +437,7 @@ class TestRandomZoomOut:
         image = mocker.MagicMock(spec=features.Image)
         h, w = image.spatial_size = (24, 32)
 
-        params = transform._get_params(image)
+        params = transform._get_params([image])
 
         assert len(params["padding"]) == 4
         assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
@@ -462,7 +462,7 @@ class TestRandomZoomOut:
         _ = transform(inpt)
         torch.manual_seed(12)
         torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
+        params = transform._get_params([inpt])
 
         fill = transforms.functional._geometry._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill)
@@ -623,7 +623,7 @@ class TestRandomAffine:
         h, w = image.spatial_size
 
         transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
-        params = transform._get_params(image)
+        params = transform._get_params([image])
 
         if not isinstance(degrees, (list, tuple)):
             assert -degrees <= params["angle"] <= degrees
@@ -690,7 +690,7 @@ class TestRandomAffine:
         torch.manual_seed(12)
         _ = transform(inpt)
         torch.manual_seed(12)
-        params = transform._get_params(inpt)
+        params = transform._get_params([inpt])
 
         fill = transforms.functional._geometry._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
@@ -722,7 +722,7 @@ class TestRandomCrop:
         h, w = image.spatial_size
 
         transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
-        params = transform._get_params(image)
+        params = transform._get_params([image])
 
         if padding is not None:
             if isinstance(padding, int):
@@ -793,7 +793,7 @@ class TestRandomCrop:
         torch.manual_seed(12)
         _ = transform(inpt)
         torch.manual_seed(12)
-        params = transform._get_params(inpt)
+        params = transform._get_params([inpt])
         if padding is None and not pad_if_needed:
             fn_crop.assert_called_once_with(
                 inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
@@ -832,7 +832,7 @@ class TestGaussianBlur:
     @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
     def test__get_params(self, sigma):
         transform = transforms.GaussianBlur(3, sigma=sigma)
-        params = transform._get_params(None)
+        params = transform._get_params([])
 
         if isinstance(sigma, float):
             assert params["sigma"][0] == params["sigma"][1] == 10
@@ -867,7 +867,7 @@ class TestGaussianBlur:
         torch.manual_seed(12)
         _ = transform(inpt)
         torch.manual_seed(12)
-        params = transform._get_params(inpt)
+        params = transform._get_params([inpt])
 
         fn.assert_called_once_with(inpt, kernel_size, **params)
 
@@ -912,7 +912,7 @@ class TestRandomPerspective:
         image.num_channels = 3
         image.spatial_size = (24, 32)
 
-        params = transform._get_params(image)
+        params = transform._get_params([image])
 
         h, w = image.spatial_size
         assert "perspective_coeffs" in params
@@ -935,7 +935,7 @@ class TestRandomPerspective:
         _ = transform(inpt)
         torch.manual_seed(12)
         torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
+        params = transform._get_params([inpt])
 
         fill = transforms.functional._geometry._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
@@ -973,7 +973,7 @@ class TestElasticTransform:
         image.num_channels = 3
         image.spatial_size = (24, 32)
 
-        params = transform._get_params(image)
+        params = transform._get_params([image])
 
         h, w = image.spatial_size
         displacement = params["displacement"]
@@ -1006,7 +1006,7 @@ class TestElasticTransform:
         # Let's mock transform._get_params to control the output:
         transform._get_params = mocker.MagicMock()
         _ = transform(inpt)
-        params = transform._get_params(inpt)
+        params = transform._get_params([inpt])
         fill = transforms.functional._geometry._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
 
@@ -1035,7 +1035,7 @@ class TestRandomErasing:
         transform = transforms.RandomErasing(value=[1, 2, 3, 4])
 
         with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
-            transform._get_params(image)
+            transform._get_params([image])
 
     @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
     def test__get_params(self, value, mocker):
@@ -1044,7 +1044,7 @@ class TestRandomErasing:
         image.spatial_size = (24, 32)
 
         transform = transforms.RandomErasing(value=value)
-        params = transform._get_params(image)
+        params = transform._get_params([image])
 
         v = params["v"]
         h, w = params["h"], params["w"]
@@ -1197,6 +1197,7 @@ class TestContainers:
         [
             [transforms.Pad(2), transforms.RandomCrop(28)],
             [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
+            [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)],
         ],
     )
     def test_ctor(self, transform_cls, trfms):
@@ -1339,7 +1340,7 @@ class TestScaleJitter:
         n_samples = 5
         for _ in range(n_samples):
 
-            params = transform._get_params(sample)
+            params = transform._get_params([sample])
 
             assert "size" in params
             size = params["size"]
@@ -1386,7 +1387,7 @@ class TestRandomShortestSize:
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
 
         sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
-        params = transform._get_params(sample)
+        params = transform._get_params([sample])
 
         assert "size" in params
         size = params["size"]
@@ -1554,13 +1555,13 @@ class TestFixedSizeCrop:
 
         transform = transforms.FixedSizeCrop(size=crop_size)
 
-        sample = dict(
-            image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB),
-            bounding_boxes=make_bounding_box(
+        flat_inputs = [
+            make_image(size=spatial_size, color_space=features.ColorSpace.RGB),
+            make_bounding_box(
                 format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
             ),
-        )
-        params = transform._get_params(sample)
+        ]
+        params = transform._get_params(flat_inputs)
 
         assert params["needs_crop"]
         assert params["height"] <= crop_size[0]
@@ -1759,7 +1760,7 @@ class TestRandomResize:
         transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
 
         for _ in range(10):
-            params = transform._get_params(None)
+            params = transform._get_params([])
 
             assert isinstance(params["size"], list) and len(params["size"]) == 1
             size = params["size"][0]
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 589d45595..7f439fb26 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -639,7 +639,7 @@ class TestContainerTransforms:
         prototype_transform = prototype_transforms.RandomApply(
             [
                 prototype_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
+                prototype_transforms.CenterCrop(224),
             ],
             p=p,
         )
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 5861dd291..99b77eb40 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -45,8 +45,8 @@ class RandomErasing(_RandomApplyTransform):
 
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        img_c, img_h, img_w = query_chw(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        img_c, img_h, img_w = query_chw(flat_inputs)
 
         if isinstance(self.value, (int, float)):
             value = [self.value]
@@ -107,13 +107,13 @@ class _BaseMixupCutmix(_RandomApplyTransform):
         self.alpha = alpha
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
-    def _check_inputs(self, sample: Any) -> None:
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
-            has_any(sample, features.Image, features.Video, features.is_simple_tensor)
-            and has_any(sample, features.OneHotLabel)
+            has_any(flat_inputs, features.Image, features.Video, features.is_simple_tensor)
+            and has_any(flat_inputs, features.OneHotLabel)
         ):
             raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
-        if has_any(sample, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
+        if has_any(flat_inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
             raise TypeError(
                 f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
             )
@@ -127,7 +127,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
 
 
 class RandomMixup(_BaseMixupCutmix):
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         return dict(lam=float(self._dist.sample(())))
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -150,10 +150,10 @@ class RandomMixup(_BaseMixupCutmix):
 
 
 class RandomCutmix(_BaseMixupCutmix):
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         lam = float(self._dist.sample(()))
 
-        H, W = query_spatial_size(sample)
+        H, W = query_spatial_size(flat_inputs)
 
         r_x = torch.randint(W, ())
         r_y = torch.randint(H, ())
@@ -344,9 +344,9 @@ class SimpleCopyPaste(_RandomApplyTransform):
                 c3 += 1
 
     def forward(self, *inputs: Any) -> Any:
-        flat_sample, spec = tree_flatten(inputs)
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
 
-        images, targets = self._extract_image_targets(flat_sample)
+        images, targets = self._extract_image_targets(flat_inputs)
 
         # images = [t1, t2, ..., tN]
         # Let's define paste_images as shifted list of input images
@@ -384,6 +384,6 @@ class SimpleCopyPaste(_RandomApplyTransform):
             output_targets.append(output_target)
 
         # Insert updated images and targets into input flat_sample
-        self._insert_outputs(flat_sample, output_images, output_targets)
+        self._insert_outputs(flat_inputs, output_images, output_targets)
 
-        return tree_unflatten(flat_sample, spec)
+        return tree_unflatten(flat_inputs, spec)
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 02c1a18da..47fc15422 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -4,7 +4,7 @@ from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, TypeV
 import PIL.Image
 import torch
 
-from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 from torchvision.prototype import features
 from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.prototype.transforms.functional._meta import get_spatial_size
@@ -31,16 +31,17 @@ class _AutoAugmentBase(Transform):
         key = keys[int(torch.randint(len(keys), ()))]
         return key, dct[key]
 
-    def _extract_image_or_video(
+    def _flatten_and_extract_image_or_video(
         self,
-        sample: Any,
+        inputs: Any,
         unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask),
-    ) -> Tuple[int, Union[features.ImageType, features.VideoType]]:
-        sample_flat, _ = tree_flatten(sample)
+    ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[features.ImageType, features.VideoType]]:
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+
         image_or_videos = []
-        for id, inpt in enumerate(sample_flat):
+        for idx, inpt in enumerate(flat_inputs):
             if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)):
-                image_or_videos.append((id, inpt))
+                image_or_videos.append((idx, inpt))
             elif isinstance(inpt, unsupported_types):
                 raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
 
@@ -51,12 +52,18 @@ class _AutoAugmentBase(Transform):
                 f"Auto augment transformations are only properly defined for a single image or video, "
                 f"but found {len(image_or_videos)}."
             )
-        return image_or_videos[0]
 
-    def _put_into_sample(self, sample: Any, id: int, item: Any) -> Any:
-        sample_flat, spec = tree_flatten(sample)
-        sample_flat[id] = item
-        return tree_unflatten(sample_flat, spec)
+        idx, image_or_video = image_or_videos[0]
+        return (flat_inputs, spec, idx), image_or_video
+
+    def _unflatten_and_insert_image_or_video(
+        self,
+        flat_inputs_with_spec: Tuple[List[Any], TreeSpec, int],
+        image_or_video: Union[features.ImageType, features.VideoType],
+    ) -> Any:
+        flat_inputs, spec, idx = flat_inputs_with_spec
+        flat_inputs[idx] = image_or_video
+        return tree_unflatten(flat_inputs, spec)
 
     def _apply_image_or_video_transform(
         self,
@@ -275,9 +282,7 @@ class AutoAugment(_AutoAugmentBase):
             raise ValueError(f"The provided policy {policy} is not recognized.")
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        id, image_or_video = self._extract_image_or_video(sample)
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
         height, width = get_spatial_size(image_or_video)
 
         policy = self._policies[int(torch.randint(len(self._policies), ()))]
@@ -300,7 +305,7 @@ class AutoAugment(_AutoAugmentBase):
                 image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
             )
 
-        return self._put_into_sample(sample, id, image_or_video)
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
 
 class RandAugment(_AutoAugmentBase):
@@ -346,9 +351,7 @@ class RandAugment(_AutoAugmentBase):
         self.num_magnitude_bins = num_magnitude_bins
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        id, image_or_video = self._extract_image_or_video(sample)
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
         height, width = get_spatial_size(image_or_video)
 
         for _ in range(self.num_ops):
@@ -364,7 +367,7 @@ class RandAugment(_AutoAugmentBase):
                 image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
             )
 
-        return self._put_into_sample(sample, id, image_or_video)
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
@@ -400,9 +403,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         self.num_magnitude_bins = num_magnitude_bins
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        id, image_or_video = self._extract_image_or_video(sample)
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
         height, width = get_spatial_size(image_or_video)
 
         transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
@@ -418,7 +419,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         image_or_video = self._apply_image_or_video_transform(
             image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
         )
-        return self._put_into_sample(sample, id, image_or_video)
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
 
 class AugMix(_AutoAugmentBase):
@@ -471,8 +472,7 @@ class AugMix(_AutoAugmentBase):
         return torch._sample_dirichlet(params)
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        id, orig_image_or_video = self._extract_image_or_video(sample)
+        flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs)
         height, width = get_spatial_size(orig_image_or_video)
 
         if isinstance(orig_image_or_video, torch.Tensor):
@@ -525,4 +525,4 @@ class AugMix(_AutoAugmentBase):
         elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_image_pil(mix)
 
-        return self._put_into_sample(sample, id, mix)
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, mix)
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 609f03bf4..3647365c3 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -1,5 +1,5 @@
 import collections.abc
-from typing import Any, Dict, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
@@ -53,7 +53,7 @@ class ColorJitter(Transform):
     def _generate_value(left: float, right: float) -> float:
         return float(torch.distributions.Uniform(left, right).sample())
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         fn_idx = torch.randperm(4)
 
         b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1])
@@ -99,8 +99,8 @@ class RandomPhotometricDistort(Transform):
         self.saturation = saturation
         self.p = p
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_channels, *_ = query_chw(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
         return dict(
             zip(
                 ["brightness", "contrast1", "saturation", "hue", "contrast2"],
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index e401534f4..ac61f4f77 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Union
 
 import numpy as np
 import PIL.Image
@@ -79,8 +79,8 @@ class RandomGrayscale(_RandomApplyTransform):
 
         super().__init__(p=p)
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_input_channels, *_ = query_chw(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_input_channels, *_ = query_chw(flat_inputs)
         return dict(num_input_channels=num_input_channels)
 
     def _transform(
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 5b31adc9e..4987256ce 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -104,8 +104,8 @@ class RandomResizedCrop(Transform):
 
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        height, width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
         area = height * width
 
         log_ratio = self._log_ratio
@@ -184,8 +184,8 @@ class FiveCrop(Transform):
     ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
         return F.five_crop(inpt, self.size)
 
-    def _check_inputs(self, sample: Any) -> None:
-        if has_any(sample, features.BoundingBox, features.Mask):
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if has_any(flat_inputs, features.BoundingBox, features.Mask):
             raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
 
 
@@ -201,8 +201,8 @@ class TenCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
         self.vertical_flip = vertical_flip
 
-    def _check_inputs(self, sample: Any) -> None:
-        if has_any(sample, features.BoundingBox, features.Mask):
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if has_any(flat_inputs, features.BoundingBox, features.Mask):
             raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
 
     def _transform(
@@ -256,8 +256,8 @@ class RandomZoomOut(_RandomApplyTransform):
         if side_range[0] < 1.0 or side_range[0] > side_range[1]:
             raise ValueError(f"Invalid canvas side range provided {side_range}.")
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_h, orig_w = query_spatial_size(flat_inputs)
 
         r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
         canvas_width = int(orig_w * r)
@@ -299,7 +299,7 @@ class RandomRotation(Transform):
 
         self.center = center
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
         return dict(angle=angle)
 
@@ -355,8 +355,8 @@ class RandomAffine(Transform):
 
         self.center = center
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        height, width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
 
         angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
         if self.translate is not None:
@@ -417,8 +417,8 @@ class RandomCrop(Transform):
         self.fill = _setup_fill_arg(fill)
         self.padding_mode = padding_mode
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        padded_height, padded_width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        padded_height, padded_width = query_spatial_size(flat_inputs)
 
         if self.padding is not None:
             pad_left, pad_right, pad_top, pad_bottom = self.padding
@@ -505,8 +505,8 @@ class RandomPerspective(_RandomApplyTransform):
         self.interpolation = interpolation
         self.fill = _setup_fill_arg(fill)
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        height, width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
 
         distortion_scale = self.distortion_scale
 
@@ -559,8 +559,8 @@ class ElasticTransform(Transform):
         self.interpolation = interpolation
         self.fill = _setup_fill_arg(fill)
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        size = list(query_spatial_size(sample))
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        size = list(query_spatial_size(flat_inputs))
 
         dx = torch.rand([1, 1] + size) * 2 - 1
         if self.sigma[0] > 0.0:
@@ -614,20 +614,20 @@ class RandomIoUCrop(Transform):
         self.options = sampler_options
         self.trials = trials
 
-    def _check_inputs(self, sample: Any) -> None:
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
-            has_all(sample, features.BoundingBox)
-            and has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor)
-            and has_any(sample, features.Label, features.OneHotLabel)
+            has_all(flat_inputs, features.BoundingBox)
+            and has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor)
+            and has_any(flat_inputs, features.Label, features.OneHotLabel)
         ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
                 "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks."
             )
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(sample)
-        bboxes = query_bounding_box(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_h, orig_w = query_spatial_size(flat_inputs)
+        bboxes = query_bounding_box(flat_inputs)
 
         while True:
             # sample an option
@@ -712,8 +712,8 @@ class ScaleJitter(Transform):
         self.interpolation = interpolation
         self.antialias = antialias
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_height, orig_width = query_spatial_size(flat_inputs)
 
         scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
         r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
@@ -740,8 +740,8 @@ class RandomShortestSize(Transform):
         self.interpolation = interpolation
         self.antialias = antialias
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_height, orig_width = query_spatial_size(flat_inputs)
 
         min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
         r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
@@ -771,20 +771,22 @@ class FixedSizeCrop(Transform):
 
         self.padding_mode = padding_mode
 
-    def _check_inputs(self, sample: Any) -> None:
-        if not has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video):
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if not has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
             )
 
-        if has_any(sample, features.BoundingBox) and not has_any(sample, features.Label, features.OneHotLabel):
+        if has_any(flat_inputs, features.BoundingBox) and not has_any(
+            flat_inputs, features.Label, features.OneHotLabel
+        ):
             raise TypeError(
                 f"If a BoundingBox is contained in the input sample, "
                 f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
             )
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        height, width = query_spatial_size(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
         new_height = min(height, self.crop_height)
         new_width = min(width, self.crop_width)
 
@@ -798,7 +800,7 @@ class FixedSizeCrop(Transform):
         left = int(offset_width * r)
 
         try:
-            bounding_boxes = query_bounding_box(sample)
+            bounding_boxes = query_bounding_box(flat_inputs)
         except ValueError:
             bounding_boxes = None
 
@@ -874,7 +876,7 @@ class RandomResize(Transform):
         self.interpolation = interpolation
         self.antialias = antialias
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         size = int(torch.randint(self.min_size, self.max_size, ()))
         return dict(size=[size])
 
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 61be60cee..e26656339 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,6 +1,6 @@
 import functools
 from collections import defaultdict
-from typing import Any, Callable, Dict, Sequence, Type, Union
+from typing import Any, Callable, Dict, List, Sequence, Type, Union
 
 import PIL.Image
 
@@ -134,7 +134,7 @@ class GaussianBlur(Transform):
 
         self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
         return dict(sigma=[sigma, sigma])
 
@@ -167,8 +167,8 @@ class RemoveSmallBoundingBoxes(Transform):
         super().__init__()
         self.min_size = min_size
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        bounding_box = query_bounding_box(sample)
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        bounding_box = query_bounding_box(flat_inputs)
 
         # TODO: We can improve performance here by not using the `remove_small_boxes` function. It requires the box to
         #  be in XYXY format only to calculate the width and height internally. Thus, if the box is in XYWH or CXCYWH
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 056c2da9f..523fa18fa 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -1,5 +1,5 @@
 import enum
-from typing import Any, Callable, Dict, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Tuple, Type, Union
 
 import PIL.Image
 import torch
@@ -23,27 +23,27 @@ class Transform(nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
-    def _check_inputs(self, sample: Any) -> None:
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
         pass
 
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         return dict()
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         raise NotImplementedError
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
 
-        self._check_inputs(sample)
+        self._check_inputs(flat_inputs)
 
-        params = self._get_params(sample)
+        params = self._get_params(flat_inputs)
 
-        flat_inputs, spec = tree_flatten(sample)
         flat_outputs = [
             self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt
             for inpt in flat_inputs
         ]
+
         return tree_unflatten(flat_outputs, spec)
 
     def extra_repr(self) -> str:
@@ -73,18 +73,19 @@ class _RandomApplyTransform(Transform):
         # early afterwards in case the random check triggers. The same result could be achieved by calling
         # `super().forward()` after the random check, but that would call `self._check_inputs` twice.
 
-        sample = inputs if len(inputs) > 1 else inputs[0]
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
 
-        self._check_inputs(sample)
+        self._check_inputs(flat_inputs)
 
         if torch.rand(1) >= self.p:
-            return sample
+            return inputs
 
-        params = self._get_params(sample)
+        params = self._get_params(flat_inputs)
 
-        flat_inputs, spec = tree_flatten(sample)
         flat_outputs = [
             self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt
             for inpt in flat_inputs
         ]
+
         return tree_unflatten(flat_outputs, spec)
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index 53b27f2e2..b3e241d16 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -1,11 +1,10 @@
 import functools
 import numbers
 from collections import defaultdict
-from typing import Any, Callable, Dict, Sequence, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Sequence, Tuple, Type, Union
 
 import PIL.Image
 
-from torch.utils._pytree import tree_flatten
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import features
 from torchvision.prototype.features._feature import FillType
@@ -73,9 +72,8 @@ def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect",
         raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
 
 
-def query_bounding_box(sample: Any) -> features.BoundingBox:
-    flat_sample, _ = tree_flatten(sample)
-    bounding_boxes = {item for item in flat_sample if isinstance(item, features.BoundingBox)}
+def query_bounding_box(flat_inputs: List[Any]) -> features.BoundingBox:
+    bounding_boxes = {inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)}
     if not bounding_boxes:
         raise TypeError("No bounding box was found in the sample")
     elif len(bounding_boxes) > 1:
@@ -83,12 +81,11 @@ def query_bounding_box(sample: Any) -> features.BoundingBox:
     return bounding_boxes.pop()
 
 
-def query_chw(sample: Any) -> Tuple[int, int, int]:
-    flat_sample, _ = tree_flatten(sample)
+def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
-        tuple(get_dimensions(item))
-        for item in flat_sample
-        if isinstance(item, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(item)
+        tuple(get_dimensions(inpt))
+        for inpt in flat_inputs
+        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(inpt)
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -98,13 +95,12 @@ def query_chw(sample: Any) -> Tuple[int, int, int]:
     return c, h, w
 
 
-def query_spatial_size(sample: Any) -> Tuple[int, int]:
-    flat_sample, _ = tree_flatten(sample)
+def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
     sizes = {
-        tuple(get_spatial_size(item))
-        for item in flat_sample
-        if isinstance(item, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox))
-        or features.is_simple_tensor(item)
+        tuple(get_spatial_size(inpt))
+        for inpt in flat_inputs
+        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox))
+        or features.is_simple_tensor(inpt)
     }
     if not sizes:
         raise TypeError("No image, video, mask or bounding box was found in the sample")
@@ -121,19 +117,17 @@ def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], boo
     return False
 
 
-def has_any(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    flat_sample, _ = tree_flatten(sample)
-    for obj in flat_sample:
-        if _isinstance(obj, types_or_checks):
+def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for inpt in flat_inputs:
+        if _isinstance(inpt, types_or_checks):
             return True
     return False
 
 
-def has_all(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    flat_sample, _ = tree_flatten(sample)
+def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
     for type_or_check in types_or_checks:
-        for obj in flat_sample:
-            if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
+        for inpt in flat_inputs:
+            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
                 break
         else:
             return False
-- 
GitLab


From 88b6b93d2b7f89b30f427ec715bff8dd1756535e Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 14 Oct 2022 12:54:56 +0100
Subject: [PATCH 046/624] Extend `RandomShortestSize` to support Video specific
 flavour of the augmentation (#6770)

* Extend RandomShortestSize to support Video specific flavour of the augmentation

* Adding a test.

* Apply changes from code review
---
 test/test_prototype_transforms.py             | 10 ++++------
 torchvision/prototype/transforms/_geometry.py |  6 ++++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 11a51f7b5..5928e6718 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1379,10 +1379,9 @@ class TestScaleJitter:
 
 
 class TestRandomShortestSize:
-    def test__get_params(self, mocker):
+    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
+    def test__get_params(self, min_size, max_size, mocker):
         spatial_size = (3, 10)
-        min_size = [5, 9]
-        max_size = 20
 
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
 
@@ -1395,10 +1394,9 @@ class TestRandomShortestSize:
         assert isinstance(size, tuple) and len(size) == 2
 
         longer = max(size)
-        assert longer <= max_size
-
         shorter = min(size)
-        if longer == max_size:
+        if max_size is not None:
+            assert longer <= max_size
             assert shorter <= max_size
         else:
             assert shorter in min_size
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 4987256ce..5c67bf0ec 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -730,7 +730,7 @@ class RandomShortestSize(Transform):
     def __init__(
         self,
         min_size: Union[List[int], Tuple[int], int],
-        max_size: int,
+        max_size: Optional[int] = None,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         antialias: Optional[bool] = None,
     ):
@@ -744,7 +744,9 @@ class RandomShortestSize(Transform):
         orig_height, orig_width = query_spatial_size(flat_inputs)
 
         min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
-        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
+        r = min_size / min(orig_height, orig_width)
+        if self.max_size is not None:
+            r = min(r, self.max_size / max(orig_height, orig_width))
 
         new_width = int(orig_width * r)
         new_height = int(orig_height * r)
-- 
GitLab


From c960273c131e41a06a7b47836fb5ee81c88ebc5d Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 14 Oct 2022 13:34:29 +0100
Subject: [PATCH 047/624] Switch `view()` with `reshape()` on equalize (#6772)

---
 torchvision/prototype/transforms/functional/_color.py | 2 +-
 torchvision/transforms/functional_tensor.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 7cbf8885c..8460f9c64 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -227,7 +227,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
 
-    return _equalize_image_tensor_vec(image.view(-1, height, width)).view(image.shape)
+    return _equalize_image_tensor_vec(image.view(-1, height, width)).reshape(image.shape)
 
 
 equalize_image_pil = _FP.equalize
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index 20b76fbf0..4944c75fa 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -875,7 +875,7 @@ def _scale_channel(img_chan: Tensor) -> Tensor:
     if img_chan.is_cuda:
         hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
     else:
-        hist = torch.bincount(img_chan.view(-1), minlength=256)
+        hist = torch.bincount(img_chan.reshape(-1), minlength=256)
 
     nonzero_hist = hist[hist != 0]
     step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
-- 
GitLab


From 8ec7a70f29010945cf640645d2cd16cb79bf3d9e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 14 Oct 2022 16:05:12 +0200
Subject: [PATCH 048/624] allow tolerances in transforms consistency checks
 (#6774)

---
 test/test_prototype_transforms_consistency.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 7f439fb26..7d2f1d735 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -12,6 +12,7 @@ import pytest
 import torch
 from prototype_common_utils import (
     ArgsKwargs,
+    assert_close,
     assert_equal,
     make_bounding_box,
     make_detection_mask,
@@ -40,6 +41,7 @@ class ConsistencyConfig:
         make_images_kwargs=None,
         supports_pil=True,
         removed_params=(),
+        closeness_kwargs=None,
     ):
         self.prototype_cls = prototype_cls
         self.legacy_cls = legacy_cls
@@ -47,6 +49,7 @@ class ConsistencyConfig:
         self.make_images_kwargs = make_images_kwargs or DEFAULT_MAKE_IMAGES_KWARGS
         self.supports_pil = supports_pil
         self.removed_params = removed_params
+        self.closeness_kwargs = closeness_kwargs or dict(rtol=0, atol=0)
 
 
 # These are here since both the prototype and legacy transform need to be constructed with the same random parameters
@@ -491,10 +494,14 @@ def test_signature_consistency(config):
     assert prototype_kinds == legacy_kinds
 
 
-def check_call_consistency(prototype_transform, legacy_transform, images=None, supports_pil=True):
+def check_call_consistency(
+    prototype_transform, legacy_transform, images=None, supports_pil=True, closeness_kwargs=None
+):
     if images is None:
         images = make_images(**DEFAULT_MAKE_IMAGES_KWARGS)
 
+    closeness_kwargs = closeness_kwargs or dict()
+
     for image in images:
         image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]"
 
@@ -520,10 +527,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s
                 f"`is_simple_tensor` path in `_transform`."
             ) from exc
 
-        assert_equal(
+        assert_close(
             output_prototype_tensor,
             output_legacy_tensor,
             msg=lambda msg: f"Tensor image consistency check failed with: \n\n{msg}",
+            **closeness_kwargs,
         )
 
         try:
@@ -536,10 +544,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s
                 f"`features.Image` path in `_transform`."
             ) from exc
 
-        assert_equal(
+        assert_close(
             output_prototype_image,
             output_prototype_tensor,
             msg=lambda msg: f"Output for feature and tensor images is not equal: \n\n{msg}",
+            **closeness_kwargs,
         )
 
         if image.ndim == 3 and supports_pil:
@@ -565,10 +574,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s
                     f"`PIL.Image.Image` path in `_transform`."
                 ) from exc
 
-            assert_equal(
+            assert_close(
                 output_prototype_pil,
                 output_legacy_pil,
                 msg=lambda msg: f"PIL image consistency check failed with: \n\n{msg}",
+                **closeness_kwargs,
             )
 
 
@@ -606,6 +616,7 @@ def test_call_consistency(config, args_kwargs):
         legacy_transform,
         images=make_images(**config.make_images_kwargs),
         supports_pil=config.supports_pil,
+        closeness_kwargs=config.closeness_kwargs,
     )
 
 
-- 
GitLab


From e1aacdd9f0712ec971e689dc23e7a3204597179d Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 14 Oct 2022 15:52:08 +0100
Subject: [PATCH 049/624] Update `ToDtype` to avoid unnecessary `to()` calls
 and fixing types on `Transform` (#6773)

* Fix `ToDtype` to avoid errors when a type is not defined.

* Nit `(features.is_simple_tensor, features._Feature)` to `(Tensor,)`

* Fixing linter

* Adding comment.

* Switch back to indexing. Python's default dict seems to have a nasty behaviour.
---
 torchvision/prototype/transforms/_misc.py      | 5 ++++-
 torchvision/prototype/transforms/_transform.py | 8 ++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index e26656339..b31c688dc 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -157,7 +157,10 @@ class ToDtype(Transform):
         self.dtype = dtype
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt.to(self.dtype[type(inpt)])
+        dtype = self.dtype[type(inpt)]
+        if dtype is None:
+            return inpt
+        return inpt.to(dtype=dtype)
 
 
 class RemoveSmallBoundingBoxes(Transform):
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 523fa18fa..95cf9c011 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -5,7 +5,6 @@ import PIL.Image
 import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype import features
 from torchvision.prototype.transforms._utils import _isinstance
 from torchvision.utils import _log_api_usage_once
 
@@ -13,11 +12,8 @@ from torchvision.utils import _log_api_usage_once
 class Transform(nn.Module):
 
     # Class attribute defining transformed types. Other types are passed-through without any transformation
-    _transformed_types: Tuple[Union[Type, Callable[[Any], bool]], ...] = (
-        features.is_simple_tensor,
-        features._Feature,
-        PIL.Image.Image,
-    )
+    # We support both Types and callables that are able to do further checks on the type of the input.
+    _transformed_types: Tuple[Union[Type, Callable[[Any], bool]], ...] = (torch.Tensor, PIL.Image.Image)
 
     def __init__(self) -> None:
         super().__init__()
-- 
GitLab


From e2fa1f9ddfe63237c226ef478baf1e35ff8d7e7e Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 14 Oct 2022 17:37:58 +0100
Subject: [PATCH 050/624] Reshare input before equalize (#6775)

---
 torchvision/prototype/transforms/functional/_color.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 8460f9c64..379736b00 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -227,7 +227,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
 
-    return _equalize_image_tensor_vec(image.view(-1, height, width)).reshape(image.shape)
+    return _equalize_image_tensor_vec(image.reshape(-1, height, width)).reshape(image.shape)
 
 
 equalize_image_pil = _FP.equalize
-- 
GitLab


From f467349ce0d41c23695538add22f6fec5a30ece4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Sat, 15 Oct 2022 08:16:21 +0200
Subject: [PATCH 051/624] replace .view with .reshape (#6777)

---
 .../prototype/transforms/_auto_augment.py     | 10 +--
 torchvision/prototype/transforms/_misc.py     |  4 +-
 .../prototype/transforms/functional/_color.py |  6 +-
 .../transforms/functional/_geometry.py        | 66 +++++++++----------
 .../prototype/transforms/functional/_misc.py  |  4 +-
 5 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 47fc15422..56d581eff 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -484,7 +484,7 @@ class AugMix(_AutoAugmentBase):
 
         orig_dims = list(image_or_video.shape)
         expected_ndim = 5 if isinstance(orig_image_or_video, features.Video) else 4
-        batch = image_or_video.view([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
+        batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
         # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
@@ -497,9 +497,9 @@ class AugMix(_AutoAugmentBase):
         # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos.
         combined_weights = self._sample_dirichlet(
             torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
-        ) * m[:, 1].view([batch_dims[0], -1])
+        ) * m[:, 1].reshape([batch_dims[0], -1])
 
-        mix = m[:, 0].view(batch_dims) * batch
+        mix = m[:, 0].reshape(batch_dims) * batch
         for i in range(self.mixture_width):
             aug = batch
             depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item())
@@ -517,8 +517,8 @@ class AugMix(_AutoAugmentBase):
                 aug = self._apply_image_or_video_transform(
                     aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
                 )
-            mix.add_(combined_weights[:, i].view(batch_dims) * aug)
-        mix = mix.view(orig_dims).to(dtype=image_or_video.dtype)
+            mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
+        mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
         if isinstance(orig_image_or_video, (features.Image, features.Video)):
             mix = orig_image_or_video.wrap_like(orig_image_or_video, mix)  # type: ignore[arg-type]
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index b31c688dc..bf7af5c26 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -88,9 +88,9 @@ class LinearTransformation(Transform):
                 f"Got {inpt.device} vs {self.mean_vector.device}"
             )
 
-        flat_tensor = inpt.view(-1, n) - self.mean_vector
+        flat_tensor = inpt.reshape(-1, n) - self.mean_vector
         transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
-        return transformed_tensor.view(shape)
+        return transformed_tensor.reshape(shape)
 
 
 class Normalize(Transform):
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 379736b00..49a769e04 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -69,7 +69,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     shape = image.shape
 
     if image.ndim > 4:
-        image = image.view(-1, num_channels, height, width)
+        image = image.reshape(-1, num_channels, height, width)
         needs_unsquash = True
     else:
         needs_unsquash = False
@@ -77,7 +77,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     output = _FT._blend(image, _FT._blurred_degenerate_image(image), sharpness_factor)
 
     if needs_unsquash:
-        output = output.view(shape)
+        output = output.reshape(shape)
 
     return output
 
@@ -213,7 +213,7 @@ def _equalize_image_tensor_vec(img: torch.Tensor) -> torch.Tensor:
     zeros = lut.new_zeros((1, 1)).expand(shape[0], 1)
     lut = torch.cat([zeros, lut[:, :-1]], dim=1)
 
-    return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).view_as(img))
+    return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).reshape_as(img))
 
 
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 43962ad4d..1c897700c 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -38,13 +38,13 @@ def horizontal_flip_bounding_box(
 
     bounding_box = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
+    ).reshape(-1, 4)
 
     bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]
 
     return convert_format_bounding_box(
         bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(shape)
+    ).reshape(shape)
 
 
 def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
@@ -75,13 +75,13 @@ def vertical_flip_bounding_box(
 
     bounding_box = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
+    ).reshape(-1, 4)
 
     bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]
 
     return convert_format_bounding_box(
         bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(shape)
+    ).reshape(shape)
 
 
 def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
@@ -123,7 +123,7 @@ def resize_image_tensor(
     extra_dims = image.shape[:-3]
 
     if image.numel() > 0:
-        image = image.view(-1, num_channels, old_height, old_width)
+        image = image.reshape(-1, num_channels, old_height, old_width)
 
         image = _FT.resize(
             image,
@@ -132,7 +132,7 @@ def resize_image_tensor(
             antialias=antialias,
         )
 
-    return image.view(extra_dims + (num_channels, new_height, new_width))
+    return image.reshape(extra_dims + (num_channels, new_height, new_width))
 
 
 @torch.jit.unused
@@ -168,7 +168,7 @@ def resize_bounding_box(
     new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
     ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
     return (
-        bounding_box.view(-1, 2, 2).mul(ratios).to(bounding_box.dtype).view(bounding_box.shape),
+        bounding_box.reshape(-1, 2, 2).mul(ratios).to(bounding_box.dtype).reshape(bounding_box.shape),
         (new_height, new_width),
     )
 
@@ -270,7 +270,7 @@ def affine_image_tensor(
 
     num_channels, height, width = image.shape[-3:]
     extra_dims = image.shape[:-3]
-    image = image.view(-1, num_channels, height, width)
+    image = image.reshape(-1, num_channels, height, width)
 
     angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
 
@@ -283,7 +283,7 @@ def affine_image_tensor(
     matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
 
     output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill)
-    return output.view(extra_dims + (num_channels, height, width))
+    return output.reshape(extra_dims + (num_channels, height, width))
 
 
 @torch.jit.unused
@@ -338,20 +338,20 @@ def _affine_bounding_box_xyxy(
             dtype=dtype,
             device=device,
         )
-        .view(2, 3)
+        .reshape(2, 3)
         .T
     )
     # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
     # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
+    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
     points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
     # 2) Now let's transform the points using affine matrix
     transformed_points = torch.matmul(points, transposed_affine_matrix)
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.view(-1, 4, 2)
+    transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, _ = torch.min(transformed_points, dim=1)
     out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
@@ -396,7 +396,7 @@ def affine_bounding_box(
     original_shape = bounding_box.shape
     bounding_box = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
+    ).reshape(-1, 4)
 
     out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center)
 
@@ -404,7 +404,7 @@ def affine_bounding_box(
 
     return convert_format_bounding_box(
         out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
+    ).reshape(original_shape)
 
 
 def affine_mask(
@@ -539,7 +539,7 @@ def rotate_image_tensor(
 
     if image.numel() > 0:
         image = _FT.rotate(
-            image.view(-1, num_channels, height, width),
+            image.reshape(-1, num_channels, height, width),
             matrix,
             interpolation=interpolation.value,
             expand=expand,
@@ -549,7 +549,7 @@ def rotate_image_tensor(
     else:
         new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height)
 
-    return image.view(extra_dims + (num_channels, new_height, new_width))
+    return image.reshape(extra_dims + (num_channels, new_height, new_width))
 
 
 @torch.jit.unused
@@ -585,7 +585,7 @@ def rotate_bounding_box(
     original_shape = bounding_box.shape
     bounding_box = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
+    ).reshape(-1, 4)
 
     out_bboxes, spatial_size = _affine_bounding_box_xyxy(
         bounding_box,
@@ -601,7 +601,7 @@ def rotate_bounding_box(
     return (
         convert_format_bounding_box(
             out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        ).view(original_shape),
+        ).reshape(original_shape),
         spatial_size,
     )
 
@@ -691,7 +691,7 @@ def _pad_with_scalar_fill(
 
     if image.numel() > 0:
         image = _FT.pad(
-            img=image.view(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
+            img=image.reshape(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
         )
         new_height, new_width = image.shape[-2:]
     else:
@@ -699,7 +699,7 @@ def _pad_with_scalar_fill(
         new_height = height + top + bottom
         new_width = width + left + right
 
-    return image.view(extra_dims + (num_channels, new_height, new_width))
+    return image.reshape(extra_dims + (num_channels, new_height, new_width))
 
 
 # TODO: This should be removed once pytorch pad supports non-scalar padding values
@@ -714,7 +714,7 @@ def _pad_with_vector_fill(
 
     output = _pad_with_scalar_fill(image, padding, fill=0, padding_mode="constant")
     left, right, top, bottom = _parse_pad_padding(padding)
-    fill = torch.tensor(fill, dtype=image.dtype, device=image.device).view(-1, 1, 1)
+    fill = torch.tensor(fill, dtype=image.dtype, device=image.device).reshape(-1, 1, 1)
 
     if top > 0:
         output[..., :top, :] = fill
@@ -863,7 +863,7 @@ def perspective_image_tensor(
     shape = image.shape
 
     if image.ndim > 4:
-        image = image.view((-1,) + shape[-3:])
+        image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
     else:
         needs_unsquash = False
@@ -871,7 +871,7 @@ def perspective_image_tensor(
     output = _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill)
 
     if needs_unsquash:
-        output = output.view(shape)
+        output = output.reshape(shape)
 
     return output
 
@@ -898,7 +898,7 @@ def perspective_bounding_box(
     original_shape = bounding_box.shape
     bounding_box = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
+    ).reshape(-1, 4)
 
     dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
     device = bounding_box.device
@@ -947,7 +947,7 @@ def perspective_bounding_box(
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
     # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
+    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
     points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
     # 2) Now let's transform the points using perspective matrices
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
@@ -959,7 +959,7 @@ def perspective_bounding_box(
 
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.view(-1, 4, 2)
+    transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, _ = torch.min(transformed_points, dim=1)
     out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
@@ -968,7 +968,7 @@ def perspective_bounding_box(
 
     return convert_format_bounding_box(
         out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
+    ).reshape(original_shape)
 
 
 def perspective_mask(
@@ -1027,7 +1027,7 @@ def elastic_image_tensor(
     shape = image.shape
 
     if image.ndim > 4:
-        image = image.view((-1,) + shape[-3:])
+        image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
     else:
         needs_unsquash = False
@@ -1035,7 +1035,7 @@ def elastic_image_tensor(
     output = _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill)
 
     if needs_unsquash:
-        output = output.view(shape)
+        output = output.reshape(shape)
 
     return output
 
@@ -1063,7 +1063,7 @@ def elastic_bounding_box(
     original_shape = bounding_box.shape
     bounding_box = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
+    ).reshape(-1, 4)
 
     # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
     # Or add spatial_size arg and check displacement shape
@@ -1075,21 +1075,21 @@ def elastic_bounding_box(
     inv_grid = id_grid - displacement
 
     # Get points from bboxes
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
+    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
     index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long)
     index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long)
     # Transform points:
     t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype)
     transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5
 
-    transformed_points = transformed_points.view(-1, 4, 2)
+    transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, _ = torch.min(transformed_points, dim=1)
     out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(
         out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
+    ).reshape(original_shape)
 
 
 def elastic_mask(
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 8fda24e17..5b2dd135a 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -65,7 +65,7 @@ def gaussian_blur_image_tensor(
     shape = image.shape
 
     if image.ndim > 4:
-        image = image.view((-1,) + shape[-3:])
+        image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
     else:
         needs_unsquash = False
@@ -73,7 +73,7 @@ def gaussian_blur_image_tensor(
     output = _FT.gaussian_blur(image, kernel_size, sigma)
 
     if needs_unsquash:
-        output = output.view(shape)
+        output = output.reshape(shape)
 
     return output
 
-- 
GitLab


From 149edda463b54b3eabe989e260a839727c89d099 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 17 Oct 2022 09:59:33 +0200
Subject: [PATCH 052/624] [proto] Reduce number of calls of __torch_function__
 (#6681)

* [proto] Reduce number of calls of __torch_function__

* Use DisableTorchFunction and super

* Use self._tensor

* Fixes mypy and color space handling

* revert Image.new_like

* WIP

* Perf opt with ref to tensor and properties

* Removed requires_grad property

* Use _tensor ref

* Revert "Use _tensor ref"

This reverts commit 38f8e21242830fed46ddf31287edb67c1abd124a.

* Update torchvision/prototype/features/_feature.py

Co-authored-by: Philip Meier <github.pmeier@posteo.de>

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/prototype/features/_feature.py | 23 ++++++++++++++++++++++
 torchvision/prototype/features/_video.py   |  6 +++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
index a56441f29..1cc2d8d4b 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/features/_feature.py
@@ -6,6 +6,7 @@ from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type
 import PIL.Image
 import torch
 from torch._C import DisableTorchFunction
+from torch.types import _device, _dtype, _size
 from torchvision.transforms import InterpolationMode
 
 
@@ -128,6 +129,28 @@ class _Feature(torch.Tensor):
             _Feature.__F = functional
         return _Feature.__F
 
+    # Add properties for common attributes like shape, dtype, device, ndim etc
+    # this way we return the result without passing into __torch_function__
+    @property
+    def shape(self) -> _size:  # type: ignore[override]
+        with DisableTorchFunction():
+            return super().shape
+
+    @property
+    def ndim(self) -> int:  # type: ignore[override]
+        with DisableTorchFunction():
+            return super().ndim
+
+    @property
+    def device(self, *args: Any, **kwargs: Any) -> _device:  # type: ignore[override]
+        with DisableTorchFunction():
+            return super().device
+
+    @property
+    def dtype(self) -> _dtype:  # type: ignore[override]
+        with DisableTorchFunction():
+            return super().dtype
+
     def horizontal_flip(self) -> _Feature:
         return self
 
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index 9dfff7f96..26f97549a 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -15,9 +15,9 @@ class Video(_Feature):
 
     @classmethod
     def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Video:
-        image = tensor.as_subclass(cls)
-        image.color_space = color_space
-        return image
+        video = tensor.as_subclass(cls)
+        video.color_space = color_space
+        return video
 
     def __new__(
         cls,
-- 
GitLab


From decb191962250e5969686f0fc07c8ee5f45b181b Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 17 Oct 2022 16:45:28 +0200
Subject: [PATCH 053/624] [proto] Small optimization for gaussian_blur
 functional op (#6762)

* Use softmax in _get_gaussian_kernel1d

* Revert "Use softmax in _get_gaussian_kernel1d"

This reverts commit eb8fba36302d2da9e06e6f40afaaf901b276a771.

* Code update

* Relaxed tolerance in consistency tests for GaussianBlur and ElasticTransform

* Code review updates

* Update test_prototype_transforms_consistency.py
---
 test/test_prototype_transforms_consistency.py | 40 +++++++++++--------
 .../prototype/transforms/functional/_misc.py  | 31 +++++++++++++-
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 7d2f1d735..212755068 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -308,22 +308,28 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.7, hue=0.3),
         ],
     ),
-    ConsistencyConfig(
-        prototype_transforms.ElasticTransform,
-        legacy_transforms.ElasticTransform,
-        [
-            ArgsKwargs(),
-            ArgsKwargs(alpha=20.0),
-            ArgsKwargs(alpha=(15.3, 27.2)),
-            ArgsKwargs(sigma=3.0),
-            ArgsKwargs(sigma=(2.5, 3.9)),
-            ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC),
-            ArgsKwargs(fill=1),
-        ],
-        # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(163, 163), (72, 333), (313, 95)]),
-    ),
+    *[
+        ConsistencyConfig(
+            prototype_transforms.ElasticTransform,
+            legacy_transforms.ElasticTransform,
+            [
+                ArgsKwargs(),
+                ArgsKwargs(alpha=20.0),
+                ArgsKwargs(alpha=(15.3, 27.2)),
+                ArgsKwargs(sigma=3.0),
+                ArgsKwargs(sigma=(2.5, 3.9)),
+                ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST),
+                ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+                ArgsKwargs(fill=1),
+            ],
+            # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
+            make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(163, 163), (72, 333), (313, 95)], dtypes=[dt]),
+            # We updated gaussian blur kernel generation with a faster and numerically more stable version
+            # This brings float32 accumulation visible in elastic transform -> we need to relax consistency tolerance
+            closeness_kwargs=ckw,
+        )
+        for dt, ckw in [(torch.uint8, {"rtol": 1e-1, "atol": 1}), (torch.float32, {"rtol": 1e-2, "atol": 1e-3})]
+    ],
     ConsistencyConfig(
         prototype_transforms.GaussianBlur,
         legacy_transforms.GaussianBlur,
@@ -333,6 +339,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(kernel_size=3, sigma=0.7),
             ArgsKwargs(kernel_size=5, sigma=(0.3, 1.4)),
         ],
+        closeness_kwargs={"rtol": 1e-5, "atol": 1e-5},
     ),
     ConsistencyConfig(
         prototype_transforms.RandomAffine,
@@ -506,7 +513,6 @@ def check_call_consistency(
         image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]"
 
         image_tensor = torch.Tensor(image)
-
         try:
             torch.manual_seed(0)
             output_legacy_tensor = legacy_transform(image_tensor)
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 5b2dd135a..fa4a6e9be 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -1,7 +1,9 @@
+import math
 from typing import List, Optional, Union
 
 import PIL.Image
 import torch
+from torch.nn.functional import conv2d, pad as torch_pad
 from torchvision.prototype import features
 from torchvision.transforms import functional_tensor as _FT
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
@@ -32,6 +34,22 @@ def normalize(
     return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
 
 
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> torch.Tensor:
+    lim = (kernel_size - 1) / (2 * math.sqrt(2) * sigma)
+    x = torch.linspace(-lim, lim, steps=kernel_size)
+    kernel1d = torch.softmax(-x.pow_(2), dim=0)
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
+    kernel2d = kernel1d_y.unsqueeze(-1) * kernel1d_x
+    return kernel2d
+
+
 def gaussian_blur_image_tensor(
     image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
@@ -70,7 +88,18 @@ def gaussian_blur_image_tensor(
     else:
         needs_unsquash = False
 
-    output = _FT.gaussian_blur(image, kernel_size, sigma)
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=image.device)
+    kernel = kernel.expand(image.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    image, need_cast, need_squeeze, out_dtype = _FT._cast_squeeze_in(image, [kernel.dtype])
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    output = torch_pad(image, padding, mode="reflect")
+    output = conv2d(output, kernel, groups=output.shape[-3])
+
+    output = _FT._cast_squeeze_out(output, need_cast, need_squeeze, out_dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)
-- 
GitLab


From 0610b13ac4af3717f538454a9c6b1f441cb386f3 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 17 Oct 2022 14:01:21 -0400
Subject: [PATCH 054/624] [Nova] Add GHA Linux CPU Unittests for Torchvision
 (#6759)

* [Nova][WIP] Add Linux CPU Unittests for Torchvision

* use conda-builder image since conda installation is needed

* install torch dep with conda instead

* use circleCI command to run tests

* larger instance to avoid OOM issues

* proper syntax for self-hosted runners

* 4xlarge instance

* 8xlarge

* 12xlarge

* use setup-miniconda job

* add back PATH change to help setup py detect conda

* run conda shell script

* install other deps up front

* git config and undo path change

* revert to local conda install

* conda-builder image

* support for whole python version matrix

* clean up the conda env once we are done with the job
---
 .github/workflows/test-linux-cpu.yml | 69 ++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 .github/workflows/test-linux-cpu.yml

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
new file mode 100644
index 000000000..81ace6351
--- /dev/null
+++ b/.github/workflows/test-linux-cpu.yml
@@ -0,0 +1,69 @@
+name: Unit-tests on Linux CPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+env:
+  CHANNEL: "nightly"
+
+jobs:
+  tests:
+    name: "Unit-tests on Linux CPU"
+    runs-on: [self-hosted, linux.12xlarge]
+    container:
+      image: pytorch/conda-builder:cpu
+    strategy:
+      matrix:
+        py_vers: ["3.7", "3.8", "3.9", "3.10"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+      - name: Set Release CHANNEL (for release)
+        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
+        run: |
+          echo "CHANNEL=test" >> "$GITHUB_ENV"
+      - name: Setup Conda
+        shell: bash -l {0}
+        env:
+          ENV_NAME: conda-env-${{ github.run_id }}
+          PY_VERS: ${{ matrix.py_vers }}
+        run: |
+          git config --global --add safe.directory /__w/vision/vision
+          . ~/miniconda3/etc/profile.d/conda.sh
+          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy
+          echo "CONDA_RUN=conda run -p ${ENV_NAME}" >> "$GITHUB_ENV"
+      - name: Install TorchVision
+        shell: bash -l {0}
+        env:
+          VERSION: cpu
+          CUDATOOLKIT: cpuonly
+        run: |
+          # Needed for JPEG library detection as setup.py detects conda presence
+          # by running `shutil.which('conda')`
+          export PATH=~/miniconda3/bin:$PATH
+          set -ex
+          ${CONDA_RUN} conda install \
+            --yes \
+            -c "pytorch-${CHANNEL}" \
+            -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
+            "${CUDATOOLKIT}"
+          ${CONDA_RUN} python3 setup.py develop
+          ${CONDA_RUN} python3 -m pip install pytest pytest-mock av
+      - name: Run tests
+        shell: bash -l {0}
+        env:
+          ENV_NAME: conda-env-${{ github.run_id }}
+          PY_VERS: ${{ matrix.py_vers }}
+        run: |
+          . ~/miniconda3/etc/profile.d/conda.sh
+          set -ex
+          ${CONDA_RUN} python3 -m torch.utils.collect_env
+          ${CONDA_RUN} python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+          conda env remove -p ${ENV_NAME}
-- 
GitLab


From e23542da0dd85193bb831d6f62b2101a9651cad0 Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Tue, 18 Oct 2022 15:38:36 +0100
Subject: [PATCH 055/624] Add raft_stereo weights (#6786)

* Add raft_stereo weights

* Update the metrics layout
---
 .../models/depth/stereo/raft_stereo.py        | 94 ++++++++++++++++++-
 1 file changed, 91 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
index 541a11f04..4b6f5a0bd 100644
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py
@@ -1,3 +1,4 @@
+from functools import partial
 from typing import Callable, List, Optional, Tuple
 
 import torch
@@ -5,11 +6,12 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.models.optical_flow.raft as raft
 from torch import Tensor
-from torchvision.models._api import register_model, WeightsEnum
+from torchvision.models._api import register_model, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
 from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
 from torchvision.models.optical_flow.raft import FlowHead, MotionEncoder, ResidualBlock
 from torchvision.ops import Conv2dNormActivation
+from torchvision.prototype.transforms._presets import StereoMatching
 from torchvision.utils import _log_api_usage_once
 
 
@@ -624,11 +626,97 @@ def _raft_stereo(
 
 
 class Raft_Stereo_Realtime_Weights(WeightsEnum):
-    pass
+    SCENEFLOW_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_realtime-cf345ccb.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 8077152,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                "Kitty2015": {
+                    "3px": 0.9409,
+                }
+            },
+        },
+    )
+
+    DEFAULT = SCENEFLOW_V1
 
 
 class Raft_Stereo_Base_Weights(WeightsEnum):
-    pass
+    SCENEFLOW_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_base_sceneflow-eff3f2e6.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 11116176,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                # Using standard metrics for each datasets
+                "Kitty2015": {
+                    # Ratio of pixels with difference less than 3px from ground truth
+                    "3px": 0.9426,
+                },
+                # For middlebury, ratio of pixels with difference less than 2px from ground truth
+                # on full, half, and quarter image resolution
+                "Middlebury2014-val-full": {
+                    "2px": 0.8167,
+                },
+                "Middlebury2014-val-half": {
+                    "2px": 0.8741,
+                },
+                "Middlebury2014-val-quarter": {
+                    "2px": 0.9064,
+                },
+                "ETH3D-val": {
+                    # Ratio of pixels with difference less than 1px from ground truth
+                    "1px": 0.9672,
+                },
+            },
+        },
+    )
+
+    MIDDLEBURY_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_base_middlebury-afa9d252.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 11116176,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                "Middlebury-test": {
+                    "mae": 1.27,
+                    "1px": 0.9063,
+                    "2px": 0.9526,
+                    "5px": 0.9725,
+                }
+            },
+        },
+    )
+
+    ETH3D_V1 = Weights(
+        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
+        url="https://download.pytorch.org/models/raft_stereo_base_eth3d-d4830f22.pth",
+        transforms=partial(StereoMatching, resize_size=(224, 224)),
+        meta={
+            "num_params": 11116176,
+            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
+            "_metrics": {
+                # Following metrics from paper: https://arxiv.org/abs/2109.07547
+                "ETH3D-test": {
+                    "mae": 0.18,
+                    "1px": 0.9756,
+                    "2px": 0.9956,
+                }
+            },
+        },
+    )
+
+    DEFAULT = MIDDLEBURY_V1
 
 
 @register_model()
-- 
GitLab


From f8b5a7af8be9ef0e27187b0595d8a9304fa9ba52 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 18 Oct 2022 16:45:23 +0200
Subject: [PATCH 056/624] don't fail linux CPU tests fast (#6788)

---
 .github/workflows/test-linux-cpu.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 81ace6351..1e127c6ac 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -21,6 +21,7 @@ jobs:
     strategy:
       matrix:
         py_vers: ["3.7", "3.8", "3.9", "3.10"]
+      fail-fast: false
 
     steps:
       - name: Checkout repository
-- 
GitLab


From 32757a260dfedebf71eb470bd0a072ed20beddc3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 18 Oct 2022 16:52:09 +0200
Subject: [PATCH 057/624] fix warnings in prototype transforms test suite
 (#6785)

* fix, ignore, or assert warnings for consistency tests

* fix, ignore, or assert warnings for kernel infos

* fix to_image_tensor for numpy inputs

* make image from numpy contiguous

* fix test
---
 test/prototype_transforms_kernel_infos.py         | 15 +++++++++++----
 test/test_prototype_transforms_consistency.py     | 13 ++++++++-----
 test/test_prototype_transforms_functional.py      |  9 +--------
 .../transforms/functional/_type_conversion.py     |  2 +-
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index c455caa6b..f8b237f2e 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1,6 +1,7 @@
 import functools
 import itertools
 import math
+import re
 
 import numpy as np
 import pytest
@@ -172,6 +173,12 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.horizontal_flip_bounding_box,
             sample_inputs_fn=sample_inputs_horizontal_flip_bounding_box,
+            test_marks=[
+                TestMark(
+                    ("TestKernels", "test_scripted_vs_eager"),
+                    pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %72')}:UserWarning"),
+                )
+            ],
         ),
         KernelInfo(
             F.horizontal_flip_mask,
@@ -443,10 +450,10 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
         transformed_points = np.matmul(points, affine_matrix.T)
         out_bbox = torch.tensor(
             [
-                np.min(transformed_points[:, 0]),
-                np.min(transformed_points[:, 1]),
-                np.max(transformed_points[:, 0]),
-                np.max(transformed_points[:, 1]),
+                np.min(transformed_points[:, 0]).item(),
+                np.min(transformed_points[:, 1]).item(),
+                np.max(transformed_points[:, 0]).item(),
+                np.max(transformed_points[:, 1]).item(),
             ],
             dtype=bbox.dtype,
         )
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 212755068..362a7a1c0 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -1,6 +1,7 @@
 import enum
 import inspect
 import random
+import re
 from collections import defaultdict
 from importlib.machinery import SourceFileLoader
 from pathlib import Path
@@ -598,6 +599,7 @@ def check_call_consistency(
         for idx, args_kwargs in enumerate(config.args_kwargs)
     ],
 )
+@pytest.mark.filterwarnings("ignore")
 def test_call_consistency(config, args_kwargs):
     args, kwargs = args_kwargs
 
@@ -671,21 +673,21 @@ class TestContainerTransforms:
         check_call_consistency(prototype_transform, legacy_transform)
 
     # We can't test other values for `p` since the random parameter generation is different
-    @pytest.mark.parametrize("p", [(0, 1), (1, 0)])
-    def test_random_choice(self, p):
+    @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)])
+    def test_random_choice(self, probabilities):
         prototype_transform = prototype_transforms.RandomChoice(
             [
                 prototype_transforms.Resize(256),
                 legacy_transforms.CenterCrop(224),
             ],
-            p=p,
+            probabilities=probabilities,
         )
         legacy_transform = legacy_transforms.RandomChoice(
             [
                 legacy_transforms.Resize(256),
                 legacy_transforms.CenterCrop(224),
             ],
-            p=p,
+            p=probabilities,
         )
 
         check_call_consistency(prototype_transform, legacy_transform)
@@ -702,7 +704,8 @@ class TestToTensorTransforms:
             assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
 
     def test_to_tensor(self):
-        prototype_transform = prototype_transforms.ToTensor()
+        with pytest.warns(UserWarning, match=re.escape("The transform `ToTensor()` is deprecated")):
+            prototype_transform = prototype_transforms.ToTensor()
         legacy_transform = legacy_transforms.ToTensor()
 
         for image in make_images(extra_dims=[()]):
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 34291611d..bafe1f134 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -1012,17 +1012,10 @@ def test_normalize_output_type():
 def test_to_image_tensor(inpt):
     output = F.to_image_tensor(inpt)
     assert isinstance(output, torch.Tensor)
+    assert output.shape == (3, 32, 32)
 
     assert np.asarray(inpt).sum() == output.sum().item()
 
-    if isinstance(inpt, PIL.Image.Image):
-        # we can't check this option
-        # as PIL -> numpy is always copying
-        return
-
-    inpt[0, 0, 0] = 11
-    assert output[0, 0, 0] == 11
-
 
 @pytest.mark.parametrize(
     "inpt",
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index b171716ae..5fe990eb7 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -27,7 +27,7 @@ def decode_video_with_av(encoded_video: torch.Tensor) -> Tuple[torch.Tensor, tor
 @torch.jit.unused
 def to_image_tensor(image: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image:
     if isinstance(image, np.ndarray):
-        output = torch.from_numpy(image)
+        output = torch.from_numpy(image).permute((2, 0, 1)).contiguous()
     elif isinstance(image, PIL.Image.Image):
         output = pil_to_tensor(image)
     else:  # isinstance(inpt, torch.Tensor):
-- 
GitLab


From 78fdaf3a757e6eaacc458883bcf7464b1711ce7a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 19 Oct 2022 11:21:36 +0200
Subject: [PATCH 058/624] pin pyav to <10 (#6789)

* pin pyav to <10

* pin av in GHA workflows as well

* also pin in M1 workflow
---
 .circleci/unittest/linux/scripts/environment.yml   | 2 +-
 .circleci/unittest/windows/scripts/environment.yml | 2 +-
 .github/workflows/test-linux-cpu.yml               | 2 +-
 .github/workflows/test-m1.yml                      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index 77ee99295..fae96c5f9 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -13,4 +13,4 @@ dependencies:
   - pip:
     - future
     - scipy
-    - av
+    - av < 10
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
index 0e07ae80d..d229aafb4 100644
--- a/.circleci/unittest/windows/scripts/environment.yml
+++ b/.circleci/unittest/windows/scripts/environment.yml
@@ -14,6 +14,6 @@ dependencies:
   - pip:
     - future
     - scipy
-    - av != 9.1.1
+    - av !=9.1.1, <10
     - dataclasses
     - h5py
diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 1e127c6ac..b6891bbfb 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -56,7 +56,7 @@ jobs:
             -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
             "${CUDATOOLKIT}"
           ${CONDA_RUN} python3 setup.py develop
-          ${CONDA_RUN} python3 -m pip install pytest pytest-mock av
+          ${CONDA_RUN} python3 -m pip install pytest pytest-mock 'av<10'
       - name: Run tests
         shell: bash -l {0}
         env:
diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml
index 1e5f79f82..c03fa9f76 100644
--- a/.github/workflows/test-m1.yml
+++ b/.github/workflows/test-m1.yml
@@ -37,7 +37,7 @@ jobs:
           conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy
           conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
           conda run -p ${ENV_NAME} python3 setup.py develop
-          conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock av
+          conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock 'av<10'
       - name: Run tests
         shell: arch -arch arm64 bash {0}
         env:
-- 
GitLab


From 7a62a545ce76f43ccc5cfe0009131f7db14ae7b5 Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Wed, 19 Oct 2022 19:11:41 +0100
Subject: [PATCH 059/624] Some fixes for crestereo (#6791)

---
 .../prototype/models/depth/stereo/crestereo.py      | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
index 496438522..29c0be936 100644
--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -763,7 +763,7 @@ class CREStereo(nn.Module):
         return "1d" if iteration % 2 == 0 else "2d"
 
     def forward(
-        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor], num_iters: int = 10
+        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 10
     ) -> List[Tensor]:
         features = torch.cat([left_image, right_image], dim=0)
         features = self.feature_encoder(features)
@@ -781,10 +781,10 @@ class CREStereo(nn.Module):
         ctx_pyramid = self.downsampling_pyramid(ctx)
 
         # we store in reversed order because we process the pyramid from top to bottom
-        l_pyramid: Dict[str, Tensor] = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        r_pyramid: Dict[str, Tensor] = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        net_pyramid: Dict[str, Tensor] = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        ctx_pyramid: Dict[str, Tensor] = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        l_pyramid = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        r_pyramid = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        net_pyramid = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        ctx_pyramid = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)}
 
         # offsets for sampling pixel candidates in the correlation ops
         offsets: Dict[str, Tensor] = {}
@@ -1425,6 +1425,9 @@ def crestereo_base(*, weights: Optional[CREStereo_Base_Weights] = None, progress
     .. autoclass:: torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights
         :members:
     """
+
+    weights = CREStereo_Base_Weights.verify(weights)
+
     return _crestereo(
         weights=weights,
         progress=progress,
-- 
GitLab


From 211563fba461062268db7cdf5a83203ed9e83e6f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 20 Oct 2022 14:14:37 +0200
Subject: [PATCH 060/624] improve perf on convert_image_dtype and add tests
 (#6795)

* improve perf on convert_image_dtype and add tests

* add reference tests

* use bitshifts for int to int

* revert bitshifts for int to int upscale

* fix warning ignore
---
 test/prototype_transforms_kernel_infos.py     | 118 ++++++++++++++++++
 test/test_prototype_transforms_functional.py  |  46 +++++--
 .../transforms/functional/_type_conversion.py |  77 +++++++++++-
 3 files changed, 226 insertions(+), 15 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index f8b237f2e..133508f5f 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1,3 +1,4 @@
+import decimal
 import functools
 import itertools
 import math
@@ -21,6 +22,7 @@ from prototype_common_utils import (
     mark_framework_limitation,
     TestMark,
 )
+from torch.utils._pytree import tree_map
 from torchvision.prototype import features
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
@@ -1947,3 +1949,119 @@ KERNEL_INFOS.extend(
         ),
     ]
 )
+
+
+def sample_inputs_convert_image_dtype():
+    for input_dtype, output_dtype in itertools.product(
+        [torch.uint8, torch.int64, torch.float32, torch.float64], repeat=2
+    ):
+        if input_dtype.is_floating_point and output_dtype == torch.int64:
+            # conversion cannot be performed safely
+            continue
+
+        for image_loader in make_image_loaders(
+            sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[input_dtype]
+        ):
+            yield ArgsKwargs(image_loader, dtype=output_dtype)
+
+    yield ArgsKwargs(make_image_loader(color_space=features.ColorSpace.RGB), dtype=torch.uint8)
+
+
+def reference_convert_image_dtype(image, dtype=torch.float):
+    input_dtype = image.dtype
+    output_dtype = dtype
+
+    if output_dtype == input_dtype:
+        return image
+
+    def fn(value):
+        if input_dtype.is_floating_point:
+            if output_dtype.is_floating_point:
+                return value
+            else:
+                return int(decimal.Decimal(value) * torch.iinfo(output_dtype).max)
+        else:
+            input_max_value = torch.iinfo(input_dtype).max
+
+            if output_dtype.is_floating_point:
+                return float(decimal.Decimal(value) / input_max_value)
+            else:
+                output_max_value = torch.iinfo(output_dtype).max
+
+                if input_max_value > output_max_value:
+                    factor = (input_max_value + 1) // (output_max_value + 1)
+                    return value // factor
+                else:
+                    factor = (output_max_value + 1) // (input_max_value + 1)
+                    return value * factor
+
+    return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype)
+
+
+def reference_inputs_convert_image_dtype():
+    for input_dtype, output_dtype in itertools.product(
+        [
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+        ],
+        repeat=2,
+    ):
+        if (input_dtype == torch.float32 and output_dtype in {torch.int32, torch.int64}) or (
+            input_dtype == torch.float64 and output_dtype == torch.int64
+        ):
+            continue
+
+        if input_dtype.is_floating_point:
+            data = [0.0, 0.5, 1.0]
+        else:
+            max_value = torch.iinfo(input_dtype).max
+            data = [0, max_value // 2, max_value]
+        image = torch.tensor(data, dtype=input_dtype)
+
+        yield ArgsKwargs(image, dtype=output_dtype)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.convert_image_dtype,
+            sample_inputs_fn=sample_inputs_convert_image_dtype,
+            reference_fn=reference_convert_image_dtype,
+            reference_inputs_fn=reference_inputs_convert_image_dtype,
+            test_marks=[
+                TestMark(
+                    ("TestKernels", "test_scripted_vs_eager"),
+                    pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %41')}:UserWarning"),
+                ),
+                TestMark(
+                    ("TestKernels", "test_dtype_and_device_consistency"),
+                    pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
+                    condition=lambda args_kwargs: args_kwargs.args[0].dtype
+                    != args_kwargs.kwargs.get("dtype", torch.float32),
+                ),
+                TestMark(
+                    ("TestKernels", "test_against_reference"),
+                    pytest.mark.xfail(reason="Conversion overflows"),
+                    condition=lambda args_kwargs: (
+                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
+                        and not args_kwargs.kwargs["dtype"].is_floating_point
+                    )
+                    or (
+                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
+                        and args_kwargs.kwargs["dtype"] == torch.int64
+                    )
+                    or (
+                        args_kwargs.args[0].dtype in {torch.int32, torch.int64}
+                        and args_kwargs.kwargs["dtype"] == torch.float16
+                    ),
+                ),
+            ],
+        ),
+    ]
+)
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index bafe1f134..3423006e2 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -26,6 +26,20 @@ def script(fn):
         raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
 
 
+def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None):
+    args_kwargs = list(args_kwargs_fn(info))
+    idx_field_len = len(str(len(args_kwargs)))
+    return [
+        pytest.param(
+            info,
+            args_kwargs_,
+            marks=info.get_marks(test_id, args_kwargs_) if test_id else [],
+            id=f"{info.id}-{idx:0{idx_field_len}}",
+        )
+        for idx, args_kwargs_ in enumerate(args_kwargs)
+    ]
+
+
 def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None):
     if condition is None:
 
@@ -49,18 +63,7 @@ def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=No
             if not condition(info):
                 continue
 
-            args_kwargs = list(args_kwargs_fn(info))
-            idx_field_len = len(str(len(args_kwargs)))
-
-            for idx, args_kwargs_ in enumerate(args_kwargs):
-                argvalues.append(
-                    pytest.param(
-                        info,
-                        args_kwargs_,
-                        marks=info.get_marks(test_id, args_kwargs_),
-                        id=f"{info.id}-{idx:0{idx_field_len}}",
-                    )
-                )
+            argvalues.extend(make_info_args_kwargs_params(info, args_kwargs_fn=args_kwargs_fn, test_id=test_id))
 
         return pytest.mark.parametrize(argnames, argvalues)(test_fn)
 
@@ -232,7 +235,6 @@ class TestDispatchers:
         [
             F.clamp_bounding_box,
             F.convert_color_space,
-            F.convert_image_dtype,
             F.get_dimensions,
             F.get_image_num_channels,
             F.get_image_size,
@@ -312,6 +314,24 @@ def test_alias(alias, target):
     assert alias is target
 
 
+@pytest.mark.parametrize(
+    ("info", "args_kwargs"),
+    make_info_args_kwargs_params(
+        next(info for info in KERNEL_INFOS if info.kernel is F.convert_image_dtype),
+        args_kwargs_fn=lambda info: info.sample_inputs_fn(),
+    ),
+)
+@pytest.mark.parametrize("device", cpu_and_gpu())
+def test_dtype_and_device_convert_image_dtype(info, args_kwargs, device):
+    (input, *other_args), kwargs = args_kwargs.load(device)
+    dtype = other_args[0] if other_args else kwargs.get("dtype", torch.float32)
+
+    output = info.kernel(input, dtype)
+
+    assert output.dtype == dtype
+    assert output.device == input.device
+
+
 # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
 #  `prototype_transforms_kernel_infos.py`
 
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index 5fe990eb7..a57fbc655 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -7,7 +7,7 @@ import torch
 from torchvision.io.video import read_video
 from torchvision.prototype import features
 from torchvision.prototype.utils._internal import ReadOnlyTensorBuffer
-from torchvision.transforms import functional as _F
+from torchvision.transforms import functional as _F, functional_tensor as _FT
 
 
 @torch.jit.unused
@@ -42,4 +42,77 @@ pil_to_tensor = _F.pil_to_tensor
 # prevalent and well understood. Thus, we just alias it without deprecating the old name.
 to_pil_image = to_image_pil
 
-convert_image_dtype = _F.convert_image_dtype
+
+def _num_value_bits(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 8
+    elif dtype == torch.int8:
+        return 7
+    elif dtype == torch.int16:
+        return 15
+    elif dtype == torch.int32:
+        return 31
+    elif dtype == torch.int64:
+        return 63
+    else:
+        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
+
+
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    if not isinstance(image, torch.Tensor):
+        raise TypeError("Input img should be Tensor Image")
+
+    if image.dtype == dtype:
+        return image
+
+    float_input = image.is_floating_point()
+    if torch.jit.is_scripting():
+        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
+        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
+    else:
+        float_output = dtype.is_floating_point
+
+    if float_input:
+        # float to float
+        if float_output:
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
+
+        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
+        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
+        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # for a detailed analysis.
+        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
+        # Instead, we can also multiply by the maximum value plus something close to `1`. See
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
+        eps = 1e-3
+        max_value = float(_FT._max_value(dtype))
+        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
+        # discrete set `{0, 1}`.
+        return image.mul(max_value + 1.0 - eps).to(dtype)
+    else:
+        # int to float
+        if float_output:
+            return image.to(dtype).div_(_FT._max_value(image.dtype))
+
+        # int to int
+        num_value_bits_input = _num_value_bits(image.dtype)
+        num_value_bits_output = _num_value_bits(dtype)
+
+        if num_value_bits_input > num_value_bits_output:
+            return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
+        else:
+            # The bitshift kernel is not vectorized
+            #  https://github.com/pytorch/pytorch/blob/703c19008df4700b6a522b0ae5c4b6d5ffc0906f/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L315-L322
+            #  This results in the multiplication actually being faster.
+            # TODO: If the bitshift kernel is optimized in core, replace the computation below with
+            #  `image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)`
+            max_value_input = float(_FT._max_value(dtype))
+            max_value_output = float(_FT._max_value(image.dtype))
+            factor = int((max_value_input + 1) // (max_value_output + 1))
+            return image.to(dtype).mul_(factor)
-- 
GitLab


From 246de0772c80eae435c0a562268d8d21ed7a27a2 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Thu, 20 Oct 2022 14:01:32 -0400
Subject: [PATCH 061/624] [Nova] Migrate Linux CPU job to Generic Job (#6797)

* [Nova] Migrate Linux CPU job to Generic Job

* branch ref for composite action job

* move checkout step to separate job

* added runs-on

* nit fixes

* no need to run conda sheel script thing

* Channel is set inside the script

* add remaining env vars

* nit env var fix

* cleanup

* simplify unneeded jobs

* name of the conda env should be the path

* remove main ref to use PR
---
 .github/workflows/test-linux-cpu.yml | 85 ++++++++++++----------------
 1 file changed, 36 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index b6891bbfb..f78dd323d 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -14,57 +14,44 @@ env:
 
 jobs:
   tests:
-    name: "Unit-tests on Linux CPU"
-    runs-on: [self-hosted, linux.12xlarge]
-    container:
-      image: pytorch/conda-builder:cpu
     strategy:
       matrix:
         py_vers: ["3.7", "3.8", "3.9", "3.10"]
       fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.12xlarge
+      repository: pytorch/vision
+      script: |
+        # Mark Build Directory Safe
+        git config --global --add safe.directory /__w/vision/vision
 
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Setup Conda
-        shell: bash -l {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          git config --global --add safe.directory /__w/vision/vision
-          . ~/miniconda3/etc/profile.d/conda.sh
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy
-          echo "CONDA_RUN=conda run -p ${ENV_NAME}" >> "$GITHUB_ENV"
-      - name: Install TorchVision
-        shell: bash -l {0}
-        env:
-          VERSION: cpu
-          CUDATOOLKIT: cpuonly
-        run: |
-          # Needed for JPEG library detection as setup.py detects conda presence
-          # by running `shutil.which('conda')`
-          export PATH=~/miniconda3/bin:$PATH
-          set -ex
-          ${CONDA_RUN} conda install \
-            --yes \
-            -c "pytorch-${CHANNEL}" \
-            -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
-            "${CUDATOOLKIT}"
-          ${CONDA_RUN} python3 setup.py develop
-          ${CONDA_RUN} python3 -m pip install pytest pytest-mock 'av<10'
-      - name: Run tests
-        shell: bash -l {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          set -ex
-          ${CONDA_RUN} python3 -m torch.utils.collect_env
-          ${CONDA_RUN} python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
-          conda env remove -p ${ENV_NAME}
+        # Set up Environment Variables
+        export PYTHON_VERSION="${{ matrix.py_vers }}"
+        export VERSION="cpu"
+        export CUDATOOLKIT="cpuonly"
+
+        # Set CHANNEL
+        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+          export CHANNEL=test
+        else
+          export CHANNEL=nightly
+        fi
+
+        # Create Conda Env
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda activate /work/ci_env
+        
+        # Install PyTorch, Torchvision, and testing libraries
+        set -ex
+        conda install \
+          --yes \
+          -c "pytorch-${CHANNEL}" \
+          -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
+          "${CUDATOOLKIT}"
+        python3 setup.py develop
+        python3 -m pip install pytest pytest-mock 'av<10'
+
+        # Run Tests
+        python3 -m torch.utils.collect_env
+        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
-- 
GitLab


From 06ad05fa60f8af0ba36b726c8e0233040811a588 Mon Sep 17 00:00:00 2001
From: Joao Gomes <jdsgomes@fb.com>
Date: Fri, 21 Oct 2022 09:58:13 +0100
Subject: [PATCH 062/624] Read video from memory newapi (#6771)

* add tensor as optional param

* add init from memory

* fix bug

* fix bug

* first working version

* apply formatting and add tests

* simplify tests

* fix tests

* fix wrong variable name

* add path as optional parameter

* add src as optional

* address pr comments

* Fix warning messages

* address pr comments

* make tests stricter

* Revert "make tests stricter"

This reverts commit 6c92e94e8372f381c9496c9f885c2c71b6a4356b.
---
 test/test_videoapi.py               | 41 +++++++++++++++++++++
 torchvision/csrc/io/decoder/defs.h  |  2 +-
 torchvision/csrc/io/video/video.cpp | 57 +++++++++++++++++++++++------
 torchvision/csrc/io/video/video.h   | 20 +++++++++-
 torchvision/io/video_reader.py      | 47 +++++++++++++++++++++---
 5 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index 895b9b835..4688e5a64 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -77,6 +77,7 @@ class TestVideoApi:
                 # compare the frames and ptss
                 for i in range(len(vr_frames)):
                     assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
+
                     mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
                     # on average the difference is very small and caused
                     # by decoding (around 1%)
@@ -114,6 +115,46 @@ class TestVideoApi:
                     # we assure that there is never more than 1% difference in signal
                     assert max_delta.item() < 0.001
 
+    @pytest.mark.parametrize("stream", ["video", "audio"])
+    @pytest.mark.parametrize("test_video", test_videos.keys())
+    def test_frame_reading_mem_vs_file(self, test_video, stream):
+        full_path = os.path.join(VIDEO_DIR, test_video)
+
+        # Test video reading from file vs from memory
+        vr_frames, vr_frames_mem = [], []
+        vr_pts, vr_pts_mem = [], []
+        # get vr frames
+        video_reader = VideoReader(full_path, stream)
+        for vr_frame in video_reader:
+            vr_frames.append(vr_frame["data"])
+            vr_pts.append(vr_frame["pts"])
+
+        # get vr frames = read from memory
+        f = open(full_path, "rb")
+        fbytes = f.read()
+        f.close()
+        video_reader_from_mem = VideoReader(fbytes, stream)
+
+        for vr_frame_from_mem in video_reader_from_mem:
+            vr_frames_mem.append(vr_frame_from_mem["data"])
+            vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+        # same number of frames
+        assert len(vr_frames) == len(vr_frames_mem)
+        assert len(vr_pts) == len(vr_pts_mem)
+
+        # compare the frames and ptss
+        for i in range(len(vr_frames)):
+            assert vr_pts[i] == vr_pts_mem[i]
+            mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+            # on average the difference is very small and caused
+            # by decoding (around 1%)
+            # TODO: asses empirically how to set this? atm it's 1%
+            # averaged over all frames
+            assert mean_delta.item() < 2.55
+
+        del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+
     @pytest.mark.parametrize("test_video,config", test_videos.items())
     def test_metadata(self, test_video, config):
         """
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
index dac6293d3..502e5762e 100644
--- a/torchvision/csrc/io/decoder/defs.h
+++ b/torchvision/csrc/io/decoder/defs.h
@@ -165,7 +165,7 @@ struct MediaFormat {
 struct DecoderParameters {
   // local file, remote file, http url, rtmp stream uri, etc. anything that
   // ffmpeg can recognize
-  std::string uri;
+  std::string uri{std::string()};
   // timeout on getting bytes for decoding
   size_t timeoutMs{1000};
   // logging level, default AV_LOG_PANIC
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
index 38b350145..d8b36a35a 100644
--- a/torchvision/csrc/io/video/video.cpp
+++ b/torchvision/csrc/io/video/video.cpp
@@ -156,14 +156,34 @@ void Video::_getDecoderParams(
 
 } // _get decoder params
 
-Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
-  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
+void Video::initFromFile(
+    std::string videoPath,
+    std::string stream,
+    int64_t numThreads) {
+  TORCH_CHECK(!initialized, "Video object can only be initialized once");
+  initialized = true;
+  params.uri = videoPath;
+  _init(stream, numThreads);
+}
+
+void Video::initFromMemory(
+    torch::Tensor videoTensor,
+    std::string stream,
+    int64_t numThreads) {
+  TORCH_CHECK(!initialized, "Video object can only be initialized once");
+  initialized = true;
+  callback = MemoryBuffer::getCallback(
+      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
+  _init(stream, numThreads);
+}
+
+void Video::_init(std::string stream, int64_t numThreads) {
   // set number of threads global
   numThreads_ = numThreads;
   // parse stream information
   current_stream = _parseStream(stream);
   // note that in the initial call we want to get all streams
-  Video::_getDecoderParams(
+  _getDecoderParams(
       0, // video start
       0, // headerOnly
       std::get<0>(current_stream), // stream info - remove that
@@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
 
   std::string logMessage, logType;
 
-  // TODO: add read from memory option
-  params.uri = videoPath;
-  logType = "file";
-  logMessage = videoPath;
-
   // locals
   std::vector<double> audioFPS, videoFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
@@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
   c10::Dict<std::string, std::vector<double>> subsMetadata;
 
   // callback and metadata defined in struct
-  succeeded = decoder.init(params, std::move(callback), &metadata);
+  DecoderInCallback tmp_callback = callback;
+  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
   if (succeeded) {
     for (const auto& header : metadata) {
       double fps = double(header.fps);
@@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
   streamsMetadata.insert("subtitles", subsMetadata);
   streamsMetadata.insert("cc", ccMetadata);
 
-  succeeded = Video::setCurrentStream(stream);
+  succeeded = setCurrentStream(stream);
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
   if (std::get<1>(current_stream) != -1) {
     LOG(INFO)
         << "Stream index set to " << std::get<1>(current_stream)
         << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
   }
+}
+
+Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
+  if (!videoPath.empty()) {
+    initFromFile(videoPath, stream, numThreads);
+  }
 } // video
 
 bool Video::setCurrentStream(std::string stream = "video") {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
     current_stream = _parseStream(stream);
   }
@@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") {
   );
 
   // callback and metadata defined in Video.h
-  return (decoder.init(params, std::move(callback), &metadata));
+  DecoderInCallback tmp_callback = callback;
+  return (decoder.init(params, std::move(tmp_callback), &metadata));
 }
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   return current_stream;
 }
 
 c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
     getStreamMetadata() const {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   return streamsMetadata;
 }
 
 void Video::Seek(double ts, bool fastSeek = false) {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   // initialize the class variables used for seeking and retrurn
   _getDecoderParams(
       ts, // video start
@@ -282,11 +310,14 @@ void Video::Seek(double ts, bool fastSeek = false) {
   );
 
   // callback and metadata defined in Video.h
-  succeeded = decoder.init(params, std::move(callback), &metadata);
+  DecoderInCallback tmp_callback = callback;
+  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
+
   LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
 }
 
 std::tuple<torch::Tensor, double> Video::Next() {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   // if failing to decode simply return a null tensor (note, should we
   // raise an exeption?)
   double frame_pts_s;
@@ -345,6 +376,8 @@ std::tuple<torch::Tensor, double> Video::Next() {
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, int64_t>())
+        .def("init_from_file", &Video::initFromFile)
+        .def("init_from_memory", &Video::initFromMemory)
         .def("get_current_stream", &Video::getCurrentStream)
         .def("set_current_stream", &Video::setCurrentStream)
         .def("get_metadata", &Video::getStreamMetadata)
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
index 7cd926b79..5249ee003 100644
--- a/torchvision/csrc/io/video/video.h
+++ b/torchvision/csrc/io/video/video.h
@@ -19,7 +19,19 @@ struct Video : torch::CustomClassHolder {
   int64_t numThreads_{0};
 
  public:
-  Video(std::string videoPath, std::string stream, int64_t numThreads);
+  Video(
+      std::string videoPath = std::string(),
+      std::string stream = std::string("video"),
+      int64_t numThreads = 0);
+  void initFromFile(
+      std::string videoPath,
+      std::string stream,
+      int64_t numThreads);
+  void initFromMemory(
+      torch::Tensor videoTensor,
+      std::string stream,
+      int64_t numThreads);
+
   std::tuple<std::string, int64_t> getCurrentStream() const;
   c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
   getStreamMetadata() const;
@@ -34,6 +46,12 @@ struct Video : torch::CustomClassHolder {
   // time in comination with any_frame settings
   double seekTS = -1;
 
+  bool initialized = false;
+
+  void _init(
+      std::string stream,
+      int64_t numThreads); // expects params.uri OR callback to be set
+
   void _getDecoderParams(
       double videoStartS,
       int64_t getPtsOnly,
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index c2ffa049d..0449d6d1e 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Iterator
+import warnings
+from typing import Any, Dict, Iterator, Optional
 
 import torch
 
@@ -71,8 +72,13 @@ class VideoReader:
         If only stream type is passed, the decoder auto-detects first stream of that type.
 
     Args:
+        src (string, bytes object, or tensor): The media source.
+            If string-type, it must be a file path supported by FFMPEG.
+            If bytes shoud be an in memory representatin of a file supported by FFMPEG.
+            If Tensor, it is interpreted internally as byte buffer.
+            It must be one-dimensional, of type ``torch.uint8``.
+
 
-        path (string): Path to the video file in supported format
 
         stream (string, optional): descriptor of the required stream, followed by the stream id,
             in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
@@ -85,9 +91,23 @@ class VideoReader:
         device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
             To use GPU decoding, pass ``device="cuda"``.
 
+        path (str, optional):
+            .. warning:
+                This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
+                Please use ``src`` instead.
+
+
+
     """
 
-    def __init__(self, path: str, stream: str = "video", num_threads: int = 0, device: str = "cpu") -> None:
+    def __init__(
+        self,
+        src: str = "",
+        stream: str = "video",
+        num_threads: int = 0,
+        device: str = "cpu",
+        path: Optional[str] = None,
+    ) -> None:
         _log_api_usage_once(self)
         self.is_cuda = False
         device = torch.device(device)
@@ -95,7 +115,7 @@ class VideoReader:
             if not _HAS_GPU_VIDEO_DECODER:
                 raise RuntimeError("Not compiled with GPU decoder support.")
             self.is_cuda = True
-            self._c = torch.classes.torchvision.GPUDecoder(path, device)
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
             return
         if not _has_video_opt():
             raise RuntimeError(
@@ -105,7 +125,24 @@ class VideoReader:
                 + "build torchvision from source."
             )
 
-        self._c = torch.classes.torchvision.Video(path, stream, num_threads)
+        if src == "":
+            if path is None:
+                raise TypeError("src cannot be empty")
+            src = path
+            warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
+
+        elif isinstance(src, bytes):
+            src = torch.frombuffer(src, dtype=torch.uint8)
+
+        if isinstance(src, str):
+            self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+        elif isinstance(src, torch.Tensor):
+            if self.is_cuda:
+                raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
+            self._c = torch.classes.torchvision.Video("", "", 0)
+            self._c.init_from_memory(src, stream, num_threads)
+        else:
+            raise TypeError("`src` must be either string, Tensor or bytes object.")
 
     def __next__(self) -> Dict[str, Any]:
         """Decodes and returns the next frame of the current stream.
-- 
GitLab


From 9f024a6e52e1f9a7a588d743f3f383870ce97fbf Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 21 Oct 2022 12:04:26 +0200
Subject: [PATCH 063/624] [proto] Speed up adjust color ops (#6784)

* WIP

* _blend optim v1

* _blend and color ops optims: v2

* updated a/r tol and configs to make tests pass

* Loose a/r tolerance in AA tests

* Use custom rgb_to_grayscale

* Renamed img -> image

* nit code update

* PR review

* adjust_contrast convert to float32 earlier

* Revert "adjust_contrast convert to float32 earlier"

This reverts commit a82cf8c739d02acd9868ebee4b8b99d101c3e45e.
---
 test/test_prototype_transforms_consistency.py | 10 +--
 .../prototype/transforms/functional/_color.py | 69 +++++++++++++++----
 .../prototype/transforms/functional/_meta.py  |  6 +-
 torchvision/transforms/functional_tensor.py   |  7 +-
 4 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 362a7a1c0..7d7246326 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -254,9 +254,10 @@ CONSISTENCY_CONFIGS = [
         legacy_transforms.RandomAdjustSharpness,
         [
             ArgsKwargs(p=0, sharpness_factor=0.5),
-            ArgsKwargs(p=1, sharpness_factor=0.3),
+            ArgsKwargs(p=1, sharpness_factor=0.2),
             ArgsKwargs(p=1, sharpness_factor=0.99),
         ],
+        closeness_kwargs={"atol": 1e-6, "rtol": 1e-6},
     ),
     ConsistencyConfig(
         prototype_transforms.RandomGrayscale,
@@ -306,8 +307,9 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(saturation=(0.8, 0.9)),
             ArgsKwargs(hue=0.3),
             ArgsKwargs(hue=(-0.1, 0.2)),
-            ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.7, hue=0.3),
+            ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.5, hue=0.6),
         ],
+        closeness_kwargs={"atol": 1e-5, "rtol": 1e-5},
     ),
     *[
         ConsistencyConfig(
@@ -753,7 +755,7 @@ class TestAATransforms:
             expected_output = t_ref(inpt)
             output = t(inpt)
 
-            assert_equal(expected_output, output)
+            assert_close(expected_output, output, atol=1, rtol=0.1)
 
     @pytest.mark.parametrize(
         "inpt",
@@ -801,7 +803,7 @@ class TestAATransforms:
             expected_output = t_ref(inpt)
             output = t(inpt)
 
-            assert_equal(expected_output, output)
+            assert_close(expected_output, output, atol=1, rtol=0.1)
 
     @pytest.mark.parametrize(
         "inpt",
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 49a769e04..ae07cc005 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -2,9 +2,29 @@ import torch
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
-from ._meta import get_dimensions_image_tensor
+from ._meta import _rgb_to_gray, get_dimensions_image_tensor, get_num_channels_image_tensor
+
+
+def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
+    ratio = float(ratio)
+    fp = image1.is_floating_point()
+    bound = 1.0 if fp else 255.0
+    output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
+    return output if fp else output.to(image1.dtype)
+
+
+def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    _FT._assert_channels(image, [1, 3])
+
+    fp = image.is_floating_point()
+    bound = 1.0 if fp else 255.0
+    output = image.mul(brightness_factor).clamp_(0, bound)
+    return output if fp else output.to(image.dtype)
+
 
-adjust_brightness_image_tensor = _FT.adjust_brightness
 adjust_brightness_image_pil = _FP.adjust_brightness
 
 
@@ -21,7 +41,20 @@ def adjust_brightness(inpt: features.InputTypeJIT, brightness_factor: float) ->
         return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
 
 
-adjust_saturation_image_tensor = _FT.adjust_saturation
+def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    c = get_num_channels_image_tensor(image)
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    return _blend(image, _rgb_to_gray(image), saturation_factor)
+
+
 adjust_saturation_image_pil = _FP.adjust_saturation
 
 
@@ -38,7 +71,19 @@ def adjust_saturation(inpt: features.InputTypeJIT, saturation_factor: float) ->
         return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
 
 
-adjust_contrast_image_tensor = _FT.adjust_contrast
+def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    c = get_num_channels_image_tensor(image)
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    grayscale_image = _rgb_to_gray(image) if c == 3 else image
+    mean = torch.mean(grayscale_image.to(dtype), dim=(-3, -2, -1), keepdim=True)
+    return _blend(image, mean, contrast_factor)
+
+
 adjust_contrast_image_pil = _FP.adjust_contrast
 
 
@@ -74,7 +119,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     else:
         needs_unsquash = False
 
-    output = _FT._blend(image, _FT._blurred_degenerate_image(image), sharpness_factor)
+    output = _blend(image, _FT._blurred_degenerate_image(image), sharpness_factor)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -183,13 +228,13 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return autocontrast_image_pil(inpt)
 
 
-def _equalize_image_tensor_vec(img: torch.Tensor) -> torch.Tensor:
-    # input img shape should be [N, H, W]
-    shape = img.shape
+def _equalize_image_tensor_vec(image: torch.Tensor) -> torch.Tensor:
+    # input image shape should be [N, H, W]
+    shape = image.shape
     # Compute image histogram:
-    flat_img = img.flatten(start_dim=1).to(torch.long)  # -> [N, H * W]
-    hist = flat_img.new_zeros(shape[0], 256)
-    hist.scatter_add_(dim=1, index=flat_img, src=flat_img.new_ones(1).expand_as(flat_img))
+    flat_image = image.flatten(start_dim=1).to(torch.long)  # -> [N, H * W]
+    hist = flat_image.new_zeros(shape[0], 256)
+    hist.scatter_add_(dim=1, index=flat_image, src=flat_image.new_ones(1).expand_as(flat_image))
 
     # Compute image cdf
     chist = hist.cumsum_(dim=1)
@@ -213,7 +258,7 @@ def _equalize_image_tensor_vec(img: torch.Tensor) -> torch.Tensor:
     zeros = lut.new_zeros((1, 1)).expand(shape[0], 1)
     lut = torch.cat([zeros, lut[:, :-1]], dim=1)
 
-    return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).reshape_as(img))
+    return torch.where((step == 0).unsqueeze(-1), image, lut.gather(dim=1, index=flat_image).reshape_as(image))
 
 
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 2903d73ce..61a54f01c 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -184,7 +184,11 @@ def _gray_to_rgb(grayscale: torch.Tensor) -> torch.Tensor:
     return grayscale.repeat(repeats)
 
 
-_rgb_to_gray = _FT.rgb_to_grayscale
+def _rgb_to_gray(image: torch.Tensor) -> torch.Tensor:
+    r, g, b = image.unbind(dim=-3)
+    l_img = (0.2989 * r).add_(g, alpha=0.587).add_(b, alpha=0.114)
+    l_img = l_img.to(image.dtype).unsqueeze(dim=-3)
+    return l_img
 
 
 def convert_color_space_image_tensor(
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index 4944c75fa..ca641faf1 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -816,12 +816,7 @@ def _blurred_degenerate_image(img: Tensor) -> Tensor:
     kernel /= kernel.sum()
     kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
 
-    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(
-        img,
-        [
-            kernel.dtype,
-        ],
-    )
+    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
     result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
     result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
 
-- 
GitLab


From c04179871a0d9cf8ff6ad3b493b5e18638aa42a9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 21 Oct 2022 13:15:36 +0200
Subject: [PATCH 064/624] another round of perf improvements for equalize
 (#6776)

* perf improvements for equalize

Co-authored-by: lezcano <lezcano-93@hotmail.com>

* improve reference tests

* add extensive comments and minor fixes to the kernel

* improve comments

Co-authored-by: lezcano <lezcano-93@hotmail.com>
---
 test/prototype_transforms_kernel_infos.py     | 40 ++++++++-
 .../prototype/transforms/functional/_color.py | 88 ++++++++++++-------
 2 files changed, 92 insertions(+), 36 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 133508f5f..eb90508fa 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -13,6 +13,8 @@ from common_utils import cycle_over
 from datasets_utils import combinations_grid
 from prototype_common_utils import (
     ArgsKwargs,
+    get_num_channels,
+    ImageLoader,
     InfoBase,
     make_bounding_box_loaders,
     make_image_loader,
@@ -1359,9 +1361,43 @@ def sample_inputs_equalize_image_tensor():
 
 
 def reference_inputs_equalize_image_tensor():
-    for image_loader in make_image_loaders(
-        extra_dims=[()], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8]
+    # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range.
+    # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one,
+    # the information gain is low if we already provide something really close to the expected value.
+    spatial_size = (256, 256)
+    for fn, color_space in itertools.product(
+        [
+            *[
+                lambda shape, dtype, device, low=low, high=high: torch.randint(
+                    low, high, shape, dtype=dtype, device=device
+                )
+                for low, high in [
+                    (0, 1),
+                    (255, 256),
+                    (0, 64),
+                    (64, 192),
+                    (192, 256),
+                ]
+            ],
+            *[
+                lambda shape, dtype, device, alpha=alpha, beta=beta: torch.distributions.Beta(alpha, beta)
+                .sample(shape)
+                .mul_(255)
+                .round_()
+                .to(dtype=dtype, device=device)
+                for alpha, beta in [
+                    (0.5, 0.5),
+                    (2, 2),
+                    (2, 5),
+                    (5, 2),
+                ]
+            ],
+        ],
+        [features.ColorSpace.GRAY, features.ColorSpace.RGB],
     ):
+        image_loader = ImageLoader(
+            fn, shape=(get_num_channels(color_space), *spatial_size), dtype=torch.uint8, color_space=color_space
+        )
         yield ArgsKwargs(image_loader)
 
 
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index ae07cc005..68b52fff6 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -228,39 +228,6 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return autocontrast_image_pil(inpt)
 
 
-def _equalize_image_tensor_vec(image: torch.Tensor) -> torch.Tensor:
-    # input image shape should be [N, H, W]
-    shape = image.shape
-    # Compute image histogram:
-    flat_image = image.flatten(start_dim=1).to(torch.long)  # -> [N, H * W]
-    hist = flat_image.new_zeros(shape[0], 256)
-    hist.scatter_add_(dim=1, index=flat_image, src=flat_image.new_ones(1).expand_as(flat_image))
-
-    # Compute image cdf
-    chist = hist.cumsum_(dim=1)
-    # Compute steps, where step per channel is nonzero_hist[:-1].sum() // 255
-    # Trick: nonzero_hist[:-1].sum() == chist[idx - 1], where idx = chist.argmax()
-    idx = chist.argmax(dim=1).sub_(1)
-    # If histogram is degenerate (hist of zero image), index is -1
-    neg_idx_mask = idx < 0
-    idx.clamp_(min=0)
-    step = chist.gather(dim=1, index=idx.unsqueeze(1))
-    step[neg_idx_mask] = 0
-    step.div_(255, rounding_mode="floor")
-
-    # Compute batched Look-up-table:
-    # Necessary to avoid an integer division by zero, which raises
-    clamped_step = step.clamp(min=1)
-    chist.add_(torch.div(step, 2, rounding_mode="floor")).div_(clamped_step, rounding_mode="floor").clamp_(0, 255)
-    lut = chist.to(torch.uint8)  # [N, 256]
-
-    # Pad lut with zeros
-    zeros = lut.new_zeros((1, 1)).expand(shape[0], 1)
-    lut = torch.cat([zeros, lut[:, :-1]], dim=1)
-
-    return torch.where((step == 0).unsqueeze(-1), image, lut.gather(dim=1, index=flat_image).reshape_as(image))
-
-
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.dtype != torch.uint8:
         raise TypeError(f"Only torch.uint8 image tensors are supported, but found {image.dtype}")
@@ -272,7 +239,60 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
 
-    return _equalize_image_tensor_vec(image.reshape(-1, height, width)).reshape(image.shape)
+    batch_shape = image.shape[:-2]
+    flat_image = image.flatten(start_dim=-2).to(torch.long)
+
+    # The algorithm for histogram equalization is mirrored from PIL:
+    # https://github.com/python-pillow/Pillow/blob/eb59cb61d5239ee69cbbf12709a0c6fd7314e6d7/src/PIL/ImageOps.py#L368-L385
+
+    # Although PyTorch has builtin functionality for histograms, it doesn't support batches. Since we deal with uint8
+    # images here and thus the values are already binned, the computation is trivial. The histogram is computed by using
+    # the flattened image as index. For example, a pixel value of 127 in the image corresponds to adding 1 to index 127
+    # in the histogram.
+    hist = flat_image.new_zeros(batch_shape + (256,), dtype=torch.int32)
+    hist.scatter_add_(dim=-1, index=flat_image, src=hist.new_ones(1).expand_as(flat_image))
+    cum_hist = hist.cumsum(dim=-1)
+
+    # The simplest form of lookup-table (LUT) that also achieves histogram equalization is
+    # `lut = cum_hist / flat_image.shape[-1] * 255`
+    # However, PIL uses a more elaborate scheme:
+    # `lut = ((cum_hist + num_non_max_pixels // (2 * 255)) // num_non_max_pixels) * 255`
+
+    # The last non-zero element in the histogram is the first element in the cumulative histogram with the maximum
+    # value. Thus, the "max" in `num_non_max_pixels` does not refer to 255 as the maximum value of uint8 images, but
+    # rather the maximum value in the image, which might be or not be 255.
+    index = cum_hist.argmax(dim=-1)
+    num_non_max_pixels = flat_image.shape[-1] - hist.gather(dim=-1, index=index.unsqueeze_(-1))
+
+    # This is performance optimization that saves us one multiplication later. With this, the LUT computation simplifies
+    # to `lut = (cum_hist + step // 2) // step` and thus saving the final multiplication by 255 while keeping the
+    # division count the same. PIL uses the variable name `step` for this, so we keep that for easier comparison.
+    step = num_non_max_pixels.div_(255, rounding_mode="floor")
+
+    # Although it looks like we could return early if we find `step == 0` like PIL does, that is unfortunately not as
+    # easy due to our support for batched images. We can only return early if `(step == 0).all()` holds. If it doesn't,
+    # we have to go through the computation below anyway. Since `step == 0` is an edge case anyway, it makes no sense to
+    # pay the runtime cost for checking it every time.
+    no_equalization = step.eq(0).unsqueeze_(-1)
+
+    # `lut[k]` is computed with `cum_hist[k-1]` with `lut[0] == (step // 2) // step == 0`. Thus, we perform the
+    # computation only for `lut[1:]` with `cum_hist[:-1]` and add `lut[0] == 0` afterwards.
+    cum_hist = cum_hist[..., :-1]
+    (
+        cum_hist.add_(step // 2)
+        # We need the `clamp_`(min=1) call here to avoid zero division since they fail for integer dtypes. This has no
+        # effect on the returned result of this kernel since images inside the batch with `step == 0` are returned as is
+        # instead of equalized version.
+        .div_(step.clamp_(min=1), rounding_mode="floor")
+        # We need the `clamp_` call here since PILs LUT computation scheme can produce values outside the valid value
+        # range of uint8 images
+        .clamp_(0, 255)
+    )
+    lut = cum_hist.to(torch.uint8)
+    lut = torch.cat([lut.new_zeros(1).expand(batch_shape + (1,)), lut], dim=-1)
+    equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
+
+    return torch.where(no_equalization, image, equalized_image)
 
 
 equalize_image_pil = _FP.equalize
-- 
GitLab


From d0de55db37447ff86ba9772fcaf15111a6e969b7 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 21 Oct 2022 13:28:04 +0200
Subject: [PATCH 065/624] [proto] Enable GPU tests on prototype (#6665)

* [proto][WIP] Enable GPU tests on prototype

* Update prototype-tests.yml

* tests on gpu as separate file

* Removed matrix setup

* Update prototype-tests-gpu.yml

* Update prototype-tests-gpu.yml

* Added --gpus=all flag

* Added xfail for cuda vs cpu tolerance issue

* Update prototype-tests-gpu.yml
---
 .github/workflows/prototype-tests-gpu.yml    | 80 ++++++++++++++++++++
 test/test_prototype_transforms_functional.py |  5 +-
 2 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/prototype-tests-gpu.yml

diff --git a/.github/workflows/prototype-tests-gpu.yml b/.github/workflows/prototype-tests-gpu.yml
new file mode 100644
index 000000000..1183ccd85
--- /dev/null
+++ b/.github/workflows/prototype-tests-gpu.yml
@@ -0,0 +1,80 @@
+# prototype-tests.yml adapted for self-hosted with gpu
+name: tests-gpu
+
+on:
+  pull_request:
+
+jobs:
+  prototype:
+    strategy:
+      fail-fast: false
+
+    runs-on: [self-hosted, linux.4xlarge.nvidia.gpu]
+    container:
+      image: pytorch/conda-builder:cuda116
+      options: --gpus all
+
+    steps:
+      - name: Run nvidia-smi
+        run: nvidia-smi
+
+      - name: Upgrade system packages
+        run: python -m pip install --upgrade pip setuptools wheel
+
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install PyTorch nightly builds
+        run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cu116/
+
+      - name: Install torchvision
+        run: pip install --progress-bar=off --no-build-isolation --editable .
+
+      - name: Install other prototype dependencies
+        run: pip install --progress-bar=off scipy pycocotools h5py iopath
+
+      - name: Install test requirements
+        run: pip install --progress-bar=off pytest pytest-mock pytest-cov
+
+      - name: Mark setup as complete
+        id: setup
+        run: python -c "import torch; exit(not torch.cuda.is_available())"
+
+      - name: Run prototype features tests
+        shell: bash
+        run: |
+          pytest \
+            --durations=20 \
+            --cov=torchvision/prototype/features \
+            --cov-report=term-missing \
+            test/test_prototype_features*.py
+
+      - name: Run prototype datasets tests
+        if: success() || ( failure() && steps.setup.conclusion == 'success' )
+        shell: bash
+        run: |
+          pytest \
+            --durations=20 \
+            --cov=torchvision/prototype/datasets \
+            --cov-report=term-missing \
+            test/test_prototype_datasets*.py
+
+      - name: Run prototype transforms tests
+        if: success() || ( failure() && steps.setup.conclusion == 'success' )
+        shell: bash
+        run: |
+          pytest \
+            --durations=20 \
+            --cov=torchvision/prototype/transforms \
+            --cov-report=term-missing \
+            test/test_prototype_transforms*.py
+
+      - name: Run prototype models tests
+        if: success() || ( failure() && steps.setup.conclusion == 'success' )
+        shell: bash
+        run: |
+          pytest \
+            --durations=20 \
+            --cov=torchvision/prototype/models \
+            --cov-report=term-missing \
+            test/test_prototype_models*.py
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 3423006e2..079ef3bd1 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -174,7 +174,10 @@ class TestKernels:
         output_cpu = info.kernel(input_cpu, *other_args, **kwargs)
         output_cuda = info.kernel(input_cuda, *other_args, **kwargs)
 
-        assert_close(output_cuda, output_cpu, check_device=False, **info.closeness_kwargs)
+        try:
+            assert_close(output_cuda, output_cpu, check_device=False, **info.closeness_kwargs)
+        except AssertionError:
+            pytest.xfail("CUDA vs CPU tolerance issue to be fixed")
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
-- 
GitLab


From 37618552208f122e229bc3666f562d3ccd05d26a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 21 Oct 2022 13:34:19 +0200
Subject: [PATCH 066/624] Unwrap features before passing them into a kernel
 (#6807)

* unwrap features before calling the kernels

* revert double unwrapping

* cleanup

* remove debug raise

* more cleanup
---
 .../prototype/features/_bounding_box.py       | 43 +++++++----
 torchvision/prototype/features/_image.py      | 71 ++++++++++++-------
 torchvision/prototype/features/_mask.py       | 22 +++---
 torchvision/prototype/features/_video.py      | 69 +++++++++++-------
 torchvision/prototype/transforms/_meta.py     | 16 ++++-
 5 files changed, 142 insertions(+), 79 deletions(-)

diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 18c607d4d..59b022582 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -66,15 +66,23 @@ class BoundingBox(_Feature):
             format = BoundingBoxFormat.from_str(format.upper())
 
         return BoundingBox.wrap_like(
-            self, self._F.convert_format_bounding_box(self, old_format=self.format, new_format=format), format=format
+            self,
+            self._F.convert_format_bounding_box(
+                self.as_subclass(torch.Tensor), old_format=self.format, new_format=format
+            ),
+            format=format,
         )
 
     def horizontal_flip(self) -> BoundingBox:
-        output = self._F.horizontal_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size)
+        output = self._F.horizontal_flip_bounding_box(
+            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
+        )
         return BoundingBox.wrap_like(self, output)
 
     def vertical_flip(self) -> BoundingBox:
-        output = self._F.vertical_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size)
+        output = self._F.vertical_flip_bounding_box(
+            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
+        )
         return BoundingBox.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
@@ -85,19 +93,19 @@ class BoundingBox(_Feature):
         antialias: bool = False,
     ) -> BoundingBox:
         output, spatial_size = self._F.resize_bounding_box(
-            self, spatial_size=self.spatial_size, size=size, max_size=max_size
+            self.as_subclass(torch.Tensor), spatial_size=self.spatial_size, size=size, max_size=max_size
         )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
         output, spatial_size = self._F.crop_bounding_box(
-            self, self.format, top=top, left=left, height=height, width=width
+            self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width
         )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def center_crop(self, output_size: List[int]) -> BoundingBox:
         output, spatial_size = self._F.center_crop_bounding_box(
-            self, format=self.format, spatial_size=self.spatial_size, output_size=output_size
+            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, output_size=output_size
         )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
@@ -111,7 +119,9 @@ class BoundingBox(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         antialias: bool = False,
     ) -> BoundingBox:
-        output, spatial_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
+        output, spatial_size = self._F.resized_crop_bounding_box(
+            self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
+        )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
     def pad(
@@ -121,7 +131,11 @@ class BoundingBox(_Feature):
         padding_mode: str = "constant",
     ) -> BoundingBox:
         output, spatial_size = self._F.pad_bounding_box(
-            self, format=self.format, spatial_size=self.spatial_size, padding=padding, padding_mode=padding_mode
+            self.as_subclass(torch.Tensor),
+            format=self.format,
+            spatial_size=self.spatial_size,
+            padding=padding,
+            padding_mode=padding_mode,
         )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
@@ -134,7 +148,12 @@ class BoundingBox(_Feature):
         center: Optional[List[float]] = None,
     ) -> BoundingBox:
         output, spatial_size = self._F.rotate_bounding_box(
-            self, format=self.format, spatial_size=self.spatial_size, angle=angle, expand=expand, center=center
+            self.as_subclass(torch.Tensor),
+            format=self.format,
+            spatial_size=self.spatial_size,
+            angle=angle,
+            expand=expand,
+            center=center,
         )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
@@ -149,7 +168,7 @@ class BoundingBox(_Feature):
         center: Optional[List[float]] = None,
     ) -> BoundingBox:
         output = self._F.affine_bounding_box(
-            self,
+            self.as_subclass(torch.Tensor),
             self.format,
             self.spatial_size,
             angle,
@@ -166,7 +185,7 @@ class BoundingBox(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> BoundingBox:
-        output = self._F.perspective_bounding_box(self, self.format, perspective_coeffs)
+        output = self._F.perspective_bounding_box(self.as_subclass(torch.Tensor), self.format, perspective_coeffs)
         return BoundingBox.wrap_like(self, output)
 
     def elastic(
@@ -175,5 +194,5 @@ class BoundingBox(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> BoundingBox:
-        output = self._F.elastic_bounding_box(self, self.format, displacement)
+        output = self._F.elastic_bounding_box(self.as_subclass(torch.Tensor), self.format, displacement)
         return BoundingBox.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index e9128b94b..48f4b0950 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -117,17 +117,17 @@ class Image(_Feature):
         return Image.wrap_like(
             self,
             self._F.convert_color_space_image_tensor(
-                self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
+                self.as_subclass(torch.Tensor), old_color_space=self.color_space, new_color_space=color_space, copy=copy
             ),
             color_space=color_space,
         )
 
     def horizontal_flip(self) -> Image:
-        output = self._F.horizontal_flip_image_tensor(self)
+        output = self._F.horizontal_flip_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
 
     def vertical_flip(self) -> Image:
-        output = self._F.vertical_flip_image_tensor(self)
+        output = self._F.vertical_flip_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
@@ -138,16 +138,16 @@ class Image(_Feature):
         antialias: bool = False,
     ) -> Image:
         output = self._F.resize_image_tensor(
-            self, size, interpolation=interpolation, max_size=max_size, antialias=antialias
+            self.as_subclass(torch.Tensor), size, interpolation=interpolation, max_size=max_size, antialias=antialias
         )
         return Image.wrap_like(self, output)
 
     def crop(self, top: int, left: int, height: int, width: int) -> Image:
-        output = self._F.crop_image_tensor(self, top, left, height, width)
+        output = self._F.crop_image_tensor(self.as_subclass(torch.Tensor), top, left, height, width)
         return Image.wrap_like(self, output)
 
     def center_crop(self, output_size: List[int]) -> Image:
-        output = self._F.center_crop_image_tensor(self, output_size=output_size)
+        output = self._F.center_crop_image_tensor(self.as_subclass(torch.Tensor), output_size=output_size)
         return Image.wrap_like(self, output)
 
     def resized_crop(
@@ -161,7 +161,14 @@ class Image(_Feature):
         antialias: bool = False,
     ) -> Image:
         output = self._F.resized_crop_image_tensor(
-            self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
+            self.as_subclass(torch.Tensor),
+            top,
+            left,
+            height,
+            width,
+            size=list(size),
+            interpolation=interpolation,
+            antialias=antialias,
         )
         return Image.wrap_like(self, output)
 
@@ -171,7 +178,7 @@ class Image(_Feature):
         fill: FillTypeJIT = None,
         padding_mode: str = "constant",
     ) -> Image:
-        output = self._F.pad_image_tensor(self, padding, fill=fill, padding_mode=padding_mode)
+        output = self._F.pad_image_tensor(self.as_subclass(torch.Tensor), padding, fill=fill, padding_mode=padding_mode)
         return Image.wrap_like(self, output)
 
     def rotate(
@@ -182,8 +189,8 @@ class Image(_Feature):
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Image:
-        output = self._F._geometry.rotate_image_tensor(
-            self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
+        output = self._F.rotate_image_tensor(
+            self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
         )
         return Image.wrap_like(self, output)
 
@@ -197,8 +204,8 @@ class Image(_Feature):
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Image:
-        output = self._F._geometry.affine_image_tensor(
-            self,
+        output = self._F.affine_image_tensor(
+            self.as_subclass(torch.Tensor),
             angle,
             translate=translate,
             scale=scale,
@@ -215,8 +222,8 @@ class Image(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Image:
-        output = self._F._geometry.perspective_image_tensor(
-            self, perspective_coeffs, interpolation=interpolation, fill=fill
+        output = self._F.perspective_image_tensor(
+            self.as_subclass(torch.Tensor), perspective_coeffs, interpolation=interpolation, fill=fill
         )
         return Image.wrap_like(self, output)
 
@@ -226,55 +233,65 @@ class Image(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Image:
-        output = self._F._geometry.elastic_image_tensor(self, displacement, interpolation=interpolation, fill=fill)
+        output = self._F.elastic_image_tensor(
+            self.as_subclass(torch.Tensor), displacement, interpolation=interpolation, fill=fill
+        )
         return Image.wrap_like(self, output)
 
     def adjust_brightness(self, brightness_factor: float) -> Image:
-        output = self._F.adjust_brightness_image_tensor(self, brightness_factor=brightness_factor)
+        output = self._F.adjust_brightness_image_tensor(
+            self.as_subclass(torch.Tensor), brightness_factor=brightness_factor
+        )
         return Image.wrap_like(self, output)
 
     def adjust_saturation(self, saturation_factor: float) -> Image:
-        output = self._F.adjust_saturation_image_tensor(self, saturation_factor=saturation_factor)
+        output = self._F.adjust_saturation_image_tensor(
+            self.as_subclass(torch.Tensor), saturation_factor=saturation_factor
+        )
         return Image.wrap_like(self, output)
 
     def adjust_contrast(self, contrast_factor: float) -> Image:
-        output = self._F.adjust_contrast_image_tensor(self, contrast_factor=contrast_factor)
+        output = self._F.adjust_contrast_image_tensor(self.as_subclass(torch.Tensor), contrast_factor=contrast_factor)
         return Image.wrap_like(self, output)
 
     def adjust_sharpness(self, sharpness_factor: float) -> Image:
-        output = self._F.adjust_sharpness_image_tensor(self, sharpness_factor=sharpness_factor)
+        output = self._F.adjust_sharpness_image_tensor(
+            self.as_subclass(torch.Tensor), sharpness_factor=sharpness_factor
+        )
         return Image.wrap_like(self, output)
 
     def adjust_hue(self, hue_factor: float) -> Image:
-        output = self._F.adjust_hue_image_tensor(self, hue_factor=hue_factor)
+        output = self._F.adjust_hue_image_tensor(self.as_subclass(torch.Tensor), hue_factor=hue_factor)
         return Image.wrap_like(self, output)
 
     def adjust_gamma(self, gamma: float, gain: float = 1) -> Image:
-        output = self._F.adjust_gamma_image_tensor(self, gamma=gamma, gain=gain)
+        output = self._F.adjust_gamma_image_tensor(self.as_subclass(torch.Tensor), gamma=gamma, gain=gain)
         return Image.wrap_like(self, output)
 
     def posterize(self, bits: int) -> Image:
-        output = self._F.posterize_image_tensor(self, bits=bits)
+        output = self._F.posterize_image_tensor(self.as_subclass(torch.Tensor), bits=bits)
         return Image.wrap_like(self, output)
 
     def solarize(self, threshold: float) -> Image:
-        output = self._F.solarize_image_tensor(self, threshold=threshold)
+        output = self._F.solarize_image_tensor(self.as_subclass(torch.Tensor), threshold=threshold)
         return Image.wrap_like(self, output)
 
     def autocontrast(self) -> Image:
-        output = self._F.autocontrast_image_tensor(self)
+        output = self._F.autocontrast_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
 
     def equalize(self) -> Image:
-        output = self._F.equalize_image_tensor(self)
+        output = self._F.equalize_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
 
     def invert(self) -> Image:
-        output = self._F.invert_image_tensor(self)
+        output = self._F.invert_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
 
     def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image:
-        output = self._F.gaussian_blur_image_tensor(self, kernel_size=kernel_size, sigma=sigma)
+        output = self._F.gaussian_blur_image_tensor(
+            self.as_subclass(torch.Tensor), kernel_size=kernel_size, sigma=sigma
+        )
         return Image.wrap_like(self, output)
 
 
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 2da10195e..697f0bbd9 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -37,11 +37,11 @@ class Mask(_Feature):
         return cast(Tuple[int, int], tuple(self.shape[-2:]))
 
     def horizontal_flip(self) -> Mask:
-        output = self._F.horizontal_flip_mask(self)
+        output = self._F.horizontal_flip_mask(self.as_subclass(torch.Tensor))
         return Mask.wrap_like(self, output)
 
     def vertical_flip(self) -> Mask:
-        output = self._F.vertical_flip_mask(self)
+        output = self._F.vertical_flip_mask(self.as_subclass(torch.Tensor))
         return Mask.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
@@ -51,15 +51,15 @@ class Mask(_Feature):
         max_size: Optional[int] = None,
         antialias: bool = False,
     ) -> Mask:
-        output = self._F.resize_mask(self, size, max_size=max_size)
+        output = self._F.resize_mask(self.as_subclass(torch.Tensor), size, max_size=max_size)
         return Mask.wrap_like(self, output)
 
     def crop(self, top: int, left: int, height: int, width: int) -> Mask:
-        output = self._F.crop_mask(self, top, left, height, width)
+        output = self._F.crop_mask(self.as_subclass(torch.Tensor), top, left, height, width)
         return Mask.wrap_like(self, output)
 
     def center_crop(self, output_size: List[int]) -> Mask:
-        output = self._F.center_crop_mask(self, output_size=output_size)
+        output = self._F.center_crop_mask(self.as_subclass(torch.Tensor), output_size=output_size)
         return Mask.wrap_like(self, output)
 
     def resized_crop(
@@ -72,7 +72,7 @@ class Mask(_Feature):
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         antialias: bool = False,
     ) -> Mask:
-        output = self._F.resized_crop_mask(self, top, left, height, width, size=size)
+        output = self._F.resized_crop_mask(self.as_subclass(torch.Tensor), top, left, height, width, size=size)
         return Mask.wrap_like(self, output)
 
     def pad(
@@ -81,7 +81,7 @@ class Mask(_Feature):
         fill: FillTypeJIT = None,
         padding_mode: str = "constant",
     ) -> Mask:
-        output = self._F.pad_mask(self, padding, padding_mode=padding_mode, fill=fill)
+        output = self._F.pad_mask(self.as_subclass(torch.Tensor), padding, padding_mode=padding_mode, fill=fill)
         return Mask.wrap_like(self, output)
 
     def rotate(
@@ -92,7 +92,7 @@ class Mask(_Feature):
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Mask:
-        output = self._F.rotate_mask(self, angle, expand=expand, center=center, fill=fill)
+        output = self._F.rotate_mask(self.as_subclass(torch.Tensor), angle, expand=expand, center=center, fill=fill)
         return Mask.wrap_like(self, output)
 
     def affine(
@@ -106,7 +106,7 @@ class Mask(_Feature):
         center: Optional[List[float]] = None,
     ) -> Mask:
         output = self._F.affine_mask(
-            self,
+            self.as_subclass(torch.Tensor),
             angle,
             translate=translate,
             scale=scale,
@@ -122,7 +122,7 @@ class Mask(_Feature):
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
     ) -> Mask:
-        output = self._F.perspective_mask(self, perspective_coeffs, fill=fill)
+        output = self._F.perspective_mask(self.as_subclass(torch.Tensor), perspective_coeffs, fill=fill)
         return Mask.wrap_like(self, output)
 
     def elastic(
@@ -131,5 +131,5 @@ class Mask(_Feature):
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
     ) -> Mask:
-        output = self._F.elastic_mask(self, displacement, fill=fill)
+        output = self._F.elastic_mask(self.as_subclass(torch.Tensor), displacement, fill=fill)
         return Mask.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index 26f97549a..6351ad5aa 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -73,17 +73,17 @@ class Video(_Feature):
         return Video.wrap_like(
             self,
             self._F.convert_color_space_video(
-                self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
+                self.as_subclass(torch.Tensor), old_color_space=self.color_space, new_color_space=color_space, copy=copy
             ),
             color_space=color_space,
         )
 
     def horizontal_flip(self) -> Video:
-        output = self._F.horizontal_flip_video(self)
+        output = self._F.horizontal_flip_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
 
     def vertical_flip(self) -> Video:
-        output = self._F.vertical_flip_video(self)
+        output = self._F.vertical_flip_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
@@ -93,15 +93,21 @@ class Video(_Feature):
         max_size: Optional[int] = None,
         antialias: bool = False,
     ) -> Video:
-        output = self._F.resize_video(self, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+        output = self._F.resize_video(
+            self.as_subclass(torch.Tensor),
+            size,
+            interpolation=interpolation,
+            max_size=max_size,
+            antialias=antialias,
+        )
         return Video.wrap_like(self, output)
 
     def crop(self, top: int, left: int, height: int, width: int) -> Video:
-        output = self._F.crop_video(self, top, left, height, width)
+        output = self._F.crop_video(self.as_subclass(torch.Tensor), top, left, height, width)
         return Video.wrap_like(self, output)
 
     def center_crop(self, output_size: List[int]) -> Video:
-        output = self._F.center_crop_video(self, output_size=output_size)
+        output = self._F.center_crop_video(self.as_subclass(torch.Tensor), output_size=output_size)
         return Video.wrap_like(self, output)
 
     def resized_crop(
@@ -115,7 +121,14 @@ class Video(_Feature):
         antialias: bool = False,
     ) -> Video:
         output = self._F.resized_crop_video(
-            self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
+            self.as_subclass(torch.Tensor),
+            top,
+            left,
+            height,
+            width,
+            size=list(size),
+            interpolation=interpolation,
+            antialias=antialias,
         )
         return Video.wrap_like(self, output)
 
@@ -125,7 +138,7 @@ class Video(_Feature):
         fill: FillTypeJIT = None,
         padding_mode: str = "constant",
     ) -> Video:
-        output = self._F.pad_video(self, padding, fill=fill, padding_mode=padding_mode)
+        output = self._F.pad_video(self.as_subclass(torch.Tensor), padding, fill=fill, padding_mode=padding_mode)
         return Video.wrap_like(self, output)
 
     def rotate(
@@ -136,8 +149,8 @@ class Video(_Feature):
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Video:
-        output = self._F._geometry.rotate_video(
-            self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
+        output = self._F.rotate_video(
+            self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
         )
         return Video.wrap_like(self, output)
 
@@ -151,8 +164,8 @@ class Video(_Feature):
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Video:
-        output = self._F._geometry.affine_video(
-            self,
+        output = self._F.affine_video(
+            self.as_subclass(torch.Tensor),
             angle,
             translate=translate,
             scale=scale,
@@ -169,7 +182,9 @@ class Video(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Video:
-        output = self._F._geometry.perspective_video(self, perspective_coeffs, interpolation=interpolation, fill=fill)
+        output = self._F.perspective_video(
+            self.as_subclass(torch.Tensor), perspective_coeffs, interpolation=interpolation, fill=fill
+        )
         return Video.wrap_like(self, output)
 
     def elastic(
@@ -178,55 +193,57 @@ class Video(_Feature):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Video:
-        output = self._F._geometry.elastic_video(self, displacement, interpolation=interpolation, fill=fill)
+        output = self._F.elastic_video(
+            self.as_subclass(torch.Tensor), displacement, interpolation=interpolation, fill=fill
+        )
         return Video.wrap_like(self, output)
 
     def adjust_brightness(self, brightness_factor: float) -> Video:
-        output = self._F.adjust_brightness_video(self, brightness_factor=brightness_factor)
+        output = self._F.adjust_brightness_video(self.as_subclass(torch.Tensor), brightness_factor=brightness_factor)
         return Video.wrap_like(self, output)
 
     def adjust_saturation(self, saturation_factor: float) -> Video:
-        output = self._F.adjust_saturation_video(self, saturation_factor=saturation_factor)
+        output = self._F.adjust_saturation_video(self.as_subclass(torch.Tensor), saturation_factor=saturation_factor)
         return Video.wrap_like(self, output)
 
     def adjust_contrast(self, contrast_factor: float) -> Video:
-        output = self._F.adjust_contrast_video(self, contrast_factor=contrast_factor)
+        output = self._F.adjust_contrast_video(self.as_subclass(torch.Tensor), contrast_factor=contrast_factor)
         return Video.wrap_like(self, output)
 
     def adjust_sharpness(self, sharpness_factor: float) -> Video:
-        output = self._F.adjust_sharpness_video(self, sharpness_factor=sharpness_factor)
+        output = self._F.adjust_sharpness_video(self.as_subclass(torch.Tensor), sharpness_factor=sharpness_factor)
         return Video.wrap_like(self, output)
 
     def adjust_hue(self, hue_factor: float) -> Video:
-        output = self._F.adjust_hue_video(self, hue_factor=hue_factor)
+        output = self._F.adjust_hue_video(self.as_subclass(torch.Tensor), hue_factor=hue_factor)
         return Video.wrap_like(self, output)
 
     def adjust_gamma(self, gamma: float, gain: float = 1) -> Video:
-        output = self._F.adjust_gamma_video(self, gamma=gamma, gain=gain)
+        output = self._F.adjust_gamma_video(self.as_subclass(torch.Tensor), gamma=gamma, gain=gain)
         return Video.wrap_like(self, output)
 
     def posterize(self, bits: int) -> Video:
-        output = self._F.posterize_video(self, bits=bits)
+        output = self._F.posterize_video(self.as_subclass(torch.Tensor), bits=bits)
         return Video.wrap_like(self, output)
 
     def solarize(self, threshold: float) -> Video:
-        output = self._F.solarize_video(self, threshold=threshold)
+        output = self._F.solarize_video(self.as_subclass(torch.Tensor), threshold=threshold)
         return Video.wrap_like(self, output)
 
     def autocontrast(self) -> Video:
-        output = self._F.autocontrast_video(self)
+        output = self._F.autocontrast_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
 
     def equalize(self) -> Video:
-        output = self._F.equalize_video(self)
+        output = self._F.equalize_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
 
     def invert(self) -> Video:
-        output = self._F.invert_video(self)
+        output = self._F.invert_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
 
     def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Video:
-        output = self._F.gaussian_blur_video(self, kernel_size=kernel_size, sigma=sigma)
+        output = self._F.gaussian_blur_video(self.as_subclass(torch.Tensor), kernel_size=kernel_size, sigma=sigma)
         return Video.wrap_like(self, output)
 
 
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index bdfe8b47a..829efeab1 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -17,7 +17,11 @@ class ConvertBoundingBoxFormat(Transform):
         self.format = format
 
     def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
-        output = F.convert_format_bounding_box(inpt, old_format=inpt.format, new_format=params["format"])
+        # We need to unwrap here to avoid unnecessary `__torch_function__` calls,
+        # since `convert_format_bounding_box` does not have a dispatcher function that would do that for us
+        output = F.convert_format_bounding_box(
+            inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=params["format"]
+        )
         return features.BoundingBox.wrap_like(inpt, output, format=params["format"])
 
 
@@ -31,7 +35,9 @@ class ConvertImageDtype(Transform):
     def _transform(
         self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
     ) -> Union[features.TensorImageType, features.TensorVideoType]:
-        output = F.convert_image_dtype(inpt, dtype=self.dtype)
+        # TODO: the `inpt.as_subclass(torch.Tensor)` call can be removed as soon as we have a proper dispatcher that
+        #  handles this. See https://github.com/pytorch/vision/pull/6783 for details.
+        output = F.convert_image_dtype(inpt.as_subclass(torch.Tensor), dtype=self.dtype)
         return (
             output if features.is_simple_tensor(inpt) else type(inpt).wrap_like(inpt, output)  # type: ignore[attr-defined]
         )
@@ -70,5 +76,9 @@ class ClampBoundingBoxes(Transform):
     _transformed_types = (features.BoundingBox,)
 
     def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
-        output = F.clamp_bounding_box(inpt, format=inpt.format, spatial_size=inpt.spatial_size)
+        # We need to unwrap here to avoid unnecessary `__torch_function__` calls,
+        # since `clamp_bounding_box` does not have a dispatcher function that would do that for us
+        output = F.clamp_bounding_box(
+            inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size
+        )
         return features.BoundingBox.wrap_like(inpt, output)
-- 
GitLab


From f88ab124a1fdc8b9f2f92af4de2aa24d1646a805 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 21 Oct 2022 12:41:04 +0100
Subject: [PATCH 067/624] Add PermuteDimensions and TransposeDimensions
 transforms (#6800)

* Add PermuteDimensions and TransposeDimensions transforms

* Strip Subclass info.

* Apply changes from code review.
---
 test/test_prototype_transforms.py            | 73 ++++++++++++++++++++
 torchvision/prototype/transforms/__init__.py | 12 +++-
 torchvision/prototype/transforms/_misc.py    | 51 +++++++++++---
 torchvision/prototype/transforms/_utils.py   | 19 +++--
 4 files changed, 137 insertions(+), 18 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 5928e6718..351430e1c 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -18,10 +18,12 @@ from prototype_common_utils import (
     make_masks,
     make_one_hot_labels,
     make_segmentation_mask,
+    make_video,
     make_videos,
 )
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import features, transforms
+from torchvision.prototype.transforms._utils import _isinstance
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
@@ -1826,3 +1828,74 @@ def test_to_dtype(dtype, expected_dtypes):
             assert transformed_value.dtype is expected_dtypes[value_type]
         else:
             assert transformed_value is value
+
+
+@pytest.mark.parametrize(
+    ("dims", "inverse_dims"),
+    [
+        (
+            {torch.Tensor: (1, 2, 0), features.Image: (2, 1, 0), features.Video: None},
+            {torch.Tensor: (2, 0, 1), features.Image: (2, 1, 0), features.Video: None},
+        ),
+        (
+            {torch.Tensor: (1, 2, 0), features.Image: (2, 1, 0), features.Video: (1, 2, 3, 0)},
+            {torch.Tensor: (2, 0, 1), features.Image: (2, 1, 0), features.Video: (3, 0, 1, 2)},
+        ),
+    ],
+)
+def test_permute_dimensions(dims, inverse_dims):
+    sample = dict(
+        plain_tensor=torch.testing.make_tensor((3, 28, 28), dtype=torch.uint8, device="cpu"),
+        image=make_image(),
+        bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY),
+        video=make_video(),
+        str="str",
+        int=0,
+    )
+
+    transform = transforms.PermuteDimensions(dims)
+    transformed_sample = transform(sample)
+
+    for key, value in sample.items():
+        value_type = type(value)
+        transformed_value = transformed_sample[key]
+
+        if _isinstance(value, (features.Image, features.is_simple_tensor, features.Video)):
+            if transform.dims.get(value_type) is not None:
+                assert transformed_value.permute(inverse_dims[value_type]).equal(value)
+            assert type(transformed_value) == torch.Tensor
+        else:
+            assert transformed_value is value
+
+
+@pytest.mark.parametrize(
+    "dims",
+    [
+        (-1, -2),
+        {torch.Tensor: (-1, -2), features.Image: (1, 2), features.Video: None},
+    ],
+)
+def test_transpose_dimensions(dims):
+    sample = dict(
+        plain_tensor=torch.testing.make_tensor((3, 28, 28), dtype=torch.uint8, device="cpu"),
+        image=make_image(),
+        bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY),
+        video=make_video(),
+        str="str",
+        int=0,
+    )
+
+    transform = transforms.TransposeDimensions(dims)
+    transformed_sample = transform(sample)
+
+    for key, value in sample.items():
+        value_type = type(value)
+        transformed_value = transformed_sample[key]
+
+        transposed_dims = transform.dims.get(value_type)
+        if _isinstance(value, (features.Image, features.is_simple_tensor, features.Video)):
+            if transposed_dims is not None:
+                assert transformed_value.transpose(*transposed_dims).equal(value)
+            assert type(transformed_value) == torch.Tensor
+        else:
+            assert transformed_value is value
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 5324db634..5bf5a12cd 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -40,7 +40,17 @@ from ._geometry import (
     TenCrop,
 )
 from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertColorSpace, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, RemoveSmallBoundingBoxes, ToDtype
+from ._misc import (
+    GaussianBlur,
+    Identity,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    PermuteDimensions,
+    RemoveSmallBoundingBoxes,
+    ToDtype,
+    TransposeDimensions,
+)
 from ._type_conversion import DecodeImage, LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
 from ._deprecated import Grayscale, RandomGrayscale, ToTensor  # usort: skip
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index bf7af5c26..aad684bf1 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,6 +1,4 @@
-import functools
-from collections import defaultdict
-from typing import Any, Callable, Dict, List, Sequence, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
 
 import PIL.Image
 
@@ -9,7 +7,7 @@ from torchvision.ops import remove_small_boxes
 from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, Transform
 
-from ._utils import _setup_float_or_seq, _setup_size, has_any, query_bounding_box
+from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size, has_any, query_bounding_box
 
 
 class Identity(Transform):
@@ -145,15 +143,10 @@ class GaussianBlur(Transform):
 class ToDtype(Transform):
     _transformed_types = (torch.Tensor,)
 
-    def _default_dtype(self, dtype: torch.dtype) -> torch.dtype:
-        return dtype
-
-    def __init__(self, dtype: Union[torch.dtype, Dict[Type, torch.dtype]]) -> None:
+    def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
         super().__init__()
         if not isinstance(dtype, dict):
-            # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
-            # If it were possible, we could replace this with `defaultdict(lambda: dtype)`
-            dtype = defaultdict(functools.partial(self._default_dtype, dtype))
+            dtype = _get_defaultdict(dtype)
         self.dtype = dtype
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -163,6 +156,42 @@ class ToDtype(Transform):
         return inpt.to(dtype=dtype)
 
 
+class PermuteDimensions(Transform):
+    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
+
+    def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None:
+        super().__init__()
+        if not isinstance(dims, dict):
+            dims = _get_defaultdict(dims)
+        self.dims = dims
+
+    def _transform(
+        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+    ) -> torch.Tensor:
+        dims = self.dims[type(inpt)]
+        if dims is None:
+            return inpt.as_subclass(torch.Tensor)
+        return inpt.permute(*dims)
+
+
+class TransposeDimensions(Transform):
+    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
+
+    def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None:
+        super().__init__()
+        if not isinstance(dims, dict):
+            dims = _get_defaultdict(dims)
+        self.dims = dims
+
+    def _transform(
+        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+    ) -> torch.Tensor:
+        dims = self.dims[type(inpt)]
+        if dims is None:
+            return inpt.as_subclass(torch.Tensor)
+        return inpt.transpose(*dims)
+
+
 class RemoveSmallBoundingBoxes(Transform):
     _transformed_types = (features.BoundingBox, features.Mask, features.Label, features.OneHotLabel)
 
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index b3e241d16..cff439b88 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -1,7 +1,7 @@
 import functools
 import numbers
 from collections import defaultdict
-from typing import Any, Callable, Dict, List, Sequence, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Sequence, Tuple, Type, TypeVar, Union
 
 import PIL.Image
 
@@ -42,8 +42,17 @@ def _check_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> None:
             raise TypeError("Got inappropriate fill arg")
 
 
-def _default_fill(fill: FillType) -> FillType:
-    return fill
+T = TypeVar("T")
+
+
+def _default_arg(value: T) -> T:
+    return value
+
+
+def _get_defaultdict(default: T) -> Dict[Any, T]:
+    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
+    # If it were possible, we could replace this with `defaultdict(lambda: default)`
+    return defaultdict(functools.partial(_default_arg, default))
 
 
 def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillType]:
@@ -52,9 +61,7 @@ def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, F
     if isinstance(fill, dict):
         return fill
 
-    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
-    # If it were possible, we could replace this with `defaultdict(lambda: fill)`
-    return defaultdict(functools.partial(_default_fill, fill))
+    return _get_defaultdict(fill)
 
 
 def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
-- 
GitLab


From 5421f12a3d62286b7935c9b8d24a0841019e3449 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 21 Oct 2022 14:16:38 +0200
Subject: [PATCH 068/624] Revert "[proto] Enable GPU tests on prototype
 (#6665)" (#6809)

This reverts commit d0de55db37447ff86ba9772fcaf15111a6e969b7.
---
 .github/workflows/prototype-tests-gpu.yml    | 80 --------------------
 test/test_prototype_transforms_functional.py |  5 +-
 2 files changed, 1 insertion(+), 84 deletions(-)
 delete mode 100644 .github/workflows/prototype-tests-gpu.yml

diff --git a/.github/workflows/prototype-tests-gpu.yml b/.github/workflows/prototype-tests-gpu.yml
deleted file mode 100644
index 1183ccd85..000000000
--- a/.github/workflows/prototype-tests-gpu.yml
+++ /dev/null
@@ -1,80 +0,0 @@
-# prototype-tests.yml adapted for self-hosted with gpu
-name: tests-gpu
-
-on:
-  pull_request:
-
-jobs:
-  prototype:
-    strategy:
-      fail-fast: false
-
-    runs-on: [self-hosted, linux.4xlarge.nvidia.gpu]
-    container:
-      image: pytorch/conda-builder:cuda116
-      options: --gpus all
-
-    steps:
-      - name: Run nvidia-smi
-        run: nvidia-smi
-
-      - name: Upgrade system packages
-        run: python -m pip install --upgrade pip setuptools wheel
-
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install PyTorch nightly builds
-        run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cu116/
-
-      - name: Install torchvision
-        run: pip install --progress-bar=off --no-build-isolation --editable .
-
-      - name: Install other prototype dependencies
-        run: pip install --progress-bar=off scipy pycocotools h5py iopath
-
-      - name: Install test requirements
-        run: pip install --progress-bar=off pytest pytest-mock pytest-cov
-
-      - name: Mark setup as complete
-        id: setup
-        run: python -c "import torch; exit(not torch.cuda.is_available())"
-
-      - name: Run prototype features tests
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/features \
-            --cov-report=term-missing \
-            test/test_prototype_features*.py
-
-      - name: Run prototype datasets tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/datasets \
-            --cov-report=term-missing \
-            test/test_prototype_datasets*.py
-
-      - name: Run prototype transforms tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/transforms \
-            --cov-report=term-missing \
-            test/test_prototype_transforms*.py
-
-      - name: Run prototype models tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/models \
-            --cov-report=term-missing \
-            test/test_prototype_models*.py
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 079ef3bd1..3423006e2 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -174,10 +174,7 @@ class TestKernels:
         output_cpu = info.kernel(input_cpu, *other_args, **kwargs)
         output_cuda = info.kernel(input_cuda, *other_args, **kwargs)
 
-        try:
-            assert_close(output_cuda, output_cpu, check_device=False, **info.closeness_kwargs)
-        except AssertionError:
-            pytest.xfail("CUDA vs CPU tolerance issue to be fixed")
+        assert_close(output_cuda, output_cpu, check_device=False, **info.closeness_kwargs)
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
-- 
GitLab


From 121a780c4b89abe00696bf97b85d10138f38b235 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 21 Oct 2022 14:34:35 +0200
Subject: [PATCH 069/624] Cleanup conversion transforms (#6801)

* remove copy from convert_color_space

* remove copy from convert_format_bounding_box

* remove .to_* methods from features

* remove unnecessary clones

* add perf todos

* refactor convert_color_space

* lint

* remove another clone

* and another clone

* remove a missed copy
---
 test/prototype_transforms_kernel_infos.py     | 26 +++-----
 test/test_prototype_transforms_functional.py  | 20 +++---
 torchvision/prototype/features/_image.py      | 12 ----
 torchvision/prototype/features/_video.py      | 12 ----
 torchvision/prototype/transforms/_augment.py  |  2 +-
 torchvision/prototype/transforms/_geometry.py | 21 +++----
 torchvision/prototype/transforms/_meta.py     |  7 +--
 .../transforms/functional/_geometry.py        | 61 +++++++++++--------
 .../prototype/transforms/functional/_meta.py  | 55 ++++++++---------
 9 files changed, 92 insertions(+), 124 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index eb90508fa..aab019040 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -461,9 +461,7 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
             ],
             dtype=bbox.dtype,
         )
-        return F.convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        )
+        return F.convert_format_bounding_box(out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format)
 
     if bounding_box.ndim < 2:
         bounding_box = [bounding_box]
@@ -556,17 +554,12 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_convert_format_bounding_box():
-    formats = set(features.BoundingBoxFormat)
-    for bounding_box_loader in make_bounding_box_loaders(formats=formats):
-        old_format = bounding_box_loader.format
-        for params in combinations_grid(new_format=formats - {old_format}, copy=(True, False)):
-            yield ArgsKwargs(bounding_box_loader, old_format=old_format, **params)
-
+    formats = list(features.BoundingBoxFormat)
+    for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
+        yield ArgsKwargs(bounding_box_loader, old_format=bounding_box_loader.format, new_format=new_format)
 
-def reference_convert_format_bounding_box(bounding_box, old_format, new_format, copy):
-    if not copy:
-        raise pytest.UsageError("Reference for `convert_format_bounding_box` only supports `copy=True`")
 
+def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
     return torchvision.ops.box_convert(
         bounding_box, in_fmt=old_format.kernel_name.lower(), out_fmt=new_format.kernel_name.lower()
     )
@@ -574,8 +567,7 @@ def reference_convert_format_bounding_box(bounding_box, old_format, new_format,
 
 def reference_inputs_convert_format_bounding_box():
     for args_kwargs in sample_inputs_convert_color_space_image_tensor():
-        (image_loader, *other_args), kwargs = args_kwargs
-        if len(image_loader.shape) == 2 and kwargs.setdefault("copy", True):
+        if len(args_kwargs.args[0].shape) == 2:
             yield args_kwargs
 
 
@@ -600,11 +592,11 @@ def sample_inputs_convert_color_space_image_tensor():
         for image_loader in make_image_loaders(
             sizes=["random"], color_spaces=[color_space], dtypes=[torch.float32], constant_alpha=True
         ):
-            yield ArgsKwargs(image_loader, old_color_space=color_space, new_color_space=color_space, copy=False)
+            yield ArgsKwargs(image_loader, old_color_space=color_space, new_color_space=color_space)
 
 
 @pil_reference_wrapper
-def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space, copy=True):
+def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space):
     color_space_pil = features.ColorSpace.from_pil_mode(image_pil.mode)
     if color_space_pil != old_color_space:
         raise pytest.UsageError(
@@ -612,7 +604,7 @@ def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_c
             f"from {old_color_space} to {color_space_pil}"
         )
 
-    return F.convert_color_space_image_pil(image_pil, color_space=new_color_space, copy=copy)
+    return F.convert_color_space_image_pil(image_pil, color_space=new_color_space)
 
 
 def reference_inputs_convert_color_space_image_tensor():
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 3423006e2..dc867f8ff 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -478,9 +478,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
             device=bbox.device,
         )
         return (
-            convert_format_bounding_box(
-                out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-            ),
+            convert_format_bounding_box(out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format),
             (height, width),
         )
 
@@ -733,14 +731,16 @@ def test_correctness_pad_bounding_box(device, padding):
 
         bbox_format = bbox.format
         bbox_dtype = bbox.dtype
-        bbox = convert_format_bounding_box(bbox, old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY)
+        bbox = (
+            bbox.clone()
+            if bbox_format == features.BoundingBoxFormat.XYXY
+            else convert_format_bounding_box(bbox, bbox_format, features.BoundingBoxFormat.XYXY)
+        )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_format_bounding_box(
-            bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False
-        )
+        bbox = convert_format_bounding_box(bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format)
         if bbox.dtype != bbox_dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
@@ -840,9 +840,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-        )
+        return convert_format_bounding_box(out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format)
 
     spatial_size = (32, 38)
 
@@ -903,7 +901,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_, copy=False)
+        return convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 48f4b0950..d52989641 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -110,18 +110,6 @@ class Image(_Feature):
     def num_channels(self) -> int:
         return self.shape[-3]
 
-    def to_color_space(self, color_space: Union[str, ColorSpace], copy: bool = True) -> Image:
-        if isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-
-        return Image.wrap_like(
-            self,
-            self._F.convert_color_space_image_tensor(
-                self.as_subclass(torch.Tensor), old_color_space=self.color_space, new_color_space=color_space, copy=copy
-            ),
-            color_space=color_space,
-        )
-
     def horizontal_flip(self) -> Image:
         output = self._F.horizontal_flip_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index 6351ad5aa..a4d30a49c 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -66,18 +66,6 @@ class Video(_Feature):
     def num_frames(self) -> int:
         return self.shape[-4]
 
-    def to_color_space(self, color_space: Union[str, ColorSpace], copy: bool = True) -> Video:
-        if isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-
-        return Video.wrap_like(
-            self,
-            self._F.convert_color_space_video(
-                self.as_subclass(torch.Tensor), old_color_space=self.color_space, new_color_space=color_space, copy=copy
-            ),
-            color_space=color_space,
-        )
-
     def horizontal_flip(self) -> Video:
         output = self._F.horizontal_flip_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 99b77eb40..4a45f9f57 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -265,7 +265,7 @@ class SimpleCopyPaste(_RandomApplyTransform):
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
         boxes = F.convert_format_bounding_box(
-            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False
+            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
 
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 5c67bf0ec..440e23ab6 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -655,9 +655,7 @@ class RandomIoUCrop(Transform):
                     continue
 
                 # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_format_bounding_box(
-                    bboxes, old_format=bboxes.format, new_format=features.BoundingBoxFormat.XYXY, copy=True
-                )
+                xyxy_bboxes = F.convert_format_bounding_box(bboxes, bboxes.format, features.BoundingBoxFormat.XYXY)
                 cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
                 cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
                 is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
@@ -801,22 +799,21 @@ class FixedSizeCrop(Transform):
         top = int(offset_height * r)
         left = int(offset_width * r)
 
+        bounding_boxes: Optional[torch.Tensor]
         try:
             bounding_boxes = query_bounding_box(flat_inputs)
         except ValueError:
             bounding_boxes = None
 
         if needs_crop and bounding_boxes is not None:
-            bounding_boxes = cast(
-                features.BoundingBox, F.crop(bounding_boxes, top=top, left=left, height=new_height, width=new_width)
-            )
-            bounding_boxes = features.BoundingBox.wrap_like(
-                bounding_boxes,
-                F.clamp_bounding_box(
-                    bounding_boxes, format=bounding_boxes.format, spatial_size=bounding_boxes.spatial_size
-                ),
+            format = bounding_boxes.format
+            bounding_boxes, spatial_size = F.crop_bounding_box(
+                bounding_boxes, format=format, top=top, left=left, height=new_height, width=new_width
             )
-            height_and_width = bounding_boxes.to_format(features.BoundingBoxFormat.XYWH)[..., 2:]
+            bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, spatial_size=spatial_size)
+            height_and_width = F.convert_format_bounding_box(
+                bounding_boxes, old_format=format, new_format=features.BoundingBoxFormat.XYWH
+            )[..., 2:]
             is_valid = torch.all(height_and_width > 0, dim=-1)
         else:
             is_valid = None
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 829efeab1..6e5a81397 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -50,7 +50,6 @@ class ConvertColorSpace(Transform):
         self,
         color_space: Union[str, features.ColorSpace],
         old_color_space: Optional[Union[str, features.ColorSpace]] = None,
-        copy: bool = True,
     ) -> None:
         super().__init__()
 
@@ -62,14 +61,10 @@ class ConvertColorSpace(Transform):
             old_color_space = features.ColorSpace.from_str(old_color_space)
         self.old_color_space = old_color_space
 
-        self.copy = copy
-
     def _transform(
         self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
     ) -> Union[features.ImageType, features.VideoType]:
-        return F.convert_color_space(
-            inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy
-        )
+        return F.convert_color_space(inpt, color_space=self.color_space, old_color_space=self.old_color_space)
 
 
 class ClampBoundingBoxes(Transform):
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 1c897700c..1451b83cf 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -36,14 +36,18 @@ def horizontal_flip_bounding_box(
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    bounding_box = (
+        bounding_box.clone()
+        if format == features.BoundingBoxFormat.XYXY
+        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]
 
     return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
+        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format
     ).reshape(shape)
 
 
@@ -73,14 +77,18 @@ def vertical_flip_bounding_box(
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    bounding_box = (
+        bounding_box.clone()
+        if format == features.BoundingBoxFormat.XYXY
+        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]
 
     return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
+        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format
     ).reshape(shape)
 
 
@@ -394,8 +402,9 @@ def affine_bounding_box(
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+
+    bounding_box = (
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center)
@@ -403,7 +412,7 @@ def affine_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
     ).reshape(original_shape)
 
 
@@ -583,8 +592,8 @@ def rotate_bounding_box(
         center = None
 
     original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+    bounding_box = (
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     out_bboxes, spatial_size = _affine_bounding_box_xyxy(
@@ -599,9 +608,9 @@ def rotate_bounding_box(
     )
 
     return (
-        convert_format_bounding_box(
-            out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        ).reshape(original_shape),
+        convert_format_bounding_box(out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format).reshape(
+            original_shape
+        ),
         spatial_size,
     )
 
@@ -818,8 +827,12 @@ def crop_bounding_box(
     height: int,
     width: int,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    bounding_box = (
+        bounding_box.clone()
+        if format == features.BoundingBoxFormat.XYXY
+        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     )
 
     # Crop or implicit pad if left and/or top have negative values:
@@ -827,9 +840,7 @@ def crop_bounding_box(
     bounding_box[..., 1::2] -= top
 
     return (
-        convert_format_bounding_box(
-            bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        ),
+        convert_format_bounding_box(bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format),
         (height, width),
     )
 
@@ -896,8 +907,8 @@ def perspective_bounding_box(
         raise ValueError("Argument perspective_coeffs should have 8 float values")
 
     original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+    bounding_box = (
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
@@ -967,7 +978,7 @@ def perspective_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
     ).reshape(original_shape)
 
 
@@ -1061,8 +1072,8 @@ def elastic_bounding_box(
     displacement = displacement.to(bounding_box.device)
 
     original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
+    bounding_box = (
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
@@ -1088,7 +1099,7 @@ def elastic_bounding_box(
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
     ).reshape(original_shape)
 
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 61a54f01c..571556562 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -125,13 +125,10 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor) -> torch.Tensor:
 
 
 def convert_format_bounding_box(
-    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, copy: bool = True
+    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat
 ) -> torch.Tensor:
     if new_format == old_format:
-        if copy:
-            return bounding_box.clone()
-        else:
-            return bounding_box
+        return bounding_box
 
     if old_format == BoundingBoxFormat.XYWH:
         bounding_box = _xywh_to_xyxy(bounding_box)
@@ -149,12 +146,16 @@ def convert_format_bounding_box(
 def clamp_bounding_box(
     bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
-    # TODO: (PERF) Possible speed up clamping if we have different implementations for each bbox format.
-    # Not sure if they yield equivalent results.
-    xyxy_boxes = convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY)
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    xyxy_boxes = (
+        bounding_box.clone()
+        if format == BoundingBoxFormat.XYXY
+        else convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY)
+    )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
-    return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format, copy=False)
+    return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format)
 
 
 def _split_alpha(image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -192,13 +193,10 @@ def _rgb_to_gray(image: torch.Tensor) -> torch.Tensor:
 
 
 def convert_color_space_image_tensor(
-    image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True
+    image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace
 ) -> torch.Tensor:
     if new_color_space == old_color_space:
-        if copy:
-            return image.clone()
-        else:
-            return image
+        return image
 
     if old_color_space == ColorSpace.OTHER or new_color_space == ColorSpace.OTHER:
         raise RuntimeError(f"Conversion to or from {ColorSpace.OTHER} is not supported.")
@@ -242,34 +240,29 @@ _COLOR_SPACE_TO_PIL_MODE = {
 
 
 @torch.jit.unused
-def convert_color_space_image_pil(
-    image: PIL.Image.Image, color_space: ColorSpace, copy: bool = True
-) -> PIL.Image.Image:
+def convert_color_space_image_pil(image: PIL.Image.Image, color_space: ColorSpace) -> PIL.Image.Image:
     old_mode = image.mode
     try:
         new_mode = _COLOR_SPACE_TO_PIL_MODE[color_space]
     except KeyError:
         raise ValueError(f"Conversion from {ColorSpace.from_pil_mode(old_mode)} to {color_space} is not supported.")
 
-    if not copy and image.mode == new_mode:
+    if image.mode == new_mode:
         return image
 
     return image.convert(new_mode)
 
 
 def convert_color_space_video(
-    video: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True
+    video: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace
 ) -> torch.Tensor:
-    return convert_color_space_image_tensor(
-        video, old_color_space=old_color_space, new_color_space=new_color_space, copy=copy
-    )
+    return convert_color_space_image_tensor(video, old_color_space=old_color_space, new_color_space=new_color_space)
 
 
 def convert_color_space(
     inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT],
     color_space: ColorSpace,
     old_color_space: Optional[ColorSpace] = None,
-    copy: bool = True,
 ) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
@@ -279,10 +272,16 @@ def convert_color_space(
                 "In order to convert the color space of simple tensors, "
                 "the `old_color_space=...` parameter needs to be passed."
             )
-        return convert_color_space_image_tensor(
-            inpt, old_color_space=old_color_space, new_color_space=color_space, copy=copy
+        return convert_color_space_image_tensor(inpt, old_color_space=old_color_space, new_color_space=color_space)
+    elif isinstance(inpt, features.Image):
+        output = convert_color_space_image_tensor(
+            inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
+        )
+        return features.Image.wrap_like(inpt, output, color_space=color_space)
+    elif isinstance(inpt, features.Video):
+        output = convert_color_space_video(
+            inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
         )
-    elif isinstance(inpt, (features.Image, features.Video)):
-        return inpt.to_color_space(color_space, copy=copy)
+        return features.Video.wrap_like(inpt, output, color_space=color_space)
     else:
-        return convert_color_space_image_pil(inpt, color_space, copy=copy)
+        return convert_color_space_image_pil(inpt, color_space)
-- 
GitLab


From 9d322cace06d48e584258b1734343bc3a775c823 Mon Sep 17 00:00:00 2001
From: Joao Gomes <jdsgomes@fb.com>
Date: Fri, 21 Oct 2022 17:46:24 +0100
Subject: [PATCH 070/624] [FBcode->GH] fix typo in CRES stereo (#6813)

[FBcode->GH]
---
 torchvision/prototype/models/depth/stereo/crestereo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
index 29c0be936..bf5d7534c 100644
--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -542,7 +542,7 @@ class LocalFeatureEncoderLayer(nn.Module):
 
         # attention operation
         message = self.attention_op(queries, keys, values, x_mask, source_mask)
-        # concatenating attention heads together before passing throught projection layer
+        # concatenating attention heads together before passing through projection layer
         message = self.merge(message.reshape(B, S, D))
         message = self.attention_norm(message)
 
-- 
GitLab


From 9c112935abe400222cca8f9fbc2d8386e0f25e80 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 21 Oct 2022 21:03:59 +0200
Subject: [PATCH 071/624] Add tests and proper support for videos in
 `ConvertImageDtype` (#6783)

* add KernelInfo

* split dtype and device consistency tests

* add proper support for video

* fix tests and add DispatcherInfo

* add aliases

* cleanup

* fix typo
---
 test/prototype_common_utils.py                |  6 +-
 test/prototype_transforms_dispatcher_infos.py | 10 ++
 test/prototype_transforms_kernel_infos.py     | 53 +++++-----
 test/test_prototype_transforms.py             |  2 +-
 test/test_prototype_transforms_consistency.py |  2 +-
 test/test_prototype_transforms_functional.py  |  1 +
 torchvision/prototype/transforms/__init__.py  |  2 +-
 torchvision/prototype/transforms/_meta.py     | 14 +--
 .../transforms/functional/__init__.py         |  5 +-
 .../prototype/transforms/functional/_meta.py  | 96 +++++++++++++++++++
 .../transforms/functional/_type_conversion.py | 77 +--------------
 11 files changed, 155 insertions(+), 113 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 220a793ac..e56da8bba 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -22,7 +22,7 @@ from torch.testing._comparison import (
     UnsupportedInputs,
 )
 from torchvision.prototype import features
-from torchvision.prototype.transforms.functional import convert_image_dtype, to_image_tensor
+from torchvision.prototype.transforms.functional import convert_dtype_image_tensor, to_image_tensor
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
 __all__ = [
@@ -97,8 +97,8 @@ class PILImagePair(TensorLikePair):
     def _equalize_attributes(self, actual, expected):
         if actual.dtype != expected.dtype:
             dtype = torch.promote_types(actual.dtype, expected.dtype)
-            actual = convert_image_dtype(actual, dtype)
-            expected = convert_image_dtype(expected, dtype)
+            actual = convert_dtype_image_tensor(actual, dtype)
+            expected = convert_dtype_image_tensor(expected, dtype)
 
         return super()._equalize_attributes(actual, expected)
 
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 82173907c..fab4b3583 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -416,4 +416,14 @@ DISPATCHER_INFOS = [
             skip_dispatch_feature,
         ],
     ),
+    DispatcherInfo(
+        F.convert_dtype,
+        kernels={
+            features.Image: F.convert_dtype_image_tensor,
+            features.Video: F.convert_dtype_video,
+        },
+        test_marks=[
+            skip_dispatch_feature,
+        ],
+    ),
 ]
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index aab019040..77a77444b 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1979,7 +1979,7 @@ KERNEL_INFOS.extend(
 )
 
 
-def sample_inputs_convert_image_dtype():
+def sample_inputs_convert_dtype_image_tensor():
     for input_dtype, output_dtype in itertools.product(
         [torch.uint8, torch.int64, torch.float32, torch.float64], repeat=2
     ):
@@ -1992,10 +1992,8 @@ def sample_inputs_convert_image_dtype():
         ):
             yield ArgsKwargs(image_loader, dtype=output_dtype)
 
-    yield ArgsKwargs(make_image_loader(color_space=features.ColorSpace.RGB), dtype=torch.uint8)
 
-
-def reference_convert_image_dtype(image, dtype=torch.float):
+def reference_convert_dtype_image_tensor(image, dtype=torch.float):
     input_dtype = image.dtype
     output_dtype = dtype
 
@@ -2026,7 +2024,7 @@ def reference_convert_image_dtype(image, dtype=torch.float):
     return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype)
 
 
-def reference_inputs_convert_image_dtype():
+def reference_inputs_convert_dtype_image_tensor():
     for input_dtype, output_dtype in itertools.product(
         [
             torch.uint8,
@@ -2055,24 +2053,32 @@ def reference_inputs_convert_image_dtype():
         yield ArgsKwargs(image, dtype=output_dtype)
 
 
+def sample_inputs_convert_dtype_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+        yield ArgsKwargs(video_loader)
+
+
+_common_convert_dtype_marks = [
+    TestMark(
+        ("TestKernels", "test_dtype_and_device_consistency"),
+        pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
+        condition=lambda args_kwargs: args_kwargs.args[0].dtype != args_kwargs.kwargs.get("dtype", torch.float32),
+    ),
+    TestMark(
+        ("TestKernels", "test_scripted_vs_eager"),
+        pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %')}:UserWarning"),
+    ),
+]
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.convert_image_dtype,
-            sample_inputs_fn=sample_inputs_convert_image_dtype,
-            reference_fn=reference_convert_image_dtype,
-            reference_inputs_fn=reference_inputs_convert_image_dtype,
+            F.convert_dtype_image_tensor,
+            sample_inputs_fn=sample_inputs_convert_dtype_image_tensor,
+            reference_fn=reference_convert_dtype_image_tensor,
+            reference_inputs_fn=reference_inputs_convert_dtype_image_tensor,
             test_marks=[
-                TestMark(
-                    ("TestKernels", "test_scripted_vs_eager"),
-                    pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %41')}:UserWarning"),
-                ),
-                TestMark(
-                    ("TestKernels", "test_dtype_and_device_consistency"),
-                    pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
-                    condition=lambda args_kwargs: args_kwargs.args[0].dtype
-                    != args_kwargs.kwargs.get("dtype", torch.float32),
-                ),
+                *_common_convert_dtype_marks,
                 TestMark(
                     ("TestKernels", "test_against_reference"),
                     pytest.mark.xfail(reason="Conversion overflows"),
@@ -2080,10 +2086,6 @@ KERNEL_INFOS.extend(
                         args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
                         and not args_kwargs.kwargs["dtype"].is_floating_point
                     )
-                    or (
-                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
-                        and args_kwargs.kwargs["dtype"] == torch.int64
-                    )
                     or (
                         args_kwargs.args[0].dtype in {torch.int32, torch.int64}
                         and args_kwargs.kwargs["dtype"] == torch.float16
@@ -2091,5 +2093,10 @@ KERNEL_INFOS.extend(
                 ),
             ],
         ),
+        KernelInfo(
+            F.convert_dtype_video,
+            sample_inputs_fn=sample_inputs_convert_dtype_video,
+            test_marks=_common_convert_dtype_marks,
+        ),
     ]
 )
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 351430e1c..1a4a098db 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -92,7 +92,7 @@ class TestSmoke:
         transforms.RandomErasing(p=1.0),
         transforms.Resize([16, 16]),
         transforms.CenterCrop([16, 16]),
-        transforms.ConvertImageDtype(),
+        transforms.ConvertDtype(),
         transforms.RandomHorizontalFlip(),
         transforms.Pad(5),
         transforms.RandomZoomOut(),
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 7d7246326..b0022baaa 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -153,7 +153,7 @@ CONSISTENCY_CONFIGS = [
         ),
     ),
     ConsistencyConfig(
-        prototype_transforms.ConvertImageDtype,
+        prototype_transforms.ConvertDtype,
         legacy_transforms.ConvertImageDtype,
         [
             ArgsKwargs(torch.float16),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index dc867f8ff..cad5c204a 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -307,6 +307,7 @@ class TestDispatchers:
             (F.get_image_num_channels, F.get_num_channels),
             (F.to_pil_image, F.to_image_pil),
             (F.elastic_transform, F.elastic),
+            (F.convert_image_dtype, F.convert_dtype_image_tensor),
         ]
     ],
 )
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 5bf5a12cd..099c30c9c 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -39,7 +39,7 @@ from ._geometry import (
     ScaleJitter,
     TenCrop,
 )
-from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertColorSpace, ConvertImageDtype
+from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertColorSpace, ConvertDtype, ConvertImageDtype
 from ._misc import (
     GaussianBlur,
     Identity,
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 6e5a81397..4a85175e9 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -25,7 +25,7 @@ class ConvertBoundingBoxFormat(Transform):
         return features.BoundingBox.wrap_like(inpt, output, format=params["format"])
 
 
-class ConvertImageDtype(Transform):
+class ConvertDtype(Transform):
     _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
 
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
@@ -35,12 +35,12 @@ class ConvertImageDtype(Transform):
     def _transform(
         self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
     ) -> Union[features.TensorImageType, features.TensorVideoType]:
-        # TODO: the `inpt.as_subclass(torch.Tensor)` call can be removed as soon as we have a proper dispatcher that
-        #  handles this. See https://github.com/pytorch/vision/pull/6783 for details.
-        output = F.convert_image_dtype(inpt.as_subclass(torch.Tensor), dtype=self.dtype)
-        return (
-            output if features.is_simple_tensor(inpt) else type(inpt).wrap_like(inpt, output)  # type: ignore[attr-defined]
-        )
+        return F.convert_dtype(inpt, self.dtype)
+
+
+# We changed the name to align it with the new naming scheme. Still, `ConvertImageDtype` is
+# prevalent and well understood. Thus, we just alias it without deprecating the old name.
+ConvertImageDtype = ConvertDtype
 
 
 class ConvertColorSpace(Transform):
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index fb72e7b57..7e520c986 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -8,6 +8,10 @@ from ._meta import (
     convert_color_space_image_pil,
     convert_color_space_video,
     convert_color_space,
+    convert_dtype_image_tensor,
+    convert_dtype,
+    convert_dtype_video,
+    convert_image_dtype,
     get_dimensions_image_tensor,
     get_dimensions_image_pil,
     get_dimensions,
@@ -162,7 +166,6 @@ from ._misc import (
     normalize_video,
 )
 from ._type_conversion import (
-    convert_image_dtype,
     decode_image_with_pil,
     decode_video_with_av,
     pil_to_tensor,
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 571556562..674cba846 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -285,3 +285,99 @@ def convert_color_space(
         return features.Video.wrap_like(inpt, output, color_space=color_space)
     else:
         return convert_color_space_image_pil(inpt, color_space)
+
+
+def _num_value_bits(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 8
+    elif dtype == torch.int8:
+        return 7
+    elif dtype == torch.int16:
+        return 15
+    elif dtype == torch.int32:
+        return 31
+    elif dtype == torch.int64:
+        return 63
+    else:
+        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
+
+
+def convert_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    if image.dtype == dtype:
+        return image
+
+    float_input = image.is_floating_point()
+    if torch.jit.is_scripting():
+        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
+        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
+    else:
+        float_output = dtype.is_floating_point
+
+    if float_input:
+        # float to float
+        if float_output:
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
+
+        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
+        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
+        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # for a detailed analysis.
+        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
+        # Instead, we can also multiply by the maximum value plus something close to `1`. See
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
+        eps = 1e-3
+        max_value = float(_FT._max_value(dtype))
+        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
+        # discrete set `{0, 1}`.
+        return image.mul(max_value + 1.0 - eps).to(dtype)
+    else:
+        # int to float
+        if float_output:
+            return image.to(dtype).div_(_FT._max_value(image.dtype))
+
+        # int to int
+        num_value_bits_input = _num_value_bits(image.dtype)
+        num_value_bits_output = _num_value_bits(dtype)
+
+        if num_value_bits_input > num_value_bits_output:
+            return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
+        else:
+            # The bitshift kernel is not vectorized
+            #  https://github.com/pytorch/pytorch/blob/703c19008df4700b6a522b0ae5c4b6d5ffc0906f/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L315-L322
+            #  This results in the multiplication actually being faster.
+            # TODO: If the bitshift kernel is optimized in core, replace the computation below with
+            #  `image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)`
+            max_value_input = float(_FT._max_value(dtype))
+            max_value_output = float(_FT._max_value(image.dtype))
+            factor = int((max_value_input + 1) // (max_value_output + 1))
+            return image.to(dtype).mul_(factor)
+
+
+# We changed the name to align it with the new naming scheme. Still, `convert_image_dtype` is
+# prevalent and well understood. Thus, we just alias it without deprecating the old name.
+convert_image_dtype = convert_dtype_image_tensor
+
+
+def convert_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    return convert_dtype_image_tensor(video, dtype)
+
+
+def convert_dtype(
+    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], dtype: torch.dtype = torch.float
+) -> torch.Tensor:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+    ):
+        return convert_dtype_image_tensor(inpt, dtype)
+    elif isinstance(inpt, features.Image):
+        output = convert_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype)
+        return features.Image.wrap_like(inpt, output)
+    else:  # isinstance(inpt, features.Video):
+        output = convert_dtype_video(inpt.as_subclass(torch.Tensor), dtype)
+        return features.Video.wrap_like(inpt, output)
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index a57fbc655..712ca62ec 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -7,7 +7,7 @@ import torch
 from torchvision.io.video import read_video
 from torchvision.prototype import features
 from torchvision.prototype.utils._internal import ReadOnlyTensorBuffer
-from torchvision.transforms import functional as _F, functional_tensor as _FT
+from torchvision.transforms import functional as _F
 
 
 @torch.jit.unused
@@ -41,78 +41,3 @@ pil_to_tensor = _F.pil_to_tensor
 # We changed the names to align them with the new naming scheme. Still, `to_pil_image` is
 # prevalent and well understood. Thus, we just alias it without deprecating the old name.
 to_pil_image = to_image_pil
-
-
-def _num_value_bits(dtype: torch.dtype) -> int:
-    if dtype == torch.uint8:
-        return 8
-    elif dtype == torch.int8:
-        return 7
-    elif dtype == torch.int16:
-        return 15
-    elif dtype == torch.int32:
-        return 31
-    elif dtype == torch.int64:
-        return 63
-    else:
-        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
-
-
-def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
-    if not isinstance(image, torch.Tensor):
-        raise TypeError("Input img should be Tensor Image")
-
-    if image.dtype == dtype:
-        return image
-
-    float_input = image.is_floating_point()
-    if torch.jit.is_scripting():
-        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
-        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
-    else:
-        float_output = dtype.is_floating_point
-
-    if float_input:
-        # float to float
-        if float_output:
-            return image.to(dtype)
-
-        # float to int
-        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
-            image.dtype == torch.float64 and dtype == torch.int64
-        ):
-            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
-
-        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
-        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
-        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
-        # for a detailed analysis.
-        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
-        # Instead, we can also multiply by the maximum value plus something close to `1`. See
-        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
-        eps = 1e-3
-        max_value = float(_FT._max_value(dtype))
-        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
-        # discrete set `{0, 1}`.
-        return image.mul(max_value + 1.0 - eps).to(dtype)
-    else:
-        # int to float
-        if float_output:
-            return image.to(dtype).div_(_FT._max_value(image.dtype))
-
-        # int to int
-        num_value_bits_input = _num_value_bits(image.dtype)
-        num_value_bits_output = _num_value_bits(dtype)
-
-        if num_value_bits_input > num_value_bits_output:
-            return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
-        else:
-            # The bitshift kernel is not vectorized
-            #  https://github.com/pytorch/pytorch/blob/703c19008df4700b6a522b0ae5c4b6d5ffc0906f/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L315-L322
-            #  This results in the multiplication actually being faster.
-            # TODO: If the bitshift kernel is optimized in core, replace the computation below with
-            #  `image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)`
-            max_value_input = float(_FT._max_value(dtype))
-            max_value_output = float(_FT._max_value(image.dtype))
-            factor = int((max_value_input + 1) // (max_value_output + 1))
-            return image.to(dtype).mul_(factor)
-- 
GitLab


From e96860d60be171e0802cdbd180ca976c1afd2b50 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 24 Oct 2022 09:50:35 +0100
Subject: [PATCH 072/624] Adding Uniform temporal Subsampling for Video (#6812)

* Adding temporal sampling kernel and dispatcher.

* Adding the UniformTemporalSubsample class.

* Add it on init

* Adding tests.

* Addressing comments.

* Reverting proposal as it led to different results.

* add more tests for uniform_temporal_subsample

* cleanup

* fix logic

* fix logic

* make test more strict

* lint

* Update torchvision/prototype/transforms/functional/_temporal.py

* remove pytorchvideo again per request

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/prototype_transforms_dispatcher_infos.py |  9 +++++
 test/prototype_transforms_kernel_infos.py     | 40 +++++++++++++++++++
 test/test_prototype_transforms.py             | 19 +++++++++
 test/test_prototype_transforms_functional.py  | 11 +++++
 torchvision/prototype/transforms/__init__.py  |  1 +
 torchvision/prototype/transforms/_temporal.py | 16 ++++++++
 .../transforms/functional/__init__.py         |  1 +
 .../transforms/functional/_temporal.py        | 24 +++++++++++
 8 files changed, 121 insertions(+)
 create mode 100644 torchvision/prototype/transforms/_temporal.py
 create mode 100644 torchvision/prototype/transforms/functional/_temporal.py

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index fab4b3583..6aa7eb248 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -426,4 +426,13 @@ DISPATCHER_INFOS = [
             skip_dispatch_feature,
         ],
     ),
+    DispatcherInfo(
+        F.uniform_temporal_subsample,
+        kernels={
+            features.Video: F.uniform_temporal_subsample_video,
+        },
+        test_marks=[
+            skip_dispatch_feature,
+        ],
+    ),
 ]
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 77a77444b..3fcf734f8 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -2100,3 +2100,43 @@ KERNEL_INFOS.extend(
         ),
     ]
 )
+
+
+def sample_inputs_uniform_temporal_subsample_video():
+    for video_loader in make_video_loaders(sizes=["random"], num_frames=[4]):
+        for temporal_dim in [-4, len(video_loader.shape) - 4]:
+            yield ArgsKwargs(video_loader, num_samples=2, temporal_dim=temporal_dim)
+
+
+def reference_uniform_temporal_subsample_video(x, num_samples, temporal_dim=-4):
+    # Copy-pasted from
+    # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19
+    t = x.shape[temporal_dim]
+    assert num_samples > 0 and t > 0
+    # Sample by nearest neighbor interpolation if num_samples > t.
+    indices = torch.linspace(0, t - 1, num_samples)
+    indices = torch.clamp(indices, 0, t - 1).long()
+    return torch.index_select(x, temporal_dim, indices)
+
+
+def reference_inputs_uniform_temporal_subsample_video():
+    for video_loader in make_video_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB], num_frames=[10]):
+        for num_samples in range(1, video_loader.shape[-4] + 1):
+            yield ArgsKwargs(video_loader, num_samples)
+
+
+KERNEL_INFOS.append(
+    KernelInfo(
+        F.uniform_temporal_subsample_video,
+        sample_inputs_fn=sample_inputs_uniform_temporal_subsample_video,
+        reference_fn=reference_uniform_temporal_subsample_video,
+        reference_inputs_fn=reference_inputs_uniform_temporal_subsample_video,
+        test_marks=[
+            TestMark(
+                ("TestKernels", "test_batched_vs_single"),
+                pytest.mark.skip("Positive `temporal_dim` arguments are not equivalent for batched and single inputs"),
+                condition=lambda args_kwargs: args_kwargs.kwargs.get("temporal_dim") >= 0,
+            ),
+        ],
+    )
+)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 1a4a098db..4334d157e 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1899,3 +1899,22 @@ def test_transpose_dimensions(dims):
             assert type(transformed_value) == torch.Tensor
         else:
             assert transformed_value is value
+
+
+class TestUniformTemporalSubsample:
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.zeros(10, 3, 8, 8),
+            torch.zeros(1, 10, 3, 8, 8),
+            features.Video(torch.zeros(1, 10, 3, 8, 8)),
+        ],
+    )
+    def test__transform(self, inpt):
+        num_samples = 5
+        transform = transforms.UniformTemporalSubsample(num_samples)
+
+        output = transform(inpt)
+        assert type(output) is type(inpt)
+        assert output.shape[-4] == num_samples
+        assert output.dtype == inpt.dtype
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index cad5c204a..0ddfcf1b3 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -242,6 +242,7 @@ class TestDispatchers:
             F.get_num_frames,
             F.get_spatial_size,
             F.rgb_to_grayscale,
+            F.uniform_temporal_subsample,
         ],
         ids=lambda dispatcher: dispatcher.__name__,
     )
@@ -1060,3 +1061,13 @@ def test_equalize_image_tensor_edge_cases():
     inpt[..., 100:, 100:] = 1
     output = F.equalize_image_tensor(inpt)
     assert output.unique().tolist() == [0, 255]
+
+
+@pytest.mark.parametrize("device", cpu_and_gpu())
+def test_correctness_uniform_temporal_subsample(device):
+    video = torch.arange(10, device=device)[:, None, None, None].expand(-1, 3, 8, 8)
+    out_video = F.uniform_temporal_subsample(video, 5)
+    assert out_video.unique().tolist() == [0, 2, 4, 6, 9]
+
+    out_video = F.uniform_temporal_subsample(video, 8)
+    assert out_video.unique().tolist() == [0, 1, 2, 3, 5, 6, 7, 9]
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 099c30c9c..2f9bd76d4 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -51,6 +51,7 @@ from ._misc import (
     ToDtype,
     TransposeDimensions,
 )
+from ._temporal import UniformTemporalSubsample
 from ._type_conversion import DecodeImage, LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
 from ._deprecated import Grayscale, RandomGrayscale, ToTensor  # usort: skip
diff --git a/torchvision/prototype/transforms/_temporal.py b/torchvision/prototype/transforms/_temporal.py
new file mode 100644
index 000000000..46293c251
--- /dev/null
+++ b/torchvision/prototype/transforms/_temporal.py
@@ -0,0 +1,16 @@
+from typing import Any, Dict
+
+from torchvision.prototype import features
+from torchvision.prototype.transforms import functional as F, Transform
+
+
+class UniformTemporalSubsample(Transform):
+    _transformed_types = (features.is_simple_tensor, features.Video)
+
+    def __init__(self, num_samples: int, temporal_dim: int = -4):
+        super().__init__()
+        self.num_samples = num_samples
+        self.temporal_dim = temporal_dim
+
+    def _transform(self, inpt: features.VideoType, params: Dict[str, Any]) -> features.VideoType:
+        return F.uniform_temporal_subsample(inpt, self.num_samples, temporal_dim=self.temporal_dim)
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index 7e520c986..0b49c53b5 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -165,6 +165,7 @@ from ._misc import (
     normalize_image_tensor,
     normalize_video,
 )
+from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
 from ._type_conversion import (
     decode_image_with_pil,
     decode_video_with_av,
diff --git a/torchvision/prototype/transforms/functional/_temporal.py b/torchvision/prototype/transforms/functional/_temporal.py
new file mode 100644
index 000000000..0aed43746
--- /dev/null
+++ b/torchvision/prototype/transforms/functional/_temporal.py
@@ -0,0 +1,24 @@
+import torch
+
+from torchvision.prototype import features
+
+
+def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temporal_dim: int = -4) -> torch.Tensor:
+    # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
+    t_max = video.shape[temporal_dim] - 1
+    indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
+    return torch.index_select(video, temporal_dim, indices)
+
+
+def uniform_temporal_subsample(
+    inpt: features.VideoTypeJIT, num_samples: int, temporal_dim: int = -4
+) -> features.VideoTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)):
+        return uniform_temporal_subsample_video(inpt, num_samples, temporal_dim=temporal_dim)
+    else:  # isinstance(inpt, features.Video)
+        if temporal_dim != -4 and inpt.ndim - 4 != temporal_dim:
+            raise ValueError("Video inputs must have temporal_dim equivalent to -4")
+        output = uniform_temporal_subsample_video(
+            inpt.as_subclass(torch.Tensor), num_samples, temporal_dim=temporal_dim
+        )
+        return features.Video.wrap_like(inpt, output)
-- 
GitLab


From 3dd2e3d84e920640029dd3718eed4a7c8a1ab38c Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 24 Oct 2022 11:16:11 +0200
Subject: [PATCH 073/624] [proto] Speed improvement for autocontrast op (#6811)

* WIP

* Updates to speed up autocontrast

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 .../prototype/transforms/functional/_color.py | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 68b52fff6..7bf412aaf 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -211,7 +211,34 @@ def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTyp
         return solarize_image_pil(inpt, threshold=threshold)
 
 
-autocontrast_image_tensor = _FT.autocontrast
+def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
+
+    if not (isinstance(image, torch.Tensor)):
+        raise TypeError("Input img should be Tensor image")
+
+    c = get_num_channels_image_tensor(image)
+
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    bound = 1.0 if image.is_floating_point() else 255.0
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+
+    minimum = image.amin(dim=(-2, -1), keepdim=True).to(dtype)
+    maximum = image.amax(dim=(-2, -1), keepdim=True).to(dtype)
+
+    scale = bound / (maximum - minimum)
+    eq_idxs = maximum == minimum
+    minimum[eq_idxs] = 0.0
+    scale[eq_idxs] = 1.0
+
+    return (image - minimum).mul_(scale).clamp_(0, bound).to(image.dtype)
+
+
 autocontrast_image_pil = _FP.autocontrast
 
 
-- 
GitLab


From 62da7d48872ef6b9a4a907bbb6b866b39c24120a Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 24 Oct 2022 11:34:59 +0200
Subject: [PATCH 074/624] [proto] Speed improvements for adjust hue op (#6805)

* WIP

* Updated rgb2hsv and a bit of hsv2rgb

* Fix issue with batch of images

* Few improvements

* hsv2rgb improvements

* PR review

* another update

* Fix cuda issue with empty images
torch.aminmax is failing

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 .../prototype/transforms/functional/_color.py | 99 ++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 7bf412aaf..2c268fa40 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -143,7 +143,104 @@ def adjust_sharpness(inpt: features.InputTypeJIT, sharpness_factor: float) -> fe
         return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
 
 
-adjust_hue_image_tensor = _FT.adjust_hue
+def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
+    r, g, _ = image.unbind(dim=-3)
+
+    # Implementation is based on
+    # https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/src/libImaging/Convert.c#L330
+    minc, maxc = torch.aminmax(image, dim=-3)
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occuring so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    channels_range = maxc - minc
+    # Since `eqc => channels_range = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = channels_range / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    channels_range_divisor = torch.where(eqc, ones, channels_range).unsqueeze_(dim=-3)
+    rc, gc, bc = ((maxc.unsqueeze(dim=-3) - image) / channels_range_divisor).unbind(dim=-3)
+
+    mask_maxc_neq_r = maxc != r
+    mask_maxc_eq_g = maxc == g
+    mask_maxc_neq_g = ~mask_maxc_eq_g
+
+    hr = (bc - gc).mul_(~mask_maxc_neq_r)
+    hg = (2.0 + rc).sub_(bc).mul_(mask_maxc_eq_g & mask_maxc_neq_r)
+    hb = (4.0 + gc).sub_(rc).mul_(mask_maxc_neq_g & mask_maxc_neq_r)
+
+    h = hr.add_(hg).add_(hb)
+    h = h.div_(6.0).add_(1.0).fmod_(1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
+    h, s, v = img.unbind(dim=-3)
+    h6 = h * 6
+    i = torch.floor(h6)
+    f = (h6) - i
+    i = i.to(dtype=torch.int32)
+
+    p = (v * (1.0 - s)).clamp_(0.0, 1.0)
+    q = (v * (1.0 - s * f)).clamp_(0.0, 1.0)
+    t = (v * (1.0 - s * (1.0 - f))).clamp_(0.0, 1.0)
+    i.remainder_(6)
+
+    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
+
+    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
+    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
+    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
+    a4 = torch.stack((a1, a2, a3), dim=-4)
+
+    return (a4.mul_(mask.to(dtype=img.dtype).unsqueeze(dim=-4))).sum(dim=-3)
+
+
+def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not (isinstance(image, torch.Tensor)):
+        raise TypeError("Input img should be Tensor image")
+
+    c = get_num_channels_image_tensor(image)
+
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    orig_dtype = image.dtype
+    if image.dtype == torch.uint8:
+        image = image / 255.0
+
+    image = _rgb_to_hsv(image)
+    h, s, v = image.unbind(dim=-3)
+    h.add_(hue_factor).remainder_(1.0)
+    image = torch.stack((h, s, v), dim=-3)
+    image_hue_adj = _hsv_to_rgb(image)
+
+    if orig_dtype == torch.uint8:
+        image_hue_adj = image_hue_adj.mul_(255.0).to(dtype=orig_dtype)
+
+    return image_hue_adj
+
+
 adjust_hue_image_pil = _FP.adjust_hue
 
 
-- 
GitLab


From 6979888dc4eef6ddb3526aeb56f5489516a4a105 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 24 Oct 2022 13:08:23 +0100
Subject: [PATCH 075/624] [prototype] Speed improvement for adjust gamma op
 (#6820)

* Speed improvement for adjust gamma op

* Adding comments and optimizations.

* fixing typo

* Remove unnecessary channel check.
---
 .../prototype/transforms/functional/_color.py | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 2c268fa40..0d5bdc31d 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -2,7 +2,7 @@ import torch
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
-from ._meta import _rgb_to_gray, get_dimensions_image_tensor, get_num_channels_image_tensor
+from ._meta import _rgb_to_gray, convert_dtype_image_tensor, get_dimensions_image_tensor, get_num_channels_image_tensor
 
 
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
@@ -257,7 +257,28 @@ def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.Input
         return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
 
 
-adjust_gamma_image_tensor = _FT.adjust_gamma
+def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
+    if not (isinstance(image, torch.Tensor)):
+        raise TypeError("Input img should be Tensor image")
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    # The input image is either assumed to be at [0, 1] scale (if float) or is converted to that scale (if integer).
+    # Since the gamma is non-negative, the output remains at [0, 1] scale.
+    if not torch.is_floating_point(image):
+        output = convert_dtype_image_tensor(image, torch.float32).pow_(gamma)
+    else:
+        output = image.pow(gamma)
+
+    if gain != 1.0:
+        # The clamp operation is needed only if multiplication is performed. It's only when gain != 1, that the scale
+        # of the output can go beyond [0, 1].
+        output = output.mul_(gain).clamp_(0.0, 1.0)
+
+    return convert_dtype_image_tensor(output, image.dtype)
+
+
 adjust_gamma_image_pil = _FP.adjust_gamma
 
 
-- 
GitLab


From 7f5513de68f579514ab7eb2387ac36663973123e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 24 Oct 2022 15:26:57 +0200
Subject: [PATCH 076/624] improve performance of {invert,
 solarize}_image_tensor (#6819)

* improve performance of invert_image_tensor

* cleanup

* lint

* more cleanup

* use new invert in solarize
---
 .../prototype/transforms/functional/_color.py    | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 0d5bdc31d..17878b0c6 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -312,7 +312,13 @@ def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
         return posterize_image_pil(inpt, bits=bits)
 
 
-solarize_image_tensor = _FT.solarize
+def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
+    if threshold > _FT._max_value(image.dtype):
+        raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
+
+    return torch.where(image >= threshold, invert_image_tensor(image), image)
+
+
 solarize_image_pil = _FP.solarize
 
 
@@ -456,7 +462,13 @@ def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return equalize_image_pil(inpt)
 
 
-invert_image_tensor = _FT.invert
+def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
+    if image.dtype == torch.uint8:
+        return image.bitwise_not()
+    else:
+        return _FT._max_value(image.dtype) - image  # type: ignore[no-any-return]
+
+
 invert_image_pil = _FP.invert
 
 
-- 
GitLab


From 788ad12e5dcec1dffc7d6311e59a2be2792799b8 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 24 Oct 2022 15:01:09 +0100
Subject: [PATCH 077/624] [prototype] Speed improvement for normalize op
 (#6821)

* Avoid GPU-CPU sync on Normalize

* Further optimizations.

* Apply code review changes.

* Fixing JIT.

* linter fix
---
 .../prototype/transforms/functional/_misc.py  | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index fa4a6e9be..3a1d8575c 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -8,7 +8,42 @@ from torchvision.prototype import features
 from torchvision.transforms import functional_tensor as _FT
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
-normalize_image_tensor = _FT.normalize
+
+def normalize_image_tensor(
+    image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False
+) -> torch.Tensor:
+    if not image.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
+
+    if image.ndim < 3:
+        raise ValueError(
+            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {image.size()}"
+        )
+
+    if isinstance(std, (tuple, list)):
+        divzero = not all(std)
+    elif isinstance(std, (int, float)):
+        divzero = std == 0
+    else:
+        divzero = False
+    if divzero:
+        raise ValueError("std evaluated to zero, leading to division by zero.")
+
+    dtype = image.dtype
+    device = image.device
+    mean = torch.as_tensor(mean, dtype=dtype, device=device)
+    std = torch.as_tensor(std, dtype=dtype, device=device)
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+
+    if inplace:
+        image = image.sub_(mean)
+    else:
+        image = image.sub(mean)
+
+    return image.div_(std)
 
 
 def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
-- 
GitLab


From 1878e868109f4116b4c5657e90c22e31268c236e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 24 Oct 2022 19:05:17 +0200
Subject: [PATCH 078/624] revert 255 -> max_value fix (#6826)

---
 torchvision/prototype/transforms/functional/_color.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 17878b0c6..742b344cf 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -313,7 +313,8 @@ def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
 
 
 def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
-    if threshold > _FT._max_value(image.dtype):
+    bound = 1 if image.is_floating_point() else 255
+    if threshold > bound:
         raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
 
     return torch.where(image >= threshold, invert_image_tensor(image), image)
@@ -466,7 +467,7 @@ def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.dtype == torch.uint8:
         return image.bitwise_not()
     else:
-        return _FT._max_value(image.dtype) - image  # type: ignore[no-any-return]
+        return (1 if image.is_floating_point() else 255) - image  # type: ignore[no-any-return]
 
 
 invert_image_pil = _FP.invert
-- 
GitLab


From 7278abece5986e6a407a76b23aa794c0c07f0d0e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 24 Oct 2022 19:06:20 +0200
Subject: [PATCH 079/624] test scalars for mean and std in normalize (#6824)

---
 test/prototype_transforms_dispatcher_infos.py | 45 ++++++--------
 test/prototype_transforms_kernel_infos.py     | 59 ++++++++-----------
 2 files changed, 40 insertions(+), 64 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 6aa7eb248..e570e4355 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -73,24 +73,24 @@ class DispatcherInfo(InfoBase):
                     yield args_kwargs
 
 
-def xfail_jit_python_scalar_arg(name, *, reason=None):
-    reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
+def xfail_jit(reason, *, condition=None):
     return TestMark(
         ("TestDispatchers", "test_scripted_smoke"),
         pytest.mark.xfail(reason=reason),
-        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
+        condition=condition,
     )
 
 
-def xfail_jit_integer_size(name="size"):
-    return xfail_jit_python_scalar_arg(name, reason=f"Integer `{name}` is not supported when scripting.")
+def xfail_jit_python_scalar_arg(name, *, reason=None):
+    return xfail_jit(
+        reason or f"Python scalar int or float for `{name}` is not supported when scripting",
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
+    )
 
 
 def xfail_jit_tuple_instead_of_list(name, *, reason=None):
-    reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting"
-    return TestMark(
-        ("TestDispatchers", "test_scripted_smoke"),
-        pytest.mark.xfail(reason=reason),
+    return xfail_jit(
+        reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting",
         condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple),
     )
 
@@ -101,10 +101,8 @@ def is_list_of_ints(args_kwargs):
 
 
 def xfail_jit_list_of_ints(name, *, reason=None):
-    reason = reason or f"Passing a list of integers for `{name}` is not supported when scripting"
-    return TestMark(
-        ("TestDispatchers", "test_scripted_smoke"),
-        pytest.mark.xfail(reason=reason),
+    return xfail_jit(
+        reason or f"Passing a list of integers for `{name}` is not supported when scripting",
         condition=is_list_of_ints,
     )
 
@@ -137,17 +135,6 @@ xfail_dispatch_pil_if_fill_sequence_needs_broadcast = TestMark(
 )
 
 
-def xfail_all_tests(*, reason, condition):
-    return [
-        TestMark(("TestDispatchers", test_name), pytest.mark.xfail(reason=reason), condition=condition)
-        for test_name in [
-            "test_scripted_smoke",
-            "test_dispatch_simple_tensor",
-            "test_dispatch_feature",
-        ]
-    ]
-
-
 DISPATCHER_INFOS = [
     DispatcherInfo(
         F.horizontal_flip,
@@ -167,7 +154,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.resize_image_pil),
         test_marks=[
-            xfail_jit_integer_size(),
+            xfail_jit_python_scalar_arg("size"),
         ],
     ),
     DispatcherInfo(
@@ -284,7 +271,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
         test_marks=[
-            xfail_jit_integer_size("output_size"),
+            xfail_jit_python_scalar_arg("output_size"),
         ],
     ),
     DispatcherInfo(
@@ -392,7 +379,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         test_marks=[
-            xfail_jit_integer_size(),
+            xfail_jit_python_scalar_arg("size"),
             skip_dispatch_feature,
         ],
     ),
@@ -402,7 +389,7 @@ DISPATCHER_INFOS = [
             features.Image: F.ten_crop_image_tensor,
         },
         test_marks=[
-            xfail_jit_integer_size(),
+            xfail_jit_python_scalar_arg("size"),
             skip_dispatch_feature,
         ],
         pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
@@ -414,6 +401,8 @@ DISPATCHER_INFOS = [
         },
         test_marks=[
             skip_dispatch_feature,
+            xfail_jit_python_scalar_arg("mean"),
+            xfail_jit_python_scalar_arg("std"),
         ],
     ),
     DispatcherInfo(
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 3fcf734f8..587deb341 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -83,24 +83,21 @@ def pil_reference_wrapper(pil_kernel):
     return wrapper
 
 
+def xfail_jit(reason, *, condition=None):
+    return TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.xfail(reason=reason), condition=condition)
+
+
 def xfail_jit_python_scalar_arg(name, *, reason=None):
-    reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting"
-    return TestMark(
-        ("TestKernels", "test_scripted_vs_eager"),
-        pytest.mark.xfail(reason=reason),
+    return xfail_jit(
+        reason or f"Python scalar int or float for `{name}` is not supported when scripting",
         condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
     )
 
 
-def xfail_jit_integer_size(name="size"):
-    return xfail_jit_python_scalar_arg(name, reason=f"Integer `{name}` is not supported when scripting.")
-
-
 def xfail_jit_tuple_instead_of_list(name, *, reason=None):
     reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting"
-    return TestMark(
-        ("TestKernels", "test_scripted_vs_eager"),
-        pytest.mark.xfail(reason=reason),
+    return xfail_jit(
+        reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting",
         condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple),
     )
 
@@ -111,27 +108,12 @@ def is_list_of_ints(args_kwargs):
 
 
 def xfail_jit_list_of_ints(name, *, reason=None):
-    reason = reason or f"Passing a list of integers for `{name}` is not supported when scripting"
-    return TestMark(
-        ("TestKernels", "test_scripted_vs_eager"),
-        pytest.mark.xfail(reason=reason),
+    return xfail_jit(
+        reason or f"Passing a list of integers for `{name}` is not supported when scripting",
         condition=is_list_of_ints,
     )
 
 
-def xfail_all_tests(*, reason, condition):
-    return [
-        TestMark(("TestKernels", test_name), pytest.mark.xfail(reason=reason), condition=condition)
-        for test_name in [
-            "test_scripted_vs_eager",
-            "test_batched_vs_single",
-            "test_no_inplace",
-            "test_cuda_vs_cpu",
-            "test_dtype_and_device_consistency",
-        ]
-    ]
-
-
 KERNEL_INFOS = []
 
 
@@ -297,14 +279,14 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resize_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_jit_integer_size(),
+                xfail_jit_python_scalar_arg("size"),
             ],
         ),
         KernelInfo(
             F.resize_bounding_box,
             sample_inputs_fn=sample_inputs_resize_bounding_box,
             test_marks=[
-                xfail_jit_integer_size(),
+                xfail_jit_python_scalar_arg("size"),
             ],
         ),
         KernelInfo(
@@ -314,7 +296,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resize_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_jit_integer_size(),
+                xfail_jit_python_scalar_arg("size"),
             ],
         ),
         KernelInfo(
@@ -1279,14 +1261,14 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_center_crop_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_jit_integer_size("output_size"),
+                xfail_jit_python_scalar_arg("output_size"),
             ],
         ),
         KernelInfo(
             F.center_crop_bounding_box,
             sample_inputs_fn=sample_inputs_center_crop_bounding_box,
             test_marks=[
-                xfail_jit_integer_size("output_size"),
+                xfail_jit_python_scalar_arg("output_size"),
             ],
         ),
         KernelInfo(
@@ -1296,7 +1278,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_center_crop_mask,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_jit_integer_size("output_size"),
+                xfail_jit_python_scalar_arg("output_size"),
             ],
         ),
         KernelInfo(
@@ -1923,7 +1905,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
             test_marks=[
-                xfail_jit_integer_size(),
+                xfail_jit_python_scalar_arg("size"),
                 mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
@@ -1934,7 +1916,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
             test_marks=[
-                xfail_jit_integer_size(),
+                xfail_jit_python_scalar_arg("size"),
                 mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
@@ -1945,6 +1927,7 @@ KERNEL_INFOS.extend(
 _NORMALIZE_MEANS_STDS = [
     ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
     ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]),
+    (0.5, 2.0),
 ]
 
 
@@ -1970,6 +1953,10 @@ KERNEL_INFOS.extend(
             F.normalize_image_tensor,
             kernel_name="normalize_image_tensor",
             sample_inputs_fn=sample_inputs_normalize_image_tensor,
+            test_marks=[
+                xfail_jit_python_scalar_arg("mean"),
+                xfail_jit_python_scalar_arg("std"),
+            ],
         ),
         KernelInfo(
             F.normalize_video,
-- 
GitLab


From 0d7807d59520289b2065b4db4a138b7fba2f61fd Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 24 Oct 2022 19:36:59 +0100
Subject: [PATCH 080/624] [prototype] Cleaning up the size dimension methods
 (#6828)

* Cleaning up the size dimension methods.

* Change error messages.
---
 .../prototype/transforms/functional/_meta.py  | 37 ++++++++++++++++---
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 674cba846..50d7c9bbf 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -7,7 +7,18 @@ from torchvision.prototype.features import BoundingBoxFormat, ColorSpace
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
 
-get_dimensions_image_tensor = _FT.get_dimensions
+def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
+    chw = list(image.shape[-3:])
+    ndims = len(chw)
+    if ndims == 3:
+        return chw
+    elif ndims == 2:
+        chw.insert(0, 1)
+        return chw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
 get_dimensions_image_pil = _FP.get_dimensions
 
 
@@ -24,7 +35,17 @@ def get_dimensions(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -
         return get_dimensions_image_pil(image)
 
 
-get_num_channels_image_tensor = _FT.get_image_num_channels
+def get_num_channels_image_tensor(image: torch.Tensor) -> int:
+    chw = image.shape[-3:]
+    ndims = len(chw)
+    if ndims == 3:
+        return chw[0]
+    elif ndims == 2:
+        return 1
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
 get_num_channels_image_pil = _FP.get_image_num_channels
 
 
@@ -36,11 +57,11 @@ def get_num_channels(image: Union[features.ImageTypeJIT, features.VideoTypeJIT])
     if isinstance(image, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
     ):
-        return _FT.get_image_num_channels(image)
+        return get_num_channels_image_tensor(image)
     elif isinstance(image, (features.Image, features.Video)):
         return image.num_channels
     else:
-        return _FP.get_image_num_channels(image)
+        return get_num_channels_image_pil(image)
 
 
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
@@ -49,8 +70,12 @@ get_image_num_channels = get_num_channels
 
 
 def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]:
-    width, height = _FT.get_image_size(image)
-    return [height, width]
+    hw = list(image.shape[-2:])
+    ndims = len(hw)
+    if ndims == 2:
+        return hw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
 
 
 @torch.jit.unused
-- 
GitLab


From 7de68b0d316d98a16aec9ede901afea3f3e1b542 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 25 Oct 2022 12:33:06 +0200
Subject: [PATCH 081/624] unwrap features in dispatchers (#6831)

* unwrap features in dispatchers

* cleanup

* align erase / five_crop / ten_crop with other dispatchers
---
 .../transforms/functional/_augment.py         | 15 ++++++---
 .../transforms/functional/_deprecated.py      | 13 ++++----
 .../transforms/functional/_geometry.py        | 31 ++++++++++++-------
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 20e5ac916..baa3e1573 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -34,10 +34,15 @@ def erase(
     v: torch.Tensor,
     inplace: bool = False,
 ) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
-    if isinstance(inpt, torch.Tensor):
-        output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
-            output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
-        return output
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+    ):
+        return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    elif isinstance(inpt, features.Image):
+        output = erase_image_tensor(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+        return features.Image.wrap_like(inpt, output)
+    elif isinstance(inpt, features.Video):
+        output = erase_video(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+        return features.Video.wrap_like(inpt, output)
     else:  # isinstance(inpt, PIL.Image.Image):
         return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index e18c267e8..1075e9a64 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -25,12 +25,13 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
 def rgb_to_grayscale(
     inpt: Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT], num_output_channels: int = 1
 ) -> Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT]:
-    old_color_space = (
-        features._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
-        if isinstance(inpt, torch.Tensor)
-        and (torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video)))
-        else None
-    )
+    if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
+        inpt = inpt.as_subclass(torch.Tensor)
+        old_color_space = None
+    elif isinstance(inpt, torch.Tensor):
+        old_color_space = features._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
+    else:
+        old_color_space = None
 
     call = ", num_output_channels=3" if num_output_channels == 3 else ""
     replacement = (
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 1451b83cf..a112db7e1 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1400,12 +1400,16 @@ def five_crop(
     inpt: ImageOrVideoTypeJIT, size: List[int]
 ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
     # TODO: consider breaking BC here to return List[features.ImageTypeJIT/VideoTypeJIT] to align this op with `ten_crop`
-    if isinstance(inpt, torch.Tensor):
-        output = five_crop_image_tensor(inpt, size)
-        if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
-            tmp = tuple(inpt.wrap_like(inpt, item) for item in output)  # type: ignore[arg-type]
-            output = tmp  # type: ignore[assignment]
-        return output
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+    ):
+        return five_crop_image_tensor(inpt, size)
+    elif isinstance(inpt, features.Image):
+        output = five_crop_image_tensor(inpt.as_subclass(torch.Tensor), size)
+        return tuple(features.Image.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
+    elif isinstance(inpt, features.Video):
+        output = five_crop_video(inpt.as_subclass(torch.Tensor), size)
+        return tuple(features.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
     else:  # isinstance(inpt, PIL.Image.Image):
         return five_crop_image_pil(inpt, size)
 
@@ -1444,10 +1448,15 @@ def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = F
 def ten_crop(
     inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], size: List[int], vertical_flip: bool = False
 ) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]:
-    if isinstance(inpt, torch.Tensor):
-        output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
-        if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
-            output = [inpt.wrap_like(inpt, item) for item in output]  # type: ignore[arg-type]
-        return output
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+    ):
+        return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
+    elif isinstance(inpt, features.Image):
+        output = ten_crop_image_tensor(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
+        return [features.Image.wrap_like(inpt, item) for item in output]
+    elif isinstance(inpt, features.Video):
+        output = ten_crop_video(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
+        return [features.Video.wrap_like(inpt, item) for item in output]
     else:  # isinstance(inpt, PIL.Image.Image):
         return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
-- 
GitLab


From b45969a7d53b5b6a15a931f4514d4f392ef218b6 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 25 Oct 2022 12:54:59 +0100
Subject: [PATCH 082/624] [prototype] Clean up `features` area (#6834)

* Clean ups on `features` area

* remove unncessary imports
---
 torchvision/prototype/features/__init__.py    | 21 ++-----------------
 .../prototype/features/_bounding_box.py       | 12 -----------
 torchvision/prototype/features/_image.py      |  6 ++----
 torchvision/prototype/features/_mask.py       |  4 ++--
 torchvision/prototype/features/_video.py      |  6 ++----
 .../transforms/functional/_deprecated.py      |  4 ++--
 .../prototype/transforms/functional/_misc.py  |  4 +---
 7 files changed, 11 insertions(+), 46 deletions(-)

diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
index 8a461e1be..557c4d83c 100644
--- a/torchvision/prototype/features/__init__.py
+++ b/torchvision/prototype/features/__init__.py
@@ -1,24 +1,7 @@
 from ._bounding_box import BoundingBox, BoundingBoxFormat
 from ._encoded import EncodedData, EncodedImage
 from ._feature import _Feature, FillType, FillTypeJIT, InputType, InputTypeJIT, is_simple_tensor
-from ._image import (
-    ColorSpace,
-    Image,
-    ImageType,
-    ImageTypeJIT,
-    LegacyImageType,
-    LegacyImageTypeJIT,
-    TensorImageType,
-    TensorImageTypeJIT,
-)
+from ._image import ColorSpace, Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
 from ._label import Label, OneHotLabel
 from ._mask import Mask
-from ._video import (
-    LegacyVideoType,
-    LegacyVideoTypeJIT,
-    TensorVideoType,
-    TensorVideoTypeJIT,
-    Video,
-    VideoType,
-    VideoTypeJIT,
-)
+from ._video import TensorVideoType, TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 59b022582..638759ae8 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -61,18 +61,6 @@ class BoundingBox(_Feature):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr(format=self.format, spatial_size=self.spatial_size)
 
-    def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
-        if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
-
-        return BoundingBox.wrap_like(
-            self,
-            self._F.convert_format_bounding_box(
-                self.as_subclass(torch.Tensor), old_format=self.format, new_format=format
-            ),
-            format=format,
-        )
-
     def horizontal_flip(self) -> BoundingBox:
         output = self._F.horizontal_flip_bounding_box(
             self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index d52989641..74904294f 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, cast, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
@@ -104,7 +104,7 @@ class Image(_Feature):
 
     @property
     def spatial_size(self) -> Tuple[int, int]:
-        return cast(Tuple[int, int], tuple(self.shape[-2:]))
+        return tuple(self.shape[-2:])  # type: ignore[return-value]
 
     @property
     def num_channels(self) -> int:
@@ -285,7 +285,5 @@ class Image(_Feature):
 
 ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
 ImageTypeJIT = torch.Tensor
-LegacyImageType = Union[torch.Tensor, PIL.Image.Image]
-LegacyImageTypeJIT = torch.Tensor
 TensorImageType = Union[torch.Tensor, Image]
 TensorImageTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 697f0bbd9..a297c43c2 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, cast, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 from torchvision.transforms import InterpolationMode
@@ -34,7 +34,7 @@ class Mask(_Feature):
 
     @property
     def spatial_size(self) -> Tuple[int, int]:
-        return cast(Tuple[int, int], tuple(self.shape[-2:]))
+        return tuple(self.shape[-2:])  # type: ignore[return-value]
 
     def horizontal_flip(self) -> Mask:
         output = self._F.horizontal_flip_mask(self.as_subclass(torch.Tensor))
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index a4d30a49c..a23116783 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, cast, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 from torchvision.transforms.functional import InterpolationMode
@@ -56,7 +56,7 @@ class Video(_Feature):
 
     @property
     def spatial_size(self) -> Tuple[int, int]:
-        return cast(Tuple[int, int], tuple(self.shape[-2:]))
+        return tuple(self.shape[-2:])  # type: ignore[return-value]
 
     @property
     def num_channels(self) -> int:
@@ -237,7 +237,5 @@ class Video(_Feature):
 
 VideoType = Union[torch.Tensor, Video]
 VideoTypeJIT = torch.Tensor
-LegacyVideoType = torch.Tensor
-LegacyVideoTypeJIT = torch.Tensor
 TensorVideoType = Union[torch.Tensor, Video]
 TensorVideoTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index 1075e9a64..e28bc4565 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -23,8 +23,8 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
 
 
 def rgb_to_grayscale(
-    inpt: Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT], num_output_channels: int = 1
-) -> Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT]:
+    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], num_output_channels: int = 1
+) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
     if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
         inpt = inpt.as_subclass(torch.Tensor)
         old_color_space = None
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 3a1d8575c..5e636a949 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -16,9 +16,7 @@ def normalize_image_tensor(
         raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
 
     if image.ndim < 3:
-        raise ValueError(
-            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {image.size()}"
-        )
+        raise ValueError(f"Expected tensor to be a tensor image of size (..., C, H, W). Got {image.shape}.")
 
     if isinstance(std, (tuple, list)):
         divzero = not all(std)
-- 
GitLab


From edb3a8069a0b86231f14e84ac9f26fd7c7bffb5f Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 25 Oct 2022 14:20:00 +0100
Subject: [PATCH 083/624] [prototype] Minor improvements on functional (#6832)

* Minor improvements on functional.

* Restore `_split_alpha`.

* Revert "Restore `_split_alpha`."

This reverts commit 2286120be6d4af2a3c9b52b605d87611ec70fe06.
---
 .../prototype/transforms/functional/_color.py        | 12 +-----------
 torchvision/prototype/transforms/functional/_meta.py | 10 +++-------
 torchvision/prototype/transforms/functional/_misc.py |  8 ++++----
 3 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 742b344cf..3ad65493f 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -188,7 +188,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
     h, s, v = img.unbind(dim=-3)
     h6 = h * 6
     i = torch.floor(h6)
-    f = (h6) - i
+    f = h6 - i
     i = i.to(dtype=torch.int32)
 
     p = (v * (1.0 - s)).clamp_(0.0, 1.0)
@@ -210,9 +210,6 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
     if not (-0.5 <= hue_factor <= 0.5):
         raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
 
-    if not (isinstance(image, torch.Tensor)):
-        raise TypeError("Input img should be Tensor image")
-
     c = get_num_channels_image_tensor(image)
 
     if c not in [1, 3]:
@@ -258,9 +255,6 @@ def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.Input
 
 
 def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
-    if not (isinstance(image, torch.Tensor)):
-        raise TypeError("Input img should be Tensor image")
-
     if gamma < 0:
         raise ValueError("Gamma should be a non-negative real number")
 
@@ -337,10 +331,6 @@ def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTyp
 
 
 def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
-
-    if not (isinstance(image, torch.Tensor)):
-        raise TypeError("Input img should be Tensor image")
-
     c = get_num_channels_image_tensor(image)
 
     if c not in [1, 3]:
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 50d7c9bbf..5e0178484 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -183,12 +183,8 @@ def clamp_bounding_box(
     return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format)
 
 
-def _split_alpha(image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    return image[..., :-1, :, :], image[..., -1:, :, :]
-
-
 def _strip_alpha(image: torch.Tensor) -> torch.Tensor:
-    image, alpha = _split_alpha(image)
+    image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
     if not torch.all(alpha == _FT._max_value(alpha.dtype)):
         raise RuntimeError(
             "Stripping the alpha channel if it contains values other than the max value is not supported."
@@ -237,7 +233,7 @@ def convert_color_space_image_tensor(
     elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB:
         return _gray_to_rgb(_strip_alpha(image))
     elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB_ALPHA:
-        image, alpha = _split_alpha(image)
+        image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
         return _add_alpha(_gray_to_rgb(image), alpha)
     elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY:
         return _rgb_to_gray(image)
@@ -248,7 +244,7 @@ def convert_color_space_image_tensor(
     elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY:
         return _rgb_to_gray(_strip_alpha(image))
     elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY_ALPHA:
-        image, alpha = _split_alpha(image)
+        image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
         return _add_alpha(_rgb_to_gray(image), alpha)
     elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.RGB:
         return _strip_alpha(image)
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 5e636a949..738e36996 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -67,9 +67,9 @@ def normalize(
     return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
 
 
-def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> torch.Tensor:
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
     lim = (kernel_size - 1) / (2 * math.sqrt(2) * sigma)
-    x = torch.linspace(-lim, lim, steps=kernel_size)
+    x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
     kernel1d = torch.softmax(-x.pow_(2), dim=0)
     return kernel1d
 
@@ -77,8 +77,8 @@ def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> torch.Tensor:
 def _get_gaussian_kernel2d(
     kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
 ) -> torch.Tensor:
-    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
-    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device)
     kernel2d = kernel1d_y.unsqueeze(-1) * kernel1d_x
     return kernel2d
 
-- 
GitLab


From 8e0e715700877bd252836772704f513f7ed48265 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 26 Oct 2022 08:43:24 +0200
Subject: [PATCH 084/624] ignore git warning 'globally' (#6833)

* ignore git warning 'globally'

* improve comment
---
 test/prototype_transforms_kernel_infos.py    | 29 ++++++--------------
 test/test_prototype_transforms_functional.py | 13 +++++++++
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 587deb341..c417b33c2 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -2,7 +2,6 @@ import decimal
 import functools
 import itertools
 import math
-import re
 
 import numpy as np
 import pytest
@@ -159,12 +158,6 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.horizontal_flip_bounding_box,
             sample_inputs_fn=sample_inputs_horizontal_flip_bounding_box,
-            test_marks=[
-                TestMark(
-                    ("TestKernels", "test_scripted_vs_eager"),
-                    pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %72')}:UserWarning"),
-                )
-            ],
         ),
         KernelInfo(
             F.horizontal_flip_mask,
@@ -2045,17 +2038,11 @@ def sample_inputs_convert_dtype_video():
         yield ArgsKwargs(video_loader)
 
 
-_common_convert_dtype_marks = [
-    TestMark(
-        ("TestKernels", "test_dtype_and_device_consistency"),
-        pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
-        condition=lambda args_kwargs: args_kwargs.args[0].dtype != args_kwargs.kwargs.get("dtype", torch.float32),
-    ),
-    TestMark(
-        ("TestKernels", "test_scripted_vs_eager"),
-        pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %')}:UserWarning"),
-    ),
-]
+skip_dtype_consistency = TestMark(
+    ("TestKernels", "test_dtype_and_device_consistency"),
+    pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
+    condition=lambda args_kwargs: args_kwargs.args[0].dtype != args_kwargs.kwargs.get("dtype", torch.float32),
+)
 
 KERNEL_INFOS.extend(
     [
@@ -2065,7 +2052,7 @@ KERNEL_INFOS.extend(
             reference_fn=reference_convert_dtype_image_tensor,
             reference_inputs_fn=reference_inputs_convert_dtype_image_tensor,
             test_marks=[
-                *_common_convert_dtype_marks,
+                skip_dtype_consistency,
                 TestMark(
                     ("TestKernels", "test_against_reference"),
                     pytest.mark.xfail(reason="Conversion overflows"),
@@ -2083,7 +2070,9 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.convert_dtype_video,
             sample_inputs_fn=sample_inputs_convert_dtype_video,
-            test_marks=_common_convert_dtype_marks,
+            test_marks=[
+                skip_dtype_consistency,
+            ],
         ),
     ]
 )
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 0ddfcf1b3..c739598a1 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -1,5 +1,6 @@
 import math
 import os
+import re
 
 import numpy as np
 import PIL.Image
@@ -26,6 +27,15 @@ def script(fn):
         raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
 
 
+# Scripting a function often triggers a warning like
+# `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information`
+# with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
+# them.
+ignore_jit_warning_no_profile = pytest.mark.filterwarnings(
+    f"ignore:{re.escape('operator() profile_node %')}:UserWarning"
+)
+
+
 def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None):
     args_kwargs = list(args_kwargs_fn(info))
     idx_field_len = len(str(len(args_kwargs)))
@@ -87,6 +97,7 @@ class TestKernels:
         condition=lambda info: info.reference_fn is not None,
     )
 
+    @ignore_jit_warning_no_profile
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_scripted_vs_eager(self, info, args_kwargs, device):
@@ -218,6 +229,7 @@ class TestDispatchers:
         condition=lambda info: features.Image in info.kernels,
     )
 
+    @ignore_jit_warning_no_profile
     @image_sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_scripted_smoke(self, info, args_kwargs, device):
@@ -230,6 +242,7 @@ class TestDispatchers:
 
     # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke`
     #  replaces this test for them.
+    @ignore_jit_warning_no_profile
     @pytest.mark.parametrize(
         "dispatcher",
         [
-- 
GitLab


From c84dbfad97251271a789b252a2a1a52c73f623ff Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 26 Oct 2022 10:25:42 +0100
Subject: [PATCH 085/624] [prototype] Speed up Augment Transform Classes
 (#6835)

* Moving value estimation of `RandomErasing` from runtime to constructor.

* Speed up mixing on MixUp/Cutmix and small optimization on SimpleCopyPaste.

* Apply nits.
---
 torchvision/prototype/transforms/_augment.py | 35 +++++++++-----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 4a45f9f57..b4834e47f 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -40,7 +40,14 @@ class RandomErasing(_RandomApplyTransform):
             raise ValueError("Scale should be between 0 and 1")
         self.scale = scale
         self.ratio = ratio
-        self.value = value
+        if isinstance(value, (int, float)):
+            self.value = [value]
+        elif isinstance(value, str):
+            self.value = None
+        elif isinstance(value, tuple):
+            self.value = list(value)
+        else:
+            self.value = value
         self.inplace = inplace
 
         self._log_ratio = torch.log(torch.tensor(self.ratio))
@@ -48,16 +55,7 @@ class RandomErasing(_RandomApplyTransform):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         img_c, img_h, img_w = query_chw(flat_inputs)
 
-        if isinstance(self.value, (int, float)):
-            value = [self.value]
-        elif isinstance(self.value, str):
-            value = None
-        elif isinstance(self.value, tuple):
-            value = list(self.value)
-        else:
-            value = self.value
-
-        if value is not None and not (len(value) in (1, img_c)):
+        if self.value is not None and not (len(self.value) in (1, img_c)):
             raise ValueError(
                 f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
             )
@@ -79,10 +77,10 @@ class RandomErasing(_RandomApplyTransform):
             if not (h < img_h and w < img_w):
                 continue
 
-            if value is None:
+            if self.value is None:
                 v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
             else:
-                v = torch.tensor(value)[:, None, None]
+                v = torch.tensor(self.value)[:, None, None]
 
             i = torch.randint(0, img_h - h + 1, size=(1,)).item()
             j = torch.randint(0, img_w - w + 1, size=(1,)).item()
@@ -121,8 +119,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
     def _mixup_onehotlabel(self, inpt: features.OneHotLabel, lam: float) -> features.OneHotLabel:
         if inpt.ndim < 2:
             raise ValueError("Need a batch of one hot labels")
-        output = inpt.clone()
-        output = output.roll(1, 0).mul_(1.0 - lam).add_(output.mul_(lam))
+        output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
         return features.OneHotLabel.wrap_like(inpt, output)
 
 
@@ -136,8 +133,7 @@ class RandomMixup(_BaseMixupCutmix):
             expected_ndim = 5 if isinstance(inpt, features.Video) else 4
             if inpt.ndim < expected_ndim:
                 raise ValueError("The transform expects a batched input")
-            output = inpt.clone()
-            output = output.roll(1, 0).mul_(1.0 - lam).add_(output.mul_(lam))
+            output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
 
             if isinstance(inpt, (features.Image, features.Video)):
                 output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
@@ -243,11 +239,12 @@ class SimpleCopyPaste(_RandomApplyTransform):
         if blending:
             paste_alpha_mask = F.gaussian_blur(paste_alpha_mask.unsqueeze(0), kernel_size=[5, 5], sigma=[2.0])
 
+        inverse_paste_alpha_mask = paste_alpha_mask.logical_not()
         # Copy-paste images:
-        image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
+        image = image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask))
 
         # Copy-paste masks:
-        masks = masks * (~paste_alpha_mask)
+        masks = masks * inverse_paste_alpha_mask
         non_all_zero_masks = masks.sum((-1, -2)) > 0
         masks = masks[non_all_zero_masks]
 
-- 
GitLab


From add75968543f36818691f8b59880f5c04689a88e Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Wed, 26 Oct 2022 11:02:39 -0700
Subject: [PATCH 086/624] [Nova] GHA Linux GPU Job (#6804)

* [Nova] GHA Linux GPU Job

* increase timeout since jobs timeout and cancel after 30 mins

* bigger instance and longer timeout

* use instance with more gpu memory and only run py38 and cu116 on PR CI
---
 .github/workflows/test-linux-cpu.yml |  4 +-
 .github/workflows/test-linux-gpu.yml | 61 ++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/test-linux-gpu.yml

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index f78dd323d..234ad97f4 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -16,7 +16,7 @@ jobs:
   tests:
     strategy:
       matrix:
-        py_vers: ["3.7", "3.8", "3.9", "3.10"]
+        python_version: ["3.7", "3.8", "3.9", "3.10"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
@@ -27,7 +27,7 @@ jobs:
         git config --global --add safe.directory /__w/vision/vision
 
         # Set up Environment Variables
-        export PYTHON_VERSION="${{ matrix.py_vers }}"
+        export PYTHON_VERSION="${{ matrix.python_version }}"
         export VERSION="cpu"
         export CUDATOOLKIT="cpuonly"
 
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
new file mode 100644
index 000000000..a4d938f23
--- /dev/null
+++ b/.github/workflows/test-linux-gpu.yml
@@ -0,0 +1,61 @@
+name: Unit-tests on Linux GPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+env:
+  CHANNEL: "nightly"
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python_version: ["3.8"]
+        cuda_arch_version: ["11.6"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      repository: pytorch/vision
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda_arch_version }}
+      timeout: 120
+      script: |
+        # Mark Build Directory Safe
+        git config --global --add safe.directory /__w/vision/vision
+
+        # Set up Environment Variables
+        export PYTHON_VERSION="${{ matrix.python_version }}"
+        export VERSION="${{ matrix.cuda_arch_version }}"
+        export CUDATOOLKIT="pytorch-cuda=${VERSION}"
+
+        # Set CHANNEL
+        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+          export CHANNEL=test
+        else
+          export CHANNEL=nightly
+        fi
+
+        # Create Conda Env
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda activate /work/ci_env
+        
+        # Install PyTorch, Torchvision, and testing libraries
+        set -ex
+        conda install \
+          --yes \
+          -c "pytorch-${CHANNEL}" \
+          -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
+          "${CUDATOOLKIT}"
+        python3 setup.py develop
+        python3 -m pip install pytest pytest-mock 'av<10'
+
+        # Run Tests
+        python3 -m torch.utils.collect_env
+        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
-- 
GitLab


From e1f464bded7781be85fdf1762c01a7869d3aec97 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 27 Oct 2022 10:56:38 +0100
Subject: [PATCH 087/624] [prototype] Minor speed and nit optimizations on
 Transform Classes (#6837)

* Change random generator for ColorJitter.

* Move `_convert_fill_arg` from runtime to constructor.

* Remove unnecessary TypeVars.

* Remove unnecessary casts

* Update comments.

* Minor code-quality changes on Geometical Transforms.

* Fixing linter and other minor fixes.

* Change mitigation for mypy.`

* Fixing the tests.

* Fixing the tests.

* Fix linter

* Restore dict copy.

* Handling of defaultdicts

* restore int idiom

* Update todo
---
 test/test_prototype_transforms.py             | 26 +++++-----
 test/test_prototype_transforms_consistency.py |  2 -
 .../prototype/transforms/_auto_augment.py     | 34 ++++++-------
 torchvision/prototype/transforms/_color.py    |  2 +-
 torchvision/prototype/transforms/_geometry.py | 48 ++++++++-----------
 .../prototype/transforms/_type_conversion.py  |  8 ++--
 torchvision/prototype/transforms/_utils.py    | 35 +++++++++++---
 .../transforms/functional/_geometry.py        | 14 ------
 8 files changed, 80 insertions(+), 89 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 4334d157e..fab4cc0dd 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -389,7 +389,7 @@ class TestPad:
         inpt = mocker.MagicMock(spec=features.Image)
         _ = transform(inpt)
 
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
+        fill = transforms._utils._convert_fill_arg(fill)
         if isinstance(padding, tuple):
             padding = list(padding)
         fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
@@ -405,14 +405,14 @@ class TestPad:
         _ = transform(inpt)
 
         if isinstance(fill, int):
-            fill = transforms.functional._geometry._convert_fill_arg(fill)
+            fill = transforms._utils._convert_fill_arg(fill)
             calls = [
                 mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
                 mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
             ]
         else:
-            fill_img = transforms.functional._geometry._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms.functional._geometry._convert_fill_arg(fill[type(mask)])
+            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
+            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
             calls = [
                 mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
                 mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
@@ -466,7 +466,7 @@ class TestRandomZoomOut:
         torch.rand(1)  # random apply changes random state
         params = transform._get_params([inpt])
 
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
+        fill = transforms._utils._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill)
 
     @pytest.mark.parametrize("fill", [12, {features.Image: 12, features.Mask: 34}])
@@ -485,14 +485,14 @@ class TestRandomZoomOut:
         params = transform._get_params(inpt)
 
         if isinstance(fill, int):
-            fill = transforms.functional._geometry._convert_fill_arg(fill)
+            fill = transforms._utils._convert_fill_arg(fill)
             calls = [
                 mocker.call(image, **params, fill=fill),
                 mocker.call(mask, **params, fill=fill),
             ]
         else:
-            fill_img = transforms.functional._geometry._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms.functional._geometry._convert_fill_arg(fill[type(mask)])
+            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
+            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
             calls = [
                 mocker.call(image, **params, fill=fill_img),
                 mocker.call(mask, **params, fill=fill_mask),
@@ -556,7 +556,7 @@ class TestRandomRotation:
         torch.manual_seed(12)
         params = transform._get_params(inpt)
 
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
+        fill = transforms._utils._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
     @pytest.mark.parametrize("angle", [34, -87])
@@ -694,7 +694,7 @@ class TestRandomAffine:
         torch.manual_seed(12)
         params = transform._get_params([inpt])
 
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
+        fill = transforms._utils._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
 
 
@@ -939,7 +939,7 @@ class TestRandomPerspective:
         torch.rand(1)  # random apply changes random state
         params = transform._get_params([inpt])
 
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
+        fill = transforms._utils._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
 
 
@@ -1009,7 +1009,7 @@ class TestElasticTransform:
         transform._get_params = mocker.MagicMock()
         _ = transform(inpt)
         params = transform._get_params([inpt])
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
+        fill = transforms._utils._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
 
 
@@ -1632,7 +1632,7 @@ class TestFixedSizeCrop:
             if not needs_crop:
                 assert args[0] is inpt_sentinel
             assert args[1] is padding_sentinel
-            fill_sentinel = transforms.functional._geometry._convert_fill_arg(fill_sentinel)
+            fill_sentinel = transforms._utils._convert_fill_arg(fill_sentinel)
             assert kwargs == dict(fill=fill_sentinel, padding_mode=padding_mode_sentinel)
         else:
             mock_pad.assert_not_called()
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index b0022baaa..a23783b00 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -983,8 +983,6 @@ class PadIfSmaller(prototype_transforms.Transform):
             return inpt
 
         fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-
         return F.pad(inpt, padding=params["padding"], fill=fill)
 
 
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 56d581eff..3714fc136 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -1,5 +1,5 @@
 import math
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import PIL.Image
 import torch
@@ -11,9 +11,6 @@ from torchvision.prototype.transforms.functional._meta import get_spatial_size
 
 from ._utils import _isinstance, _setup_fill_arg
 
-K = TypeVar("K")
-V = TypeVar("V")
-
 
 class _AutoAugmentBase(Transform):
     def __init__(
@@ -26,7 +23,7 @@ class _AutoAugmentBase(Transform):
         self.interpolation = interpolation
         self.fill = _setup_fill_arg(fill)
 
-    def _get_random_item(self, dct: Dict[K, V]) -> Tuple[K, V]:
+    def _get_random_item(self, dct: Dict[str, Tuple[Callable, bool]]) -> Tuple[str, Tuple[Callable, bool]]:
         keys = tuple(dct.keys())
         key = keys[int(torch.randint(len(keys), ()))]
         return key, dct[key]
@@ -71,10 +68,9 @@ class _AutoAugmentBase(Transform):
         transform_id: str,
         magnitude: float,
         interpolation: InterpolationMode,
-        fill: Dict[Type, features.FillType],
+        fill: Dict[Type, features.FillTypeJIT],
     ) -> Union[features.ImageType, features.VideoType]:
         fill_ = fill[type(image)]
-        fill_ = F._geometry._convert_fill_arg(fill_)
 
         if transform_id == "Identity":
             return image
@@ -170,9 +166,7 @@ class AutoAugment(_AutoAugmentBase):
         "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
         "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
@@ -327,9 +321,7 @@ class RandAugment(_AutoAugmentBase):
         "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
         "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
@@ -383,9 +375,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
         "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6))).round().int(),
             False,
         ),
         "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
@@ -430,9 +420,7 @@ class AugMix(_AutoAugmentBase):
         "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, height / 3.0, num_bins), True),
         "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
         "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 4 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
+            lambda num_bins, height, width: (4 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
         "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
@@ -517,7 +505,13 @@ class AugMix(_AutoAugmentBase):
                 aug = self._apply_image_or_video_transform(
                     aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
                 )
-            mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
+            mix.add_(
+                # The multiplication below could become in-place provided `aug is not batch and aug.is_floating_point()`
+                # Currently we can't do this because `aug` has to be `unint8` to support ops like `equalize`.
+                # TODO: change this once all ops in `F` support floats. https://github.com/pytorch/vision/issues/6840
+                combined_weights[:, i].reshape(batch_dims)
+                * aug
+            )
         mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
         if isinstance(orig_image_or_video, (features.Image, features.Video)):
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 3647365c3..0dcf636c3 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -51,7 +51,7 @@ class ColorJitter(Transform):
 
     @staticmethod
     def _generate_value(left: float, right: float) -> float:
-        return float(torch.distributions.Uniform(left, right).sample())
+        return torch.empty(1).uniform_(left, right).item()
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         fn_idx = torch.randperm(4)
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 440e23ab6..c5ab38d84 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -223,20 +223,16 @@ class Pad(Transform):
         _check_padding_arg(padding)
         _check_padding_mode_arg(padding_mode)
 
+        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
+        if not isinstance(padding, int):
+            padding = list(padding)
         self.padding = padding
         self.fill = _setup_fill_arg(fill)
         self.padding_mode = padding_mode
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        padding = self.padding
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)
+        return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)
 
 
 class RandomZoomOut(_RandomApplyTransform):
@@ -274,7 +270,6 @@ class RandomZoomOut(_RandomApplyTransform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
         return F.pad(inpt, **params, fill=fill)
 
 
@@ -300,12 +295,11 @@ class RandomRotation(Transform):
         self.center = center
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
         return dict(angle=angle)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
         return F.rotate(
             inpt,
             **params,
@@ -358,7 +352,7 @@ class RandomAffine(Transform):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         height, width = query_spatial_size(flat_inputs)
 
-        angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
         if self.translate is not None:
             max_dx = float(self.translate[0] * width)
             max_dy = float(self.translate[1] * height)
@@ -369,22 +363,21 @@ class RandomAffine(Transform):
             translate = (0, 0)
 
         if self.scale is not None:
-            scale = float(torch.empty(1).uniform_(self.scale[0], self.scale[1]).item())
+            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
         else:
             scale = 1.0
 
         shear_x = shear_y = 0.0
         if self.shear is not None:
-            shear_x = float(torch.empty(1).uniform_(self.shear[0], self.shear[1]).item())
+            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
             if len(self.shear) == 4:
-                shear_y = float(torch.empty(1).uniform_(self.shear[2], self.shear[3]).item())
+                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
 
         shear = (shear_x, shear_y)
         return dict(angle=angle, translate=translate, scale=scale, shear=shear)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
         return F.affine(
             inpt,
             **params,
@@ -478,8 +471,6 @@ class RandomCrop(Transform):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["needs_pad"]:
             fill = self.fill[type(inpt)]
-            fill = F._geometry._convert_fill_arg(fill)
-
             inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         if params["needs_crop"]:
@@ -512,21 +503,23 @@ class RandomPerspective(_RandomApplyTransform):
 
         half_height = height // 2
         half_width = width // 2
+        bound_height = int(distortion_scale * half_height) + 1
+        bound_width = int(distortion_scale * half_width) + 1
         topleft = [
-            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
-            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
         ]
         topright = [
-            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
-            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
         ]
         botright = [
-            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
-            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
         ]
         botleft = [
-            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
-            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
         ]
         startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
         endpoints = [topleft, topright, botright, botleft]
@@ -535,7 +528,6 @@ class RandomPerspective(_RandomApplyTransform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
         return F.perspective(
             inpt,
             **params,
@@ -584,7 +576,6 @@ class ElasticTransform(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
         return F.elastic(
             inpt,
             **params,
@@ -855,7 +846,6 @@ class FixedSizeCrop(Transform):
 
         if params["needs_pad"]:
             fill = self.fill[type(inpt)]
-            fill = F._geometry._convert_fill_arg(fill)
             inpt = F.pad(inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         return inpt
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index d4ee73871..d0b11d53a 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -1,4 +1,4 @@
-from typing import Any, cast, Dict, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -13,7 +13,7 @@ class DecodeImage(Transform):
     _transformed_types = (features.EncodedImage,)
 
     def _transform(self, inpt: torch.Tensor, params: Dict[str, Any]) -> features.Image:
-        return cast(features.Image, F.decode_image_with_pil(inpt))
+        return F.decode_image_with_pil(inpt)  # type: ignore[no-any-return]
 
 
 class LabelToOneHot(Transform):
@@ -27,7 +27,7 @@ class LabelToOneHot(Transform):
         num_categories = self.num_categories
         if num_categories == -1 and inpt.categories is not None:
             num_categories = len(inpt.categories)
-        output = one_hot(inpt, num_classes=num_categories)
+        output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories)
         return features.OneHotLabel(output, categories=inpt.categories)
 
     def extra_repr(self) -> str:
@@ -50,7 +50,7 @@ class ToImageTensor(Transform):
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> features.Image:
-        return cast(features.Image, F.to_image_tensor(inpt))
+        return F.to_image_tensor(inpt)  # type: ignore[no-any-return]
 
 
 class ToImagePIL(Transform):
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index cff439b88..2272396f7 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -7,7 +7,7 @@ import PIL.Image
 
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import features
-from torchvision.prototype.features._feature import FillType
+from torchvision.prototype.features._feature import FillType, FillTypeJIT
 
 from torchvision.prototype.transforms.functional._meta import get_dimensions, get_spatial_size
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
@@ -37,9 +37,12 @@ def _check_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> None:
         for key, value in fill.items():
             # Check key for type
             _check_fill_arg(value)
+        if isinstance(fill, defaultdict) and callable(fill.default_factory):
+            default_value = fill.default_factory()
+            _check_fill_arg(default_value)
     else:
         if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
-            raise TypeError("Got inappropriate fill arg")
+            raise TypeError("Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed.")
 
 
 T = TypeVar("T")
@@ -55,13 +58,33 @@ def _get_defaultdict(default: T) -> Dict[Any, T]:
     return defaultdict(functools.partial(_default_arg, default))
 
 
-def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillType]:
+def _convert_fill_arg(fill: features.FillType) -> features.FillTypeJIT:
+    # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
+    # So, we can't reassign fill to 0
+    # if fill is None:
+    #     fill = 0
+    if fill is None:
+        return fill
+
+    # This cast does Sequence -> List[float] to please mypy and torch.jit.script
+    if not isinstance(fill, (int, float)):
+        fill = [float(v) for v in list(fill)]
+    return fill
+
+
+def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillTypeJIT]:
     _check_fill_arg(fill)
 
     if isinstance(fill, dict):
-        return fill
+        for k, v in fill.items():
+            fill[k] = _convert_fill_arg(v)
+        if isinstance(fill, defaultdict) and callable(fill.default_factory):
+            default_value = fill.default_factory()
+            sanitized_default = _convert_fill_arg(default_value)
+            fill.default_factory = functools.partial(_default_arg, sanitized_default)
+        return fill  # type: ignore[return-value]
 
-    return _get_defaultdict(fill)
+    return _get_defaultdict(_convert_fill_arg(fill))
 
 
 def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
@@ -80,7 +103,7 @@ def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect",
 
 
 def query_bounding_box(flat_inputs: List[Any]) -> features.BoundingBox:
-    bounding_boxes = {inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)}
+    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)]
     if not bounding_boxes:
         raise TypeError("No bounding box was found in the sample")
     elif len(bounding_boxes) > 1:
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index a112db7e1..7f709b73b 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -470,20 +470,6 @@ def affine_video(
     )
 
 
-def _convert_fill_arg(fill: features.FillType) -> features.FillTypeJIT:
-    # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
-    # So, we can't reassign fill to 0
-    # if fill is None:
-    #     fill = 0
-    if fill is None:
-        return fill
-
-    # This cast does Sequence -> List[float] to please mypy and torch.jit.script
-    if not isinstance(fill, (int, float)):
-        fill = [float(v) for v in list(fill)]
-    return fill
-
-
 def affine(
     inpt: features.InputTypeJIT,
     angle: Union[int, float],
-- 
GitLab


From e0068d8e5116b6c9467a94cd10a6e3c8ae6eb654 Mon Sep 17 00:00:00 2001
From: F-G Fernandez <26927750+frgfm@users.noreply.github.com>
Date: Thu, 27 Oct 2022 16:36:34 +0200
Subject: [PATCH 088/624] Added missing typing annotations in dataset
 __getitem__ (#6843)

* style: Fixed typing of dataset __getitem__

* style: Fixed last missing typing

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/datasets/dtd.py           | 4 ++--
 torchvision/datasets/fgvc_aircraft.py | 2 +-
 torchvision/datasets/flowers102.py    | 2 +-
 torchvision/datasets/food101.py       | 2 +-
 torchvision/datasets/rendered_sst2.py | 2 +-
 torchvision/datasets/sun397.py        | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/torchvision/datasets/dtd.py b/torchvision/datasets/dtd.py
index 2d8314346..2b10efe94 100644
--- a/torchvision/datasets/dtd.py
+++ b/torchvision/datasets/dtd.py
@@ -1,6 +1,6 @@
 import os
 import pathlib
-from typing import Callable, Optional
+from typing import Any, Callable, Optional, Tuple
 
 import PIL.Image
 
@@ -76,7 +76,7 @@ class DTD(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/fgvc_aircraft.py b/torchvision/datasets/fgvc_aircraft.py
index 2e4993361..aa705b305 100644
--- a/torchvision/datasets/fgvc_aircraft.py
+++ b/torchvision/datasets/fgvc_aircraft.py
@@ -90,7 +90,7 @@ class FGVCAircraft(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py
index ad3a6dda0..fdaf2ddb4 100644
--- a/torchvision/datasets/flowers102.py
+++ b/torchvision/datasets/flowers102.py
@@ -76,7 +76,7 @@ class Flowers102(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/food101.py b/torchvision/datasets/food101.py
index e7d1bd194..d2557a827 100644
--- a/torchvision/datasets/food101.py
+++ b/torchvision/datasets/food101.py
@@ -69,7 +69,7 @@ class Food101(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/rendered_sst2.py b/torchvision/datasets/rendered_sst2.py
index 89adf8cf8..58ea2f9cf 100644
--- a/torchvision/datasets/rendered_sst2.py
+++ b/torchvision/datasets/rendered_sst2.py
@@ -59,7 +59,7 @@ class RenderedSST2(VisionDataset):
     def __len__(self) -> int:
         return len(self._samples)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._samples[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/sun397.py b/torchvision/datasets/sun397.py
index 05cb910dd..0a1ffef9b 100644
--- a/torchvision/datasets/sun397.py
+++ b/torchvision/datasets/sun397.py
@@ -55,7 +55,7 @@ class SUN397(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
-- 
GitLab


From 52b80c48817a8bc0b918d6f26917bf329372edab Mon Sep 17 00:00:00 2001
From: F-G Fernandez <26927750+frgfm@users.noreply.github.com>
Date: Thu, 27 Oct 2022 16:40:39 +0200
Subject: [PATCH 089/624] style: Added typing to datasets/lfw (#6844)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/datasets/lfw.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/torchvision/datasets/lfw.py b/torchvision/datasets/lfw.py
index a25765d57..7a5aa45aa 100644
--- a/torchvision/datasets/lfw.py
+++ b/torchvision/datasets/lfw.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from PIL import Image
 
@@ -38,7 +38,7 @@ class _LFW(VisionDataset):
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(os.path.join(root, self.base_folder), transform=transform, target_transform=target_transform)
 
         self.image_set = verify_str_arg(image_set.lower(), "image_set", self.file_dict.keys())
@@ -62,7 +62,7 @@ class _LFW(VisionDataset):
             img = Image.open(f)
             return img.convert("RGB")
 
-    def _check_integrity(self):
+    def _check_integrity(self) -> bool:
         st1 = check_integrity(os.path.join(self.root, self.filename), self.md5)
         st2 = check_integrity(os.path.join(self.root, self.labels_file), self.checksums[self.labels_file])
         if not st1 or not st2:
@@ -71,7 +71,7 @@ class _LFW(VisionDataset):
             return check_integrity(os.path.join(self.root, self.names), self.checksums[self.names])
         return True
 
-    def download(self):
+    def download(self) -> None:
         if self._check_integrity():
             print("Files already downloaded and verified")
             return
@@ -81,13 +81,13 @@ class _LFW(VisionDataset):
         if self.view == "people":
             download_url(f"{self.download_url_prefix}{self.names}", self.root)
 
-    def _get_path(self, identity, no):
+    def _get_path(self, identity: str, no: Union[int, str]) -> str:
         return os.path.join(self.images_dir, identity, f"{identity}_{int(no):04d}.jpg")
 
     def extra_repr(self) -> str:
         return f"Alignment: {self.image_set}\nSplit: {self.split}"
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.data)
 
 
@@ -119,13 +119,13 @@ class LFWPeople(_LFW):
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(root, split, image_set, "people", transform, target_transform, download)
 
         self.class_to_idx = self._get_classes()
         self.data, self.targets = self._get_people()
 
-    def _get_people(self):
+    def _get_people(self) -> Tuple[List[str], List[int]]:
         data, targets = [], []
         with open(os.path.join(self.root, self.labels_file)) as f:
             lines = f.readlines()
@@ -143,7 +143,7 @@ class LFWPeople(_LFW):
 
         return data, targets
 
-    def _get_classes(self):
+    def _get_classes(self) -> Dict[str, int]:
         with open(os.path.join(self.root, self.names)) as f:
             lines = f.readlines()
             names = [line.strip().split()[0] for line in lines]
@@ -201,12 +201,12 @@ class LFWPairs(_LFW):
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(root, split, image_set, "pairs", transform, target_transform, download)
 
         self.pair_names, self.data, self.targets = self._get_pairs(self.images_dir)
 
-    def _get_pairs(self, images_dir):
+    def _get_pairs(self, images_dir: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]], List[int]]:
         pair_names, data, targets = [], [], []
         with open(os.path.join(self.root, self.labels_file)) as f:
             lines = f.readlines()
-- 
GitLab


From b522aca275cb20d688bef23f0155227d4a610af4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 27 Oct 2022 17:28:11 +0200
Subject: [PATCH 090/624] extend equalize to all integer and floating dtypes
 (#6851)

* extend equalize to all integer and floating dtypes

* address nits
---
 test/prototype_transforms_kernel_infos.py     | 49 ++++++++++++-------
 .../prototype/transforms/functional/_color.py | 36 +++++++-------
 2 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index c417b33c2..7cfb9b6a7 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1322,7 +1322,7 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_equalize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8]
+        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1331,27 +1331,41 @@ def reference_inputs_equalize_image_tensor():
     # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range.
     # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one,
     # the information gain is low if we already provide something really close to the expected value.
+    def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor):
+        if dtype.is_floating_point:
+            low = low_factor
+            high = high_factor
+        else:
+            max_value = torch.iinfo(dtype).max
+            low = int(low_factor * max_value)
+            high = int(high_factor * max_value)
+        return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high)
+
+    def make_beta_distributed_image(shape, dtype, device, *, alpha, beta):
+        image = torch.distributions.Beta(alpha, beta).sample(shape)
+        if not dtype.is_floating_point:
+            image.mul_(torch.iinfo(dtype).max).round_()
+        return image.to(dtype=dtype, device=device)
+
     spatial_size = (256, 256)
-    for fn, color_space in itertools.product(
+    for dtype, color_space, fn in itertools.product(
+        [torch.uint8, torch.float32],
+        [features.ColorSpace.GRAY, features.ColorSpace.RGB],
         [
+            lambda shape, dtype, device: torch.zeros(shape, dtype=dtype, device=device),
+            lambda shape, dtype, device: torch.full(
+                shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device
+            ),
             *[
-                lambda shape, dtype, device, low=low, high=high: torch.randint(
-                    low, high, shape, dtype=dtype, device=device
-                )
-                for low, high in [
-                    (0, 1),
-                    (255, 256),
-                    (0, 64),
-                    (64, 192),
-                    (192, 256),
+                functools.partial(make_uniform_band_image, low_factor=low_factor, high_factor=high_factor)
+                for low_factor, high_factor in [
+                    (0.0, 0.25),
+                    (0.25, 0.75),
+                    (0.75, 1.0),
                 ]
             ],
             *[
-                lambda shape, dtype, device, alpha=alpha, beta=beta: torch.distributions.Beta(alpha, beta)
-                .sample(shape)
-                .mul_(255)
-                .round_()
-                .to(dtype=dtype, device=device)
+                functools.partial(make_beta_distributed_image, alpha=alpha, beta=beta)
                 for alpha, beta in [
                     (0.5, 0.5),
                     (2, 2),
@@ -1360,10 +1374,9 @@ def reference_inputs_equalize_image_tensor():
                 ]
             ],
         ],
-        [features.ColorSpace.GRAY, features.ColorSpace.RGB],
     ):
         image_loader = ImageLoader(
-            fn, shape=(get_num_channels(color_space), *spatial_size), dtype=torch.uint8, color_space=color_space
+            fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype, color_space=color_space
         )
         yield ArgsKwargs(image_loader)
 
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 3ad65493f..fb2385102 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -371,26 +371,26 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
 
 
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
-    if image.dtype != torch.uint8:
-        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {image.dtype}")
-
-    num_channels, height, width = get_dimensions_image_tensor(image)
-    if num_channels not in (1, 3):
-        raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
-
     if image.numel() == 0:
         return image
 
+    # 1. The algorithm below can easily be extended to support arbitrary integer dtypes. However, the histogram that
+    #    would be needed to computed will have at least `torch.iinfo(dtype).max + 1` values. That is perfectly fine for
+    #    `torch.int8`, `torch.uint8`, and `torch.int16`, at least questionable for `torch.int32` and completely
+    #    unfeasible for `torch.int64`.
+    # 2. Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
+    #    could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
+    #    to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it slower
+    #    and more complicated to implement than a simple conversion and a fast histogram implementation for integers.
+    # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
+    # by far the most common, we choose it as base.
+    output_dtype = image.dtype
+    image = convert_dtype_image_tensor(image, torch.uint8)
+
+    # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
+    # corresponds to adding 1 to index 127 in the histogram.
     batch_shape = image.shape[:-2]
     flat_image = image.flatten(start_dim=-2).to(torch.long)
-
-    # The algorithm for histogram equalization is mirrored from PIL:
-    # https://github.com/python-pillow/Pillow/blob/eb59cb61d5239ee69cbbf12709a0c6fd7314e6d7/src/PIL/ImageOps.py#L368-L385
-
-    # Although PyTorch has builtin functionality for histograms, it doesn't support batches. Since we deal with uint8
-    # images here and thus the values are already binned, the computation is trivial. The histogram is computed by using
-    # the flattened image as index. For example, a pixel value of 127 in the image corresponds to adding 1 to index 127
-    # in the histogram.
     hist = flat_image.new_zeros(batch_shape + (256,), dtype=torch.int32)
     hist.scatter_add_(dim=-1, index=flat_image, src=hist.new_ones(1).expand_as(flat_image))
     cum_hist = hist.cumsum(dim=-1)
@@ -398,6 +398,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     # The simplest form of lookup-table (LUT) that also achieves histogram equalization is
     # `lut = cum_hist / flat_image.shape[-1] * 255`
     # However, PIL uses a more elaborate scheme:
+    # https://github.com/python-pillow/Pillow/blob/eb59cb61d5239ee69cbbf12709a0c6fd7314e6d7/src/PIL/ImageOps.py#L368-L385
     # `lut = ((cum_hist + num_non_max_pixels // (2 * 255)) // num_non_max_pixels) * 255`
 
     # The last non-zero element in the histogram is the first element in the cumulative histogram with the maximum
@@ -415,7 +416,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     # easy due to our support for batched images. We can only return early if `(step == 0).all()` holds. If it doesn't,
     # we have to go through the computation below anyway. Since `step == 0` is an edge case anyway, it makes no sense to
     # pay the runtime cost for checking it every time.
-    no_equalization = step.eq(0).unsqueeze_(-1)
+    valid_equalization = step.ne(0).unsqueeze_(-1)
 
     # `lut[k]` is computed with `cum_hist[k-1]` with `lut[0] == (step // 2) // step == 0`. Thus, we perform the
     # computation only for `lut[1:]` with `cum_hist[:-1]` and add `lut[0] == 0` afterwards.
@@ -434,7 +435,8 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     lut = torch.cat([lut.new_zeros(1).expand(batch_shape + (1,)), lut], dim=-1)
     equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
 
-    return torch.where(no_equalization, image, equalized_image)
+    output = torch.where(valid_equalization, equalized_image, image)
+    return convert_dtype_image_tensor(output, output_dtype)
 
 
 equalize_image_pil = _FP.equalize
-- 
GitLab


From cb60e97a778ad6626980b67c456984e5fcf1d507 Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Thu, 27 Oct 2022 18:19:49 +0100
Subject: [PATCH 091/624] Add version 0.14 to readme (#6854)

---
 README.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.rst b/README.rst
index c3605cc3c..c7f11d9c6 100644
--- a/README.rst
+++ b/README.rst
@@ -23,6 +23,8 @@ supported Python versions.
 +==========================+==========================+=================================+
 | ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.10``           |
 +--------------------------+--------------------------+---------------------------------+
+| ``1.13.0``               | ``0.14.0``               | ``>=3.7``, ``<=3.10``           |
++--------------------------+--------------------------+---------------------------------+
 | ``1.12.0``               | ``0.13.0``               | ``>=3.7``, ``<=3.10``           |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.11.0``               | ``0.12.0``               | ``>=3.7``, ``<=3.10``           |
-- 
GitLab


From 6af796abe1433c04cac891185eb81926eb1ff71c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 28 Oct 2022 11:49:41 +0200
Subject: [PATCH 092/624] silence mypy (#6857)

---
 torchvision/prototype/transforms/_augment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index b4834e47f..b253b7cbf 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -125,7 +125,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
 
 class RandomMixup(_BaseMixupCutmix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        return dict(lam=float(self._dist.sample(())))
+        return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         lam = params["lam"]
@@ -147,7 +147,7 @@ class RandomMixup(_BaseMixupCutmix):
 
 class RandomCutmix(_BaseMixupCutmix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        lam = float(self._dist.sample(()))
+        lam = float(self._dist.sample(()))  # type: ignore[arg-type]
 
         H, W = query_spatial_size(flat_inputs)
 
-- 
GitLab


From 900982fccb88d1220cac5b1dad9ae37dd7554f2e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 28 Oct 2022 12:38:30 +0200
Subject: [PATCH 093/624] extend support of posterize to all integer and
 floating dtypes (#6847)

* extend support of posterize to all integer and floating dtypes

* remove raise

* revert to fixed value range for integer dtypes
---
 test/prototype_transforms_kernel_infos.py           |  6 ++----
 .../prototype/transforms/functional/_color.py       | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 7cfb9b6a7..287b1acaa 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1446,16 +1446,14 @@ _POSTERIZE_BITS = [1, 4, 8]
 
 def sample_inputs_posterize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8]
+        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
 
 
 def reference_inputs_posterize_image_tensor():
     for image_loader, bits in itertools.product(
-        make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
         _POSTERIZE_BITS,
     ):
         yield ArgsKwargs(image_loader, bits=bits)
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index fb2385102..e7977156d 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -289,7 +289,18 @@ def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) ->
         return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
 
 
-posterize_image_tensor = _FT.posterize
+def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
+    if bits > 8:
+        return image
+
+    if image.is_floating_point():
+        levels = 1 << bits
+        return image.mul(levels).floor_().clamp_(0, levels - 1).div_(levels)
+    else:
+        mask = ((1 << bits) - 1) << (8 - bits)
+        return image & mask
+
+
 posterize_image_pil = _FP.posterize
 
 
-- 
GitLab


From 436ff9a52a0c901aa1b9bb51c0e580ae7f7cd4c0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 28 Oct 2022 13:48:21 +0200
Subject: [PATCH 094/624] assume that integer images are [0, 255] in equalize
 (#6859)

---
 .../prototype/transforms/functional/_color.py  | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index e7977156d..67a55cfb1 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -385,18 +385,14 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
 
-    # 1. The algorithm below can easily be extended to support arbitrary integer dtypes. However, the histogram that
-    #    would be needed to computed will have at least `torch.iinfo(dtype).max + 1` values. That is perfectly fine for
-    #    `torch.int8`, `torch.uint8`, and `torch.int16`, at least questionable for `torch.int32` and completely
-    #    unfeasible for `torch.int64`.
-    # 2. Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
-    #    could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
-    #    to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it slower
-    #    and more complicated to implement than a simple conversion and a fast histogram implementation for integers.
-    # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
-    # by far the most common, we choose it as base.
     output_dtype = image.dtype
-    image = convert_dtype_image_tensor(image, torch.uint8)
+    if image.is_floating_point():
+        # Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
+        # could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
+        # to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it
+        # slower and more complicated to implement than a simple conversion and a fast histogram implementation for
+        # integers.
+        image = convert_dtype_image_tensor(image, torch.uint8)
 
     # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
     # corresponds to adding 1 to index 127 in the histogram.
-- 
GitLab


From e47843c5fc69e6eaf8de22638d945603f8b8975f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 28 Oct 2022 17:24:13 +0200
Subject: [PATCH 095/624] fix reference tests for convert_format_bounding_box
 (#6860)

* fix reference tests for convert_format_bounding_box

* add check to prevent empty args_kwargs_fns in the future
---
 test/prototype_transforms_kernel_infos.py    | 6 +++---
 test/test_prototype_transforms_functional.py | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 287b1acaa..ba9bfe8ee 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -536,12 +536,12 @@ def sample_inputs_convert_format_bounding_box():
 
 def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
     return torchvision.ops.box_convert(
-        bounding_box, in_fmt=old_format.kernel_name.lower(), out_fmt=new_format.kernel_name.lower()
-    )
+        bounding_box, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()
+    ).to(bounding_box.dtype)
 
 
 def reference_inputs_convert_format_bounding_box():
-    for args_kwargs in sample_inputs_convert_color_space_image_tensor():
+    for args_kwargs in sample_inputs_convert_format_bounding_box():
         if len(args_kwargs.args[0].shape) == 2:
             yield args_kwargs
 
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index c739598a1..6746c09e0 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -38,6 +38,10 @@ ignore_jit_warning_no_profile = pytest.mark.filterwarnings(
 
 def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None):
     args_kwargs = list(args_kwargs_fn(info))
+    if not args_kwargs:
+        raise pytest.UsageError(
+            f"Couldn't collect a single `ArgsKwargs` for `{info.id}`{f' in {test_id}' if test_id else ''}"
+        )
     idx_field_len = len(str(len(args_kwargs)))
     return [
         pytest.param(
-- 
GitLab


From c32bec56f2490663f8d006d9dfdc84c698221732 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 28 Oct 2022 10:53:17 -0700
Subject: [PATCH 096/624] [Nova] More comprehensive Smoke Tests for Torchvision
 (#6803)

* [Nova] More comprehensive Smoke Tests for Torchvision

* No need to import everything since nothing is lazily uploaded
---
 test/smoke_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index c3a4bdd19..65247d226 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,4 +1,5 @@
-import torch
+"""Run smoke tests"""
+
 import torchvision
-import torchvision.datasets as dset
-import torchvision.transforms
+
+print("torchvision version is ", torchvision.__version__)
-- 
GitLab


From 526ad5cf1f0a47953c6a091e5e57c5719627dbc5 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 28 Oct 2022 11:33:07 -0700
Subject: [PATCH 097/624] [Nova] Add Caller workflow for Linux Conda Builds
 (#6856)

* [Nova] Add Caller workflow for Linux Conda Builds

* Add smoke tests script
---
 .github/workflows/build-conda-linux.yml | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/workflows/build-conda-linux.yml

diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
new file mode 100644
index 000000000..3bac45d76
--- /dev/null
+++ b/.github/workflows/build-conda-linux.yml
@@ -0,0 +1,45 @@
+name: Build Linux Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: conda
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            conda-package-directory: packaging/torchvision
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_linux.yml@main
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-- 
GitLab


From be3f120798183d8e8d8a420f22f2b2223f1b0701 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 28 Oct 2022 13:39:00 -0700
Subject: [PATCH 098/624] [Nova] Add Linux Wheels Build Workflow (#6855)

* [Nova] Add Linux Wheels Build Workflow

* Add smoke tests script
---
 .github/workflows/build-wheels-linux.yml | 45 ++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/workflows/build-wheels-linux.yml

diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
new file mode 100644
index 000000000..d981a9781
--- /dev/null
+++ b/.github/workflows/build-wheels-linux.yml
@@ -0,0 +1,45 @@
+name: Build Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From cba1c011a87dd14af10f97bcb113fa09a8e2b396 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 28 Oct 2022 15:59:34 -0700
Subject: [PATCH 099/624] [Nova] Fix Refs in Build Workflow to use Default
 GitHub Ref (#6863)

---
 .github/workflows/build-conda-linux.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
index 3bac45d76..3eb8509f3 100644
--- a/.github/workflows/build-conda-linux.yml
+++ b/.github/workflows/build-conda-linux.yml
@@ -32,6 +32,7 @@ jobs:
     with:
       conda-package-directory: ${{ matrix.conda-package-directory }}
       repository: ${{ matrix.repository }}
+      ref: ""
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-- 
GitLab


From 70faba91d1c79f689ac6ebd30ad0c4be62690196 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 31 Oct 2022 11:30:32 +0000
Subject: [PATCH 100/624] [prototype] Add support of inplace on
 `convert_format_bounding_box` (#6858)

* Add support of inplace on `convert_format_bounding_box`

* Move `as_subclass` calls to `F` invocations

* Fix bug.

* Fix _cxcywh_to_xyxy.

* Fixing _xyxy_to_cxcywh.

* Adding comments.
---
 torchvision/prototype/transforms/_augment.py  |  2 +-
 torchvision/prototype/transforms/_geometry.py | 11 +++-
 torchvision/prototype/transforms/_misc.py     |  4 +-
 .../transforms/functional/_geometry.py        | 38 +++++------
 .../prototype/transforms/functional/_meta.py  | 63 ++++++++++---------
 5 files changed, 65 insertions(+), 53 deletions(-)

diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index b253b7cbf..cf861c46d 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -262,7 +262,7 @@ class SimpleCopyPaste(_RandomApplyTransform):
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
         boxes = F.convert_format_bounding_box(
-            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format
+            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
 
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index c5ab38d84..214296d03 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -646,7 +646,9 @@ class RandomIoUCrop(Transform):
                     continue
 
                 # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_format_bounding_box(bboxes, bboxes.format, features.BoundingBoxFormat.XYXY)
+                xyxy_bboxes = F.convert_format_bounding_box(
+                    bboxes.as_subclass(torch.Tensor), bboxes.format, features.BoundingBoxFormat.XYXY
+                )
                 cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
                 cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
                 is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
@@ -799,7 +801,12 @@ class FixedSizeCrop(Transform):
         if needs_crop and bounding_boxes is not None:
             format = bounding_boxes.format
             bounding_boxes, spatial_size = F.crop_bounding_box(
-                bounding_boxes, format=format, top=top, left=left, height=new_height, width=new_width
+                bounding_boxes.as_subclass(torch.Tensor),
+                format=format,
+                top=top,
+                left=left,
+                height=new_height,
+                width=new_width,
             )
             bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, spatial_size=spatial_size)
             height_and_width = F.convert_format_bounding_box(
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index aad684bf1..1d4b0f6fa 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -207,7 +207,9 @@ class RemoveSmallBoundingBoxes(Transform):
         #  format,we need to convert first just to afterwards compute the width and height again, although they were
         #  there in the first place for these formats.
         bounding_box = F.convert_format_bounding_box(
-            bounding_box, old_format=bounding_box.format, new_format=features.BoundingBoxFormat.XYXY
+            bounding_box.as_subclass(torch.Tensor),
+            old_format=bounding_box.format,
+            new_format=features.BoundingBoxFormat.XYXY,
         )
         valid_indices = remove_small_boxes(bounding_box, min_size=self.min_size)
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 7f709b73b..5b71c79d3 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -38,16 +38,14 @@ def horizontal_flip_bounding_box(
 
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = (
-        bounding_box.clone()
-        if format == features.BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+    bounding_box = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     ).reshape(-1, 4)
 
     bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]
 
     return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(shape)
 
 
@@ -79,16 +77,14 @@ def vertical_flip_bounding_box(
 
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = (
-        bounding_box.clone()
-        if format == features.BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+    bounding_box = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     ).reshape(-1, 4)
 
     bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]
 
     return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(shape)
 
 
@@ -412,7 +408,7 @@ def affine_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
@@ -594,9 +590,9 @@ def rotate_bounding_box(
     )
 
     return (
-        convert_format_bounding_box(out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format).reshape(
-            original_shape
-        ),
+        convert_format_bounding_box(
+            out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        ).reshape(original_shape),
         spatial_size,
     )
 
@@ -815,10 +811,8 @@ def crop_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = (
-        bounding_box.clone()
-        if format == features.BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+    bounding_box = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     )
 
     # Crop or implicit pad if left and/or top have negative values:
@@ -826,7 +820,9 @@ def crop_bounding_box(
     bounding_box[..., 1::2] -= top
 
     return (
-        convert_format_bounding_box(bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format),
+        convert_format_bounding_box(
+            bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        ),
         (height, width),
     )
 
@@ -964,7 +960,7 @@ def perspective_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
@@ -1085,7 +1081,7 @@ def elastic_bounding_box(
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 5e0178484..81ccd08de 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -119,51 +119,60 @@ def get_num_frames(inpt: features.VideoTypeJIT) -> int:
         raise TypeError(f"The video should be a Tensor. Got {type(inpt)}")
 
 
-def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor:
-    xyxy = xywh.clone()
+def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xyxy = xywh if inplace else xywh.clone()
     xyxy[..., 2:] += xyxy[..., :2]
     return xyxy
 
 
-def _xyxy_to_xywh(xyxy: torch.Tensor) -> torch.Tensor:
-    xywh = xyxy.clone()
+def _xyxy_to_xywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xywh = xyxy if inplace else xyxy.clone()
     xywh[..., 2:] -= xywh[..., :2]
     return xywh
 
 
-def _cxcywh_to_xyxy(cxcywh: torch.Tensor) -> torch.Tensor:
-    cx, cy, w, h = torch.unbind(cxcywh, dim=-1)
-    x1 = cx - 0.5 * w
-    y1 = cy - 0.5 * h
-    x2 = cx + 0.5 * w
-    y2 = cy + 0.5 * h
-    return torch.stack((x1, y1, x2, y2), dim=-1).to(cxcywh.dtype)
+def _cxcywh_to_xyxy(cxcywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        cxcywh = cxcywh.clone()
 
+    # Trick to do fast division by 2 and ceil, without casting. It produces the same result as
+    # `torchvision.ops._box_convert._box_cxcywh_to_xyxy`.
+    half_wh = cxcywh[..., 2:].div(-2, rounding_mode=None if cxcywh.is_floating_point() else "floor").abs_()
+    # (cx - width / 2) = x1, same for y1
+    cxcywh[..., :2].sub_(half_wh)
+    # (x1 + width) = x2, same for y2
+    cxcywh[..., 2:].add_(cxcywh[..., :2])
 
-def _xyxy_to_cxcywh(xyxy: torch.Tensor) -> torch.Tensor:
-    x1, y1, x2, y2 = torch.unbind(xyxy, dim=-1)
-    cx = (x1 + x2) / 2
-    cy = (y1 + y2) / 2
-    w = x2 - x1
-    h = y2 - y1
-    return torch.stack((cx, cy, w, h), dim=-1).to(xyxy.dtype)
+    return cxcywh
+
+
+def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xyxy = xyxy.clone()
+
+    # (x2 - x1) = width, same for height
+    xyxy[..., 2:].sub_(xyxy[..., :2])
+    # (x1 * 2 + width) / 2 = x1 + width / 2 = x1 + (x2-x1)/2 = (x1 + x2)/2 = cx, same for cy
+    xyxy[..., :2].mul_(2).add_(xyxy[..., 2:]).div_(2, rounding_mode=None if xyxy.is_floating_point() else "floor")
+
+    return xyxy
 
 
 def convert_format_bounding_box(
-    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat
+    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
     if new_format == old_format:
         return bounding_box
 
     if old_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xywh_to_xyxy(bounding_box)
+        bounding_box = _xywh_to_xyxy(bounding_box, inplace)
     elif old_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _cxcywh_to_xyxy(bounding_box)
+        bounding_box = _cxcywh_to_xyxy(bounding_box, inplace)
 
     if new_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xyxy_to_xywh(bounding_box)
+        bounding_box = _xyxy_to_xywh(bounding_box, inplace)
     elif new_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _xyxy_to_cxcywh(bounding_box)
+        bounding_box = _xyxy_to_cxcywh(bounding_box, inplace)
 
     return bounding_box
 
@@ -173,14 +182,12 @@ def clamp_bounding_box(
 ) -> torch.Tensor:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    xyxy_boxes = (
-        bounding_box.clone()
-        if format == BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY)
+    xyxy_boxes = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
-    return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format)
+    return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
 
 
 def _strip_alpha(image: torch.Tensor) -> torch.Tensor:
-- 
GitLab


From 2e833520618dc460cbeb693e29e40b65a02ccafb Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Mon, 31 Oct 2022 13:22:01 +0100
Subject: [PATCH 101/624] Pyav backend for VideoReader API (#6598)

* Test: add backend parameter

* VideoReader object now works on backend

* Frame reading now passes

* Keyframe seek now passes

* Pyav backend now supports metadata

* changes in test to reflect GPU decoder change

* Linter?

* Test GPU output

* Addressing Joao's comments

* lint

* lint

* Revert "Test GPU output"

This reverts commit f62e955d7dc81bcb23b40d58ea75413b9b62e76d.

* lint?

* lint

* lint

* Address issues in build?

* hopefully doc fix

* Arrgh

* arrgh

* fix typos

* fix input options

* remove read from memory option in pyav

* skip read from mem test for gpu and pyab be

* fix test

* remove unused import

* Hack to get reading from memory work with pyav

* patch audio test

Co-authored-by: Bruno Korbar <bkorbar@quansight.com>
Co-authored-by: Joao Gomes <jdsgomes@fb.com>
---
 test/test_video_gpu_decoder.py      |  13 ++-
 test/test_videoapi.py               |  98 +++++++++-------
 torchvision/__init__.py             |  21 +++-
 torchvision/io/__init__.py          |   5 -
 torchvision/io/_load_gpu_decoder.py |   8 --
 torchvision/io/video_reader.py      | 172 ++++++++++++++++++++--------
 6 files changed, 210 insertions(+), 107 deletions(-)
 delete mode 100644 torchvision/io/_load_gpu_decoder.py

diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
index d987db6dd..66c9aa04a 100644
--- a/test/test_video_gpu_decoder.py
+++ b/test/test_video_gpu_decoder.py
@@ -3,7 +3,9 @@ import os
 
 import pytest
 import torch
-from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
+import torchvision
+from torchvision import _HAS_GPU_VIDEO_DECODER
+from torchvision.io import VideoReader
 
 try:
     import av
@@ -29,8 +31,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_frame_reading(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         with av.open(full_path) as container:
             for av_frame in container.decode(container.streams.video[0]):
                 av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -54,7 +57,8 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_seek_reading(self, keyframes, full_path, duration):
-        decoder = VideoReader(full_path, device="cuda")
+        torchvision.set_video_backend("cuda")
+        decoder = VideoReader(full_path)
         time = duration / 2
         decoder.seek(time, keyframes_only=keyframes)
         with av.open(full_path) as container:
@@ -79,8 +83,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_metadata(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         video_metadata = decoder.get_metadata()["video"]
         with av.open(full_path) as container:
             video = container.streams.video[0]
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index 4688e5a64..c1bfb9012 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -53,7 +53,9 @@ test_videos = {
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         with av.open(full_path) as av_reader:
             if av_reader.streams.video:
@@ -117,50 +119,60 @@ class TestVideoApi:
 
     @pytest.mark.parametrize("stream", ["video", "audio"])
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading_mem_vs_file(self, test_video, stream):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
-        # Test video reading from file vs from memory
-        vr_frames, vr_frames_mem = [], []
-        vr_pts, vr_pts_mem = [], []
-        # get vr frames
-        video_reader = VideoReader(full_path, stream)
-        for vr_frame in video_reader:
-            vr_frames.append(vr_frame["data"])
-            vr_pts.append(vr_frame["pts"])
-
-        # get vr frames = read from memory
-        f = open(full_path, "rb")
-        fbytes = f.read()
-        f.close()
-        video_reader_from_mem = VideoReader(fbytes, stream)
-
-        for vr_frame_from_mem in video_reader_from_mem:
-            vr_frames_mem.append(vr_frame_from_mem["data"])
-            vr_pts_mem.append(vr_frame_from_mem["pts"])
-
-        # same number of frames
-        assert len(vr_frames) == len(vr_frames_mem)
-        assert len(vr_pts) == len(vr_pts_mem)
-
-        # compare the frames and ptss
-        for i in range(len(vr_frames)):
-            assert vr_pts[i] == vr_pts_mem[i]
-            mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-            # on average the difference is very small and caused
-            # by decoding (around 1%)
-            # TODO: asses empirically how to set this? atm it's 1%
-            # averaged over all frames
-            assert mean_delta.item() < 2.55
-
-        del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        reader = VideoReader(full_path)
+        reader_md = reader.get_metadata()
+
+        if stream in reader_md:
+            # Test video reading from file vs from memory
+            vr_frames, vr_frames_mem = [], []
+            vr_pts, vr_pts_mem = [], []
+            # get vr frames
+            video_reader = VideoReader(full_path, stream)
+            for vr_frame in video_reader:
+                vr_frames.append(vr_frame["data"])
+                vr_pts.append(vr_frame["pts"])
+
+            # get vr frames = read from memory
+            f = open(full_path, "rb")
+            fbytes = f.read()
+            f.close()
+            video_reader_from_mem = VideoReader(fbytes, stream)
+
+            for vr_frame_from_mem in video_reader_from_mem:
+                vr_frames_mem.append(vr_frame_from_mem["data"])
+                vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+            # same number of frames
+            assert len(vr_frames) == len(vr_frames_mem)
+            assert len(vr_pts) == len(vr_pts_mem)
+
+            # compare the frames and ptss
+            for i in range(len(vr_frames)):
+                assert vr_pts[i] == vr_pts_mem[i]
+                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+                # on average the difference is very small and caused
+                # by decoding (around 1%)
+                # TODO: asses empirically how to set this? atm it's 1%
+                # averaged over all frames
+                assert mean_delta.item() < 2.55
+
+            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        else:
+            del reader, reader_md
 
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_metadata(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_metadata(self, test_video, config, backend):
         """
         Test that the metadata returned via pyav corresponds to the one returned
         by the new video decoder API
         """
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         reader = VideoReader(full_path, "video")
         reader_md = reader.get_metadata()
@@ -168,7 +180,9 @@ class TestVideoApi:
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_seek_start(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_seek_start(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         video_reader = VideoReader(full_path, "video")
         num_frames = 0
@@ -194,7 +208,9 @@ class TestVideoApi:
         assert start_num_frames == num_frames
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_accurateseek_middle(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader"])
+    def test_accurateseek_middle(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         stream = "video"
         video_reader = VideoReader(full_path, stream)
@@ -233,7 +249,9 @@ class TestVideoApi:
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_keyframe_reading(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
+    def test_keyframe_reading(self, test_video, config, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
         av_reader = av.open(full_path)
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 739f79407..def7e82b8 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -1,16 +1,24 @@
 import os
 import warnings
+from modulefinder import Module
 
 import torch
 from torchvision import datasets, io, models, ops, transforms, utils
 
-from .extension import _HAS_OPS
+from .extension import _HAS_OPS, _load_library
 
 try:
     from .version import __version__  # noqa: F401
 except ImportError:
     pass
 
+try:
+    _load_library("Decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError, ModuleNotFoundError):
+    _HAS_GPU_VIDEO_DECODER = False
+
+
 # Check if torchvision is being imported within the root folder
 if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
     os.path.realpath(os.getcwd()), "torchvision"
@@ -66,11 +74,16 @@ def set_video_backend(backend):
         backend, please compile torchvision from source.
     """
     global _video_backend
-    if backend not in ["pyav", "video_reader"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
+    if backend not in ["pyav", "video_reader", "cuda"]:
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
     if backend == "video_reader" and not io._HAS_VIDEO_OPT:
+        # TODO: better messages
         message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        warnings.warn(message)
+        raise RuntimeError(message)
+    elif backend == "cuda" and not _HAS_GPU_VIDEO_DECODER:
+        # TODO: better messages
+        message = "cuda video backend is not available."
+        raise RuntimeError(message)
     else:
         _video_backend = backend
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index ba7d4f69f..0787b8230 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -4,10 +4,6 @@ import torch
 
 from ..utils import _log_api_usage_once
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
@@ -47,7 +43,6 @@ __all__ = [
     "_read_video_timestamps_from_memory",
     "_probe_video_from_memory",
     "_HAS_VIDEO_OPT",
-    "_HAS_GPU_VIDEO_DECODER",
     "_read_video_clip_from_memory",
     "_read_video_meta_data",
     "VideoMetaData",
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
deleted file mode 100644
index f7869f0a9..000000000
--- a/torchvision/io/_load_gpu_decoder.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from ..extension import _load_library
-
-
-try:
-    _load_library("Decoder")
-    _HAS_GPU_VIDEO_DECODER = True
-except (ImportError, OSError):
-    _HAS_GPU_VIDEO_DECODER = False
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 0449d6d1e..764b82dfe 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -1,14 +1,12 @@
+import io
 import warnings
+
 from typing import Any, Dict, Iterator, Optional
 
 import torch
 
 from ..utils import _log_api_usage_once
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import _HAS_VIDEO_OPT
 
 if _HAS_VIDEO_OPT:
@@ -22,11 +20,37 @@ else:
         return False
 
 
+try:
+    import av
+
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
+
+
 class VideoReader:
     """
     Fine-grained video-reading API.
     Supports frame-by-frame reading of various streams from a single video
-    container.
+    container. Much like previous video_reader API it supports the following
+    backends: video_reader, pyav, and cuda.
+    Backends can be set via `torchvision.set_video_backend` function.
 
     .. betastatus:: VideoReader class
 
@@ -88,16 +112,11 @@ class VideoReader:
             Default value (0) enables multithreading with codec-dependent heuristic. The performance
             will depend on the version of FFMPEG codecs supported.
 
-        device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
-            To use GPU decoding, pass ``device="cuda"``.
 
         path (str, optional):
             .. warning:
                 This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
                 Please use ``src`` instead.
-
-
-
     """
 
     def __init__(
@@ -105,45 +124,59 @@ class VideoReader:
         src: str = "",
         stream: str = "video",
         num_threads: int = 0,
-        device: str = "cpu",
         path: Optional[str] = None,
     ) -> None:
         _log_api_usage_once(self)
-        self.is_cuda = False
-        device = torch.device(device)
-        if device.type == "cuda":
-            if not _HAS_GPU_VIDEO_DECODER:
-                raise RuntimeError("Not compiled with GPU decoder support.")
-            self.is_cuda = True
-            self._c = torch.classes.torchvision.GPUDecoder(src, device)
-            return
-        if not _has_video_opt():
-            raise RuntimeError(
-                "Not compiled with video_reader support, "
-                + "to enable video_reader support, please install "
-                + "ffmpeg (version 4.2 is currently supported) and "
-                + "build torchvision from source."
-            )
-
-        if src == "":
-            if path is None:
-                raise TypeError("src cannot be empty")
-            src = path
-            warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
-
-        elif isinstance(src, bytes):
-            src = torch.frombuffer(src, dtype=torch.uint8)
+        from .. import get_video_backend
 
+        self.backend = get_video_backend()
         if isinstance(src, str):
-            self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            if src == "":
+                if path is None:
+                    raise TypeError("src cannot be empty")
+                src = path
+                warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
+        elif isinstance(src, bytes):
+            if self.backend in ["cuda"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
+                )
+            elif self.backend == "pyav":
+                src = io.BytesIO(src)
+            else:
+                src = torch.frombuffer(src, dtype=torch.uint8)
         elif isinstance(src, torch.Tensor):
-            if self.is_cuda:
-                raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
-            self._c = torch.classes.torchvision.Video("", "", 0)
-            self._c.init_from_memory(src, stream, num_threads)
+            if self.backend in ["cuda", "pyav"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
+                )
         else:
             raise TypeError("`src` must be either string, Tensor or bytes object.")
 
+        if self.backend == "cuda":
+            device = torch.device("cuda")
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+
+        elif self.backend == "video_reader":
+            if isinstance(src, str):
+                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            elif isinstance(src, torch.Tensor):
+                self._c = torch.classes.torchvision.Video("", "", 0)
+                self._c.init_from_memory(src, stream, num_threads)
+
+        elif self.backend == "pyav":
+            self.container = av.open(src, metadata_errors="ignore")
+            # TODO: load metadata
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+
+            # TODO: add extradata exception
+
+        else:
+            raise RuntimeError("Unknown video backend: {}".format(self.backend))
+
     def __next__(self) -> Dict[str, Any]:
         """Decodes and returns the next frame of the current stream.
         Frames are encoded as a dict with mandatory
@@ -156,14 +189,29 @@ class VideoReader:
             and corresponding timestamp (``pts``) in seconds
 
         """
-        if self.is_cuda:
+        if self.backend == "cuda":
             frame = self._c.next()
             if frame.numel() == 0:
                 raise StopIteration
-            return {"data": frame}
-        frame, pts = self._c.next()
+            return {"data": frame, "pts": None}
+        elif self.backend == "video_reader":
+            frame, pts = self._c.next()
+        else:
+            try:
+                frame = next(self._c)
+                pts = float(frame.pts * frame.time_base)
+                if "video" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
+                elif "audio" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_ndarray()).permute(1, 0)
+                else:
+                    frame = None
+            except av.error.EOFError:
+                raise StopIteration
+
         if frame.numel() == 0:
             raise StopIteration
+
         return {"data": frame, "pts": pts}
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
@@ -182,7 +230,18 @@ class VideoReader:
             frame with the exact timestamp if it exists or
             the first frame with timestamp larger than ``time_s``.
         """
-        self._c.seek(time_s, keyframes_only)
+        if self.backend in ["cuda", "video_reader"]:
+            self._c.seek(time_s, keyframes_only)
+        else:
+            # handle special case as pyav doesn't catch it
+            if time_s < 0:
+                time_s = 0
+            temp_str = self.container.streams.get(**self.pyav_stream)[0]
+            offset = int(round(time_s / temp_str.time_base))
+            if not keyframes_only:
+                warnings.warn("Accurate seek is not implemented for pyav backend")
+            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
+            self._c = self.container.decode(**self.pyav_stream)
         return self
 
     def get_metadata(self) -> Dict[str, Any]:
@@ -191,6 +250,21 @@ class VideoReader:
         Returns:
             (dict): dictionary containing duration and frame rate for every stream
         """
+        if self.backend == "pyav":
+            metadata = {}  # type:  Dict[str, Any]
+            for stream in self.container.streams:
+                if stream.type not in metadata:
+                    if stream.type == "video":
+                        rate_n = "fps"
+                    else:
+                        rate_n = "framerate"
+                    metadata[stream.type] = {rate_n: [], "duration": []}
+
+                rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
+
+                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
+                metadata[stream.type][rate_n].append(float(rate))
+            return metadata
         return self._c.get_metadata()
 
     def set_current_stream(self, stream: str) -> bool:
@@ -210,6 +284,12 @@ class VideoReader:
         Returns:
             (bool): True on succes, False otherwise
         """
-        if self.is_cuda:
-            print("GPU decoding only works with video stream.")
+        if self.backend == "cuda":
+            warnings.warn("GPU decoding only works with video stream.")
+        if self.backend == "pyav":
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            return True
         return self._c.set_current_stream(stream)
-- 
GitLab


From ce257ef78b9da0430a47d387b8e6b175ebaf94ce Mon Sep 17 00:00:00 2001
From: John Detloff <jmdetloff@gmail.com>
Date: Mon, 31 Oct 2022 10:07:36 -0500
Subject: [PATCH 102/624] [iOS] Update Cocoapods for 0.14 release (#6870)

* [iOS] Update Cocoapods for 0.14 release

* Update pytorch version in podspec

Co-authored-by: YosuaMichael <yosuamichael@fb.com>
---
 ios/LibTorchvision.podspec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ios/LibTorchvision.podspec b/ios/LibTorchvision.podspec
index ba87820e1..d7b154e4a 100644
--- a/ios/LibTorchvision.podspec
+++ b/ios/LibTorchvision.podspec
@@ -1,8 +1,8 @@
-pytorch_version = '1.12.0'
+pytorch_version = '1.13.0'
 
 Pod::Spec.new do |s|
     s.name             = 'LibTorchvision'
-    s.version          = '0.13.0'
+    s.version          = '0.14.0'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/vision'
-- 
GitLab


From a33b1fd6af197d0c1cce7469fd268142fd959674 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 31 Oct 2022 16:08:42 -0700
Subject: [PATCH 103/624] [Nova] Add Caller for Mac x86 Wheels (#6865)

---
 .github/workflows/build-wheels-macos.yml | 47 ++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 .github/workflows/build-wheels-macos.yml

diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
new file mode 100644
index 000000000..1d61622c7
--- /dev/null
+++ b/.github/workflows/build-wheels-macos.yml
@@ -0,0 +1,47 @@
+name: Build Macos Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: macos
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-12
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From 8b4bb5f13d72f6564d300e2749b3c563d8a2ff81 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 31 Oct 2022 16:36:02 -0700
Subject: [PATCH 104/624] [Nova] Pass the smoke test scripts to Reusable
 Workflow (#6864)

---
 .github/workflows/build-conda-linux.yml  | 1 +
 .github/workflows/build-wheels-linux.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
index 3eb8509f3..85d6f08aa 100644
--- a/.github/workflows/build-conda-linux.yml
+++ b/.github/workflows/build-conda-linux.yml
@@ -39,6 +39,7 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
       # Using "development" as trigger event so these binaries are not uploaded
       # to official channels yet
       trigger-event: development
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index d981a9781..16a9a9f5f 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -37,6 +37,7 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
       # Using "development" as trigger event so these binaries are not uploaded
       # to official channels yet
       trigger-event: development
-- 
GitLab


From 226a56b9f44c400aaea6340ea1c2a2e2c165ae36 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 1 Nov 2022 09:16:15 +0100
Subject: [PATCH 105/624] [proto][tests] Added ref functions for h/v flips
 (#6876)

* [proto][tests] Added ref functions for h/v flips

* Better dtype handling in reference_affine_bounding_box_helper
---
 test/prototype_transforms_kernel_infos.py | 79 +++++++++++++++++++----
 1 file changed, 66 insertions(+), 13 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index ba9bfe8ee..980edbee8 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -145,6 +145,29 @@ def sample_inputs_horizontal_flip_video():
         yield ArgsKwargs(video_loader)
 
 
+def reference_horizontal_flip_bounding_box(bounding_box, *, format, spatial_size):
+    affine_matrix = np.array(
+        [
+            [-1, 0, spatial_size[1]],
+            [0, 1, 0],
+        ],
+        dtype="float32",
+    )
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+
+    return expected_bboxes
+
+
+def reference_inputs_flip_bounding_box():
+    for bounding_box_loader in make_bounding_box_loaders(extra_dims=[()]):
+        yield ArgsKwargs(
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
+        )
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -158,6 +181,8 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.horizontal_flip_bounding_box,
             sample_inputs_fn=sample_inputs_horizontal_flip_bounding_box,
+            reference_fn=reference_horizontal_flip_bounding_box,
+            reference_inputs_fn=reference_inputs_flip_bounding_box,
         ),
         KernelInfo(
             F.horizontal_flip_mask,
@@ -409,15 +434,13 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
     return true_matrix
 
 
-def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
-    if center is None:
-        center = [s * 0.5 for s in spatial_size[::-1]]
-
-    def transform(bbox):
-        affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
-        affine_matrix = affine_matrix[:2, :]
-
-        bbox_xyxy = F.convert_format_bounding_box(bbox, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix):
+    def transform(bbox, affine_matrix_, format_):
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        in_dtype = bbox.dtype
+        bbox_xyxy = F.convert_format_bounding_box(
+            bbox.float(), old_format=format_, new_format=features.BoundingBoxFormat.XYXY, inplace=True
+        )
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -426,7 +449,7 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
                 [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
             ]
         )
-        transformed_points = np.matmul(points, affine_matrix.T)
+        transformed_points = np.matmul(points, affine_matrix_.T)
         out_bbox = torch.tensor(
             [
                 np.min(transformed_points[:, 0]).item(),
@@ -434,14 +457,16 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
                 np.max(transformed_points[:, 0]).item(),
                 np.max(transformed_points[:, 1]).item(),
             ],
-            dtype=bbox.dtype,
         )
-        return F.convert_format_bounding_box(out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format)
+        out_bbox = F.convert_format_bounding_box(
+            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        )
+        return out_bbox.to(dtype=in_dtype)
 
     if bounding_box.ndim < 2:
         bounding_box = [bounding_box]
 
-    expected_bboxes = [transform(bbox) for bbox in bounding_box]
+    expected_bboxes = [transform(bbox, affine_matrix, format) for bbox in bounding_box]
     if len(expected_bboxes) > 1:
         expected_bboxes = torch.stack(expected_bboxes)
     else:
@@ -450,6 +475,18 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
     return expected_bboxes
 
 
+def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
+    if center is None:
+        center = [s * 0.5 for s in spatial_size[::-1]]
+
+    affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
+    affine_matrix = affine_matrix[:2, :]
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+
+    return expected_bboxes
+
+
 def reference_inputs_affine_bounding_box():
     for bounding_box_loader, affine_kwargs in itertools.product(
         make_bounding_box_loaders(extra_dims=[()]),
@@ -643,6 +680,20 @@ def sample_inputs_vertical_flip_video():
         yield ArgsKwargs(video_loader)
 
 
+def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
+    affine_matrix = np.array(
+        [
+            [1, 0, 0],
+            [0, -1, spatial_size[0]],
+        ],
+        dtype="float32",
+    )
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+
+    return expected_bboxes
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -656,6 +707,8 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.vertical_flip_bounding_box,
             sample_inputs_fn=sample_inputs_vertical_flip_bounding_box,
+            reference_fn=reference_vertical_flip_bounding_box,
+            reference_inputs_fn=reference_inputs_flip_bounding_box,
         ),
         KernelInfo(
             F.vertical_flip_mask,
-- 
GitLab


From c35e05435688310f7db1e5497d3d63863ad6c9c8 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 1 Nov 2022 08:36:01 +0000
Subject: [PATCH 106/624] [prototype] Adjust solarize threshold on input type
 (#6874)

* Adjust solarize threshold on input type

* Handle PIL images

* minor refactor

* Fix linter
---
 .../prototype/transforms/_auto_augment.py     | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 3714fc136..83b0de8ea 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -137,7 +137,8 @@ class _AutoAugmentBase(Transform):
         elif transform_id == "Posterize":
             return F.posterize(image, bits=int(magnitude))
         elif transform_id == "Solarize":
-            return F.solarize(image, threshold=magnitude)
+            bound = 1.0 if isinstance(image, torch.Tensor) and image.is_floating_point() else 255.0
+            return F.solarize(image, threshold=bound * magnitude)
         elif transform_id == "AutoContrast":
             return F.autocontrast(image)
         elif transform_id == "Equalize":
@@ -169,7 +170,7 @@ class AutoAugment(_AutoAugmentBase):
             lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
         "Invert": (lambda num_bins, height, width: None, False),
@@ -324,7 +325,7 @@ class RandAugment(_AutoAugmentBase):
             lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
     }
@@ -378,7 +379,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
             lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
     }
@@ -423,7 +424,7 @@ class AugMix(_AutoAugmentBase):
             lambda num_bins, height, width: (4 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
             False,
         ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
         "AutoContrast": (lambda num_bins, height, width: None, False),
         "Equalize": (lambda num_bins, height, width: None, False),
     }
@@ -505,13 +506,7 @@ class AugMix(_AutoAugmentBase):
                 aug = self._apply_image_or_video_transform(
                     aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
                 )
-            mix.add_(
-                # The multiplication below could become in-place provided `aug is not batch and aug.is_floating_point()`
-                # Currently we can't do this because `aug` has to be `unint8` to support ops like `equalize`.
-                # TODO: change this once all ops in `F` support floats. https://github.com/pytorch/vision/issues/6840
-                combined_weights[:, i].reshape(batch_dims)
-                * aug
-            )
+            mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
         mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
         if isinstance(orig_image_or_video, (features.Image, features.Video)):
-- 
GitLab


From 72c595269510b1b6bf7c4fb5b9855b71512b541b Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 1 Nov 2022 17:28:28 +0100
Subject: [PATCH 107/624] [proto] Speed-up h/v bboxes flip ops (#6877)

* [proto][tests] Added ref functions for h/v flips

* Better dtype handling in reference_affine_bounding_box_helper

* [proto] Speed-up h/v bboxes flip ops

* Use more inplace ops

* Removed _old methods

* Fixed jit issue using a bit slower version
---
 .../transforms/functional/_geometry.py        | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 5b71c79d3..b4a528c54 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -36,17 +36,16 @@ def horizontal_flip_bounding_box(
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
-    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
-    #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
-    ).reshape(-1, 4)
+    bounding_box = bounding_box.clone().reshape(-1, 4)
 
-    bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]
+    if format == features.BoundingBoxFormat.XYXY:
+        bounding_box[:, [2, 0]] = bounding_box[:, [0, 2]].sub_(spatial_size[1]).neg_()
+    elif format == features.BoundingBoxFormat.XYWH:
+        bounding_box[:, 0].add_(bounding_box[:, 2]).sub_(spatial_size[1]).neg_()
+    else:  # format == features.BoundingBoxFormat.CXCYWH:
+        bounding_box[:, 0].sub_(spatial_size[1]).neg_()
 
-    return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-    ).reshape(shape)
+    return bounding_box.reshape(shape)
 
 
 def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
@@ -75,17 +74,16 @@ def vertical_flip_bounding_box(
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
-    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
-    #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
-    ).reshape(-1, 4)
+    bounding_box = bounding_box.clone().reshape(-1, 4)
 
-    bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]
+    if format == features.BoundingBoxFormat.XYXY:
+        bounding_box[:, [1, 3]] = bounding_box[:, [3, 1]].sub_(spatial_size[0]).neg_()
+    elif format == features.BoundingBoxFormat.XYWH:
+        bounding_box[:, 1].add_(bounding_box[:, 3]).sub_(spatial_size[0]).neg_()
+    else:  # format == features.BoundingBoxFormat.CXCYWH:
+        bounding_box[:, 1].sub_(spatial_size[0]).neg_()
 
-    return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-    ).reshape(shape)
+    return bounding_box.reshape(shape)
 
 
 def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
-- 
GitLab


From d95fbaf1efd5346a4afcf5b9953df75696432265 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 1 Nov 2022 16:56:48 +0000
Subject: [PATCH 108/624] [prototype] Optimize Center Crop performance (#6880)

* Reducing unnecessary method calls

* Optimize pad branch.

* Remove unnecessary call to crop_image_tensor

* Fix linter

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 .../prototype/transforms/functional/_color.py | 14 +++---
 .../transforms/functional/_geometry.py        | 47 +++++++++----------
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 67a55cfb1..f0b47b441 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -2,7 +2,7 @@ import torch
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
-from ._meta import _rgb_to_gray, convert_dtype_image_tensor, get_dimensions_image_tensor, get_num_channels_image_tensor
+from ._meta import _rgb_to_gray, convert_dtype_image_tensor
 
 
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
@@ -45,7 +45,7 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
     if saturation_factor < 0:
         raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
 
-    c = get_num_channels_image_tensor(image)
+    c = image.shape[-3]
     if c not in [1, 3]:
         raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
 
@@ -75,7 +75,7 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
     if contrast_factor < 0:
         raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
 
-    c = get_num_channels_image_tensor(image)
+    c = image.shape[-3]
     if c not in [1, 3]:
         raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
     dtype = image.dtype if torch.is_floating_point(image) else torch.float32
@@ -101,7 +101,7 @@ def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> feat
 
 
 def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
-    num_channels, height, width = get_dimensions_image_tensor(image)
+    num_channels, height, width = image.shape[-3:]
     if num_channels not in (1, 3):
         raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
 
@@ -210,8 +210,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
     if not (-0.5 <= hue_factor <= 0.5):
         raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
 
-    c = get_num_channels_image_tensor(image)
-
+    c = image.shape[-3]
     if c not in [1, 3]:
         raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
 
@@ -342,8 +341,7 @@ def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTyp
 
 
 def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
-    c = get_num_channels_image_tensor(image)
-
+    c = image.shape[-3]
     if c not in [1, 3]:
         raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index b4a528c54..9ed8a965e 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -16,12 +16,7 @@ from torchvision.transforms.functional import (
 )
 from torchvision.transforms.functional_tensor import _parse_pad_padding
 
-from ._meta import (
-    convert_format_bounding_box,
-    get_dimensions_image_tensor,
-    get_spatial_size_image_pil,
-    get_spatial_size_image_tensor,
-)
+from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
 
 horizontal_flip_image_tensor = _FT.hflip
 horizontal_flip_image_pil = _FP.hflip
@@ -120,9 +115,9 @@ def resize_image_tensor(
     max_size: Optional[int] = None,
     antialias: bool = False,
 ) -> torch.Tensor:
-    num_channels, old_height, old_width = get_dimensions_image_tensor(image)
+    shape = image.shape
+    num_channels, old_height, old_width = shape[-3:]
     new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
-    extra_dims = image.shape[:-3]
 
     if image.numel() > 0:
         image = image.reshape(-1, num_channels, old_height, old_width)
@@ -134,7 +129,7 @@ def resize_image_tensor(
             antialias=antialias,
         )
 
-    return image.reshape(extra_dims + (num_channels, new_height, new_width))
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
 @torch.jit.unused
@@ -270,8 +265,8 @@ def affine_image_tensor(
     if image.numel() == 0:
         return image
 
-    num_channels, height, width = image.shape[-3:]
-    extra_dims = image.shape[:-3]
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
     image = image.reshape(-1, num_channels, height, width)
 
     angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
@@ -285,7 +280,7 @@ def affine_image_tensor(
     matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
 
     output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill)
-    return output.reshape(extra_dims + (num_channels, height, width))
+    return output.reshape(shape)
 
 
 @torch.jit.unused
@@ -511,8 +506,8 @@ def rotate_image_tensor(
     fill: features.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    num_channels, height, width = image.shape[-3:]
-    extra_dims = image.shape[:-3]
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
 
     center_f = [0.0, 0.0]
     if center is not None:
@@ -538,7 +533,7 @@ def rotate_image_tensor(
     else:
         new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height)
 
-    return image.reshape(extra_dims + (num_channels, new_height, new_width))
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
 @torch.jit.unused
@@ -675,8 +670,8 @@ def _pad_with_scalar_fill(
     fill: Union[int, float, None],
     padding_mode: str = "constant",
 ) -> torch.Tensor:
-    num_channels, height, width = image.shape[-3:]
-    extra_dims = image.shape[:-3]
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
 
     if image.numel() > 0:
         image = _FT.pad(
@@ -688,7 +683,7 @@ def _pad_with_scalar_fill(
         new_height = height + top + bottom
         new_width = width + left + right
 
-    return image.reshape(extra_dims + (num_channels, new_height, new_width))
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
 # TODO: This should be removed once pytorch pad supports non-scalar padding values
@@ -1130,7 +1125,8 @@ elastic_transform = elastic
 
 def _center_crop_parse_output_size(output_size: List[int]) -> List[int]:
     if isinstance(output_size, numbers.Number):
-        return [int(output_size), int(output_size)]
+        s = int(output_size)
+        return [s, s]
     elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
         return [output_size[0], output_size[0]]
     else:
@@ -1156,18 +1152,21 @@ def _center_crop_compute_crop_anchor(
 
 def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    image_height, image_width = get_spatial_size_image_tensor(image)
+    shape = image.shape
+    if image.numel() == 0:
+        return image.reshape(shape[:-2] + (crop_height, crop_width))
+    image_height, image_width = shape[-2:]
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        image = pad_image_tensor(image, padding_ltrb, fill=0)
+        image = _FT.torch_pad(image, _FT._parse_pad_padding(padding_ltrb), value=0.0)
 
-        image_height, image_width = get_spatial_size_image_tensor(image)
+        image_height, image_width = image.shape[-2:]
         if crop_width == image_width and crop_height == image_height:
             return image
 
     crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
-    return crop_image_tensor(image, crop_top, crop_left, crop_height, crop_width)
+    return image[..., crop_top : (crop_top + crop_height), crop_left : (crop_left + crop_width)]
 
 
 @torch.jit.unused
@@ -1332,7 +1331,7 @@ def five_crop_image_tensor(
     image: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     crop_height, crop_width = _parse_five_crop_size(size)
-    image_height, image_width = get_spatial_size_image_tensor(image)
+    image_height, image_width = image.shape[-2:]
 
     if crop_width > image_width or crop_height > image_height:
         msg = "Requested crop size {} is bigger than input size {}"
-- 
GitLab


From c4c0ef98d06c7011d80991827b099a6266b7075c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 12:33:34 +0100
Subject: [PATCH 109/624] make weights deepcopyable (#6883)

* make weights deepcopyable

* add test

* test enum member instead of whole enum
---
 test/test_extended_models.py | 20 ++++++++++++++++++++
 torchvision/models/_api.py   |  3 +++
 2 files changed, 23 insertions(+)

diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index c467564c9..2cd8a5681 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -1,3 +1,4 @@
+import copy
 import os
 
 import pytest
@@ -59,6 +60,25 @@ def test_get_model_weights(name, weight):
     assert models.get_model_weights(name) == weight
 
 
+@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy])
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_copyable(copy_fn, name):
+    model_weights = models.get_model_weights(name)
+    for weights in list(model_weights):
+        copied_weights = copy_fn(weights)
+        assert copied_weights is weights
+
+
 @pytest.mark.parametrize(
     "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow]
 )
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 52ac070e6..d550594c5 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -75,6 +75,9 @@ class WeightsEnum(StrEnum):
                 return object.__getattribute__(self.value, name)
         return super().__getattr__(name)
 
+    def __deepcopy__(self, memodict=None):
+        return self
+
 
 def get_weight(name: str) -> WeightsEnum:
     """
-- 
GitLab


From 1921613a11f0b382be046e77e17281ec60de7fa9 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 2 Nov 2022 13:32:32 +0000
Subject: [PATCH 110/624] [prototype] Gaussian Blur clean up (#6888)

* Refactor gaussian_blur

* Add conditional reshape

* Further refactoring

* Remove unused import.
---
 .../prototype/transforms/functional/_misc.py  | 60 ++++++++++---------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 738e36996..d8bfc7cae 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -5,7 +5,6 @@ import PIL.Image
 import torch
 from torch.nn.functional import conv2d, pad as torch_pad
 from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 
@@ -68,9 +67,9 @@ def normalize(
 
 
 def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-    lim = (kernel_size - 1) / (2 * math.sqrt(2) * sigma)
+    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0) * sigma)
     x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
-    kernel1d = torch.softmax(-x.pow_(2), dim=0)
+    kernel1d = torch.softmax(x.pow_(2).neg_(), dim=0)
     return kernel1d
 
 
@@ -89,7 +88,7 @@ def gaussian_blur_image_tensor(
     # TODO: consider deprecating integers from sigma on the future
     if isinstance(kernel_size, int):
         kernel_size = [kernel_size, kernel_size]
-    if len(kernel_size) != 2:
+    elif len(kernel_size) != 2:
         raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
     for ksize in kernel_size:
         if ksize % 2 == 0 or ksize < 0:
@@ -97,15 +96,19 @@ def gaussian_blur_image_tensor(
 
     if sigma is None:
         sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
-
-    if sigma is not None and not isinstance(sigma, (int, float, list, tuple)):
-        raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
-    if isinstance(sigma, (int, float)):
-        sigma = [float(sigma), float(sigma)]
-    if isinstance(sigma, (list, tuple)) and len(sigma) == 1:
-        sigma = [sigma[0], sigma[0]]
-    if len(sigma) != 2:
-        raise ValueError(f"If sigma is a sequence, its length should be 2. Got {len(sigma)}")
+    else:
+        if isinstance(sigma, (list, tuple)):
+            length = len(sigma)
+            if length == 1:
+                s = float(sigma[0])
+                sigma = [s, s]
+            elif length != 2:
+                raise ValueError(f"If sigma is a sequence, its length should be 2. Got {length}")
+        elif isinstance(sigma, (int, float)):
+            s = float(sigma)
+            sigma = [s, s]
+        else:
+            raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
     for s in sigma:
         if s <= 0.0:
             raise ValueError(f"sigma should have positive values. Got {sigma}")
@@ -113,30 +116,33 @@ def gaussian_blur_image_tensor(
     if image.numel() == 0:
         return image
 
+    dtype = image.dtype
     shape = image.shape
-
-    if image.ndim > 4:
+    ndim = image.ndim
+    if ndim == 3:
+        image = image.unsqueeze(dim=0)
+    elif ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
-        needs_unsquash = True
-    else:
-        needs_unsquash = False
 
-    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
-    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=image.device)
-    kernel = kernel.expand(image.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+    fp = torch.is_floating_point(image)
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype if fp else torch.float32, device=image.device)
+    kernel = kernel.expand(shape[-3], 1, kernel.shape[0], kernel.shape[1])
 
-    image, need_cast, need_squeeze, out_dtype = _FT._cast_squeeze_in(image, [kernel.dtype])
+    output = image if fp else image.to(dtype=torch.float32)
 
     # padding = (left, right, top, bottom)
     padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
-    output = torch_pad(image, padding, mode="reflect")
-    output = conv2d(output, kernel, groups=output.shape[-3])
-
-    output = _FT._cast_squeeze_out(output, need_cast, need_squeeze, out_dtype)
+    output = torch_pad(output, padding, mode="reflect")
+    output = conv2d(output, kernel, groups=shape[-3])
 
-    if needs_unsquash:
+    if ndim == 3:
+        output = output.squeeze(dim=0)
+    elif ndim > 4:
         output = output.reshape(shape)
 
+    if not fp:
+        output = output.round_().to(dtype=dtype)
+
     return output
 
 
-- 
GitLab


From 2ba2f1d5f6c03b4b2e4dc07805ff3d32d883898c Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Wed, 2 Nov 2022 18:46:26 +0100
Subject: [PATCH 111/624] [proto] Speed-up crop on bboxes and tests (#6881)

* [proto] Speed-up crop on bboxes and tests

* Fix linter

* Update _geometry.py

* Fixed device issue

* Revert changes in test/prototype_transforms_kernel_infos.py

* Fixed failing correctness tests
---
 test/prototype_transforms_kernel_infos.py     | 23 +++++++++++++++++++
 test/test_prototype_transforms_functional.py  | 14 ++++-------
 .../transforms/functional/_geometry.py        | 21 +++++++----------
 3 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 980edbee8..eac1fc0ae 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -862,6 +862,27 @@ def sample_inputs_crop_video():
         yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
 
 
+def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
+
+    affine_matrix = np.array(
+        [
+            [1, 0, -left],
+            [0, 1, -top],
+        ],
+        dtype="float32",
+    )
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    return expected_bboxes, (height, width)
+
+
+def reference_inputs_crop_bounding_box():
+    for bounding_box_loader, params in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
+    ):
+        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -875,6 +896,8 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.crop_bounding_box,
             sample_inputs_fn=sample_inputs_crop_bounding_box,
+            reference_fn=reference_crop_bounding_box,
+            reference_inputs_fn=reference_inputs_crop_bounding_box,
         ),
         KernelInfo(
             F.crop_mask,
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 6746c09e0..20f5e5330 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -900,7 +900,8 @@ def test_correctness_center_crop_bounding_box(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
         spatial_size_ = bbox.spatial_size
-        bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH)
+        dtype = bbox.dtype
+        bbox = convert_format_bounding_box(bbox.float(), format_, features.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
@@ -913,14 +914,9 @@ def test_correctness_center_crop_bounding_box(device, output_size):
             bbox[2].item(),
             bbox[3].item(),
         ]
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYWH,
-            spatial_size=output_size_,
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        return convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_)
+        out_bbox = torch.tensor(out_bbox)
+        out_bbox = convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_)
+        return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 9ed8a965e..d3e1d94c9 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -802,22 +802,17 @@ def crop_bounding_box(
     height: int,
     width: int,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
-    #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
-    )
+
+    bounding_box = bounding_box.clone()
 
     # Crop or implicit pad if left and/or top have negative values:
-    bounding_box[..., 0::2] -= left
-    bounding_box[..., 1::2] -= top
+    if format == features.BoundingBoxFormat.XYXY:
+        sub = torch.tensor([left, top, left, top], device=bounding_box.device)
+    else:
+        sub = torch.tensor([left, top, 0, 0], device=bounding_box.device)
+    bounding_box = bounding_box.sub_(sub)
 
-    return (
-        convert_format_bounding_box(
-            bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-        ),
-        (height, width),
-    )
+    return bounding_box, (height, width)
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-- 
GitLab


From e64784cdea465d833d9d0f66dc73d7abe217933d Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Wed, 2 Nov 2022 12:27:57 -0700
Subject: [PATCH 112/624] [Nova] M1 Wheels Builds (#6893)

---
 .github/workflows/build-wheels-m1.yml | 47 +++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 .github/workflows/build-wheels-m1.yml

diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
new file mode 100644
index 000000000..c73fe698f
--- /dev/null
+++ b/.github/workflows/build-wheels-m1.yml
@@ -0,0 +1,47 @@
+name: Build M1 Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-m1-12
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From d8cec344e0d5899b3b4ae9f7dbc5e9f61cacb441 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 3 Nov 2022 09:22:23 +0000
Subject: [PATCH 113/624] [prototype] Clean up and port the resize kernel in V2
 (#6892)

* Ported `resize`

* Align with previous behaviour

* Update torchvision/prototype/transforms/functional/_geometry.py

Co-authored-by: Philip Meier <github.pmeier@posteo.de>

* Moving input verification on top of method.

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .../transforms/functional/_geometry.py        | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index d3e1d94c9..95f620f68 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -4,6 +4,7 @@ from typing import List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
+from torch.nn.functional import interpolate
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 from torchvision.transforms.functional import (
@@ -115,6 +116,12 @@ def resize_image_tensor(
     max_size: Optional[int] = None,
     antialias: bool = False,
 ) -> torch.Tensor:
+    align_corners: Optional[bool] = None
+    if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
+        align_corners = False
+    elif antialias:
+        raise ValueError("Antialias option is supported for bilinear and bicubic interpolation modes only")
+
     shape = image.shape
     num_channels, old_height, old_width = shape[-3:]
     new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
@@ -122,13 +129,24 @@ def resize_image_tensor(
     if image.numel() > 0:
         image = image.reshape(-1, num_channels, old_height, old_width)
 
-        image = _FT.resize(
+        dtype = image.dtype
+        need_cast = dtype not in (torch.float32, torch.float64)
+        if need_cast:
+            image = image.to(dtype=torch.float32)
+
+        image = interpolate(
             image,
             size=[new_height, new_width],
-            interpolation=interpolation.value,
+            mode=interpolation.value,
+            align_corners=align_corners,
             antialias=antialias,
         )
 
+        if need_cast:
+            if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
+                image = image.clamp_(min=0, max=255)
+            image = image.round_().to(dtype=dtype)
+
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
@@ -1312,9 +1330,11 @@ def resized_crop(
 
 def _parse_five_crop_size(size: List[int]) -> List[int]:
     if isinstance(size, numbers.Number):
-        size = [int(size), int(size)]
+        s = int(size)
+        size = [s, s]
     elif isinstance(size, (tuple, list)) and len(size) == 1:
-        size = [size[0], size[0]]
+        s = size[0]
+        size = [s, s]
 
     if len(size) != 2:
         raise ValueError("Please provide only two dimensions (h, w) for size.")
-- 
GitLab


From 79ca506cbeb4b7b5eb391084980127179e777b3d Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 3 Nov 2022 11:58:41 +0100
Subject: [PATCH 114/624] [proto] Optimized functional pad op for bboxes +
 tests (#6890)

* [proto] Speed-up crop on bboxes and tests

* Fix linter

* Update _geometry.py

* Fixed device issue

* Revert changes in test/prototype_transforms_kernel_infos.py

* Fixed failing correctness tests

* [proto] Optimized functional pad op for bboxes + tests

* Renamed copy-pasted variable name

* Code update

* Fixes according to the review
---
 test/prototype_transforms_kernel_infos.py     | 36 ++++++++++++++++++-
 .../transforms/functional/_geometry.py        | 20 +++++------
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index eac1fc0ae..ca3657de9 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -25,7 +25,7 @@ from prototype_common_utils import (
 )
 from torch.utils._pytree import tree_map
 from torchvision.prototype import features
-from torchvision.transforms.functional_tensor import _max_value as get_max_value
+from torchvision.transforms.functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
@@ -1078,6 +1078,38 @@ def sample_inputs_pad_video():
         yield ArgsKwargs(video_loader, padding=[1])
 
 
+def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, padding_mode):
+
+    left, right, top, bottom = _parse_pad_padding(padding)
+
+    affine_matrix = np.array(
+        [
+            [1, 0, left],
+            [0, 1, top],
+        ],
+        dtype="float32",
+    )
+
+    height = spatial_size[0] + top + bottom
+    width = spatial_size[1] + left + right
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    return expected_bboxes, (height, width)
+
+
+def reference_inputs_pad_bounding_box():
+    for bounding_box_loader, padding in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
+    ):
+        yield ArgsKwargs(
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
+            padding=padding,
+            padding_mode="constant",
+        )
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1097,6 +1129,8 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.pad_bounding_box,
             sample_inputs_fn=sample_inputs_pad_bounding_box,
+            reference_fn=reference_pad_bounding_box,
+            reference_inputs_fn=reference_inputs_pad_bounding_box,
             test_marks=[
                 xfail_jit_python_scalar_arg("padding"),
                 xfail_jit_tuple_instead_of_list("padding"),
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 95f620f68..6656ecfe8 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -768,14 +768,11 @@ def pad_bounding_box(
 
     left, right, top, bottom = _parse_pad_padding(padding)
 
-    bounding_box = bounding_box.clone()
-
-    # this works without conversion since padding only affects xy coordinates
-    bounding_box[..., 0] += left
-    bounding_box[..., 1] += top
     if format == features.BoundingBoxFormat.XYXY:
-        bounding_box[..., 2] += left
-        bounding_box[..., 3] += top
+        pad = [left, top, left, top]
+    else:
+        pad = [left, top, 0, 0]
+    bounding_box = bounding_box + torch.tensor(pad, dtype=bounding_box.dtype, device=bounding_box.device)
 
     height, width = spatial_size
     height += top + bottom
@@ -821,14 +818,13 @@ def crop_bounding_box(
     width: int,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
 
-    bounding_box = bounding_box.clone()
-
     # Crop or implicit pad if left and/or top have negative values:
     if format == features.BoundingBoxFormat.XYXY:
-        sub = torch.tensor([left, top, left, top], device=bounding_box.device)
+        sub = [left, top, left, top]
     else:
-        sub = torch.tensor([left, top, 0, 0], device=bounding_box.device)
-    bounding_box = bounding_box.sub_(sub)
+        sub = [left, top, 0, 0]
+
+    bounding_box = bounding_box - torch.tensor(sub, dtype=bounding_box.dtype, device=bounding_box.device)
 
     return bounding_box, (height, width)
 
-- 
GitLab


From a2151b962b77f6a0f0a4e06a39f069612e9a4e50 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 3 Nov 2022 12:23:42 +0100
Subject: [PATCH 115/624] replace assert torch.allclose with
 torch.testing.assert_allclose (#6895)

---
 test/test_architecture_ops.py | 4 ++--
 test/test_ops.py              | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_architecture_ops.py b/test/test_architecture_ops.py
index 9f254c794..32ad1a32f 100644
--- a/test/test_architecture_ops.py
+++ b/test/test_architecture_ops.py
@@ -20,7 +20,7 @@ class MaxvitTester(unittest.TestCase):
         x_hat = partition(x, partition_size)
         x_hat = departition(x_hat, partition_size, n_partitions, n_partitions)
 
-        assert torch.allclose(x, x_hat)
+        torch.testing.assert_close(x, x_hat)
 
     def test_maxvit_grid_partition(self):
         input_shape = (1, 3, 224, 224)
@@ -39,7 +39,7 @@ class MaxvitTester(unittest.TestCase):
         x_hat = post_swap(x_hat)
         x_hat = departition(x_hat, n_partitions, partition_size, partition_size)
 
-        assert torch.allclose(x, x_hat)
+        torch.testing.assert_close(x, x_hat)
 
 
 if __name__ == "__main__":
diff --git a/test/test_ops.py b/test/test_ops.py
index d76e57fae..99b58bb93 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -630,7 +630,7 @@ class TestNMS:
         boxes, scores = self._create_tensors_with_iou(1000, iou)
         keep_ref = self._reference_nms(boxes, scores, iou)
         keep = ops.nms(boxes, scores, iou)
-        assert torch.allclose(keep, keep_ref), err_msg.format(iou)
+        torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou))
 
     def test_nms_input_errors(self):
         with pytest.raises(RuntimeError):
@@ -661,7 +661,7 @@ class TestNMS:
         keep = ops.nms(boxes, scores, iou)
         qkeep = ops.nms(qboxes, qscores, iou)
 
-        assert torch.allclose(qkeep, keep), err_msg.format(iou)
+        torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou))
 
     @needs_cuda
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
@@ -1237,7 +1237,7 @@ class TestIouBase:
         boxes2 = gen_box(7)
         a = TestIouBase._cartesian_product(boxes1, boxes2, target_fn)
         b = target_fn(boxes1, boxes2)
-        assert torch.allclose(a, b)
+        torch.testing.assert_close(a, b)
 
 
 class TestBoxIou(TestIouBase):
-- 
GitLab


From 1502ed9ae21f35bcd5661cd3bb67fd7bc6b2b863 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 3 Nov 2022 12:34:00 +0100
Subject: [PATCH 116/624] [proto] Small optims for perspective bboxes op 
 (#6891)

* [proto] Speed-up crop on bboxes and tests

* Fix linter

* Update _geometry.py

* Fixed device issue

* Revert changes in test/prototype_transforms_kernel_infos.py

* Fixed failing correctness tests

* [proto] Optimized functional pad op for bboxes + tests

* Renamed copy-pasted variable name

* [proto] Small optims for perspective bboxes op
---
 .../transforms/functional/_geometry.py        | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 6656ecfe8..e73730009 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -928,16 +928,16 @@ def perspective_bounding_box(
         (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
     ]
 
-    theta1 = torch.tensor(
-        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
+    theta12_T = torch.tensor(
+        [
+            [inv_coeffs[0], inv_coeffs[3], inv_coeffs[6], inv_coeffs[6]],
+            [inv_coeffs[1], inv_coeffs[4], inv_coeffs[7], inv_coeffs[7]],
+            [inv_coeffs[2], inv_coeffs[5], 1.0, 1.0],
+        ],
         dtype=dtype,
         device=device,
     )
 
-    theta2 = torch.tensor(
-        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
-    )
-
     # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
@@ -948,15 +948,16 @@ def perspective_bounding_box(
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
 
-    numer_points = torch.matmul(points, theta1.T)
-    denom_points = torch.matmul(points, theta2.T)
-    transformed_points = numer_points / denom_points
+    numer_denom_points = torch.matmul(points, theta12_T)
+    numer_points = numer_denom_points[:, :2]
+    denom_points = numer_denom_points[:, 2:]
+    transformed_points = numer_points.div_(denom_points)
 
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
     transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     # out_bboxes should be of shape [N boxes, 4]
-- 
GitLab


From aec38fc2f97086a843ec79cb376eb3f350d80a88 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 3 Nov 2022 13:45:45 +0100
Subject: [PATCH 117/624] [proto][tests] added ref tests for resize bboxes
 (#6879)

---
 test/prototype_transforms_kernel_infos.py | 29 ++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index ca3657de9..008887539 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -288,6 +288,31 @@ def sample_inputs_resize_video():
         yield ArgsKwargs(video_loader, size=[min(video_loader.shape[-2:]) + 1])
 
 
+def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=None):
+
+    old_height, old_width = spatial_size
+    new_height, new_width = F._geometry._compute_resized_output_size(spatial_size, size=size, max_size=max_size)
+
+    affine_matrix = np.array(
+        [
+            [new_width / old_width, 0, 0],
+            [0, new_height / old_height, 0],
+        ],
+        dtype="float32",
+    )
+
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=bounding_box.format, affine_matrix=affine_matrix
+    )
+    return expected_bboxes, (new_height, new_width)
+
+
+def reference_inputs_resize_bounding_box():
+    for bounding_box_loader in make_bounding_box_loaders(extra_dims=((), (4,))):
+        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
+            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -303,6 +328,8 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.resize_bounding_box,
             sample_inputs_fn=sample_inputs_resize_bounding_box,
+            reference_fn=reference_resize_bounding_box,
+            reference_inputs_fn=reference_inputs_resize_bounding_box,
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
             ],
@@ -459,7 +486,7 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
             ],
         )
         out_bbox = F.convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         return out_bbox.to(dtype=in_dtype)
 
-- 
GitLab


From f1b840d5e495ac206b2c885dbbe3cd646ac7be52 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 3 Nov 2022 13:06:58 +0000
Subject: [PATCH 118/624] Remaining BBox kernel perf optimizations (#6896)

* Bbox resize optimization

* Other (untested) optimizations on `_affine_bounding_box_xyxy` and `elastic_bounding_box`.

* fix conflict

* Reverting changes on elastic

* revert one more change

* Further improvement
---
 .../prototype/transforms/functional/_geometry.py     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index e73730009..40fa904ad 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -181,9 +181,11 @@ def resize_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     old_height, old_width = spatial_size
     new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
-    ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_box.device)
     return (
-        bounding_box.reshape(-1, 2, 2).mul(ratios).to(bounding_box.dtype).reshape(bounding_box.shape),
+        bounding_box.mul(ratios).to(bounding_box.dtype),
         (new_height, new_width),
     )
 
@@ -367,8 +369,7 @@ def _affine_bounding_box_xyxy(
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
     transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
 
     if expand:
@@ -388,8 +389,7 @@ def _affine_bounding_box_xyxy(
         new_points = torch.matmul(points, transposed_affine_matrix)
         tr, _ = torch.min(new_points, dim=0, keepdim=True)
         # Translate bounding boxes
-        out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0]
-        out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1]
+        out_bboxes.sub_(tr.repeat((1, 2)))
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
         new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height)
-- 
GitLab


From 4d085f2e4ee86ac2b884305fb760f3c53cb173f1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 3 Nov 2022 16:54:22 +0100
Subject: [PATCH 119/624] remove unnecessary checks from pad_image_tensor
 (#6894)

* remove unnecessary changes from pad_image_tensor

* cleanup

* fix fill=None workaround

* address review comments

* remove more xfails
---
 test/prototype_transforms_dispatcher_infos.py |   1 -
 test/prototype_transforms_kernel_infos.py     |   2 -
 .../transforms/functional/_geometry.py        | 102 ++++++++++++++----
 3 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index e570e4355..d817e4a71 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -234,7 +234,6 @@ DISPATCHER_INFOS = [
                 condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs)
                 and args_kwargs.kwargs.get("padding_mode", "constant") == "constant",
             ),
-            xfail_jit_python_scalar_arg("padding"),
             xfail_jit_tuple_instead_of_list("padding"),
             xfail_jit_tuple_instead_of_list("fill"),
             # TODO: check if this is a regression since it seems that should be supported if `int` is ok
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 008887539..a106aea65 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1146,7 +1146,6 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_pad_image_tensor,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
             test_marks=[
-                xfail_jit_python_scalar_arg("padding"),
                 xfail_jit_tuple_instead_of_list("padding"),
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok
@@ -1159,7 +1158,6 @@ KERNEL_INFOS.extend(
             reference_fn=reference_pad_bounding_box,
             reference_inputs_fn=reference_inputs_pad_bounding_box,
             test_marks=[
-                xfail_jit_python_scalar_arg("padding"),
                 xfail_jit_tuple_instead_of_list("padding"),
             ],
         ),
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 40fa904ad..f2a12d6f6 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -4,7 +4,8 @@ from typing import List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
-from torch.nn.functional import interpolate
+from torch.nn.functional import interpolate, pad as torch_pad
+
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 from torchvision.transforms.functional import (
@@ -15,7 +16,6 @@ from torchvision.transforms.functional import (
     pil_to_tensor,
     to_pil_image,
 )
-from torchvision.transforms.functional_tensor import _parse_pad_padding
 
 from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
 
@@ -663,7 +663,28 @@ def rotate(
         return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
 
-pad_image_pil = _FP.pad
+def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif isinstance(padding, (tuple, list)):
+        if len(padding) == 1:
+            pad_left = pad_right = pad_top = pad_bottom = padding[0]
+        elif len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        elif len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+        else:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+    else:
+        raise TypeError(f"`padding` should be an integer or tuple or list of integers, but got {padding}")
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
 
 
 def pad_image_tensor(
@@ -672,50 +693,86 @@ def pad_image_tensor(
     fill: features.FillTypeJIT = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
+    # Be aware that while `padding` has order `[left, top, right, bottom]` has order, `torch_padding` uses
+    # `[left, right, top, bottom]`. This stems from the fact that we align our API with PIL, but need to use `torch_pad`
+    # internally.
+    torch_padding = _parse_pad_padding(padding)
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError(
+            f"`padding_mode` should be either `'constant'`, `'edge'`, `'reflect'` or `'symmetric'`, "
+            f"but got `'{padding_mode}'`."
+        )
+
     if fill is None:
-        # This is a JIT workaround
-        return _pad_with_scalar_fill(image, padding, fill=None, padding_mode=padding_mode)
-    elif isinstance(fill, (int, float)) or len(fill) == 1:
-        fill_number = fill[0] if isinstance(fill, list) else fill
-        return _pad_with_scalar_fill(image, padding, fill=fill_number, padding_mode=padding_mode)
+        fill = 0
+
+    if isinstance(fill, (int, float)):
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+    elif len(fill) == 1:
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill[0], padding_mode=padding_mode)
     else:
-        return _pad_with_vector_fill(image, padding, fill=fill, padding_mode=padding_mode)
+        return _pad_with_vector_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
 
 
 def _pad_with_scalar_fill(
     image: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: Union[int, float, None],
-    padding_mode: str = "constant",
+    torch_padding: List[int],
+    fill: Union[int, float],
+    padding_mode: str,
 ) -> torch.Tensor:
     shape = image.shape
     num_channels, height, width = shape[-3:]
 
     if image.numel() > 0:
-        image = _FT.pad(
-            img=image.reshape(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
-        )
+        image = image.reshape(-1, num_channels, height, width)
+
+        if padding_mode == "edge":
+            # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
+            # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
+            # name.
+            padding_mode = "replicate"
+
+        if padding_mode == "constant":
+            image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
+        elif padding_mode in ("reflect", "replicate"):
+            # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
+            # TODO: See https://github.com/pytorch/pytorch/issues/40763
+            dtype = image.dtype
+            if not image.is_floating_point():
+                needs_cast = True
+                image = image.to(torch.float32)
+            else:
+                needs_cast = False
+
+            image = torch_pad(image, torch_padding, mode=padding_mode)
+
+            if needs_cast:
+                image = image.to(dtype)
+        else:  # padding_mode == "symmetric"
+            image = _FT._pad_symmetric(image, torch_padding)
+
         new_height, new_width = image.shape[-2:]
     else:
-        left, right, top, bottom = _FT._parse_pad_padding(padding)
+        left, right, top, bottom = torch_padding
         new_height = height + top + bottom
         new_width = width + left + right
 
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
-# TODO: This should be removed once pytorch pad supports non-scalar padding values
+# TODO: This should be removed once torch_pad supports non-scalar padding values
 def _pad_with_vector_fill(
     image: torch.Tensor,
-    padding: Union[int, List[int]],
+    torch_padding: List[int],
     fill: List[float],
-    padding_mode: str = "constant",
+    padding_mode: str,
 ) -> torch.Tensor:
     if padding_mode != "constant":
         raise ValueError(f"Padding mode '{padding_mode}' is not supported if fill is not scalar")
 
-    output = _pad_with_scalar_fill(image, padding, fill=0, padding_mode="constant")
-    left, right, top, bottom = _parse_pad_padding(padding)
+    output = _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    left, right, top, bottom = torch_padding
     fill = torch.tensor(fill, dtype=image.dtype, device=image.device).reshape(-1, 1, 1)
 
     if top > 0:
@@ -729,6 +786,9 @@ def _pad_with_vector_fill(
     return output
 
 
+pad_image_pil = _FP.pad
+
+
 def pad_mask(
     mask: torch.Tensor,
     padding: Union[int, List[int]],
-- 
GitLab


From cb4413a3be28fc07ca749f2cef30aaff1439e582 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 3 Nov 2022 18:11:52 +0100
Subject: [PATCH 120/624] Fix hardcoded 255 (#6830)

* fix prototype kernels

* fix stable kernels

* fix tests

* make test more robust

* improve invert for signed integers

* improve invert

* fix posterize

* Revert "assume that integer images are [0, 255] in equalize (#6859)"

This reverts commit 436ff9a52a0c901aa1b9bb51c0e580ae7f7cd4c0.

* fix solarize in AA

* fix resize

* Revert "fix resize"

This reverts commit 5f33f4aa73e098237650b794157ec9335d964152.

* add comment to float max value

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/test_functional_tensor.py                | 50 ++++++++++-------
 .../prototype/transforms/_auto_augment.py     |  3 +-
 .../prototype/transforms/functional/_color.py | 55 ++++++++++---------
 torchvision/transforms/functional_tensor.py   | 26 +++------
 4 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 25f4e709f..e4d65a64a 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -790,32 +790,40 @@ def test_solarize2(device, dtype, config, channels):
     )
 
 
+@pytest.mark.parametrize(
+    ("dtype", "threshold"),
+    [
+        *[
+            (dtype, threshold)
+            for dtype, threshold in itertools.product(
+                [torch.float32, torch.float16],
+                [0.0, 0.25, 0.5, 0.75, 1.0],
+            )
+        ],
+        *[(torch.uint8, threshold) for threshold in [0, 64, 128, 192, 255]],
+        *[(torch.int64, threshold) for threshold in [0, 2**32, 2**63 - 1]],
+    ],
+)
 @pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [0.0, 0.25, 0.5, 0.75, 1.0])
-def test_solarize_threshold1_bound(threshold, device):
-    img = torch.rand((3, 12, 23)).to(device)
-    F_t.solarize(img, threshold)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [1.5])
-def test_solarize_threshold1_upper_bound(threshold, device):
-    img = torch.rand((3, 12, 23)).to(device)
-    with pytest.raises(TypeError, match="Threshold should be less than bound of img."):
-        F_t.solarize(img, threshold)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [0, 64, 128, 192, 255])
-def test_solarize_threshold2_bound(threshold, device):
-    img = torch.randint(0, 256, (3, 12, 23)).to(device)
+def test_solarize_threshold_within_bound(threshold, dtype, device):
+    make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
+    img = make_img((3, 12, 23), dtype=dtype, device=device)
     F_t.solarize(img, threshold)
 
 
+@pytest.mark.parametrize(
+    ("dtype", "threshold"),
+    [
+        (torch.float32, 1.5),
+        (torch.float16, 1.5),
+        (torch.uint8, 260),
+        (torch.int64, 2**64),
+    ],
+)
 @pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [260])
-def test_solarize_threshold2_upper_bound(threshold, device):
-    img = torch.randint(0, 256, (3, 12, 23)).to(device)
+def test_solarize_threshold_above_bound(threshold, dtype, device):
+    make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
+    img = make_img((3, 12, 23), dtype=dtype, device=device)
     with pytest.raises(TypeError, match="Threshold should be less than bound of img."):
         F_t.solarize(img, threshold)
 
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 83b0de8ea..a78be373d 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -8,6 +8,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 from torchvision.prototype import features
 from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.prototype.transforms.functional._meta import get_spatial_size
+from torchvision.transforms import functional_tensor as _FT
 
 from ._utils import _isinstance, _setup_fill_arg
 
@@ -137,7 +138,7 @@ class _AutoAugmentBase(Transform):
         elif transform_id == "Posterize":
             return F.posterize(image, bits=int(magnitude))
         elif transform_id == "Solarize":
-            bound = 1.0 if isinstance(image, torch.Tensor) and image.is_floating_point() else 255.0
+            bound = _FT._max_value(image.dtype) if isinstance(image, torch.Tensor) else 255.0
             return F.solarize(image, threshold=bound * magnitude)
         elif transform_id == "AutoContrast":
             return F.autocontrast(image)
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index f0b47b441..c70d746d8 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -2,13 +2,13 @@ import torch
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
-from ._meta import _rgb_to_gray, convert_dtype_image_tensor
+from ._meta import _num_value_bits, _rgb_to_gray, convert_dtype_image_tensor
 
 
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
     ratio = float(ratio)
     fp = image1.is_floating_point()
-    bound = 1.0 if fp else 255.0
+    bound = _FT._max_value(image1.dtype)
     output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
     return output if fp else output.to(image1.dtype)
 
@@ -20,7 +20,7 @@ def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float
     _FT._assert_channels(image, [1, 3])
 
     fp = image.is_floating_point()
-    bound = 1.0 if fp else 255.0
+    bound = _FT._max_value(image.dtype)
     output = image.mul(brightness_factor).clamp_(0, bound)
     return output if fp else output.to(image.dtype)
 
@@ -222,8 +222,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
         return image
 
     orig_dtype = image.dtype
-    if image.dtype == torch.uint8:
-        image = image / 255.0
+    image = convert_dtype_image_tensor(image, torch.float32)
 
     image = _rgb_to_hsv(image)
     h, s, v = image.unbind(dim=-3)
@@ -231,10 +230,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
     image = torch.stack((h, s, v), dim=-3)
     image_hue_adj = _hsv_to_rgb(image)
 
-    if orig_dtype == torch.uint8:
-        image_hue_adj = image_hue_adj.mul_(255.0).to(dtype=orig_dtype)
-
-    return image_hue_adj
+    return convert_dtype_image_tensor(image_hue_adj, orig_dtype)
 
 
 adjust_hue_image_pil = _FP.adjust_hue
@@ -289,14 +285,15 @@ def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) ->
 
 
 def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
-    if bits > 8:
-        return image
-
     if image.is_floating_point():
         levels = 1 << bits
         return image.mul(levels).floor_().clamp_(0, levels - 1).div_(levels)
     else:
-        mask = ((1 << bits) - 1) << (8 - bits)
+        num_value_bits = _num_value_bits(image.dtype)
+        if bits >= num_value_bits:
+            return image
+
+        mask = ((1 << bits) - 1) << (num_value_bits - bits)
         return image & mask
 
 
@@ -317,8 +314,7 @@ def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
 
 
 def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
-    bound = 1 if image.is_floating_point() else 255
-    if threshold > bound:
+    if threshold > _FT._max_value(image.dtype):
         raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
 
     return torch.where(image >= threshold, invert_image_tensor(image), image)
@@ -349,7 +345,7 @@ def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
         # exit earlier on empty images
         return image
 
-    bound = 1.0 if image.is_floating_point() else 255.0
+    bound = _FT._max_value(image.dtype)
     dtype = image.dtype if torch.is_floating_point(image) else torch.float32
 
     minimum = image.amin(dim=(-2, -1), keepdim=True).to(dtype)
@@ -383,14 +379,18 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
 
+    # 1. The algorithm below can easily be extended to support arbitrary integer dtypes. However, the histogram that
+    #    would be needed to computed will have at least `torch.iinfo(dtype).max + 1` values. That is perfectly fine for
+    #    `torch.int8`, `torch.uint8`, and `torch.int16`, at least questionable for `torch.int32` and completely
+    #    unfeasible for `torch.int64`.
+    # 2. Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
+    #    could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
+    #    to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it slower
+    #    and more complicated to implement than a simple conversion and a fast histogram implementation for integers.
+    # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
+    # by far the most common, we choose it as base.
     output_dtype = image.dtype
-    if image.is_floating_point():
-        # Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
-        # could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
-        # to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it
-        # slower and more complicated to implement than a simple conversion and a fast histogram implementation for
-        # integers.
-        image = convert_dtype_image_tensor(image, torch.uint8)
+    image = convert_dtype_image_tensor(image, torch.uint8)
 
     # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
     # corresponds to adding 1 to index 127 in the histogram.
@@ -461,10 +461,13 @@ def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
 
 
 def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
-    if image.dtype == torch.uint8:
+    if image.is_floating_point():
+        return 1.0 - image  # type: ignore[no-any-return]
+    elif image.dtype == torch.uint8:
         return image.bitwise_not()
-    else:
-        return (1 if image.is_floating_point() else 255) - image  # type: ignore[no-any-return]
+    else:  # signed integer dtypes
+        # We can't use `Tensor.bitwise_not` here, since we want to retain the leading zero bit that encodes the sign
+        return image.bitwise_xor((1 << _num_value_bits(image.dtype)) - 1)
 
 
 invert_image_pil = _FP.invert
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index ca641faf1..aa40516d4 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -15,12 +15,6 @@ def _assert_image_tensor(img: Tensor) -> None:
         raise TypeError("Tensor is not a torch image.")
 
 
-def _assert_threshold(img: Tensor, threshold: float) -> None:
-    bound = 1 if img.is_floating_point() else 255
-    if threshold > bound:
-        raise TypeError("Threshold should be less than bound of img.")
-
-
 def get_dimensions(img: Tensor) -> List[int]:
     _assert_image_tensor(img)
     channels = 1 if img.ndim == 2 else img.shape[-3]
@@ -56,6 +50,8 @@ def _max_value(dtype: torch.dtype) -> int:
     elif dtype == torch.int64:
         return 9223372036854775807
     else:
+        # This is only here for completeness. This value is implicitly assumed in a lot of places so changing it is not
+        # easy.
         return 1
 
 
@@ -212,8 +208,7 @@ def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
         return img
 
     orig_dtype = img.dtype
-    if img.dtype == torch.uint8:
-        img = img.to(dtype=torch.float32) / 255.0
+    img = convert_image_dtype(img, torch.float32)
 
     img = _rgb2hsv(img)
     h, s, v = img.unbind(dim=-3)
@@ -221,10 +216,7 @@ def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
     img = torch.stack((h, s, v), dim=-3)
     img_hue_adj = _hsv2rgb(img)
 
-    if orig_dtype == torch.uint8:
-        img_hue_adj = (img_hue_adj * 255.0).to(dtype=orig_dtype)
-
-    return img_hue_adj
+    return convert_image_dtype(img_hue_adj, orig_dtype)
 
 
 def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
@@ -263,7 +255,7 @@ def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
 
 def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
     ratio = float(ratio)
-    bound = 1.0 if img1.is_floating_point() else 255.0
+    bound = _max_value(img1.dtype)
     return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
 
 
@@ -775,8 +767,7 @@ def invert(img: Tensor) -> Tensor:
 
     _assert_channels(img, [1, 3])
 
-    bound = torch.tensor(1 if img.is_floating_point() else 255, dtype=img.dtype, device=img.device)
-    return bound - img
+    return _max_value(img.dtype) - img
 
 
 def posterize(img: Tensor, bits: int) -> Tensor:
@@ -802,7 +793,8 @@ def solarize(img: Tensor, threshold: float) -> Tensor:
 
     _assert_channels(img, [1, 3])
 
-    _assert_threshold(img, threshold)
+    if threshold > _max_value(img.dtype):
+        raise TypeError("Threshold should be less than bound of img.")
 
     inverted_img = invert(img)
     return torch.where(img >= threshold, inverted_img, img)
@@ -849,7 +841,7 @@ def autocontrast(img: Tensor) -> Tensor:
 
     _assert_channels(img, [1, 3])
 
-    bound = 1.0 if img.is_floating_point() else 255.0
+    bound = _max_value(img.dtype)
     dtype = img.dtype if torch.is_floating_point(img) else torch.float32
 
     minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
-- 
GitLab


From 4508c84e0303de3f40e621fcf337d0d3970cf9e2 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 4 Nov 2022 10:32:08 +0100
Subject: [PATCH 121/624] [proto][tests] Fix missing extra_dims in cxcywh
 (#6906)

---
 test/prototype_common_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index e56da8bba..8feab0ec5 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -373,8 +373,8 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dt
             h = randint_with_tensor_bounds(1, height - y)
             parts = (x, y, w, h)
         else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, ())
-            cy = torch.randint(1, height - 1, ())
+            cx = torch.randint(1, width - 1, extra_dims)
+            cy = torch.randint(1, height - 1, extra_dims)
             w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
             h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
             parts = (cx, cy, w, h)
-- 
GitLab


From 9b0da0c332d6fa758e5a2a5ffa79ebddd5ff6188 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 4 Nov 2022 10:54:33 +0100
Subject: [PATCH 122/624] replace tensor division with scalar division and
 tensor multiplication (#6903)

* replace tensor division with scalar division and tensor multiplication

* fix consistency test tolerances
---
 test/test_prototype_transforms_consistency.py         | 2 ++
 torchvision/prototype/transforms/functional/_color.py | 4 ++--
 torchvision/prototype/transforms/functional/_meta.py  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index a23783b00..4cba4265a 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -163,6 +163,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(torch.uint8),
         ],
         supports_pil=False,
+        # Use default tolerances of `torch.testing.assert_close`
+        closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
         prototype_transforms.ToPILImage,
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index c70d746d8..551832208 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -180,7 +180,7 @@ def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
     hb = (4.0 + gc).sub_(rc).mul_(mask_maxc_neq_g & mask_maxc_neq_r)
 
     h = hr.add_(hg).add_(hb)
-    h = h.div_(6.0).add_(1.0).fmod_(1.0)
+    h = h.mul_(1.0 / 6.0).add_(1.0).fmod_(1.0)
     return torch.stack((h, s, maxc), dim=-3)
 
 
@@ -287,7 +287,7 @@ def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) ->
 def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
     if image.is_floating_point():
         levels = 1 << bits
-        return image.mul(levels).floor_().clamp_(0, levels - 1).div_(levels)
+        return image.mul(levels).floor_().clamp_(0, levels - 1).mul_(1.0 / levels)
     else:
         num_value_bits = _num_value_bits(image.dtype)
         if bits >= num_value_bits:
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 81ccd08de..8bcd81767 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -367,7 +367,7 @@ def convert_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.f
     else:
         # int to float
         if float_output:
-            return image.to(dtype).div_(_FT._max_value(image.dtype))
+            return image.to(dtype).mul_(1.0 / _FT._max_value(image.dtype))
 
         # int to int
         num_value_bits_input = _num_value_bits(image.dtype)
-- 
GitLab


From 732064866c816a95149064f4b573f28e6294413e Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 4 Nov 2022 11:22:50 +0100
Subject: [PATCH 123/624] [proto] Small optims for elastic op on bboxes (#6897)

* [proto] Small optims for elastic op on bboxes

* More inplace ops according to the review

* Create grid on device directly. This should be faster

* PR Review update. Apply ceil on float input
---
 .../transforms/functional/_geometry.py        | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index f2a12d6f6..0541c24fa 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1108,6 +1108,18 @@ def elastic_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
+def _create_identity_grid(size: Tuple[int, int], device: torch.device) -> torch.Tensor:
+    sy, sx = size
+    base_grid = torch.empty(1, sy, sx, 2, device=device)
+    x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device)
+    base_grid[..., 0].copy_(x_grid)
+
+    y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+
+    return base_grid
+
+
 def elastic_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
@@ -1125,22 +1137,24 @@ def elastic_bounding_box(
     # Or add spatial_size arg and check displacement shape
     spatial_size = displacement.shape[-3], displacement.shape[-2]
 
-    id_grid = _FT._create_identity_grid(list(spatial_size)).to(bounding_box.device)
+    id_grid = _create_identity_grid(spatial_size, bounding_box.device)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
-    inv_grid = id_grid - displacement
+    inv_grid = id_grid.sub_(displacement)
 
     # Get points from bboxes
     points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
-    index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long)
-    index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long)
+    if points.is_floating_point():
+        points = points.ceil_()
+    index_xy = points.to(dtype=torch.long)
+    index_x, index_y = index_xy[:, 0], index_xy[:, 1]
+
     # Transform points:
     t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype)
-    transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
 
     transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(
-- 
GitLab


From dc11b1f6bb6d241703bea79cdbc14d87f9f82252 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Fri, 4 Nov 2022 10:39:48 +0000
Subject: [PATCH 124/624] [prototype] Restore BC on perspective (#6902)

* Restore BC on perspective

* Fixes linter

* Fixing tests.

* Apply code-review changes.

* Pleasing mypy.

* Revert named parameters.
---
 test/prototype_transforms_kernel_infos.py     | 14 ++--
 test/test_prototype_transforms.py             |  6 +-
 test/test_prototype_transforms_functional.py  |  4 +-
 .../prototype/features/_bounding_box.py       |  8 ++-
 torchvision/prototype/features/_feature.py    |  4 +-
 torchvision/prototype/features/_image.py      | 11 +++-
 torchvision/prototype/features/_mask.py       |  8 ++-
 torchvision/prototype/features/_video.py      | 11 +++-
 torchvision/prototype/transforms/_geometry.py |  6 +-
 .../transforms/functional/_geometry.py        | 66 +++++++++++++++----
 10 files changed, 102 insertions(+), 36 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index a106aea65..1fd6581f5 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1184,38 +1184,38 @@ _PERSPECTIVE_COEFFS = [
 def sample_inputs_perspective_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"]):
         for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 def reference_inputs_perspective_image_tensor():
-    for image_loader, perspective_coeffs in itertools.product(make_image_loaders(extra_dims=[()]), _PERSPECTIVE_COEFFS):
+    for image_loader, coefficients in itertools.product(make_image_loaders(extra_dims=[()]), _PERSPECTIVE_COEFFS):
         # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
         for fill in [None, 128.0, 128, [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=perspective_coeffs)
+            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=coefficients)
 
 
 def sample_inputs_perspective_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, perspective_coeffs=_PERSPECTIVE_COEFFS[0]
+            bounding_box_loader, bounding_box_loader.format, None, None, coefficients=_PERSPECTIVE_COEFFS[0]
         )
 
 
 def sample_inputs_perspective_mask():
     for mask_loader in make_mask_loaders(sizes=["random"]):
-        yield ArgsKwargs(mask_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+        yield ArgsKwargs(mask_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 def reference_inputs_perspective_mask():
     for mask_loader, perspective_coeffs in itertools.product(
         make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
     ):
-        yield ArgsKwargs(mask_loader, perspective_coeffs=perspective_coeffs)
+        yield ArgsKwargs(mask_loader, None, None, coefficients=perspective_coeffs)
 
 
 def sample_inputs_perspective_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+        yield ArgsKwargs(video_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 KERNEL_INFOS.extend(
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index fab4cc0dd..9ba5c8564 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -917,8 +917,8 @@ class TestRandomPerspective:
         params = transform._get_params([image])
 
         h, w = image.spatial_size
-        assert "perspective_coeffs" in params
-        assert len(params["perspective_coeffs"]) == 8
+        assert "coefficients" in params
+        assert len(params["coefficients"]) == 8
 
     @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
     def test__transform(self, distortion_scale, mocker):
@@ -940,7 +940,7 @@ class TestRandomPerspective:
         params = transform._get_params([inpt])
 
         fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
+        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
 
 
 class TestElasticTransform:
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 20f5e5330..9917fea82 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -874,7 +874,9 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
         output_bboxes = F.perspective_bounding_box(
             bboxes,
             bboxes_format,
-            perspective_coeffs=pcoeffs,
+            None,
+            None,
+            coefficients=pcoeffs,
         )
 
         if bboxes.ndim < 2:
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 638759ae8..13dcdb4b7 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -169,11 +169,15 @@ class BoundingBox(_Feature):
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> BoundingBox:
-        output = self._F.perspective_bounding_box(self.as_subclass(torch.Tensor), self.format, perspective_coeffs)
+        output = self._F.perspective_bounding_box(
+            self.as_subclass(torch.Tensor), startpoints, endpoints, self.format, coefficients=coefficients
+        )
         return BoundingBox.wrap_like(self, output)
 
     def elastic(
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
index 1cc2d8d4b..9893e24d7 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/features/_feature.py
@@ -218,9 +218,11 @@ class _Feature(torch.Tensor):
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> _Feature:
         return self
 
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 74904294f..1e9c4623d 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -206,12 +206,19 @@ class Image(_Feature):
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> Image:
         output = self._F.perspective_image_tensor(
-            self.as_subclass(torch.Tensor), perspective_coeffs, interpolation=interpolation, fill=fill
+            self.as_subclass(torch.Tensor),
+            startpoints,
+            endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
         )
         return Image.wrap_like(self, output)
 
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index a297c43c2..1962a8d64 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -118,11 +118,15 @@ class Mask(_Feature):
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> Mask:
-        output = self._F.perspective_mask(self.as_subclass(torch.Tensor), perspective_coeffs, fill=fill)
+        output = self._F.perspective_mask(
+            self.as_subclass(torch.Tensor), startpoints, endpoints, fill=fill, coefficients=coefficients
+        )
         return Mask.wrap_like(self, output)
 
     def elastic(
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index a23116783..0d0961d77 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -166,12 +166,19 @@ class Video(_Feature):
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> Video:
         output = self._F.perspective_video(
-            self.as_subclass(torch.Tensor), perspective_coeffs, interpolation=interpolation, fill=fill
+            self.as_subclass(torch.Tensor),
+            startpoints,
+            endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
         )
         return Video.wrap_like(self, output)
 
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 214296d03..bda250485 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -524,15 +524,17 @@ class RandomPerspective(_RandomApplyTransform):
         startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
         endpoints = [topleft, topright, botright, botleft]
         perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
-        return dict(perspective_coeffs=perspective_coeffs)
+        return dict(coefficients=perspective_coeffs)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
         return F.perspective(
             inpt,
-            **params,
+            None,
+            None,
             fill=fill,
             interpolation=self.interpolation,
+            **params,
         )
 
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 0541c24fa..24b240b25 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -11,6 +11,7 @@ from torchvision.transforms import functional_pil as _FP, functional_tensor as _
 from torchvision.transforms.functional import (
     _compute_resized_output_size as __compute_resized_output_size,
     _get_inverse_affine_matrix,
+    _get_perspective_coeffs,
     InterpolationMode,
     pil_modes_mapping,
     pil_to_tensor,
@@ -906,12 +907,32 @@ def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: i
         return crop_image_pil(inpt, top, left, height, width)
 
 
+def _perspective_coefficients(
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]],
+) -> List[float]:
+    if coefficients is not None:
+        if startpoints is not None and endpoints is not None:
+            raise ValueError("The startpoints/endpoints and the coefficients shouldn't be defined concurrently.")
+        elif len(coefficients) != 8:
+            raise ValueError("Argument coefficients should have 8 float values")
+        return coefficients
+    elif startpoints is not None and endpoints is not None:
+        return _get_perspective_coeffs(startpoints, endpoints)
+    else:
+        raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
+
+
 def perspective_image_tensor(
     image: torch.Tensor,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
     if image.numel() == 0:
         return image
 
@@ -934,21 +955,24 @@ def perspective_image_tensor(
 @torch.jit.unused
 def perspective_image_pil(
     image: PIL.Image.Image,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BICUBIC,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
     return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
 def perspective_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
-
-    if len(perspective_coeffs) != 8:
-        raise ValueError("Argument perspective_coeffs should have 8 float values")
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
     original_shape = bounding_box.shape
     bounding_box = (
@@ -1029,8 +1053,10 @@ def perspective_bounding_box(
 
 def perspective_mask(
     mask: torch.Tensor,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1039,7 +1065,7 @@ def perspective_mask(
         needs_squeeze = False
 
     output = perspective_image_tensor(
-        mask, perspective_coeffs=perspective_coeffs, interpolation=InterpolationMode.NEAREST, fill=fill
+        mask, startpoints, endpoints, interpolation=InterpolationMode.NEAREST, fill=fill, coefficients=coefficients
     )
 
     if needs_squeeze:
@@ -1050,25 +1076,37 @@ def perspective_mask(
 
 def perspective_video(
     video: torch.Tensor,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    return perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill)
+    return perspective_image_tensor(
+        video, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+    )
 
 
 def perspective(
     inpt: features.InputTypeJIT,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return perspective_image_tensor(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
+        return perspective_image_tensor(
+            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
     elif isinstance(inpt, features._Feature):
-        return inpt.perspective(perspective_coeffs, interpolation=interpolation, fill=fill)
+        return inpt.perspective(
+            startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
     else:
-        return perspective_image_pil(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
+        return perspective_image_pil(
+            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
 
 
 def elastic_image_tensor(
-- 
GitLab


From f20177b79e9e4c39eef9187ca77196a4611c2712 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 4 Nov 2022 10:46:22 +0000
Subject: [PATCH 125/624] [FBcode->GH] Revert "Pyav backend for VideoReader API
 (#6598)" (#6908)

This reverts commit 2e833520618dc460cbeb693e29e40b65a02ccafb.
---
 test/test_video_gpu_decoder.py      |  13 +--
 test/test_videoapi.py               |  98 +++++++---------
 torchvision/__init__.py             |  21 +---
 torchvision/io/__init__.py          |   5 +
 torchvision/io/_load_gpu_decoder.py |   8 ++
 torchvision/io/video_reader.py      | 172 ++++++++--------------------
 6 files changed, 107 insertions(+), 210 deletions(-)
 create mode 100644 torchvision/io/_load_gpu_decoder.py

diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
index 66c9aa04a..d987db6dd 100644
--- a/test/test_video_gpu_decoder.py
+++ b/test/test_video_gpu_decoder.py
@@ -3,9 +3,7 @@ import os
 
 import pytest
 import torch
-import torchvision
-from torchvision import _HAS_GPU_VIDEO_DECODER
-from torchvision.io import VideoReader
+from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
 
 try:
     import av
@@ -31,9 +29,8 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_frame_reading(self, video_file):
-        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path)
+        decoder = VideoReader(full_path, device="cuda")
         with av.open(full_path) as container:
             for av_frame in container.decode(container.streams.video[0]):
                 av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -57,8 +54,7 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_seek_reading(self, keyframes, full_path, duration):
-        torchvision.set_video_backend("cuda")
-        decoder = VideoReader(full_path)
+        decoder = VideoReader(full_path, device="cuda")
         time = duration / 2
         decoder.seek(time, keyframes_only=keyframes)
         with av.open(full_path) as container:
@@ -83,9 +79,8 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_metadata(self, video_file):
-        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path)
+        decoder = VideoReader(full_path, device="cuda")
         video_metadata = decoder.get_metadata()["video"]
         with av.open(full_path) as container:
             video = container.streams.video[0]
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index c1bfb9012..4688e5a64 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -53,9 +53,7 @@ test_videos = {
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    def test_frame_reading(self, test_video, backend):
-        torchvision.set_video_backend(backend)
+    def test_frame_reading(self, test_video):
         full_path = os.path.join(VIDEO_DIR, test_video)
         with av.open(full_path) as av_reader:
             if av_reader.streams.video:
@@ -119,60 +117,50 @@ class TestVideoApi:
 
     @pytest.mark.parametrize("stream", ["video", "audio"])
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
-        torchvision.set_video_backend(backend)
+    def test_frame_reading_mem_vs_file(self, test_video, stream):
         full_path = os.path.join(VIDEO_DIR, test_video)
 
-        reader = VideoReader(full_path)
-        reader_md = reader.get_metadata()
-
-        if stream in reader_md:
-            # Test video reading from file vs from memory
-            vr_frames, vr_frames_mem = [], []
-            vr_pts, vr_pts_mem = [], []
-            # get vr frames
-            video_reader = VideoReader(full_path, stream)
-            for vr_frame in video_reader:
-                vr_frames.append(vr_frame["data"])
-                vr_pts.append(vr_frame["pts"])
-
-            # get vr frames = read from memory
-            f = open(full_path, "rb")
-            fbytes = f.read()
-            f.close()
-            video_reader_from_mem = VideoReader(fbytes, stream)
-
-            for vr_frame_from_mem in video_reader_from_mem:
-                vr_frames_mem.append(vr_frame_from_mem["data"])
-                vr_pts_mem.append(vr_frame_from_mem["pts"])
-
-            # same number of frames
-            assert len(vr_frames) == len(vr_frames_mem)
-            assert len(vr_pts) == len(vr_pts_mem)
-
-            # compare the frames and ptss
-            for i in range(len(vr_frames)):
-                assert vr_pts[i] == vr_pts_mem[i]
-                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-                # on average the difference is very small and caused
-                # by decoding (around 1%)
-                # TODO: asses empirically how to set this? atm it's 1%
-                # averaged over all frames
-                assert mean_delta.item() < 2.55
-
-            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
-        else:
-            del reader, reader_md
+        # Test video reading from file vs from memory
+        vr_frames, vr_frames_mem = [], []
+        vr_pts, vr_pts_mem = [], []
+        # get vr frames
+        video_reader = VideoReader(full_path, stream)
+        for vr_frame in video_reader:
+            vr_frames.append(vr_frame["data"])
+            vr_pts.append(vr_frame["pts"])
+
+        # get vr frames = read from memory
+        f = open(full_path, "rb")
+        fbytes = f.read()
+        f.close()
+        video_reader_from_mem = VideoReader(fbytes, stream)
+
+        for vr_frame_from_mem in video_reader_from_mem:
+            vr_frames_mem.append(vr_frame_from_mem["data"])
+            vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+        # same number of frames
+        assert len(vr_frames) == len(vr_frames_mem)
+        assert len(vr_pts) == len(vr_pts_mem)
+
+        # compare the frames and ptss
+        for i in range(len(vr_frames)):
+            assert vr_pts[i] == vr_pts_mem[i]
+            mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+            # on average the difference is very small and caused
+            # by decoding (around 1%)
+            # TODO: asses empirically how to set this? atm it's 1%
+            # averaged over all frames
+            assert mean_delta.item() < 2.55
+
+        del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
 
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    def test_metadata(self, test_video, config, backend):
+    def test_metadata(self, test_video, config):
         """
         Test that the metadata returned via pyav corresponds to the one returned
         by the new video decoder API
         """
-        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         reader = VideoReader(full_path, "video")
         reader_md = reader.get_metadata()
@@ -180,9 +168,7 @@ class TestVideoApi:
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    def test_seek_start(self, test_video, backend):
-        torchvision.set_video_backend(backend)
+    def test_seek_start(self, test_video):
         full_path = os.path.join(VIDEO_DIR, test_video)
         video_reader = VideoReader(full_path, "video")
         num_frames = 0
@@ -208,9 +194,7 @@ class TestVideoApi:
         assert start_num_frames == num_frames
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader"])
-    def test_accurateseek_middle(self, test_video, backend):
-        torchvision.set_video_backend(backend)
+    def test_accurateseek_middle(self, test_video):
         full_path = os.path.join(VIDEO_DIR, test_video)
         stream = "video"
         video_reader = VideoReader(full_path, stream)
@@ -249,9 +233,7 @@ class TestVideoApi:
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
-    def test_keyframe_reading(self, test_video, config, backend):
-        torchvision.set_video_backend(backend)
+    def test_keyframe_reading(self, test_video, config):
         full_path = os.path.join(VIDEO_DIR, test_video)
 
         av_reader = av.open(full_path)
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index def7e82b8..739f79407 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -1,24 +1,16 @@
 import os
 import warnings
-from modulefinder import Module
 
 import torch
 from torchvision import datasets, io, models, ops, transforms, utils
 
-from .extension import _HAS_OPS, _load_library
+from .extension import _HAS_OPS
 
 try:
     from .version import __version__  # noqa: F401
 except ImportError:
     pass
 
-try:
-    _load_library("Decoder")
-    _HAS_GPU_VIDEO_DECODER = True
-except (ImportError, OSError, ModuleNotFoundError):
-    _HAS_GPU_VIDEO_DECODER = False
-
-
 # Check if torchvision is being imported within the root folder
 if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
     os.path.realpath(os.getcwd()), "torchvision"
@@ -74,16 +66,11 @@ def set_video_backend(backend):
         backend, please compile torchvision from source.
     """
     global _video_backend
-    if backend not in ["pyav", "video_reader", "cuda"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
+    if backend not in ["pyav", "video_reader"]:
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
     if backend == "video_reader" and not io._HAS_VIDEO_OPT:
-        # TODO: better messages
         message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        raise RuntimeError(message)
-    elif backend == "cuda" and not _HAS_GPU_VIDEO_DECODER:
-        # TODO: better messages
-        message = "cuda video backend is not available."
-        raise RuntimeError(message)
+        warnings.warn(message)
     else:
         _video_backend = backend
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index 0787b8230..ba7d4f69f 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -4,6 +4,10 @@ import torch
 
 from ..utils import _log_api_usage_once
 
+try:
+    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
+except ModuleNotFoundError:
+    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
@@ -43,6 +47,7 @@ __all__ = [
     "_read_video_timestamps_from_memory",
     "_probe_video_from_memory",
     "_HAS_VIDEO_OPT",
+    "_HAS_GPU_VIDEO_DECODER",
     "_read_video_clip_from_memory",
     "_read_video_meta_data",
     "VideoMetaData",
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
new file mode 100644
index 000000000..f7869f0a9
--- /dev/null
+++ b/torchvision/io/_load_gpu_decoder.py
@@ -0,0 +1,8 @@
+from ..extension import _load_library
+
+
+try:
+    _load_library("Decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError):
+    _HAS_GPU_VIDEO_DECODER = False
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 764b82dfe..0449d6d1e 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -1,12 +1,14 @@
-import io
 import warnings
-
 from typing import Any, Dict, Iterator, Optional
 
 import torch
 
 from ..utils import _log_api_usage_once
 
+try:
+    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
+except ModuleNotFoundError:
+    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import _HAS_VIDEO_OPT
 
 if _HAS_VIDEO_OPT:
@@ -20,37 +22,11 @@ else:
         return False
 
 
-try:
-    import av
-
-    av.logging.set_level(av.logging.ERROR)
-    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
-        av = ImportError(
-            """\
-Your version of PyAV is too old for the necessary video operations in torchvision.
-If you are on Python 3.5, you will have to build from source (the conda-forge
-packages are not up-to-date).  See
-https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-        )
-except ImportError:
-    av = ImportError(
-        """\
-PyAV is not installed, and is necessary for the video operations in torchvision.
-See https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-    )
-
-
 class VideoReader:
     """
     Fine-grained video-reading API.
     Supports frame-by-frame reading of various streams from a single video
-    container. Much like previous video_reader API it supports the following
-    backends: video_reader, pyav, and cuda.
-    Backends can be set via `torchvision.set_video_backend` function.
+    container.
 
     .. betastatus:: VideoReader class
 
@@ -112,11 +88,16 @@ class VideoReader:
             Default value (0) enables multithreading with codec-dependent heuristic. The performance
             will depend on the version of FFMPEG codecs supported.
 
+        device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
+            To use GPU decoding, pass ``device="cuda"``.
 
         path (str, optional):
             .. warning:
                 This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
                 Please use ``src`` instead.
+
+
+
     """
 
     def __init__(
@@ -124,59 +105,45 @@ class VideoReader:
         src: str = "",
         stream: str = "video",
         num_threads: int = 0,
+        device: str = "cpu",
         path: Optional[str] = None,
     ) -> None:
         _log_api_usage_once(self)
-        from .. import get_video_backend
+        self.is_cuda = False
+        device = torch.device(device)
+        if device.type == "cuda":
+            if not _HAS_GPU_VIDEO_DECODER:
+                raise RuntimeError("Not compiled with GPU decoder support.")
+            self.is_cuda = True
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+            return
+        if not _has_video_opt():
+            raise RuntimeError(
+                "Not compiled with video_reader support, "
+                + "to enable video_reader support, please install "
+                + "ffmpeg (version 4.2 is currently supported) and "
+                + "build torchvision from source."
+            )
+
+        if src == "":
+            if path is None:
+                raise TypeError("src cannot be empty")
+            src = path
+            warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
 
-        self.backend = get_video_backend()
-        if isinstance(src, str):
-            if src == "":
-                if path is None:
-                    raise TypeError("src cannot be empty")
-                src = path
-                warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
         elif isinstance(src, bytes):
-            if self.backend in ["cuda"]:
-                raise RuntimeError(
-                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
-                )
-            elif self.backend == "pyav":
-                src = io.BytesIO(src)
-            else:
-                src = torch.frombuffer(src, dtype=torch.uint8)
+            src = torch.frombuffer(src, dtype=torch.uint8)
+
+        if isinstance(src, str):
+            self._c = torch.classes.torchvision.Video(src, stream, num_threads)
         elif isinstance(src, torch.Tensor):
-            if self.backend in ["cuda", "pyav"]:
-                raise RuntimeError(
-                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
-                )
+            if self.is_cuda:
+                raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
+            self._c = torch.classes.torchvision.Video("", "", 0)
+            self._c.init_from_memory(src, stream, num_threads)
         else:
             raise TypeError("`src` must be either string, Tensor or bytes object.")
 
-        if self.backend == "cuda":
-            device = torch.device("cuda")
-            self._c = torch.classes.torchvision.GPUDecoder(src, device)
-
-        elif self.backend == "video_reader":
-            if isinstance(src, str):
-                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
-            elif isinstance(src, torch.Tensor):
-                self._c = torch.classes.torchvision.Video("", "", 0)
-                self._c.init_from_memory(src, stream, num_threads)
-
-        elif self.backend == "pyav":
-            self.container = av.open(src, metadata_errors="ignore")
-            # TODO: load metadata
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-
-            # TODO: add extradata exception
-
-        else:
-            raise RuntimeError("Unknown video backend: {}".format(self.backend))
-
     def __next__(self) -> Dict[str, Any]:
         """Decodes and returns the next frame of the current stream.
         Frames are encoded as a dict with mandatory
@@ -189,29 +156,14 @@ class VideoReader:
             and corresponding timestamp (``pts``) in seconds
 
         """
-        if self.backend == "cuda":
+        if self.is_cuda:
             frame = self._c.next()
             if frame.numel() == 0:
                 raise StopIteration
-            return {"data": frame, "pts": None}
-        elif self.backend == "video_reader":
-            frame, pts = self._c.next()
-        else:
-            try:
-                frame = next(self._c)
-                pts = float(frame.pts * frame.time_base)
-                if "video" in self.pyav_stream:
-                    frame = torch.tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
-                elif "audio" in self.pyav_stream:
-                    frame = torch.tensor(frame.to_ndarray()).permute(1, 0)
-                else:
-                    frame = None
-            except av.error.EOFError:
-                raise StopIteration
-
+            return {"data": frame}
+        frame, pts = self._c.next()
         if frame.numel() == 0:
             raise StopIteration
-
         return {"data": frame, "pts": pts}
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
@@ -230,18 +182,7 @@ class VideoReader:
             frame with the exact timestamp if it exists or
             the first frame with timestamp larger than ``time_s``.
         """
-        if self.backend in ["cuda", "video_reader"]:
-            self._c.seek(time_s, keyframes_only)
-        else:
-            # handle special case as pyav doesn't catch it
-            if time_s < 0:
-                time_s = 0
-            temp_str = self.container.streams.get(**self.pyav_stream)[0]
-            offset = int(round(time_s / temp_str.time_base))
-            if not keyframes_only:
-                warnings.warn("Accurate seek is not implemented for pyav backend")
-            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
-            self._c = self.container.decode(**self.pyav_stream)
+        self._c.seek(time_s, keyframes_only)
         return self
 
     def get_metadata(self) -> Dict[str, Any]:
@@ -250,21 +191,6 @@ class VideoReader:
         Returns:
             (dict): dictionary containing duration and frame rate for every stream
         """
-        if self.backend == "pyav":
-            metadata = {}  # type:  Dict[str, Any]
-            for stream in self.container.streams:
-                if stream.type not in metadata:
-                    if stream.type == "video":
-                        rate_n = "fps"
-                    else:
-                        rate_n = "framerate"
-                    metadata[stream.type] = {rate_n: [], "duration": []}
-
-                rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
-
-                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
-                metadata[stream.type][rate_n].append(float(rate))
-            return metadata
         return self._c.get_metadata()
 
     def set_current_stream(self, stream: str) -> bool:
@@ -284,12 +210,6 @@ class VideoReader:
         Returns:
             (bool): True on succes, False otherwise
         """
-        if self.backend == "cuda":
-            warnings.warn("GPU decoding only works with video stream.")
-        if self.backend == "pyav":
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-            return True
+        if self.is_cuda:
+            print("GPU decoding only works with video stream.")
         return self._c.set_current_stream(stream)
-- 
GitLab


From 0ea425a93ec2c23e7e1a9f551798c4584fa78eb6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 4 Nov 2022 13:58:50 +0100
Subject: [PATCH 126/624] Add video kernels to dispatcher tests (#6905)

* add KernelInfo's for {five, ten}_crop_video

* add video kernels to dispatcher tests
---
 test/prototype_transforms_dispatcher_infos.py | 27 ++++++++++++++
 test/prototype_transforms_kernel_infos.py     | 37 +++++++++++++++----
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index d817e4a71..8a9f5148e 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -140,6 +140,7 @@ DISPATCHER_INFOS = [
         F.horizontal_flip,
         kernels={
             features.Image: F.horizontal_flip_image_tensor,
+            features.Video: F.horizontal_flip_video,
             features.BoundingBox: F.horizontal_flip_bounding_box,
             features.Mask: F.horizontal_flip_mask,
         },
@@ -149,6 +150,7 @@ DISPATCHER_INFOS = [
         F.resize,
         kernels={
             features.Image: F.resize_image_tensor,
+            features.Video: F.resize_video,
             features.BoundingBox: F.resize_bounding_box,
             features.Mask: F.resize_mask,
         },
@@ -161,6 +163,7 @@ DISPATCHER_INFOS = [
         F.affine,
         kernels={
             features.Image: F.affine_image_tensor,
+            features.Video: F.affine_video,
             features.BoundingBox: F.affine_bounding_box,
             features.Mask: F.affine_mask,
         },
@@ -177,6 +180,7 @@ DISPATCHER_INFOS = [
         F.vertical_flip,
         kernels={
             features.Image: F.vertical_flip_image_tensor,
+            features.Video: F.vertical_flip_video,
             features.BoundingBox: F.vertical_flip_bounding_box,
             features.Mask: F.vertical_flip_mask,
         },
@@ -186,6 +190,7 @@ DISPATCHER_INFOS = [
         F.rotate,
         kernels={
             features.Image: F.rotate_image_tensor,
+            features.Video: F.rotate_video,
             features.BoundingBox: F.rotate_bounding_box,
             features.Mask: F.rotate_mask,
         },
@@ -200,6 +205,7 @@ DISPATCHER_INFOS = [
         F.crop,
         kernels={
             features.Image: F.crop_image_tensor,
+            features.Video: F.crop_video,
             features.BoundingBox: F.crop_bounding_box,
             features.Mask: F.crop_mask,
         },
@@ -209,6 +215,7 @@ DISPATCHER_INFOS = [
         F.resized_crop,
         kernels={
             features.Image: F.resized_crop_image_tensor,
+            features.Video: F.resized_crop_video,
             features.BoundingBox: F.resized_crop_bounding_box,
             features.Mask: F.resized_crop_mask,
         },
@@ -218,6 +225,7 @@ DISPATCHER_INFOS = [
         F.pad,
         kernels={
             features.Image: F.pad_image_tensor,
+            features.Video: F.pad_video,
             features.BoundingBox: F.pad_bounding_box,
             features.Mask: F.pad_mask,
         },
@@ -244,6 +252,7 @@ DISPATCHER_INFOS = [
         F.perspective,
         kernels={
             features.Image: F.perspective_image_tensor,
+            features.Video: F.perspective_video,
             features.BoundingBox: F.perspective_bounding_box,
             features.Mask: F.perspective_mask,
         },
@@ -256,6 +265,7 @@ DISPATCHER_INFOS = [
         F.elastic,
         kernels={
             features.Image: F.elastic_image_tensor,
+            features.Video: F.elastic_video,
             features.BoundingBox: F.elastic_bounding_box,
             features.Mask: F.elastic_mask,
         },
@@ -265,6 +275,7 @@ DISPATCHER_INFOS = [
         F.center_crop,
         kernels={
             features.Image: F.center_crop_image_tensor,
+            features.Video: F.center_crop_video,
             features.BoundingBox: F.center_crop_bounding_box,
             features.Mask: F.center_crop_mask,
         },
@@ -277,6 +288,7 @@ DISPATCHER_INFOS = [
         F.gaussian_blur,
         kernels={
             features.Image: F.gaussian_blur_image_tensor,
+            features.Video: F.gaussian_blur_video,
         },
         pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil),
         test_marks=[
@@ -288,6 +300,7 @@ DISPATCHER_INFOS = [
         F.equalize,
         kernels={
             features.Image: F.equalize_image_tensor,
+            features.Video: F.equalize_video,
         },
         pil_kernel_info=PILKernelInfo(F.equalize_image_pil, kernel_name="equalize_image_pil"),
     ),
@@ -295,6 +308,7 @@ DISPATCHER_INFOS = [
         F.invert,
         kernels={
             features.Image: F.invert_image_tensor,
+            features.Video: F.invert_video,
         },
         pil_kernel_info=PILKernelInfo(F.invert_image_pil, kernel_name="invert_image_pil"),
     ),
@@ -302,6 +316,7 @@ DISPATCHER_INFOS = [
         F.posterize,
         kernels={
             features.Image: F.posterize_image_tensor,
+            features.Video: F.posterize_video,
         },
         pil_kernel_info=PILKernelInfo(F.posterize_image_pil, kernel_name="posterize_image_pil"),
     ),
@@ -309,6 +324,7 @@ DISPATCHER_INFOS = [
         F.solarize,
         kernels={
             features.Image: F.solarize_image_tensor,
+            features.Video: F.solarize_video,
         },
         pil_kernel_info=PILKernelInfo(F.solarize_image_pil, kernel_name="solarize_image_pil"),
     ),
@@ -316,6 +332,7 @@ DISPATCHER_INFOS = [
         F.autocontrast,
         kernels={
             features.Image: F.autocontrast_image_tensor,
+            features.Video: F.autocontrast_video,
         },
         pil_kernel_info=PILKernelInfo(F.autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
     ),
@@ -323,6 +340,7 @@ DISPATCHER_INFOS = [
         F.adjust_sharpness,
         kernels={
             features.Image: F.adjust_sharpness_image_tensor,
+            features.Video: F.adjust_sharpness_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
     ),
@@ -330,6 +348,7 @@ DISPATCHER_INFOS = [
         F.erase,
         kernels={
             features.Image: F.erase_image_tensor,
+            features.Video: F.erase_video,
         },
         pil_kernel_info=PILKernelInfo(F.erase_image_pil),
         test_marks=[
@@ -340,6 +359,7 @@ DISPATCHER_INFOS = [
         F.adjust_brightness,
         kernels={
             features.Image: F.adjust_brightness_image_tensor,
+            features.Video: F.adjust_brightness_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_brightness_image_pil, kernel_name="adjust_brightness_image_pil"),
     ),
@@ -347,6 +367,7 @@ DISPATCHER_INFOS = [
         F.adjust_contrast,
         kernels={
             features.Image: F.adjust_contrast_image_tensor,
+            features.Video: F.adjust_contrast_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
     ),
@@ -354,6 +375,7 @@ DISPATCHER_INFOS = [
         F.adjust_gamma,
         kernels={
             features.Image: F.adjust_gamma_image_tensor,
+            features.Video: F.adjust_gamma_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
     ),
@@ -361,6 +383,7 @@ DISPATCHER_INFOS = [
         F.adjust_hue,
         kernels={
             features.Image: F.adjust_hue_image_tensor,
+            features.Video: F.adjust_hue_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
     ),
@@ -368,6 +391,7 @@ DISPATCHER_INFOS = [
         F.adjust_saturation,
         kernels={
             features.Image: F.adjust_saturation_image_tensor,
+            features.Video: F.adjust_saturation_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
     ),
@@ -375,6 +399,7 @@ DISPATCHER_INFOS = [
         F.five_crop,
         kernels={
             features.Image: F.five_crop_image_tensor,
+            features.Video: F.five_crop_video,
         },
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         test_marks=[
@@ -386,6 +411,7 @@ DISPATCHER_INFOS = [
         F.ten_crop,
         kernels={
             features.Image: F.ten_crop_image_tensor,
+            features.Video: F.ten_crop_video,
         },
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
@@ -397,6 +423,7 @@ DISPATCHER_INFOS = [
         F.normalize,
         kernels={
             features.Image: F.normalize_image_tensor,
+            features.Video: F.normalize_video,
         },
         test_marks=[
             skip_dispatch_feature,
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 1fd6581f5..2f829f333 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -2020,6 +2020,12 @@ def reference_inputs_five_crop_image_tensor():
             yield ArgsKwargs(image_loader, size=size)
 
 
+def sample_inputs_five_crop_video():
+    size = _FIVE_TEN_CROP_SIZES[0]
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
+        yield ArgsKwargs(video_loader, size=size)
+
+
 def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
@@ -2036,6 +2042,17 @@ def reference_inputs_ten_crop_image_tensor():
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
+def sample_inputs_ten_crop_video():
+    size = _FIVE_TEN_CROP_SIZES[0]
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
+        yield ArgsKwargs(video_loader, size=size)
+
+
+_common_five_ten_crop_marks = [
+    xfail_jit_python_scalar_arg("size"),
+    mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
+]
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -2043,23 +2060,27 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_five_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
-            test_marks=[
-                xfail_jit_python_scalar_arg("size"),
-                mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
-            ],
+            test_marks=_common_five_ten_crop_marks,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
+        KernelInfo(
+            F.five_crop_video,
+            sample_inputs_fn=sample_inputs_five_crop_video,
+            test_marks=_common_five_ten_crop_marks,
+        ),
         KernelInfo(
             F.ten_crop_image_tensor,
             sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
-            test_marks=[
-                xfail_jit_python_scalar_arg("size"),
-                mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
-            ],
+            test_marks=_common_five_ten_crop_marks,
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
+        KernelInfo(
+            F.ten_crop_video,
+            sample_inputs_fn=sample_inputs_ten_crop_video,
+            test_marks=_common_five_ten_crop_marks,
+        ),
     ]
 )
 
-- 
GitLab


From a21a93eebb4891329712adebede31a949885e4c7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 4 Nov 2022 13:59:24 +0100
Subject: [PATCH 127/624] fix ONNX tests (#6901)

* fix ONNX tests

* fix error trigger

* remove relaxing condition

* remove tolerate_small_differences
---
 test/test_onnx.py | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/test/test_onnx.py b/test/test_onnx.py
index b6f5481ed..09c73accc 100644
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@@ -27,7 +27,6 @@ class TestONNXExporter:
         self,
         model,
         inputs_list,
-        tolerate_small_mismatch=False,
         do_constant_folding=True,
         dynamic_axes=None,
         output_names=None,
@@ -64,9 +63,9 @@ class TestONNXExporter:
                 test_ouputs = model(*test_inputs)
                 if isinstance(test_ouputs, torch.Tensor):
                     test_ouputs = (test_ouputs,)
-            self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch)
+            self.ort_validate(onnx_io, test_inputs, test_ouputs)
 
-    def ort_validate(self, onnx_io, inputs, outputs, tolerate_small_mismatch=False):
+    def ort_validate(self, onnx_io, inputs, outputs):
 
         inputs, _ = torch.jit._flatten(inputs)
         outputs, _ = torch.jit._flatten(outputs)
@@ -86,13 +85,7 @@ class TestONNXExporter:
         ort_outs = ort_session.run(None, ort_inputs)
 
         for i in range(0, len(outputs)):
-            try:
-                torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
-            except AssertionError as error:
-                if tolerate_small_mismatch:
-                    assert "(0.00%)" in str(error), str(error)
-                else:
-                    raise
+            torch.testing.assert_close(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
 
     def test_nms(self):
         num_boxes = 100
@@ -325,7 +318,6 @@ class TestONNXExporter:
         self.run_model(
             model,
             [(images, features), (images2, test_features)],
-            tolerate_small_mismatch=True,
             input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
             dynamic_axes={
                 "input1": [0, 1, 2, 3],
@@ -401,7 +393,6 @@ class TestONNXExporter:
         self.run_model(
             model,
             [(images, features), (images2, test_features)],
-            tolerate_small_mismatch=True,
             input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
             dynamic_axes={
                 "input1": [0, 1, 2, 3],
@@ -447,7 +438,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs"],
             dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
         # Test exported model for an image with no detections on other images
         self.run_model(
@@ -456,7 +446,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs"],
             dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
     # Verify that paste_mask_in_image beahves the same in tracing.
@@ -511,7 +500,6 @@ class TestONNXExporter:
                 "scores": [0],
                 "masks": [0, 1, 2],
             },
-            tolerate_small_mismatch=True,
         )
         # Test exported model for an image with no detections on other images
         self.run_model(
@@ -526,7 +514,6 @@ class TestONNXExporter:
                 "scores": [0],
                 "masks": [0, 1, 2],
             },
-            tolerate_small_mismatch=True,
         )
 
     # Verify that heatmaps_to_keypoints behaves the same in tracing.
@@ -568,7 +555,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
             dynamic_axes={"images_tensors": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
         self.run_model(
@@ -577,7 +563,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
             dynamic_axes={"images_tensors": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
     def test_shufflenet_v2_dynamic_axes(self):
@@ -591,7 +576,6 @@ class TestONNXExporter:
             input_names=["input_images"],
             output_names=["output"],
             dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}},
-            tolerate_small_mismatch=True,
         )
 
 
-- 
GitLab


From 3d10c8a651bd15774a63c4d7cf87d5504cf8c756 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 4 Nov 2022 14:06:57 +0100
Subject: [PATCH 128/624] [proto] Small optim for perspective op on images
 (#6907)

* [proto] small optim for perspective op on images, reverted concat trick on bboxes

* revert unrelated changes

* PR review updates

* PR review change
---
 .../transforms/functional/_geometry.py        | 61 ++++++++++++++++---
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 24b240b25..adf494b1c 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -907,6 +907,36 @@ def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: i
         return crop_image_pil(inpt, top, left, height, width)
 
 
+def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device))
+    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1.div_(output_grid2).sub_(1.0)
+    return output_grid.view(1, oh, ow, 2)
+
+
 def _perspective_coefficients(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
@@ -944,7 +974,19 @@ def perspective_image_tensor(
     else:
         needs_unsquash = False
 
-    output = _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill)
+    _FT._assert_grid_transform_inputs(
+        image,
+        matrix=None,
+        interpolation=interpolation.value,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    ow, oh = image.shape[-1], image.shape[-2]
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
+    output = _FT._apply_grid_transform(image, grid, interpolation.value, fill=fill)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1012,16 +1054,16 @@ def perspective_bounding_box(
         (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
     ]
 
-    theta12_T = torch.tensor(
-        [
-            [inv_coeffs[0], inv_coeffs[3], inv_coeffs[6], inv_coeffs[6]],
-            [inv_coeffs[1], inv_coeffs[4], inv_coeffs[7], inv_coeffs[7]],
-            [inv_coeffs[2], inv_coeffs[5], 1.0, 1.0],
-        ],
+    theta1 = torch.tensor(
+        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
         dtype=dtype,
         device=device,
     )
 
+    theta2 = torch.tensor(
+        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
+    )
+
     # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
@@ -1032,9 +1074,8 @@ def perspective_bounding_box(
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
 
-    numer_denom_points = torch.matmul(points, theta12_T)
-    numer_points = numer_denom_points[:, :2]
-    denom_points = numer_denom_points[:, 2:]
+    numer_points = torch.matmul(points, theta1.T)
+    denom_points = torch.matmul(points, theta2.T)
     transformed_points = numer_points.div_(denom_points)
 
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
-- 
GitLab


From 9e6dab730bae3c62a79cd9f05f1532da3bbd0acf Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 4 Nov 2022 14:38:45 +0100
Subject: [PATCH 129/624] add auto generated model docs to make clean (#6912)

---
 docs/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/Makefile b/docs/Makefile
index 389a07a60..f462ff223 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -33,6 +33,7 @@ clean:
 	rm -rf $(SOURCEDIR)/auto_examples/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/gen_modules/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/generated/  # autosummary
+	rm -rf $(SOURCEDIR)/models/generated  # autosummary
 
 .PHONY: help Makefile docset
 
-- 
GitLab


From 95b594e04a136a9bb7db5644b581ced6ed1d3dc3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 4 Nov 2022 15:20:56 +0100
Subject: [PATCH 130/624] fix antialias handling on resize and resized_crop
 (#6913)

---
 torchvision/prototype/features/_bounding_box.py     |  4 ++--
 torchvision/prototype/features/_feature.py          |  4 ++--
 torchvision/prototype/features/_image.py            |  4 ++--
 torchvision/prototype/features/_mask.py             |  4 ++--
 torchvision/prototype/features/_video.py            |  4 ++--
 .../prototype/transforms/functional/_geometry.py    | 13 +++++--------
 6 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 13dcdb4b7..699afd0b2 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -78,7 +78,7 @@ class BoundingBox(_Feature):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> BoundingBox:
         output, spatial_size = self._F.resize_bounding_box(
             self.as_subclass(torch.Tensor), spatial_size=self.spatial_size, size=size, max_size=max_size
@@ -105,7 +105,7 @@ class BoundingBox(_Feature):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> BoundingBox:
         output, spatial_size = self._F.resized_crop_bounding_box(
             self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
index 9893e24d7..fc289485c 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/features/_feature.py
@@ -164,7 +164,7 @@ class _Feature(torch.Tensor):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> _Feature:
         return self
 
@@ -182,7 +182,7 @@ class _Feature(torch.Tensor):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> _Feature:
         return self
 
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 1e9c4623d..5ed107305 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -123,7 +123,7 @@ class Image(_Feature):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> Image:
         output = self._F.resize_image_tensor(
             self.as_subclass(torch.Tensor), size, interpolation=interpolation, max_size=max_size, antialias=antialias
@@ -146,7 +146,7 @@ class Image(_Feature):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> Image:
         output = self._F.resized_crop_image_tensor(
             self.as_subclass(torch.Tensor),
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 1962a8d64..6b021d8ad 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -49,7 +49,7 @@ class Mask(_Feature):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         max_size: Optional[int] = None,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> Mask:
         output = self._F.resize_mask(self.as_subclass(torch.Tensor), size, max_size=max_size)
         return Mask.wrap_like(self, output)
@@ -70,7 +70,7 @@ class Mask(_Feature):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> Mask:
         output = self._F.resized_crop_mask(self.as_subclass(torch.Tensor), top, left, height, width, size=size)
         return Mask.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index 0d0961d77..b1cba6236 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -79,7 +79,7 @@ class Video(_Feature):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> Video:
         output = self._F.resize_video(
             self.as_subclass(torch.Tensor),
@@ -106,7 +106,7 @@ class Video(_Feature):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
+        antialias: Optional[bool] = None,
     ) -> Video:
         output = self._F.resized_crop_video(
             self.as_subclass(torch.Tensor),
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index adf494b1c..09a9900f0 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -115,8 +115,9 @@ def resize_image_tensor(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: bool = False,
+    antialias: Optional[bool] = None,
 ) -> torch.Tensor:
+    antialias = False if antialias is None else antialias
     align_corners: Optional[bool] = None
     if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
         align_corners = False
@@ -196,7 +197,7 @@ def resize_video(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: bool = False,
+    antialias: Optional[bool] = None,
 ) -> torch.Tensor:
     return resize_image_tensor(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
@@ -209,10 +210,8 @@ def resize(
     antialias: Optional[bool] = None,
 ) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        antialias = False if antialias is None else antialias
         return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
     elif isinstance(inpt, features._Feature):
-        antialias = False if antialias is None else antialias
         return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
     else:
         if antialias is not None and not antialias:
@@ -1396,7 +1395,7 @@ def resized_crop_image_tensor(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: bool = False,
+    antialias: Optional[bool] = None,
 ) -> torch.Tensor:
     image = crop_image_tensor(image, top, left, height, width)
     return resize_image_tensor(image, size, interpolation=interpolation, antialias=antialias)
@@ -1449,7 +1448,7 @@ def resized_crop_video(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: bool = False,
+    antialias: Optional[bool] = None,
 ) -> torch.Tensor:
     return resized_crop_image_tensor(
         video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
@@ -1467,12 +1466,10 @@ def resized_crop(
     antialias: Optional[bool] = None,
 ) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        antialias = False if antialias is None else antialias
         return resized_crop_image_tensor(
             inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
         )
     elif isinstance(inpt, features._Feature):
-        antialias = False if antialias is None else antialias
         return inpt.resized_crop(top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
     else:
         return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
-- 
GitLab


From 4f3a000bc8b093c5c3355afd3093e93cffada51d Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 4 Nov 2022 12:11:25 -0700
Subject: [PATCH 131/624] [Nova] Add Caller for Mac x86 Conda Bianries (#6866)

---
 .github/workflows/build-conda-macos.yml | 48 +++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 .github/workflows/build-conda-macos.yml

diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml
new file mode 100644
index 000000000..11871ac21
--- /dev/null
+++ b/.github/workflows/build-conda-macos.yml
@@ -0,0 +1,48 @@
+name: Build Macos Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: conda
+      os: macos
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            conda-package-directory: packaging/torchvision
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@main
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      runner-type: macos-12
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-- 
GitLab


From e3f7baaf2ee4ef51ee1829636ec34566349eac60 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 7 Nov 2022 14:31:36 +0100
Subject: [PATCH 132/624] add test for dispatcher kernel signature consistency
 (#6904)

* add test for dispatcher kernel signature consistency

* add dispatcher feature signature consistency test

* fix error message

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/test_prototype_transforms_functional.py | 60 ++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 9917fea82..8d7e4a1b6 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -1,7 +1,10 @@
+import inspect
 import math
 import os
 import re
 
+from typing import get_type_hints
+
 import numpy as np
 import PIL.Image
 import pytest
@@ -314,6 +317,63 @@ class TestDispatchers:
 
         spy.assert_called_once()
 
+    @pytest.mark.parametrize(
+        ("dispatcher_info", "feature_type", "kernel_info"),
+        [
+            pytest.param(dispatcher_info, feature_type, kernel_info, id=f"{dispatcher_info.id}-{feature_type.__name__}")
+            for dispatcher_info in DISPATCHER_INFOS
+            for feature_type, kernel_info in dispatcher_info.kernel_infos.items()
+        ],
+    )
+    def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, feature_type, kernel_info):
+        dispatcher_signature = inspect.signature(dispatcher_info.dispatcher)
+        dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
+
+        kernel_signature = inspect.signature(kernel_info.kernel)
+        kernel_params = list(kernel_signature.parameters.values())[1:]
+
+        # We filter out metadata that is implicitly passed to the dispatcher through the input feature, but has to be
+        # explicit passed to the kernel.
+        feature_type_metadata = feature_type.__annotations__.keys()
+        kernel_params = [param for param in kernel_params if param.name not in feature_type_metadata]
+
+        dispatcher_params = iter(dispatcher_params)
+        for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
+            try:
+                # In general, the dispatcher parameters are a superset of the kernel parameters. Thus, we filter out
+                # dispatcher parameters that have no kernel equivalent while keeping the order intact.
+                while dispatcher_param.name != kernel_param.name:
+                    dispatcher_param = next(dispatcher_params)
+            except StopIteration:
+                raise AssertionError(
+                    f"Parameter `{kernel_param.name}` of kernel `{kernel_info.id}` "
+                    f"has no corresponding parameter on the dispatcher `{dispatcher_info.id}`."
+                ) from None
+
+            assert dispatcher_param == kernel_param
+
+    @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
+    def test_dispatcher_feature_signatures_consistency(self, info):
+        try:
+            feature_method = getattr(features._Feature, info.id)
+        except AttributeError:
+            pytest.skip("Dispatcher doesn't support arbitrary feature dispatch.")
+
+        dispatcher_signature = inspect.signature(info.dispatcher)
+        dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
+
+        feature_signature = inspect.signature(feature_method)
+        feature_params = list(feature_signature.parameters.values())[1:]
+
+        # Because we use `from __future__ import annotations` inside the module where `features._Feature` is defined,
+        # the annotations are stored as strings. This makes them concrete again, so they can be compared to the natively
+        # concrete dispatcher annotations.
+        feature_annotations = get_type_hints(feature_method)
+        for param in feature_params:
+            param._annotation = feature_annotations[param.name]
+
+        assert dispatcher_params == feature_params
+
 
 @pytest.mark.parametrize(
     ("alias", "target"),
-- 
GitLab


From b80f83db79bb605d91f6e3062e66daac173ee17a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 7 Nov 2022 16:56:32 +0100
Subject: [PATCH 133/624] add support for fine-grained tolerance settings
 (#6921)

* add support for fine-grained tolerance settings

* fix test_cuda_vs_cpu
---
 test/prototype_common_utils.py               | 23 ++++--
 test/prototype_transforms_kernel_infos.py    | 83 ++++++++++----------
 test/test_prototype_transforms_functional.py | 53 +++++++++----
 3 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 8feab0ec5..85583bcca 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -628,21 +628,34 @@ def mark_framework_limitation(test_id, reason):
 
 
 class InfoBase:
-    def __init__(self, *, id, test_marks=None, closeness_kwargs=None):
+    def __init__(
+        self,
+        *,
         # Identifier if the info that shows up the parametrization.
-        self.id = id
+        id,
         # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
         # See the `TestMark` class for details
-        self.test_marks = test_marks or []
-        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`.
-        self.closeness_kwargs = closeness_kwargs or dict()
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
 
+        self.test_marks = test_marks or []
         test_marks_map = defaultdict(list)
         for test_mark in self.test_marks:
             test_marks_map[test_mark.test_id].append(test_mark)
         self._test_marks_map = dict(test_marks_map)
 
+        self.closeness_kwargs = closeness_kwargs or dict()
+
     def get_marks(self, test_id, args_kwargs):
         return [
             test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
         ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 2f829f333..f5a2c6bb2 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -61,11 +61,10 @@ class KernelInfo(InfoBase):
         self.reference_inputs_fn = reference_inputs_fn
 
 
-DEFAULT_IMAGE_CLOSENESS_KWARGS = dict(
-    atol=1e-5,
-    rtol=0,
-    agg_method="mean",
-)
+DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS = {
+    (("TestKernels", "test_against_reference"), torch.float32, "cpu"): dict(atol=1e-5, rtol=0, agg_method="mean"),
+    (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=1e-5, rtol=0, agg_method="mean"),
+}
 
 
 def pil_reference_wrapper(pil_kernel):
@@ -176,7 +175,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_horizontal_flip_image_tensor,
             reference_fn=pil_reference_wrapper(F.horizontal_flip_image_pil),
             reference_inputs_fn=reference_inputs_horizontal_flip_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.horizontal_flip_bounding_box,
@@ -320,7 +319,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resize_image_tensor,
             reference_fn=reference_resize_image_tensor,
             reference_inputs_fn=reference_inputs_resize_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
             ],
@@ -339,7 +338,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resize_mask,
             reference_fn=reference_resize_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
             ],
@@ -556,7 +555,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_affine_image_tensor,
             reference_fn=pil_reference_wrapper(F.affine_image_pil),
             reference_inputs_fn=reference_inputs_affine_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
                 xfail_jit_tuple_instead_of_list("fill"),
@@ -569,7 +568,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_affine_bounding_box,
             reference_fn=reference_affine_bounding_box,
             reference_inputs_fn=reference_inputs_affine_bounding_box,
-            closeness_kwargs=dict(atol=1, rtol=0),
+            closeness_kwargs={
+                (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
+            },
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
             ],
@@ -579,7 +580,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_affine_mask,
             reference_fn=reference_affine_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
             ],
@@ -668,7 +669,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_convert_color_space_image_tensor,
             reference_fn=reference_convert_color_space_image_tensor,
             reference_inputs_fn=reference_inputs_convert_color_space_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.convert_color_space_video,
@@ -729,7 +730,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_vertical_flip_image_tensor,
             reference_fn=pil_reference_wrapper(F.vertical_flip_image_pil),
             reference_inputs_fn=reference_inputs_vertical_flip_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.vertical_flip_bounding_box,
@@ -820,7 +821,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_rotate_image_tensor,
             reference_fn=pil_reference_wrapper(F.rotate_image_pil),
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok
@@ -836,7 +837,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_rotate_mask,
             reference_fn=reference_rotate_mask,
             reference_inputs_fn=reference_inputs_rotate_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.rotate_video,
@@ -918,7 +919,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.crop_image_pil),
             reference_inputs_fn=reference_inputs_crop_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.crop_bounding_box,
@@ -931,7 +932,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_crop_mask,
             reference_fn=pil_reference_wrapper(F.crop_image_pil),
             reference_inputs_fn=reference_inputs_crop_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.crop_video,
@@ -1010,7 +1011,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
             reference_fn=reference_resized_crop_image_tensor,
             reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.resized_crop_bounding_box,
@@ -1021,7 +1022,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resized_crop_mask,
             reference_fn=pil_reference_wrapper(F.resized_crop_image_pil),
             reference_inputs_fn=reference_inputs_resized_crop_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.resized_crop_video,
@@ -1144,7 +1145,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_pad_image_tensor,
             reference_fn=pil_reference_wrapper(F.pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_tuple_instead_of_list("padding"),
                 xfail_jit_tuple_instead_of_list("fill"),
@@ -1166,7 +1167,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_pad_mask,
             reference_fn=pil_reference_wrapper(F.pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.pad_video,
@@ -1225,7 +1226,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_perspective_image_tensor,
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.perspective_bounding_box,
@@ -1236,7 +1237,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_perspective_mask,
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.perspective_video,
@@ -1306,7 +1307,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_elastic_image_tensor,
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.elastic_bounding_box,
@@ -1317,7 +1318,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_elastic_mask,
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.elastic_video,
@@ -1387,7 +1388,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_center_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("output_size"),
             ],
@@ -1404,7 +1405,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_center_crop_mask,
             reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("output_size"),
             ],
@@ -1441,7 +1442,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.gaussian_blur_image_tensor,
             sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
             test_marks=[
                 xfail_jit_python_scalar_arg("kernel_size"),
                 xfail_jit_python_scalar_arg("sigma"),
@@ -1529,7 +1530,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_equalize_image_tensor,
             reference_fn=pil_reference_wrapper(F.equalize_image_pil),
             reference_inputs_fn=reference_inputs_equalize_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.equalize_video,
@@ -1566,7 +1567,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_invert_image_tensor,
             reference_fn=pil_reference_wrapper(F.invert_image_pil),
             reference_inputs_fn=reference_inputs_invert_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.invert_video,
@@ -1607,7 +1608,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_posterize_image_tensor,
             reference_fn=pil_reference_wrapper(F.posterize_image_pil),
             reference_inputs_fn=reference_inputs_posterize_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.posterize_video,
@@ -1651,7 +1652,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_solarize_image_tensor,
             reference_fn=pil_reference_wrapper(F.solarize_image_pil),
             reference_inputs_fn=reference_inputs_solarize_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.solarize_video,
@@ -1688,7 +1689,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
             reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
             reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.autocontrast_video,
@@ -1729,7 +1730,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
             reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.adjust_sharpness_video,
@@ -1800,7 +1801,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
             reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.adjust_brightness_video,
@@ -1841,7 +1842,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
             reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.adjust_contrast_video,
@@ -1886,7 +1887,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
             reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.adjust_gamma_video,
@@ -1927,7 +1928,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
             reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.adjust_hue_video,
@@ -1967,7 +1968,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
             reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.adjust_saturation_video,
@@ -2061,7 +2062,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
             test_marks=_common_five_ten_crop_marks,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.five_crop_video,
@@ -2074,7 +2075,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
             test_marks=_common_five_ten_crop_marks,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
+            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.ten_crop_video,
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 8d7e4a1b6..f8fc8e5fd 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -93,6 +93,13 @@ def fix_rng_seed():
     yield
 
 
+@pytest.fixture()
+def test_id(request):
+    test_class_name = request.cls.__name__ if request.cls is not None else None
+    test_function_name = request.node.originalname
+    return test_class_name, test_function_name
+
+
 class TestKernels:
     sample_inputs = make_info_args_kwargs_parametrization(
         KERNEL_INFOS,
@@ -107,16 +114,20 @@ class TestKernels:
     @ignore_jit_warning_no_profile
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_scripted_vs_eager(self, info, args_kwargs, device):
+    def test_scripted_vs_eager(self, test_id, info, args_kwargs, device):
         kernel_eager = info.kernel
         kernel_scripted = script(kernel_eager)
 
-        args, kwargs = args_kwargs.load(device)
+        (input, *other_args), kwargs = args_kwargs.load(device)
 
-        actual = kernel_scripted(*args, **kwargs)
-        expected = kernel_eager(*args, **kwargs)
+        actual = kernel_scripted(input, *other_args, **kwargs)
+        expected = kernel_eager(input, *other_args, **kwargs)
 
-        assert_close(actual, expected, **info.closeness_kwargs)
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
+        )
 
     def _unbatch(self, batch, *, data_dims):
         if isinstance(batch, torch.Tensor):
@@ -137,7 +148,7 @@ class TestKernels:
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_batched_vs_single(self, info, args_kwargs, device):
+    def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
         feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input)
@@ -168,7 +179,11 @@ class TestKernels:
         single_inputs = self._unbatch(batched_input, data_dims=data_dims)
         expected = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs)
 
-        assert_close(actual, expected, **info.closeness_kwargs)
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device),
+        )
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -185,14 +200,19 @@ class TestKernels:
 
     @sample_inputs
     @needs_cuda
-    def test_cuda_vs_cpu(self, info, args_kwargs):
+    def test_cuda_vs_cpu(self, test_id, info, args_kwargs):
         (input_cpu, *other_args), kwargs = args_kwargs.load("cpu")
         input_cuda = input_cpu.to("cuda")
 
         output_cpu = info.kernel(input_cpu, *other_args, **kwargs)
         output_cuda = info.kernel(input_cuda, *other_args, **kwargs)
 
-        assert_close(output_cuda, output_cpu, check_device=False, **info.closeness_kwargs)
+        assert_close(
+            output_cuda,
+            output_cpu,
+            check_device=False,
+            **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device),
+        )
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -208,13 +228,18 @@ class TestKernels:
         assert output.device == input.device
 
     @reference_inputs
-    def test_against_reference(self, info, args_kwargs):
-        args, kwargs = args_kwargs.load("cpu")
+    def test_against_reference(self, test_id, info, args_kwargs):
+        (input, *other_args), kwargs = args_kwargs.load("cpu")
 
-        actual = info.kernel(*args, **kwargs)
-        expected = info.reference_fn(*args, **kwargs)
+        actual = info.kernel(input, *other_args, **kwargs)
+        expected = info.reference_fn(input, *other_args, **kwargs)
 
-        assert_close(actual, expected, check_dtype=False, **info.closeness_kwargs)
+        assert_close(
+            actual,
+            expected,
+            check_dtype=False,
+            **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
+        )
 
 
 @pytest.fixture
-- 
GitLab


From f30a92f1fd9a2e08058d446335acb44d30124e4d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 7 Nov 2022 19:59:34 +0100
Subject: [PATCH 134/624] add a signature consistency tests for v1 vs. v2
 dispatchers (#6914)

* add a signature consistency tests for v1 vs. v2 dispatchers

* temporarily increase test verbosity

* Revert "temporarily increase test verbosity"

This reverts commit 468c73f727af8dde8a2a3cc3063fd64923c50f63.

* fix test to allow annotation deviations

* fill <-> center for rotate

* ignore annotation changes for center / translate in rotate / affine
---
 test/test_prototype_transforms_consistency.py | 83 ++++++++++++++++++-
 .../prototype/features/_bounding_box.py       |  2 +-
 torchvision/prototype/features/_feature.py    |  2 +-
 torchvision/prototype/features/_image.py      |  2 +-
 torchvision/prototype/features/_mask.py       |  2 +-
 torchvision/prototype/features/_video.py      |  2 +-
 torchvision/prototype/transforms/_geometry.py |  2 +-
 .../transforms/functional/_geometry.py        | 10 +--
 8 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 4cba4265a..d3100bc81 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -25,9 +25,10 @@ from prototype_common_utils import (
 from torchvision import transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import features, transforms as prototype_transforms
-from torchvision.prototype.transforms import functional as F
+from torchvision.prototype.transforms import functional as prototype_F
 from torchvision.prototype.transforms._utils import query_spatial_size
 from torchvision.prototype.transforms.functional import to_image_pil
+from torchvision.transforms import functional as legacy_F
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)])
 
@@ -985,7 +986,7 @@ class PadIfSmaller(prototype_transforms.Transform):
             return inpt
 
         fill = self.fill[type(inpt)]
-        return F.pad(inpt, padding=params["padding"], fill=fill)
+        return prototype_F.pad(inpt, padding=params["padding"], fill=fill)
 
 
 class TestRefSegTransforms:
@@ -1119,3 +1120,81 @@ class TestRefSegTransforms:
         t_ref = seg_transforms.RandomResize(min_size=base_size, max_size=base_size)
 
         self.check_resize(mocker, t_ref, t)
+
+
+@pytest.mark.parametrize(
+    ("legacy_dispatcher", "name_only_params"),
+    [
+        (legacy_F.get_dimensions, {}),
+        (legacy_F.get_image_size, {}),
+        (legacy_F.get_image_num_channels, {}),
+        (legacy_F.to_tensor, {}),
+        (legacy_F.pil_to_tensor, {}),
+        (legacy_F.convert_image_dtype, {}),
+        (legacy_F.to_pil_image, {}),
+        (legacy_F.normalize, {}),
+        (legacy_F.resize, {}),
+        (legacy_F.pad, {"padding", "fill"}),
+        (legacy_F.crop, {}),
+        (legacy_F.center_crop, {}),
+        (legacy_F.resized_crop, {}),
+        (legacy_F.hflip, {}),
+        (legacy_F.perspective, {"startpoints", "endpoints", "fill"}),
+        (legacy_F.vflip, {}),
+        (legacy_F.five_crop, {}),
+        (legacy_F.ten_crop, {}),
+        (legacy_F.adjust_brightness, {}),
+        (legacy_F.adjust_contrast, {}),
+        (legacy_F.adjust_saturation, {}),
+        (legacy_F.adjust_hue, {}),
+        (legacy_F.adjust_gamma, {}),
+        (legacy_F.rotate, {"center", "fill"}),
+        (legacy_F.affine, {"angle", "translate", "center", "fill"}),
+        (legacy_F.to_grayscale, {}),
+        (legacy_F.rgb_to_grayscale, {}),
+        (legacy_F.to_tensor, {}),
+        (legacy_F.erase, {}),
+        (legacy_F.gaussian_blur, {}),
+        (legacy_F.invert, {}),
+        (legacy_F.posterize, {}),
+        (legacy_F.solarize, {}),
+        (legacy_F.adjust_sharpness, {}),
+        (legacy_F.autocontrast, {}),
+        (legacy_F.equalize, {}),
+        (legacy_F.elastic_transform, {"fill"}),
+    ],
+)
+def test_dispatcher_signature_consistency(legacy_dispatcher, name_only_params):
+    legacy_signature = inspect.signature(legacy_dispatcher)
+    legacy_params = list(legacy_signature.parameters.values())[1:]
+
+    try:
+        prototype_dispatcher = getattr(prototype_F, legacy_dispatcher.__name__)
+    except AttributeError:
+        raise AssertionError(
+            f"Legacy dispatcher `F.{legacy_dispatcher.__name__}` has no prototype equivalent"
+        ) from None
+
+    prototype_signature = inspect.signature(prototype_dispatcher)
+    prototype_params = list(prototype_signature.parameters.values())[1:]
+
+    # Some dispatchers got extra parameters. This makes sure they have a default argument and thus are BC. We don't
+    # need to check if parameters were added in the middle rather than at the end, since that will be caught by the
+    # regular check below.
+    prototype_params, new_prototype_params = (
+        prototype_params[: len(legacy_params)],
+        prototype_params[len(legacy_params) :],
+    )
+    for param in new_prototype_params:
+        assert param.default is not param.empty
+
+    # Some annotations were changed mostly to supersets of what was there before. Plus, some legacy dispatchers had no
+    # annotations. In these cases we simply drop the annotation and default argument from the comparison
+    for prototype_param, legacy_param in zip(prototype_params, legacy_params):
+        if legacy_param.name in name_only_params:
+            prototype_param._annotation = prototype_param._default = inspect.Parameter.empty
+            legacy_param._annotation = legacy_param._default = inspect.Parameter.empty
+        elif legacy_param.annotation is inspect.Parameter.empty:
+            prototype_param._annotation = inspect.Parameter.empty
+
+    assert prototype_params == legacy_params
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 699afd0b2..a91a50ecb 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -132,8 +132,8 @@ class BoundingBox(_Feature):
         angle: float,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
+        fill: FillTypeJIT = None,
     ) -> BoundingBox:
         output, spatial_size = self._F.rotate_bounding_box(
             self.as_subclass(torch.Tensor),
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
index fc289485c..3d7623645 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/features/_feature.py
@@ -199,8 +199,8 @@ class _Feature(torch.Tensor):
         angle: float,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
+        fill: FillTypeJIT = None,
     ) -> _Feature:
         return self
 
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 5ed107305..fd04e8939 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -174,8 +174,8 @@ class Image(_Feature):
         angle: float,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
+        fill: FillTypeJIT = None,
     ) -> Image:
         output = self._F.rotate_image_tensor(
             self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
index 6b021d8ad..eb823f824 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -89,8 +89,8 @@ class Mask(_Feature):
         angle: float,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
+        fill: FillTypeJIT = None,
     ) -> Mask:
         output = self._F.rotate_mask(self.as_subclass(torch.Tensor), angle, expand=expand, center=center, fill=fill)
         return Mask.wrap_like(self, output)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
index b1cba6236..042f643e5 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -134,8 +134,8 @@ class Video(_Feature):
         angle: float,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
+        fill: FillTypeJIT = None,
     ) -> Video:
         output = self._F.rotate_video(
             self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index bda250485..3839fed65 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -305,8 +305,8 @@ class RandomRotation(Transform):
             **params,
             interpolation=self.interpolation,
             expand=self.expand,
-            fill=fill,
             center=self.center,
+            fill=fill,
         )
 
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 09a9900f0..9031e77a0 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -521,8 +521,8 @@ def rotate_image_tensor(
     angle: float,
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
-    fill: features.FillTypeJIT = None,
     center: Optional[List[float]] = None,
+    fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
     shape = image.shape
     num_channels, height, width = shape[-3:]
@@ -560,8 +560,8 @@ def rotate_image_pil(
     angle: float,
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
-    fill: features.FillTypeJIT = None,
     center: Optional[List[float]] = None,
+    fill: features.FillTypeJIT = None,
 ) -> PIL.Image.Image:
     if center is not None and expand:
         warnings.warn("The provided center argument has no effect on the result if expand is True")
@@ -612,8 +612,8 @@ def rotate_mask(
     mask: torch.Tensor,
     angle: float,
     expand: bool = False,
-    fill: features.FillTypeJIT = None,
     center: Optional[List[float]] = None,
+    fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -641,8 +641,8 @@ def rotate_video(
     angle: float,
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
-    fill: features.FillTypeJIT = None,
     center: Optional[List[float]] = None,
+    fill: features.FillTypeJIT = None,
 ) -> torch.Tensor:
     return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
@@ -652,8 +652,8 @@ def rotate(
     angle: float,
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
-    fill: features.FillTypeJIT = None,
     center: Optional[List[float]] = None,
+    fill: features.FillTypeJIT = None,
 ) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
         return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-- 
GitLab


From 81a2e1056b08dacc98e68d76676fb80cba10fe4b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 7 Nov 2022 20:13:07 +0100
Subject: [PATCH 135/624] add mitigation for empty bounding boxes (#6923)

---
 torchvision/prototype/transforms/functional/_geometry.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 9031e77a0..909b43739 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -337,6 +337,9 @@ def _affine_bounding_box_xyxy(
     center: Optional[List[float]] = None,
     expand: bool = False,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    if bounding_box.numel() == 0:
+        return bounding_box, spatial_size
+
     angle, translate, shear, center = _affine_parse_args(
         angle, translate, scale, shear, InterpolationMode.NEAREST, center
     )
@@ -1013,6 +1016,9 @@ def perspective_bounding_box(
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    if bounding_box.numel() == 0:
+        return bounding_box
+
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
     original_shape = bounding_box.shape
@@ -1203,6 +1209,9 @@ def elastic_bounding_box(
     format: features.BoundingBoxFormat,
     displacement: torch.Tensor,
 ) -> torch.Tensor:
+    if bounding_box.numel() == 0:
+        return bounding_box
+
     # TODO: add in docstring about approximation we are doing for grid inversion
     displacement = displacement.to(bounding_box.device)
 
-- 
GitLab


From 6ad8044463acaac14dd03b310ae18df6b3d97161 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 7 Nov 2022 20:13:29 +0100
Subject: [PATCH 136/624] add image or video tolerances for CUDA vs CPU tests
 (#6924)

---
 test/prototype_transforms_kernel_infos.py | 29 +++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index f5a2c6bb2..52ff4565a 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -66,6 +66,11 @@ DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS = {
     (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=1e-5, rtol=0, agg_method="mean"),
 }
 
+CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE = {
+    (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): dict(atol=atol, rtol=0)
+    for dtype, atol in [(torch.uint8, 1), (torch.float32, 1 / 255)]
+}
+
 
 def pil_reference_wrapper(pil_kernel):
     @functools.wraps(pil_kernel)
@@ -319,7 +324,10 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resize_image_tensor,
             reference_fn=reference_resize_image_tensor,
             reference_inputs_fn=reference_inputs_resize_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            closeness_kwargs={
+                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            },
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
             ],
@@ -346,6 +354,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.resize_video,
             sample_inputs_fn=sample_inputs_resize_video,
+            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
         ),
     ]
 )
@@ -1011,7 +1020,10 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
             reference_fn=reference_resized_crop_image_tensor,
             reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            closeness_kwargs={
+                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            },
         ),
         KernelInfo(
             F.resized_crop_bounding_box,
@@ -1027,6 +1039,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.resized_crop_video,
             sample_inputs_fn=sample_inputs_resized_crop_video,
+            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
         ),
     ]
 )
@@ -1226,7 +1239,10 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_perspective_image_tensor,
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            closeness_kwargs={
+                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            },
         ),
         KernelInfo(
             F.perspective_bounding_box,
@@ -1242,6 +1258,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.perspective_video,
             sample_inputs_fn=sample_inputs_perspective_video,
+            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
         ),
     ]
 )
@@ -1442,7 +1459,10 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.gaussian_blur_image_tensor,
             sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            closeness_kwargs={
+                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            },
             test_marks=[
                 xfail_jit_python_scalar_arg("kernel_size"),
                 xfail_jit_python_scalar_arg("sigma"),
@@ -1451,6 +1471,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.gaussian_blur_video,
             sample_inputs_fn=sample_inputs_gaussian_blur_video,
+            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
         ),
     ]
 )
-- 
GitLab


From bf58902b2fd881c760cd2eeacfae2d7c468ebf1f Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 7 Nov 2022 13:59:23 -0800
Subject: [PATCH 137/624] [Nova] Caller for Conda M1 builds (#6915)

* [Nova] Caller for Conda M1 builds

* rename job
---
 .github/workflows/build-conda-m1.yml | 48 ++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 .github/workflows/build-conda-m1.yml

diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
new file mode 100644
index 000000000..dd4559f78
--- /dev/null
+++ b/.github/workflows/build-conda-m1.yml
@@ -0,0 +1,48 @@
+name: Build M1 Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: conda
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            conda-package-directory: packaging/torchvision
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@main
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      runner-type: macos-m1-12
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-- 
GitLab


From 7a7ab7e7c1f0b6b062627b8151848aa86c86e2d2 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Tue, 8 Nov 2022 15:41:11 +0000
Subject: [PATCH 138/624] [prototype] Speed up `adjust_sharpness_image_tensor`
 (#6930)

* Speed up `adjust_sharpness_image_tensor`

* Add a comment
---
 .../prototype/transforms/functional/_color.py | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 551832208..12fa5288a 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -1,4 +1,5 @@
 import torch
+from torch.nn.functional import conv2d
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 
@@ -111,6 +112,8 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     if image.numel() == 0 or height <= 2 or width <= 2:
         return image
 
+    bound = _FT._max_value(image.dtype)
+    fp = image.is_floating_point()
     shape = image.shape
 
     if image.ndim > 4:
@@ -119,7 +122,30 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     else:
         needs_unsquash = False
 
-    output = _blend(image, _FT._blurred_degenerate_image(image), sharpness_factor)
+    # The following is a normalized 3x3 kernel with 1s in the edges and a 5 in the middle.
+    kernel_dtype = image.dtype if fp else torch.float32
+    a, b = 1.0 / 13.0, 5.0 / 13.0
+    kernel = torch.tensor([[a, a, a], [a, b, a], [a, a, a]], dtype=kernel_dtype, device=image.device)
+    kernel = kernel.expand(num_channels, 1, 3, 3)
+
+    # We copy and cast at the same time to avoid modifications on the original data
+    output = image.to(dtype=kernel_dtype, copy=True)
+    blurred_degenerate = conv2d(output, kernel, groups=num_channels)
+    if not fp:
+        # it is better to round before cast
+        blurred_degenerate = blurred_degenerate.round_()
+
+    # Create a view on the underlying output while pointing at the same data. We do this to avoid indexing twice.
+    view = output[..., 1:-1, 1:-1]
+
+    # We speed up blending by minimizing flops and doing in-place. The 2 blend options are mathematically equivalent:
+    # x+(1-r)*(y-x) = x + (1-r)*y - (1-r)*x = x*r + y*(1-r)
+    view.add_(blurred_degenerate.sub_(view), alpha=(1.0 - sharpness_factor))
+
+    # The actual data of ouput have been modified by the above. We only need to clamp and cast now.
+    output = output.clamp_(0, bound)
+    if not fp:
+        output = output.to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)
-- 
GitLab


From f32600b66d9c03682ccd975e442e67c609fd7f0a Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 9 Nov 2022 09:14:34 +0000
Subject: [PATCH 139/624] Update minimum version to 3.7.2 (#6926)

---
 README.rst | 4 ++--
 setup.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index c7f11d9c6..dec9deac7 100644
--- a/README.rst
+++ b/README.rst
@@ -21,9 +21,9 @@ supported Python versions.
 +--------------------------+--------------------------+---------------------------------+
 | ``torch``                | ``torchvision``          | ``python``                      |
 +==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.10``           |
+| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7.2``, ``<=3.10``         |
 +--------------------------+--------------------------+---------------------------------+
-| ``1.13.0``               | ``0.14.0``               | ``>=3.7``, ``<=3.10``           |
+| ``1.13.0``               | ``0.14.0``               | ``>=3.7.2``, ``<=3.10``         |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.12.0``               | ``0.13.0``               | ``>=3.7``, ``<=3.10``           |
 +--------------------------+--------------------------+---------------------------------+
diff --git a/setup.py b/setup.py
index 25bef6b50..93ca4151d 100644
--- a/setup.py
+++ b/setup.py
@@ -546,7 +546,7 @@ if __name__ == "__main__":
             "scipy": ["scipy"],
         },
         ext_modules=get_extensions(),
-        python_requires=">=3.7",
+        python_requires=">=3.7.2",
         cmdclass={
             "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
             "clean": clean,
-- 
GitLab


From 10d47a666467a11064e266c5e855903381690b6c Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 9 Nov 2022 16:04:08 +0000
Subject: [PATCH 140/624] [prototype] Speed up `adjust_contrast_image_tensor`
 (#6933)

* Avoid double casting on adjust_contrast

* Handle properly ints.
---
 torchvision/prototype/transforms/functional/_color.py | 11 ++++++++---
 torchvision/prototype/transforms/functional/_meta.py  |  8 +++++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 12fa5288a..e46dfd749 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -79,9 +79,14 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
     c = image.shape[-3]
     if c not in [1, 3]:
         raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
-    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
-    grayscale_image = _rgb_to_gray(image) if c == 3 else image
-    mean = torch.mean(grayscale_image.to(dtype), dim=(-3, -2, -1), keepdim=True)
+    fp = image.is_floating_point()
+    if c == 3:
+        grayscale_image = _rgb_to_gray(image, cast=False)
+        if not fp:
+            grayscale_image = grayscale_image.floor_()
+    else:
+        grayscale_image = image if fp else image.to(torch.float32)
+    mean = torch.mean(grayscale_image, dim=(-3, -2, -1), keepdim=True)
     return _blend(image, mean, contrast_factor)
 
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 8bcd81767..a1c5e4723 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -213,10 +213,12 @@ def _gray_to_rgb(grayscale: torch.Tensor) -> torch.Tensor:
     return grayscale.repeat(repeats)
 
 
-def _rgb_to_gray(image: torch.Tensor) -> torch.Tensor:
+def _rgb_to_gray(image: torch.Tensor, cast: bool = True) -> torch.Tensor:
     r, g, b = image.unbind(dim=-3)
-    l_img = (0.2989 * r).add_(g, alpha=0.587).add_(b, alpha=0.114)
-    l_img = l_img.to(image.dtype).unsqueeze(dim=-3)
+    l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
+    if cast:
+        l_img = l_img.to(image.dtype)
+    l_img = l_img.unsqueeze(dim=-3)
     return l_img
 
 
-- 
GitLab


From ffd5a567eb90abf6b5555063da434d3c130d540f Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 9 Nov 2022 17:30:07 +0000
Subject: [PATCH 141/624] [prototype] Speed up `autocontrast_image_tensor`
 (#6935)

* Performance optimization for autocontrast

* Fixing tests
---
 test/test_prototype_transforms_consistency.py  |  2 ++
 .../prototype/transforms/functional/_color.py  | 18 ++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index d3100bc81..143f1279c 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -251,6 +251,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(p=0),
             ArgsKwargs(p=1),
         ],
+        # Use default tolerances of `torch.testing.assert_close`
+        closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
         prototype_transforms.RandomAdjustSharpness,
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index e46dfd749..671f3fcf9 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -377,17 +377,23 @@ def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
         return image
 
     bound = _FT._max_value(image.dtype)
-    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    fp = image.is_floating_point()
+    float_image = image if fp else image.to(torch.float32)
 
-    minimum = image.amin(dim=(-2, -1), keepdim=True).to(dtype)
-    maximum = image.amax(dim=(-2, -1), keepdim=True).to(dtype)
+    minimum = float_image.amin(dim=(-2, -1), keepdim=True)
+    maximum = float_image.amax(dim=(-2, -1), keepdim=True)
 
-    scale = bound / (maximum - minimum)
     eq_idxs = maximum == minimum
+    inv_scale = maximum.sub_(minimum).div_(bound)
     minimum[eq_idxs] = 0.0
-    scale[eq_idxs] = 1.0
+    inv_scale[eq_idxs] = 1.0
+
+    if fp:
+        diff = float_image.sub(minimum)
+    else:
+        diff = float_image.sub_(minimum)
 
-    return (image - minimum).mul_(scale).clamp_(0, bound).to(image.dtype)
+    return diff.div_(inv_scale).clamp_(0, bound).to(image.dtype)
 
 
 autocontrast_image_pil = _FP.autocontrast
-- 
GitLab


From 98f1a409df15d7aa557d7f2c6a1996691d273c2e Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 10 Nov 2022 13:41:24 +0000
Subject: [PATCH 142/624] Replace `getbands()` with `get_image_num_channels()`
 (#6941)

---
 torchvision/transforms/functional.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index f06b5dbc9..ec5804f0c 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -167,7 +167,7 @@ def to_tensor(pic) -> Tensor:
 
     if pic.mode == "1":
         img = 255 * img
-    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
     # put it from HWC to CHW format
     img = img.permute((2, 0, 1)).contiguous()
     if isinstance(img, torch.ByteTensor):
@@ -205,7 +205,7 @@ def pil_to_tensor(pic: Any) -> Tensor:
 
     # handle PIL Image
     img = torch.as_tensor(np.array(pic, copy=True))
-    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
     # put it from HWC to CHW format
     img = img.permute((2, 0, 1))
     return img
-- 
GitLab


From 0cc90808885724f4b29fd69bd4e0d58b00d476de Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 10 Nov 2022 13:41:56 +0000
Subject: [PATCH 143/624] Minor perf on saturation uint8 (#6940)

---
 torchvision/prototype/transforms/functional/_color.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 671f3fcf9..376021f09 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -53,7 +53,11 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
     if c == 1:  # Match PIL behaviour
         return image
 
-    return _blend(image, _rgb_to_gray(image), saturation_factor)
+    grayscale_image = _rgb_to_gray(image, cast=False)
+    if not image.is_floating_point():
+        grayscale_image = grayscale_image.floor_()
+
+    return _blend(image, grayscale_image, saturation_factor)
 
 
 adjust_saturation_image_pil = _FP.adjust_saturation
-- 
GitLab


From 70edf96d86e60b815ccfc5a72a3f5b1f6bd65898 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 10 Nov 2022 15:47:59 +0000
Subject: [PATCH 144/624] [prototype] Port elastic and minor cleanups (#6942)

* Port elastic and minor cleanups

* Update torchvision/prototype/transforms/functional/_geometry.py

Co-authored-by: Philip Meier <github.pmeier@posteo.de>

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .../prototype/transforms/functional/_color.py |  2 +-
 .../transforms/functional/_geometry.py        | 22 ++++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 376021f09..d067559f2 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -388,7 +388,7 @@ def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
     maximum = float_image.amax(dim=(-2, -1), keepdim=True)
 
     eq_idxs = maximum == minimum
-    inv_scale = maximum.sub_(minimum).div_(bound)
+    inv_scale = maximum.sub_(minimum).mul_(1.0 / bound)
     minimum[eq_idxs] = 0.0
     inv_scale[eq_idxs] = 1.0
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 909b43739..ce97ce057 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -390,7 +390,7 @@ def _affine_bounding_box_xyxy(
             device=device,
         )
         new_points = torch.matmul(points, transposed_affine_matrix)
-        tr, _ = torch.min(new_points, dim=0, keepdim=True)
+        tr = torch.amin(new_points, dim=0, keepdim=True)
         # Translate bounding boxes
         out_bboxes.sub_(tr.repeat((1, 2)))
         # Estimate meta-data for image with inverted=True and with center=[0,0]
@@ -701,7 +701,7 @@ def pad_image_tensor(
     # internally.
     torch_padding = _parse_pad_padding(padding)
 
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+    if padding_mode not in ("constant", "edge", "reflect", "symmetric"):
         raise ValueError(
             f"`padding_mode` should be either `'constant'`, `'edge'`, `'reflect'` or `'symmetric'`, "
             f"but got `'{padding_mode}'`."
@@ -917,7 +917,7 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
     # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #
-
+    # TODO: should we define them transposed?
     theta1 = torch.tensor(
         [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
     )
@@ -925,9 +925,9 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
 
     d = 0.5
     base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
-    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
+    x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device)
     base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
+    y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
     base_grid[..., 1].copy_(y_grid)
     base_grid[..., 2].fill_(1)
 
@@ -1059,6 +1059,7 @@ def perspective_bounding_box(
         (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
     ]
 
+    # TODO: should we define them transposed?
     theta1 = torch.tensor(
         [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
         dtype=dtype,
@@ -1165,6 +1166,7 @@ def elastic_image_tensor(
         return image
 
     shape = image.shape
+    device = image.device
 
     if image.ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
@@ -1172,7 +1174,9 @@ def elastic_image_tensor(
     else:
         needs_unsquash = False
 
-    output = _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill)
+    image_height, image_width = shape[-2:]
+    grid = _create_identity_grid((image_height, image_width), device=device).add_(displacement.to(device))
+    output = _FT._apply_grid_transform(image, grid, interpolation.value, fill)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1505,8 +1509,7 @@ def five_crop_image_tensor(
     image_height, image_width = image.shape[-2:]
 
     if crop_width > image_width or crop_height > image_height:
-        msg = "Requested crop size {} is bigger than input size {}"
-        raise ValueError(msg.format(size, (image_height, image_width)))
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
 
     tl = crop_image_tensor(image, 0, 0, crop_height, crop_width)
     tr = crop_image_tensor(image, 0, image_width - crop_width, crop_height, crop_width)
@@ -1525,8 +1528,7 @@ def five_crop_image_pil(
     image_height, image_width = get_spatial_size_image_pil(image)
 
     if crop_width > image_width or crop_height > image_height:
-        msg = "Requested crop size {} is bigger than input size {}"
-        raise ValueError(msg.format(size, (image_height, image_width)))
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
 
     tl = crop_image_pil(image, 0, 0, crop_height, crop_width)
     tr = crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
-- 
GitLab


From d72e90640ec8514e0369b5419d7f3b74a387b1d7 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Thu, 10 Nov 2022 17:20:21 +0000
Subject: [PATCH 145/624] [prototype] Speed up `adjust_hue_image_tensor`
 (#6938)

* Performance optimization on adjust_hue_image_tensor

* handle ints

* Inplace logical ops

* Remove unnecessary casting.

* Fix linter.
---
 .../prototype/transforms/functional/_color.py | 21 ++++++++++---------
 .../prototype/transforms/functional/_meta.py  |  1 +
 .../transforms/functional/_type_conversion.py |  3 ++-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index d067559f2..66805339c 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -208,11 +208,10 @@ def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
 
     mask_maxc_neq_r = maxc != r
     mask_maxc_eq_g = maxc == g
-    mask_maxc_neq_g = ~mask_maxc_eq_g
 
-    hr = (bc - gc).mul_(~mask_maxc_neq_r)
-    hg = (2.0 + rc).sub_(bc).mul_(mask_maxc_eq_g & mask_maxc_neq_r)
-    hb = (4.0 + gc).sub_(rc).mul_(mask_maxc_neq_g & mask_maxc_neq_r)
+    hg = rc.add(2.0).sub_(bc).mul_(mask_maxc_eq_g & mask_maxc_neq_r)
+    hr = bc.sub_(gc).mul_(~mask_maxc_neq_r)
+    hb = gc.add_(4.0).sub_(rc).mul_(mask_maxc_neq_r.logical_and_(mask_maxc_eq_g.logical_not_()))
 
     h = hr.add_(hg).add_(hb)
     h = h.mul_(1.0 / 6.0).add_(1.0).fmod_(1.0)
@@ -221,14 +220,16 @@ def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
 
 def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
     h, s, v = img.unbind(dim=-3)
-    h6 = h * 6
+    h6 = h.mul(6)
     i = torch.floor(h6)
-    f = h6 - i
+    f = h6.sub_(i)
     i = i.to(dtype=torch.int32)
 
-    p = (v * (1.0 - s)).clamp_(0.0, 1.0)
-    q = (v * (1.0 - s * f)).clamp_(0.0, 1.0)
-    t = (v * (1.0 - s * (1.0 - f))).clamp_(0.0, 1.0)
+    sxf = s * f
+    one_minus_s = 1.0 - s
+    q = (1.0 - sxf).mul_(v).clamp_(0.0, 1.0)
+    t = sxf.add_(one_minus_s).mul_(v).clamp_(0.0, 1.0)
+    p = one_minus_s.mul_(v).clamp_(0.0, 1.0)
     i.remainder_(6)
 
     mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
@@ -238,7 +239,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
     a3 = torch.stack((p, p, t, v, v, q), dim=-3)
     a4 = torch.stack((a1, a2, a3), dim=-4)
 
-    return (a4.mul_(mask.to(dtype=img.dtype).unsqueeze(dim=-4))).sum(dim=-3)
+    return (a4.mul_(mask.unsqueeze(dim=-4))).sum(dim=-3)
 
 
 def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index a1c5e4723..0d2bd7bf1 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -164,6 +164,7 @@ def convert_format_bounding_box(
     if new_format == old_format:
         return bounding_box
 
+    # TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance
     if old_format == BoundingBoxFormat.XYWH:
         bounding_box = _xywh_to_xyxy(bounding_box, inplace)
     elif old_format == BoundingBoxFormat.CXCYWH:
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index 712ca62ec..c99d3d9af 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -1,4 +1,3 @@
-import unittest.mock
 from typing import Any, Dict, Tuple, Union
 
 import numpy as np
@@ -20,6 +19,8 @@ def decode_image_with_pil(encoded_image: torch.Tensor) -> features.Image:
 
 @torch.jit.unused
 def decode_video_with_av(encoded_video: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
+    import unittest.mock
+
     with unittest.mock.patch("torchvision.io.video.os.path.exists", return_value=True):
         return read_video(ReadOnlyTensorBuffer(encoded_video))  # type: ignore[arg-type]
 
-- 
GitLab


From 65769ab7662263a032a14c77e7b0890abb7c3001 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 11 Nov 2022 09:12:37 +0100
Subject: [PATCH 146/624] fix prototype transforms tests with set agg_method
 (#6934)

* fix prototype transforms tests with set agg_method

* use individual tolerances

* refactor PIL reference test

* increase tolerance for elastic_mask

* fix autocontrast tolerances

* increase tolerance for RandomAutocontrast
---
 test/prototype_common_utils.py                | 102 ++---
 test/prototype_transforms_kernel_infos.py     | 378 +++++++++++++-----
 test/test_prototype_transforms_consistency.py |  35 +-
 test/test_prototype_transforms_functional.py  |  42 +-
 4 files changed, 381 insertions(+), 176 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 85583bcca..9a613901e 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -12,17 +12,9 @@ import torch
 import torch.testing
 from datasets_utils import combinations_grid
 from torch.nn.functional import one_hot
-from torch.testing._comparison import (
-    assert_equal as _assert_equal,
-    BooleanPair,
-    ErrorMeta,
-    NonePair,
-    NumberPair,
-    TensorLikePair,
-    UnsupportedInputs,
-)
+from torch.testing._comparison import assert_equal as _assert_equal, BooleanPair, NonePair, NumberPair, TensorLikePair
 from torchvision.prototype import features
-from torchvision.prototype.transforms.functional import convert_dtype_image_tensor, to_image_tensor
+from torchvision.prototype.transforms.functional import to_image_tensor
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
 __all__ = [
@@ -54,7 +46,7 @@ __all__ = [
 ]
 
 
-class PILImagePair(TensorLikePair):
+class ImagePair(TensorLikePair):
     def __init__(
         self,
         actual,
@@ -64,44 +56,13 @@ class PILImagePair(TensorLikePair):
         allowed_percentage_diff=None,
         **other_parameters,
     ):
-        if not any(isinstance(input, PIL.Image.Image) for input in (actual, expected)):
-            raise UnsupportedInputs()
-
-        # This parameter is ignored to enable checking PIL images to tensor images no on the CPU
-        other_parameters["check_device"] = False
+        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
+            actual, expected = [to_image_tensor(input) for input in [actual, expected]]
 
         super().__init__(actual, expected, **other_parameters)
         self.agg_method = getattr(torch, agg_method) if isinstance(agg_method, str) else agg_method
         self.allowed_percentage_diff = allowed_percentage_diff
 
-    def _process_inputs(self, actual, expected, *, id, allow_subclasses):
-        actual, expected = [
-            to_image_tensor(input) if not isinstance(input, torch.Tensor) else features.Image(input)
-            for input in [actual, expected]
-        ]
-        # This broadcast is needed, because `features.Mask`'s can have a 2D shape, but converting the equivalent PIL
-        # image to a tensor adds a singleton leading dimension.
-        # Although it looks like this belongs in `self._equalize_attributes`, it has to happen here.
-        # `self._equalize_attributes` is called after `super()._compare_attributes` and that has an unconditional
-        # shape check that will fail if we don't broadcast before.
-        try:
-            actual, expected = torch.broadcast_tensors(actual, expected)
-        except RuntimeError:
-            raise ErrorMeta(
-                AssertionError,
-                f"The image shapes are not broadcastable: {actual.shape} != {expected.shape}.",
-                id=id,
-            ) from None
-        return super()._process_inputs(actual, expected, id=id, allow_subclasses=allow_subclasses)
-
-    def _equalize_attributes(self, actual, expected):
-        if actual.dtype != expected.dtype:
-            dtype = torch.promote_types(actual.dtype, expected.dtype)
-            actual = convert_dtype_image_tensor(actual, dtype)
-            expected = convert_dtype_image_tensor(expected, dtype)
-
-        return super()._equalize_attributes(actual, expected)
-
     def compare(self) -> None:
         actual, expected = self.actual, self.expected
 
@@ -111,16 +72,24 @@ class PILImagePair(TensorLikePair):
         abs_diff = torch.abs(actual - expected)
 
         if self.allowed_percentage_diff is not None:
-            percentage_diff = (abs_diff != 0).to(torch.float).mean()
+            percentage_diff = float((abs_diff.ne(0).to(torch.float64).mean()))
             if percentage_diff > self.allowed_percentage_diff:
-                self._make_error_meta(AssertionError, "percentage mismatch")
+                raise self._make_error_meta(
+                    AssertionError,
+                    f"{percentage_diff:.1%} elements differ, "
+                    f"but only {self.allowed_percentage_diff:.1%} is allowed",
+                )
 
         if self.agg_method is None:
             super()._compare_values(actual, expected)
         else:
-            err = self.agg_method(abs_diff.to(torch.float64))
-            if err > self.atol:
-                self._make_error_meta(AssertionError, "aggregated mismatch")
+            agg_abs_diff = float(self.agg_method(abs_diff.to(torch.float64)))
+            if agg_abs_diff > self.atol:
+                raise self._make_error_meta(
+                    AssertionError,
+                    f"The '{self.agg_method.__name__}' of the absolute difference is {agg_abs_diff}, "
+                    f"but only {self.atol} is allowed.",
+                )
 
 
 def assert_close(
@@ -148,7 +117,7 @@ def assert_close(
             NonePair,
             BooleanPair,
             NumberPair,
-            PILImagePair,
+            ImagePair,
             TensorLikePair,
         ),
         allow_subclasses=allow_subclasses,
@@ -167,6 +136,32 @@ def assert_close(
 assert_equal = functools.partial(assert_close, rtol=0, atol=0)
 
 
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 10:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper
+
+
 class ArgsKwargs:
     def __init__(self, *args, **kwargs):
         self.args = args
@@ -656,6 +651,13 @@ class InfoBase:
         ]
 
     def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
         if isinstance(device, torch.device):
             device = device.type
         return self.closeness_kwargs.get((test_id, dtype, device), dict())
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 52ff4565a..361a921b1 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -4,6 +4,7 @@ import itertools
 import math
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch.testing
 import torchvision.ops
@@ -49,6 +50,12 @@ class KernelInfo(InfoBase):
         # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter
         # values to be tested. If not specified, `sample_inputs_fn` will be used.
         reference_inputs_fn=None,
+        # If true-ish, triggers a test that checks the kernel for consistency between uint8 and float32 inputs with the
+        # the reference inputs. This is usually used whenever we use a PIL kernel as reference.
+        # Can be a callable in which case it will be called with `other_args, kwargs`. It should return the same
+        # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input
+        # dtype.
+        float32_vs_uint8=False,
         # See InfoBase
         test_marks=None,
         # See InfoBase
@@ -60,28 +67,64 @@ class KernelInfo(InfoBase):
         self.reference_fn = reference_fn
         self.reference_inputs_fn = reference_inputs_fn
 
+        if float32_vs_uint8 and not callable(float32_vs_uint8):
+            float32_vs_uint8 = lambda other_args, kwargs: (other_args, kwargs)  # noqa: E731
+        self.float32_vs_uint8 = float32_vs_uint8
 
-DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS = {
-    (("TestKernels", "test_against_reference"), torch.float32, "cpu"): dict(atol=1e-5, rtol=0, agg_method="mean"),
-    (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=1e-5, rtol=0, agg_method="mean"),
-}
 
-CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE = {
-    (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): dict(atol=atol, rtol=0)
-    for dtype, atol in [(torch.uint8, 1), (torch.float32, 1 / 255)]
-}
+def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, agg_method=None):
+    return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, agg_method=agg_method)
+
+
+def cuda_vs_cpu_pixel_difference(atol=1):
+    return {
+        (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): _pixel_difference_closeness_kwargs(atol, dtype=dtype)
+        for dtype in [torch.uint8, torch.float32]
+    }
+
+
+def pil_reference_pixel_difference(atol=1, agg_method=None):
+    return {
+        (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): _pixel_difference_closeness_kwargs(
+            atol, agg_method=agg_method
+        )
+    }
+
+
+def float32_vs_uint8_pixel_difference(atol=1, agg_method=None):
+    return {
+        (
+            ("TestKernels", "test_float32_vs_uint8"),
+            torch.float32,
+            "cpu",
+        ): _pixel_difference_closeness_kwargs(atol, dtype=torch.float32, agg_method=agg_method)
+    }
 
 
 def pil_reference_wrapper(pil_kernel):
     @functools.wraps(pil_kernel)
-    def wrapper(image_tensor, *other_args, **kwargs):
-        if image_tensor.ndim > 3:
+    def wrapper(input_tensor, *other_args, **kwargs):
+        if input_tensor.dtype != torch.uint8:
+            raise pytest.UsageError(f"Can only test uint8 tensor images against PIL, but input is {input_tensor.dtype}")
+        if input_tensor.ndim > 3:
             raise pytest.UsageError(
-                f"Can only test single tensor images against PIL, but input has shape {image_tensor.shape}"
+                f"Can only test single tensor images against PIL, but input has shape {input_tensor.shape}"
             )
 
-        # We don't need to convert back to tensor here, since `assert_close` does that automatically.
-        return pil_kernel(F.to_image_pil(image_tensor), *other_args, **kwargs)
+        input_pil = F.to_image_pil(input_tensor)
+        output_pil = pil_kernel(input_pil, *other_args, **kwargs)
+        if not isinstance(output_pil, PIL.Image.Image):
+            return output_pil
+
+        output_tensor = F.to_image_tensor(output_pil)
+
+        # 2D mask shenanigans
+        if output_tensor.ndim == 2 and input_tensor.ndim == 3:
+            output_tensor = output_tensor.unsqueeze(0)
+        elif output_tensor.ndim == 3 and input_tensor.ndim == 2:
+            output_tensor = output_tensor.squeeze(0)
+
+        return output_tensor
 
     return wrapper
 
@@ -126,7 +169,7 @@ def sample_inputs_horizontal_flip_image_tensor():
 
 
 def reference_inputs_horizontal_flip_image_tensor():
-    for image_loader in make_image_loaders(extra_dims=[()]):
+    for image_loader in make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]):
         yield ArgsKwargs(image_loader)
 
 
@@ -180,7 +223,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_horizontal_flip_image_tensor,
             reference_fn=pil_reference_wrapper(F.horizontal_flip_image_pil),
             reference_inputs_fn=reference_inputs_horizontal_flip_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
         ),
         KernelInfo(
             F.horizontal_flip_bounding_box,
@@ -244,7 +287,7 @@ def reference_resize_image_tensor(*args, **kwargs):
 
 def reference_inputs_resize_image_tensor():
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(extra_dims=[()]),
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.NEAREST_EXACT,
@@ -324,9 +367,13 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resize_image_tensor,
             reference_fn=reference_resize_image_tensor,
             reference_inputs_fn=reference_inputs_resize_image_tensor,
+            float32_vs_uint8=True,
             closeness_kwargs={
-                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
-                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+                # TODO: investigate
+                **pil_reference_pixel_difference(110, agg_method="mean"),
+                **cuda_vs_cpu_pixel_difference(),
+                # TODO: investigate
+                **float32_vs_uint8_pixel_difference(50),
             },
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
@@ -346,7 +393,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resize_mask,
             reference_fn=reference_resize_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=pil_reference_pixel_difference(10),
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
             ],
@@ -354,7 +402,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.resize_video,
             sample_inputs_fn=sample_inputs_resize_video,
-            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
@@ -400,6 +448,36 @@ _DIVERSE_AFFINE_PARAMS = [
 ]
 
 
+def get_fills(*, num_channels, dtype, vector=True):
+    yield None
+
+    max_value = get_max_value(dtype)
+    # This intentionally gives us a float and an int scalar fill value
+    yield max_value / 2
+    yield max_value
+
+    if not vector:
+        return
+
+    if dtype.is_floating_point:
+        yield [0.1 + c / 10 for c in range(num_channels)]
+    else:
+        yield [12.0 + c for c in range(num_channels)]
+
+
+def float32_vs_uint8_fill_adapter(other_args, kwargs):
+    fill = kwargs.get("fill")
+    if fill is None:
+        return other_args, kwargs
+
+    if isinstance(fill, (int, float)):
+        fill /= 255
+    else:
+        fill = type(fill)(fill_ / 255 for fill_ in fill)
+
+    return other_args, dict(kwargs, fill=fill)
+
+
 def sample_inputs_affine_image_tensor():
     make_affine_image_loaders = functools.partial(
         make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
@@ -409,10 +487,7 @@ def sample_inputs_affine_image_tensor():
         yield ArgsKwargs(image_loader, **affine_params)
 
     for image_loader in make_affine_image_loaders():
-        fills = [None, 0.5]
-        if image_loader.num_channels > 1:
-            fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]])
-        for fill in fills:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, **_full_affine_params(), fill=fill)
 
     for image_loader, interpolation in itertools.product(
@@ -426,7 +501,9 @@ def sample_inputs_affine_image_tensor():
 
 
 def reference_inputs_affine_image_tensor():
-    for image_loader, affine_kwargs in itertools.product(make_image_loaders(extra_dims=[()]), _AFFINE_KWARGS):
+    for image_loader, affine_kwargs in itertools.product(
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _AFFINE_KWARGS
+    ):
         yield ArgsKwargs(
             image_loader,
             interpolation=F.InterpolationMode.NEAREST,
@@ -564,7 +641,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_affine_image_tensor,
             reference_fn=pil_reference_wrapper(F.affine_image_pil),
             reference_inputs_fn=reference_inputs_affine_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=pil_reference_pixel_difference(10, agg_method="mean"),
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
                 xfail_jit_tuple_instead_of_list("fill"),
@@ -589,7 +667,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_affine_mask,
             reference_fn=reference_affine_mask,
             reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            closeness_kwargs=pil_reference_pixel_difference(10),
+            float32_vs_uint8=True,
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
             ],
@@ -631,7 +710,9 @@ KERNEL_INFOS.append(
 
 
 def sample_inputs_convert_color_space_image_tensor():
-    color_spaces = list(set(features.ColorSpace) - {features.ColorSpace.OTHER})
+    color_spaces = sorted(
+        set(features.ColorSpace) - {features.ColorSpace.OTHER}, key=lambda color_space: color_space.value
+    )
 
     for old_color_space, new_color_space in cycle_over(color_spaces):
         for image_loader in make_image_loaders(sizes=["random"], color_spaces=[old_color_space], constant_alpha=True):
@@ -659,7 +740,7 @@ def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_c
 def reference_inputs_convert_color_space_image_tensor():
     for args_kwargs in sample_inputs_convert_color_space_image_tensor():
         (image_loader, *other_args), kwargs = args_kwargs
-        if len(image_loader.shape) == 3:
+        if len(image_loader.shape) == 3 and image_loader.dtype == torch.uint8:
             yield args_kwargs
 
 
@@ -678,7 +759,10 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_convert_color_space_image_tensor,
             reference_fn=reference_convert_color_space_image_tensor,
             reference_inputs_fn=reference_inputs_convert_color_space_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
+            },
         ),
         KernelInfo(
             F.convert_color_space_video,
@@ -694,7 +778,7 @@ def sample_inputs_vertical_flip_image_tensor():
 
 
 def reference_inputs_vertical_flip_image_tensor():
-    for image_loader in make_image_loaders(extra_dims=[()]):
+    for image_loader in make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]):
         yield ArgsKwargs(image_loader)
 
 
@@ -739,7 +823,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_vertical_flip_image_tensor,
             reference_fn=pil_reference_wrapper(F.vertical_flip_image_pil),
             reference_inputs_fn=reference_inputs_vertical_flip_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
         ),
         KernelInfo(
             F.vertical_flip_bounding_box,
@@ -775,10 +859,7 @@ def sample_inputs_rotate_image_tensor():
         yield ArgsKwargs(image_loader, angle=15.0, center=center)
 
     for image_loader in make_rotate_image_loaders():
-        fills = [None, 0.5]
-        if image_loader.num_channels > 1:
-            fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]])
-        for fill in fills:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, angle=15.0, fill=fill)
 
     for image_loader, interpolation in itertools.product(
@@ -789,7 +870,9 @@ def sample_inputs_rotate_image_tensor():
 
 
 def reference_inputs_rotate_image_tensor():
-    for image_loader, angle in itertools.product(make_image_loaders(extra_dims=[()]), _ROTATE_ANGLES):
+    for image_loader, angle in itertools.product(
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _ROTATE_ANGLES
+    ):
         yield ArgsKwargs(image_loader, angle=angle)
 
 
@@ -830,7 +913,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_rotate_image_tensor,
             reference_fn=pil_reference_wrapper(F.rotate_image_pil),
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            # TODO: investigate
+            closeness_kwargs=pil_reference_pixel_difference(100, agg_method="mean"),
             test_marks=[
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok
@@ -846,7 +931,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_rotate_mask,
             reference_fn=reference_rotate_mask,
             reference_inputs_fn=reference_inputs_rotate_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=pil_reference_pixel_difference(10),
         ),
         KernelInfo(
             F.rotate_video,
@@ -873,7 +959,9 @@ def sample_inputs_crop_image_tensor():
 
 
 def reference_inputs_crop_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(extra_dims=[()]), _CROP_PARAMS):
+    for image_loader, params in itertools.product(
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _CROP_PARAMS
+    ):
         yield ArgsKwargs(image_loader, **params)
 
 
@@ -928,7 +1016,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.crop_image_pil),
             reference_inputs_fn=reference_inputs_crop_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
         ),
         KernelInfo(
             F.crop_bounding_box,
@@ -941,7 +1029,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_crop_mask,
             reference_fn=pil_reference_wrapper(F.crop_image_pil),
             reference_inputs_fn=reference_inputs_crop_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
         ),
         KernelInfo(
             F.crop_video,
@@ -970,7 +1058,7 @@ def reference_resized_crop_image_tensor(*args, **kwargs):
 
 def reference_inputs_resized_crop_image_tensor():
     for image_loader, interpolation, params in itertools.product(
-        make_image_loaders(extra_dims=[()]),
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.NEAREST_EXACT,
@@ -1020,9 +1108,13 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
             reference_fn=reference_resized_crop_image_tensor,
             reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
+            float32_vs_uint8=True,
             closeness_kwargs={
-                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
-                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+                # TODO: investigate
+                **pil_reference_pixel_difference(60, agg_method="mean"),
+                **cuda_vs_cpu_pixel_difference(),
+                # TODO: investigate
+                **float32_vs_uint8_pixel_difference(50),
             },
         ),
         KernelInfo(
@@ -1034,12 +1126,13 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resized_crop_mask,
             reference_fn=pil_reference_wrapper(F.resized_crop_image_pil),
             reference_inputs_fn=reference_inputs_resized_crop_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=pil_reference_pixel_difference(10),
         ),
         KernelInfo(
             F.resized_crop_video,
             sample_inputs_fn=sample_inputs_resized_crop_video,
-            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
@@ -1062,10 +1155,7 @@ def sample_inputs_pad_image_tensor():
         yield ArgsKwargs(image_loader, padding=padding)
 
     for image_loader in make_pad_image_loaders():
-        fills = [None, 0.5]
-        if image_loader.num_channels > 1:
-            fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]])
-        for fill in fills:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, padding=[1], fill=fill)
 
     for image_loader, padding_mode in itertools.product(
@@ -1082,12 +1172,15 @@ def sample_inputs_pad_image_tensor():
 
 
 def reference_inputs_pad_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(extra_dims=[()]), _PAD_PARAMS):
+    for image_loader, params in itertools.product(
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PAD_PARAMS
+    ):
         # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
-        fills = [None, 128.0, 128]
-        if params["padding_mode"] == "constant":
-            fills.append([12.0 + c for c in range(image_loader.num_channels)])
-        for fill in fills:
+        for fill in get_fills(
+            num_channels=image_loader.num_channels,
+            dtype=image_loader.dtype,
+            vector=params["padding_mode"] == "constant",
+        ):
             yield ArgsKwargs(image_loader, fill=fill, **params)
 
 
@@ -1110,8 +1203,10 @@ def sample_inputs_pad_mask():
 
 
 def reference_inputs_pad_mask():
-    for image_loader, fill, params in itertools.product(make_image_loaders(extra_dims=[()]), [None, 127], _PAD_PARAMS):
-        yield ArgsKwargs(image_loader, fill=fill, **params)
+    for mask_loader, fill, params in itertools.product(
+        make_mask_loaders(num_objects=[1], extra_dims=[()]), [None, 127], _PAD_PARAMS
+    ):
+        yield ArgsKwargs(mask_loader, fill=fill, **params)
 
 
 def sample_inputs_pad_video():
@@ -1158,7 +1253,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_pad_image_tensor,
             reference_fn=pil_reference_wrapper(F.pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
             test_marks=[
                 xfail_jit_tuple_instead_of_list("padding"),
                 xfail_jit_tuple_instead_of_list("fill"),
@@ -1180,7 +1276,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_pad_mask,
             reference_fn=pil_reference_wrapper(F.pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
         ),
         KernelInfo(
             F.pad_video,
@@ -1197,14 +1293,16 @@ _PERSPECTIVE_COEFFS = [
 
 def sample_inputs_perspective_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"]):
-        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 def reference_inputs_perspective_image_tensor():
-    for image_loader, coefficients in itertools.product(make_image_loaders(extra_dims=[()]), _PERSPECTIVE_COEFFS):
+    for image_loader, coefficients in itertools.product(
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PERSPECTIVE_COEFFS
+    ):
         # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
-        for fill in [None, 128.0, 128, [12.0 + c for c in range(image_loader.num_channels)]]:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=coefficients)
 
 
@@ -1239,9 +1337,12 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_perspective_image_tensor,
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs={
-                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
-                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+                # TODO: investigate
+                **pil_reference_pixel_difference(160, agg_method="mean"),
+                **cuda_vs_cpu_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
             },
         ),
         KernelInfo(
@@ -1253,12 +1354,15 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_perspective_mask,
             reference_fn=pil_reference_wrapper(F.perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=10, rtol=0),
+            },
         ),
         KernelInfo(
             F.perspective_video,
             sample_inputs_fn=sample_inputs_perspective_video,
-            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
@@ -1271,13 +1375,13 @@ def _get_elastic_displacement(spatial_size):
 def sample_inputs_elastic_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"]):
         displacement = _get_elastic_displacement(image_loader.spatial_size)
-        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
 
 
 def reference_inputs_elastic_image_tensor():
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(extra_dims=[()]),
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.BILINEAR,
@@ -1285,7 +1389,7 @@ def reference_inputs_elastic_image_tensor():
         ],
     ):
         displacement = _get_elastic_displacement(image_loader.spatial_size)
-        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
 
 
@@ -1324,7 +1428,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_elastic_image_tensor,
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
+            # TODO: investigate
+            closeness_kwargs=float32_vs_uint8_pixel_difference(60, agg_method="mean"),
         ),
         KernelInfo(
             F.elastic_bounding_box,
@@ -1335,7 +1441,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_elastic_mask,
             reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            # TODO: investigate
+            closeness_kwargs=pil_reference_pixel_difference(80, agg_method="mean"),
         ),
         KernelInfo(
             F.elastic_video,
@@ -1364,7 +1472,8 @@ def sample_inputs_center_crop_image_tensor():
 
 def reference_inputs_center_crop_image_tensor():
     for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
+        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], dtypes=[torch.uint8]),
+        _CENTER_CROP_OUTPUT_SIZES,
     ):
         yield ArgsKwargs(image_loader, output_size=output_size)
 
@@ -1405,7 +1514,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_center_crop_image_tensor,
             reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
             test_marks=[
                 xfail_jit_python_scalar_arg("output_size"),
             ],
@@ -1422,7 +1531,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_center_crop_mask,
             reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_mask,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
             test_marks=[
                 xfail_jit_python_scalar_arg("output_size"),
             ],
@@ -1459,10 +1568,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.gaussian_blur_image_tensor,
             sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
-            closeness_kwargs={
-                **DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
-                **CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
-            },
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
             test_marks=[
                 xfail_jit_python_scalar_arg("kernel_size"),
                 xfail_jit_python_scalar_arg("sigma"),
@@ -1471,7 +1577,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.gaussian_blur_video,
             sample_inputs_fn=sample_inputs_gaussian_blur_video,
-            closeness_kwargs=CUDA_VS_CPU_SINGLE_PIXEL_DIFFERENCE,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
@@ -1506,7 +1612,7 @@ def reference_inputs_equalize_image_tensor():
 
     spatial_size = (256, 256)
     for dtype, color_space, fn in itertools.product(
-        [torch.uint8, torch.float32],
+        [torch.uint8],
         [features.ColorSpace.GRAY, features.ColorSpace.RGB],
         [
             lambda shape, dtype, device: torch.zeros(shape, dtype=dtype, device=device),
@@ -1550,8 +1656,8 @@ KERNEL_INFOS.extend(
             kernel_name="equalize_image_tensor",
             sample_inputs_fn=sample_inputs_equalize_image_tensor,
             reference_fn=pil_reference_wrapper(F.equalize_image_pil),
+            float32_vs_uint8=True,
             reference_inputs_fn=reference_inputs_equalize_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.equalize_video,
@@ -1570,7 +1676,7 @@ def sample_inputs_invert_image_tensor():
 
 def reference_inputs_invert_image_tensor():
     for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]
+        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1588,7 +1694,7 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_invert_image_tensor,
             reference_fn=pil_reference_wrapper(F.invert_image_pil),
             reference_inputs_fn=reference_inputs_invert_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
         ),
         KernelInfo(
             F.invert_video,
@@ -1610,7 +1716,9 @@ def sample_inputs_posterize_image_tensor():
 
 def reference_inputs_posterize_image_tensor():
     for image_loader, bits in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _POSTERIZE_BITS,
     ):
         yield ArgsKwargs(image_loader, bits=bits)
@@ -1629,7 +1737,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_posterize_image_tensor,
             reference_fn=pil_reference_wrapper(F.posterize_image_pil),
             reference_inputs_fn=reference_inputs_posterize_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
         ),
         KernelInfo(
             F.posterize_video,
@@ -1654,12 +1763,16 @@ def sample_inputs_solarize_image_tensor():
 
 def reference_inputs_solarize_image_tensor():
     for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]
+        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
     ):
         for threshold in _get_solarize_thresholds(image_loader.dtype):
             yield ArgsKwargs(image_loader, threshold=threshold)
 
 
+def uint8_to_float32_threshold_adapter(other_args, kwargs):
+    return other_args, dict(threshold=kwargs["threshold"] / 255)
+
+
 def sample_inputs_solarize_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
         yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype)))
@@ -1673,7 +1786,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_solarize_image_tensor,
             reference_fn=pil_reference_wrapper(F.solarize_image_pil),
             reference_inputs_fn=reference_inputs_solarize_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=uint8_to_float32_threshold_adapter,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
         ),
         KernelInfo(
             F.solarize_video,
@@ -1692,7 +1806,7 @@ def sample_inputs_autocontrast_image_tensor():
 
 def reference_inputs_autocontrast_image_tensor():
     for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]
+        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1710,7 +1824,11 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
             reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
             reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
+            },
         ),
         KernelInfo(
             F.autocontrast_video,
@@ -1732,7 +1850,9 @@ def sample_inputs_adjust_sharpness_image_tensor():
 
 def reference_inputs_adjust_sharpness_image_tensor():
     for image_loader, sharpness_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _ADJUST_SHARPNESS_FACTORS,
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor)
@@ -1751,7 +1871,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
             reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(2),
         ),
         KernelInfo(
             F.adjust_sharpness_video,
@@ -1803,7 +1924,9 @@ def sample_inputs_adjust_brightness_image_tensor():
 
 def reference_inputs_adjust_brightness_image_tensor():
     for image_loader, brightness_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _ADJUST_BRIGHTNESS_FACTORS,
     ):
         yield ArgsKwargs(image_loader, brightness_factor=brightness_factor)
@@ -1822,7 +1945,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
             reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
         ),
         KernelInfo(
             F.adjust_brightness_video,
@@ -1844,7 +1968,9 @@ def sample_inputs_adjust_contrast_image_tensor():
 
 def reference_inputs_adjust_contrast_image_tensor():
     for image_loader, contrast_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _ADJUST_CONTRAST_FACTORS,
     ):
         yield ArgsKwargs(image_loader, contrast_factor=contrast_factor)
@@ -1863,7 +1989,11 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
             reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(2),
+            },
         ),
         KernelInfo(
             F.adjust_contrast_video,
@@ -1888,7 +2018,9 @@ def sample_inputs_adjust_gamma_image_tensor():
 
 def reference_inputs_adjust_gamma_image_tensor():
     for image_loader, (gamma, gain) in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _ADJUST_GAMMA_GAMMAS_GAINS,
     ):
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
@@ -1908,7 +2040,11 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
             reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
+            },
         ),
         KernelInfo(
             F.adjust_gamma_video,
@@ -1930,7 +2066,9 @@ def sample_inputs_adjust_hue_image_tensor():
 
 def reference_inputs_adjust_hue_image_tensor():
     for image_loader, hue_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _ADJUST_HUE_FACTORS,
     ):
         yield ArgsKwargs(image_loader, hue_factor=hue_factor)
@@ -1949,7 +2087,12 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
             reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                # TODO: investigate
+                **pil_reference_pixel_difference(20),
+                **float32_vs_uint8_pixel_difference(),
+            },
         ),
         KernelInfo(
             F.adjust_hue_video,
@@ -1970,7 +2113,9 @@ def sample_inputs_adjust_saturation_image_tensor():
 
 def reference_inputs_adjust_saturation_image_tensor():
     for image_loader, saturation_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
+        make_image_loaders(
+            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        ),
         _ADJUST_SATURATION_FACTORS,
     ):
         yield ArgsKwargs(image_loader, saturation_factor=saturation_factor)
@@ -1989,7 +2134,11 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
             reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
             reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(2),
+            },
         ),
         KernelInfo(
             F.adjust_saturation_video,
@@ -2038,7 +2187,9 @@ def sample_inputs_five_crop_image_tensor():
 
 def reference_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+        ):
             yield ArgsKwargs(image_loader, size=size)
 
 
@@ -2060,7 +2211,9 @@ def sample_inputs_ten_crop_image_tensor():
 
 def reference_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+        ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
@@ -2070,6 +2223,17 @@ def sample_inputs_ten_crop_video():
         yield ArgsKwargs(video_loader, size=size)
 
 
+def multi_crop_pil_reference_wrapper(pil_kernel):
+    def wrapper(input_tensor, *other_args, **kwargs):
+        output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs)
+        return type(output)(
+            F.convert_dtype_image_tensor(F.to_image_tensor(output_pil), dtype=input_tensor.dtype)
+            for output_pil in output
+        )
+
+    return wrapper
+
+
 _common_five_ten_crop_marks = [
     xfail_jit_python_scalar_arg("size"),
     mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
@@ -2080,10 +2244,9 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.five_crop_image_tensor,
             sample_inputs_fn=sample_inputs_five_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
+            reference_fn=multi_crop_pil_reference_wrapper(F.five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
             test_marks=_common_five_ten_crop_marks,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.five_crop_video,
@@ -2093,10 +2256,9 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.ten_crop_image_tensor,
             sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
+            reference_fn=multi_crop_pil_reference_wrapper(F.ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
             test_marks=_common_five_ten_crop_marks,
-            closeness_kwargs=DEFAULT_PIL_REFERENCE_CLOSENESS_KWARGS,
         ),
         KernelInfo(
             F.ten_crop_video,
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 143f1279c..0cc52f8b8 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -244,16 +244,19 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(p=1, threshold=0.99),
         ],
     ),
-    ConsistencyConfig(
-        prototype_transforms.RandomAutocontrast,
-        legacy_transforms.RandomAutocontrast,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-        # Use default tolerances of `torch.testing.assert_close`
-        closeness_kwargs=dict(rtol=None, atol=None),
-    ),
+    *[
+        ConsistencyConfig(
+            prototype_transforms.RandomAutocontrast,
+            legacy_transforms.RandomAutocontrast,
+            [
+                ArgsKwargs(p=0),
+                ArgsKwargs(p=1),
+            ],
+            make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[dt]),
+            closeness_kwargs=ckw,
+        )
+        for dt, ckw in [(torch.uint8, dict(atol=1, rtol=0)), (torch.float32, dict(rtol=None, atol=None))]
+    ],
     ConsistencyConfig(
         prototype_transforms.RandomAdjustSharpness,
         legacy_transforms.RandomAdjustSharpness,
@@ -1007,7 +1010,7 @@ class TestRefSegTransforms:
 
             dp = (conv_fn(feature_image), feature_mask)
             dp_ref = (
-                to_image_pil(feature_image) if supports_pil else torch.Tensor(feature_image),
+                to_image_pil(feature_image) if supports_pil else feature_image.as_subclass(torch.Tensor),
                 to_image_pil(feature_mask),
             )
 
@@ -1021,12 +1024,16 @@ class TestRefSegTransforms:
         for dp, dp_ref in self.make_datapoints(**data_kwargs or dict()):
 
             self.set_seed()
-            output = t(dp)
+            actual = actual_image, actual_mask = t(dp)
 
             self.set_seed()
-            expected_output = t_ref(*dp_ref)
+            expected_image, expected_mask = t_ref(*dp_ref)
+            if isinstance(actual_image, torch.Tensor) and not isinstance(expected_image, torch.Tensor):
+                expected_image = legacy_F.pil_to_tensor(expected_image)
+            expected_mask = legacy_F.pil_to_tensor(expected_mask).squeeze(0)
+            expected = (expected_image, expected_mask)
 
-            assert_equal(output, expected_output)
+            assert_equal(actual, expected)
 
     @pytest.mark.parametrize(
         ("t_ref", "t", "data_kwargs"),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index f8fc8e5fd..c45bddae5 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -11,7 +11,7 @@ import pytest
 
 import torch
 from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
-from prototype_common_utils import assert_close, make_bounding_boxes, make_image
+from prototype_common_utils import assert_close, make_bounding_boxes, make_image, parametrized_error_message
 from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
 from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
@@ -22,6 +22,10 @@ from torchvision.prototype.transforms.functional._meta import convert_format_bou
 from torchvision.transforms.functional import _get_perspective_coeffs
 
 
+KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
+DISPATCHER_INFOS_MAP = {info.dispatcher: info for info in DISPATCHER_INFOS}
+
+
 @cache
 def script(fn):
     try:
@@ -127,6 +131,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
+            msg=parametrized_error_message(*other_args, *kwargs),
         )
 
     def _unbatch(self, batch, *, data_dims):
@@ -183,6 +188,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device),
+            msg=parametrized_error_message(*other_args, *kwargs),
         )
 
     @sample_inputs
@@ -212,6 +218,7 @@ class TestKernels:
             output_cpu,
             check_device=False,
             **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device),
+            msg=parametrized_error_message(*other_args, *kwargs),
         )
 
     @sample_inputs
@@ -237,8 +244,35 @@ class TestKernels:
         assert_close(
             actual,
             expected,
-            check_dtype=False,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
+            msg=parametrized_error_message(*other_args, *kwargs),
+        )
+
+    @make_info_args_kwargs_parametrization(
+        [info for info in KERNEL_INFOS if info.float32_vs_uint8],
+        args_kwargs_fn=lambda info: info.reference_inputs_fn(),
+    )
+    def test_float32_vs_uint8(self, test_id, info, args_kwargs):
+        (input, *other_args), kwargs = args_kwargs.load("cpu")
+
+        if input.dtype != torch.uint8:
+            pytest.skip(f"Input dtype is {input.dtype}.")
+
+        adapted_other_args, adapted_kwargs = info.float32_vs_uint8(other_args, kwargs)
+
+        actual = info.kernel(
+            F.convert_dtype_image_tensor(input, dtype=torch.float32),
+            *adapted_other_args,
+            **adapted_kwargs,
+        )
+
+        expected = F.convert_dtype_image_tensor(info.kernel(input, *other_args, **kwargs), dtype=torch.float32)
+
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=torch.float32, device=input.device),
+            msg=parametrized_error_message(*other_args, *kwargs),
         )
 
 
@@ -421,12 +455,12 @@ def test_alias(alias, target):
 @pytest.mark.parametrize(
     ("info", "args_kwargs"),
     make_info_args_kwargs_params(
-        next(info for info in KERNEL_INFOS if info.kernel is F.convert_image_dtype),
+        KERNEL_INFOS_MAP[F.convert_dtype_image_tensor],
         args_kwargs_fn=lambda info: info.sample_inputs_fn(),
     ),
 )
 @pytest.mark.parametrize("device", cpu_and_gpu())
-def test_dtype_and_device_convert_image_dtype(info, args_kwargs, device):
+def test_convert_dtype_image_tensor_dtype_and_device(info, args_kwargs, device):
     (input, *other_args), kwargs = args_kwargs.load(device)
     dtype = other_args[0] if other_args else kwargs.get("dtype", torch.float32)
 
-- 
GitLab


From ad2eceabf0dcdb17a25d84da62492825a2c770a2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 11 Nov 2022 10:38:09 +0100
Subject: [PATCH 147/624] Cleanup prototype datasets CI and related things
 (#6944)

* remove prototype datasets from CI

* move encoded features to prototype datasets namespace

* remove decoding transforms

* [REVERT ME] reinstate prototype datasets CI

* Revert "[REVERT ME] reinstate prototype datasets CI"

This reverts commit 215fb185cf6be5be7adf0388116c77acc9a5d3f3.
---
 .circleci/config.yml                          | 10 ----------
 .circleci/config.yml.in                       | 10 ----------
 .github/workflows/prototype-tests.yml         | 14 ++-----------
 torchvision/prototype/__init__.py             |  2 +-
 .../prototype/datasets/_builtin/caltech.py    |  4 ++--
 .../prototype/datasets/_builtin/celeba.py     |  4 ++--
 .../prototype/datasets/_builtin/clevr.py      |  4 ++--
 .../prototype/datasets/_builtin/coco.py       |  4 ++--
 .../prototype/datasets/_builtin/country211.py |  4 ++--
 .../prototype/datasets/_builtin/cub200.py     |  4 ++--
 .../prototype/datasets/_builtin/dtd.py        |  4 ++--
 .../prototype/datasets/_builtin/eurosat.py    |  4 ++--
 .../prototype/datasets/_builtin/food101.py    |  4 ++--
 .../prototype/datasets/_builtin/gtsrb.py      |  4 ++--
 .../prototype/datasets/_builtin/imagenet.py   |  4 ++--
 .../datasets/_builtin/oxford_iiit_pet.py      |  4 ++--
 .../prototype/datasets/_builtin/sbd.py        |  4 ++--
 .../datasets/_builtin/stanford_cars.py        |  4 ++--
 .../prototype/datasets/_builtin/voc.py        |  4 ++--
 torchvision/prototype/datasets/_folder.py     |  3 ++-
 .../prototype/datasets/utils/__init__.py      |  1 +
 .../{features => datasets/utils}/_encoded.py  |  4 ++--
 torchvision/prototype/features/__init__.py    |  1 -
 torchvision/prototype/transforms/__init__.py  |  2 +-
 .../prototype/transforms/_type_conversion.py  |  7 -------
 .../transforms/functional/__init__.py         |  9 +--------
 .../transforms/functional/_type_conversion.py | 20 +------------------
 27 files changed, 41 insertions(+), 102 deletions(-)
 rename torchvision/prototype/{features => datasets/utils}/_encoded.py (96%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 713c1e6c4..9c03a0669 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -152,15 +152,6 @@ commands:
           args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
           descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
 
-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
   # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
   # This command can be used if only a selection of tests need to be run, for ad-hoc files.
   run_tests_selective:
@@ -326,7 +317,6 @@ jobs:
       - checkout
       - install_torchvision:
           editable: true
-      - install_prototype_dependencies
       - pip_install:
           args: mypy
           descr: Install Python type check utilities
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index b421dc1a7..d3146568b 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -152,15 +152,6 @@ commands:
           args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
           descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
 
-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
   # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
   # This command can be used if only a selection of tests need to be run, for ad-hoc files.
   run_tests_selective:
@@ -326,7 +317,6 @@ jobs:
       - checkout
       - install_torchvision:
           editable: true
-      - install_prototype_dependencies
       - pip_install:
           args: mypy
           descr: Install Python type check utilities
diff --git a/.github/workflows/prototype-tests.yml b/.github/workflows/prototype-tests.yml
index 5e9ca360d..daff383b0 100644
--- a/.github/workflows/prototype-tests.yml
+++ b/.github/workflows/prototype-tests.yml
@@ -28,13 +28,13 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Install PyTorch nightly builds
-        run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
+        run: pip install --progress-bar=off --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
 
       - name: Install torchvision
         run: pip install --progress-bar=off --no-build-isolation --editable .
 
       - name: Install other prototype dependencies
-        run: pip install --progress-bar=off scipy pycocotools h5py iopath
+        run: pip install --progress-bar=off scipy pycocotools h5py
 
       - name: Install test requirements
         run: pip install --progress-bar=off pytest pytest-mock pytest-cov
@@ -52,16 +52,6 @@ jobs:
             --cov-report=term-missing \
             test/test_prototype_features*.py
 
-      - name: Run prototype datasets tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/datasets \
-            --cov-report=term-missing \
-            test/test_prototype_datasets*.py
-
       - name: Run prototype transforms tests
         if: success() || ( failure() && steps.setup.conclusion == 'success' )
         shell: bash
diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py
index bef5ecc41..0edf8eb2e 100644
--- a/torchvision/prototype/__init__.py
+++ b/torchvision/prototype/__init__.py
@@ -1 +1 @@
-from . import datasets, features, models, transforms, utils
+from . import features, models, transforms, utils
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index 29ed162cc..eadc2a019 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -4,7 +4,7 @@ from typing import Any, BinaryIO, Dict, List, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
@@ -12,7 +12,7 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 3382b62b6..829fa2560 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -3,7 +3,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -11,7 +11,7 @@ from torchvision.prototype.datasets.utils._internal import (
     INFINITE_BUFFER_SIZE,
     path_accessor,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
index cb701fbe6..753a28363 100644
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ b/torchvision/prototype/datasets/_builtin/clevr.py
@@ -2,7 +2,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -11,7 +11,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_accessor,
     path_comparator,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index 72d76f487..4ec4580e7 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -14,7 +14,7 @@ from torchdata.datapipes.iter import (
     Mapper,
     UnBatcher,
 )
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -24,7 +24,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_accessor,
     read_categories_file,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py
index f9821ea4e..c006e4454 100644
--- a/torchvision/prototype/datasets/_builtin/country211.py
+++ b/torchvision/prototype/datasets/_builtin/country211.py
@@ -2,14 +2,14 @@ import pathlib
 from typing import Any, Dict, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index 9c32d96f9..2a88f7030 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -14,7 +14,7 @@ from torchdata.datapipes.iter import (
     Mapper,
 )
 from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -25,7 +25,7 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py
index e7ff1e795..ebd5eaec5 100644
--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ b/torchvision/prototype/datasets/_builtin/dtd.py
@@ -3,7 +3,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -12,7 +12,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py
index 88863dbcb..12a379b47 100644
--- a/torchvision/prototype/datasets/_builtin/eurosat.py
+++ b/torchvision/prototype/datasets/_builtin/eurosat.py
@@ -2,9 +2,9 @@ import pathlib
 from typing import Any, Dict, List, Tuple, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py
index 3657116ae..122962599 100644
--- a/torchvision/prototype/datasets/_builtin/food101.py
+++ b/torchvision/prototype/datasets/_builtin/food101.py
@@ -2,7 +2,7 @@ from pathlib import Path
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -11,7 +11,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index e11dc2bb4..73eee8435 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -2,14 +2,14 @@ import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
     INFINITE_BUFFER_SIZE,
     path_comparator,
 )
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py
index 3192f1f55..8388285e5 100644
--- a/torchvision/prototype/datasets/_builtin/imagenet.py
+++ b/torchvision/prototype/datasets/_builtin/imagenet.py
@@ -15,7 +15,7 @@ from torchdata.datapipes.iter import (
     TarArchiveLoader,
 )
 from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datasets.utils import Dataset, ManualDownloadResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, ManualDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -25,7 +25,7 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
index 499dbd837..7621e7b11 100644
--- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
+++ b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
@@ -3,7 +3,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -13,7 +13,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import EncodedImage, Label
+from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
index 7aea1e0f7..01dd1d888 100644
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ b/torchvision/prototype/datasets/_builtin/sbd.py
@@ -4,7 +4,7 @@ from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -15,7 +15,7 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, EncodedImage
+from torchvision.prototype.features import _Feature
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index a0e7a377e..82aec3129 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -2,7 +2,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
     hint_shuffling,
@@ -10,7 +10,7 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index 8db82b4aa..901f8eeb1 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -6,7 +6,7 @@ from xml.etree import ElementTree
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
 from torchvision.datasets import VOCDetection
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
     hint_sharding,
@@ -16,7 +16,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
+from torchvision.prototype.features import BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py
index b2ec23c5e..01a93a52a 100644
--- a/torchvision/prototype/datasets/_folder.py
+++ b/torchvision/prototype/datasets/_folder.py
@@ -5,8 +5,9 @@ import pathlib
 from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper
+from torchvision.prototype.datasets.utils import EncodedData, EncodedImage
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import EncodedData, EncodedImage, Label
+from torchvision.prototype.features import Label
 
 
 __all__ = ["from_data_folder", "from_image_folder"]
diff --git a/torchvision/prototype/datasets/utils/__init__.py b/torchvision/prototype/datasets/utils/__init__.py
index 41ccbf489..3fdb53eec 100644
--- a/torchvision/prototype/datasets/utils/__init__.py
+++ b/torchvision/prototype/datasets/utils/__init__.py
@@ -1,3 +1,4 @@
 from . import _internal  # usort: skip
 from ._dataset import Dataset
+from ._encoded import EncodedData, EncodedImage
 from ._resource import GDriveResource, HttpResource, KaggleDownloadResource, ManualDownloadResource, OnlineResource
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py
similarity index 96%
rename from torchvision/prototype/features/_encoded.py
rename to torchvision/prototype/datasets/utils/_encoded.py
index ffa347a3e..1e06878ba 100644
--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/datasets/utils/_encoded.py
@@ -6,9 +6,9 @@ from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
 
 import PIL.Image
 import torch
-from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
 
-from ._feature import _Feature
+from torchvision.prototype.features._feature import _Feature
+from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
 
 D = TypeVar("D", bound="EncodedData")
 
diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
index 557c4d83c..e11e99a9b 100644
--- a/torchvision/prototype/features/__init__.py
+++ b/torchvision/prototype/features/__init__.py
@@ -1,5 +1,4 @@
 from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._encoded import EncodedData, EncodedImage
 from ._feature import _Feature, FillType, FillTypeJIT, InputType, InputTypeJIT, is_simple_tensor
 from ._image import ColorSpace, Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
 from ._label import Label, OneHotLabel
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 2f9bd76d4..a5e54bb5f 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -52,6 +52,6 @@ from ._misc import (
     TransposeDimensions,
 )
 from ._temporal import UniformTemporalSubsample
-from ._type_conversion import DecodeImage, LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
+from ._type_conversion import LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
 from ._deprecated import Grayscale, RandomGrayscale, ToTensor  # usort: skip
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index d0b11d53a..30b92259b 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -9,13 +9,6 @@ from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, Transform
 
 
-class DecodeImage(Transform):
-    _transformed_types = (features.EncodedImage,)
-
-    def _transform(self, inpt: torch.Tensor, params: Dict[str, Any]) -> features.Image:
-        return F.decode_image_with_pil(inpt)  # type: ignore[no-any-return]
-
-
 class LabelToOneHot(Transform):
     _transformed_types = (features.Label,)
 
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index 0b49c53b5..ec2da6ee5 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -166,13 +166,6 @@ from ._misc import (
     normalize_video,
 )
 from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
-from ._type_conversion import (
-    decode_image_with_pil,
-    decode_video_with_av,
-    pil_to_tensor,
-    to_image_pil,
-    to_image_tensor,
-    to_pil_image,
-)
+from ._type_conversion import pil_to_tensor, to_image_pil, to_image_tensor, to_pil_image
 
 from ._deprecated import get_image_size, rgb_to_grayscale, to_grayscale, to_tensor  # usort: skip
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index c99d3d9af..0e3d08ef0 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -1,30 +1,12 @@
-from typing import Any, Dict, Tuple, Union
+from typing import Union
 
 import numpy as np
 import PIL.Image
 import torch
-from torchvision.io.video import read_video
 from torchvision.prototype import features
-from torchvision.prototype.utils._internal import ReadOnlyTensorBuffer
 from torchvision.transforms import functional as _F
 
 
-@torch.jit.unused
-def decode_image_with_pil(encoded_image: torch.Tensor) -> features.Image:
-    image = torch.as_tensor(np.array(PIL.Image.open(ReadOnlyTensorBuffer(encoded_image)), copy=True))
-    if image.ndim == 2:
-        image = image.unsqueeze(2)
-    return features.Image(image.permute(2, 0, 1))
-
-
-@torch.jit.unused
-def decode_video_with_av(encoded_video: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
-    import unittest.mock
-
-    with unittest.mock.patch("torchvision.io.video.os.path.exists", return_value=True):
-        return read_video(ReadOnlyTensorBuffer(encoded_video))  # type: ignore[arg-type]
-
-
 @torch.jit.unused
 def to_image_tensor(image: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image:
     if isinstance(image, np.ndarray):
-- 
GitLab


From deba056203d009fec6b58afb9fa211f6ee3328c8 Mon Sep 17 00:00:00 2001
From: toni057 <toni.blaslov@gmail.com>
Date: Fri, 11 Nov 2022 18:15:36 +0000
Subject: [PATCH 148/624] Adding FLOPs and size to model metadata (#6936)

* Adding FLOPs and size to model metadata

* Adding weight size to quantization models

* Small refactor of rich metadata

* Removing unused code

* Fixing wrong entries

* Adding .DS_Store to gitignore

* Renaming _flops to _ops

* Adding number of operations to quantization models

* Reflecting _flops change to _ops

* Renamed ops and weight size in individual model doc pages

* Linter fixes

* Rounding ops to first decimal

* Rounding num ops and sizes to 3 decimals

* Change naming of columns.

* Update tables

Co-authored-by: Toni Blaslov <tblaslov@fb.com>
Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 docs/source/conf.py                           | 33 +++++++--
 test/test_extended_models.py                  |  4 +-
 torchvision/models/alexnet.py                 |  2 +
 torchvision/models/convnext.py                |  8 +++
 torchvision/models/densenet.py                | 10 ++-
 torchvision/models/detection/faster_rcnn.py   |  8 +++
 torchvision/models/detection/fcos.py          |  2 +
 torchvision/models/detection/keypoint_rcnn.py |  4 ++
 torchvision/models/detection/mask_rcnn.py     |  4 ++
 torchvision/models/detection/retinanet.py     |  4 ++
 torchvision/models/detection/ssd.py           |  2 +
 torchvision/models/detection/ssdlite.py       |  2 +
 torchvision/models/efficientnet.py            | 24 +++++++
 torchvision/models/googlenet.py               |  2 +
 torchvision/models/inception.py               |  2 +
 torchvision/models/maxvit.py                  |  2 +
 torchvision/models/mnasnet.py                 |  8 +++
 torchvision/models/mobilenetv2.py             |  4 ++
 torchvision/models/mobilenetv3.py             |  6 ++
 torchvision/models/optical_flow/raft.py       | 16 +++++
 torchvision/models/quantization/googlenet.py  |  2 +
 torchvision/models/quantization/inception.py  |  2 +
 .../models/quantization/mobilenetv2.py        |  2 +
 .../models/quantization/mobilenetv3.py        |  2 +
 torchvision/models/quantization/resnet.py     | 12 ++++
 .../models/quantization/shufflenetv2.py       |  8 +++
 torchvision/models/regnet.py                  | 68 +++++++++++++++++++
 torchvision/models/resnet.py                  | 34 ++++++++++
 torchvision/models/segmentation/deeplabv3.py  |  6 ++
 torchvision/models/segmentation/fcn.py        |  4 ++
 torchvision/models/segmentation/lraspp.py     |  2 +
 torchvision/models/shufflenetv2.py            |  8 +++
 torchvision/models/squeezenet.py              |  4 ++
 torchvision/models/swin_transformer.py        | 12 ++++
 torchvision/models/vgg.py                     | 18 +++++
 torchvision/models/video/mvit.py              |  4 ++
 torchvision/models/video/resnet.py            |  6 ++
 torchvision/models/video/s3d.py               |  2 +
 torchvision/models/vision_transformer.py      | 20 ++++++
 39 files changed, 353 insertions(+), 10 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 231d3cad4..2d1eb2d7a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -362,6 +362,14 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
                     max_visible = 3
                     v_sample = ", ".join(v[:max_visible])
                     v = f"{v_sample}, ... ({len(v)-max_visible} omitted)" if len(v) > max_visible else v_sample
+                elif k == "_ops":
+                    if obj.__name__.endswith("_QuantizedWeights"):
+                        v = f"{v} giga instructions per sec"
+                    else:
+                        v = f"{v} giga floating-point operations per sec"
+                elif k == "_weight_size":
+                    v = f"{v} MB (file size)"
+
                 table.append((str(k), str(v)))
             table = tabulate(table, tablefmt="rst")
             lines += [".. rst-class:: table-weights"]  # Custom CSS class, see custom_torchvision.css
@@ -385,19 +393,30 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     if exclude_patterns is not None:
         weights = [w for w in weights if all(p not in str(w) for p in exclude_patterns)]
 
+    ops_name = "GIPS" if "QuantizedWeights" in weights_endswith else "GFLOPS"
+
     metrics_keys, metrics_names = zip(*metrics)
-    column_names = ["Weight"] + list(metrics_names) + ["Params", "Recipe"]
+    column_names = (
+        ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Size (MB)", "Recipe"]
+    )  # Final column order
     column_names = [f"**{name}**" for name in column_names]  # Add bold
 
-    content = [
-        (
+    content = []
+    for w in weights:
+        row = [
             f":class:`{w} <{type(w).__name__}>`",
             *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys),
             f"{w.meta['num_params']/1e6:.1f}M",
+            f"{w.meta['_ops']:.3f}",
+            f"{round(w.meta['_weight_size'], 1):.1f}",
             f"`link <{w.meta['recipe']}>`__",
-        )
-        for w in weights
-    ]
+        ]
+
+        content.append(row)
+
+    column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 3 + ["10"]
+    widths_table = " ".join(column_widths)
+
     table = tabulate(content, headers=column_names, tablefmt="rst")
 
     generated_dir = Path("generated")
@@ -405,7 +424,7 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     with open(generated_dir / f"{table_name}_table.rst", "w+") as table_file:
         table_file.write(".. rst-class:: table-weights\n")  # Custom CSS class, see custom_torchvision.css
         table_file.write(".. table::\n")
-        table_file.write(f"    :widths: 100 {'20 ' * len(metrics_names)} 20 10\n\n")
+        table_file.write(f"    :widths: {widths_table} \n\n")
         table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n")
 
 
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index 2cd8a5681..c3bb5d653 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -155,11 +155,13 @@ def test_schema_meta_validation(model_fn):
         "recipe",
         "unquantized",
         "_docs",
+        "_ops",
+        "_weight_size",
     }
     # mandatory fields for each computer vision task
     classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")}
     defaults = {
-        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs"},
+        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_weight_size", "_ops"},
         "models": classification_fields,
         "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")},
         "quantization": classification_fields | {"backend", "unquantized"},
diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py
index 328f978ba..50179d07c 100644
--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -67,6 +67,8 @@ class AlexNet_Weights(WeightsEnum):
                     "acc@5": 79.066,
                 }
             },
+            "_ops": 0.714,
+            "_weight_size": 233.087,
             "_docs": """
                 These weights reproduce closely the results of the paper using a simplified training recipe.
             """,
diff --git a/torchvision/models/convnext.py b/torchvision/models/convnext.py
index 025baa3d1..21e36b063 100644
--- a/torchvision/models/convnext.py
+++ b/torchvision/models/convnext.py
@@ -219,6 +219,8 @@ class ConvNeXt_Tiny_Weights(WeightsEnum):
                     "acc@5": 96.146,
                 }
             },
+            "_ops": 4.456,
+            "_weight_size": 109.119,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -237,6 +239,8 @@ class ConvNeXt_Small_Weights(WeightsEnum):
                     "acc@5": 96.650,
                 }
             },
+            "_ops": 8.684,
+            "_weight_size": 191.703,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -255,6 +259,8 @@ class ConvNeXt_Base_Weights(WeightsEnum):
                     "acc@5": 96.870,
                 }
             },
+            "_ops": 15.355,
+            "_weight_size": 338.064,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -273,6 +279,8 @@ class ConvNeXt_Large_Weights(WeightsEnum):
                     "acc@5": 96.976,
                 }
             },
+            "_ops": 34.361,
+            "_weight_size": 754.537,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py
index 9aa5ed176..575d123c4 100644
--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -15,7 +15,6 @@ from ._api import register_model, Weights, WeightsEnum
 from ._meta import _IMAGENET_CATEGORIES
 from ._utils import _ovewrite_named_param, handle_legacy_interface
 
-
 __all__ = [
     "DenseNet",
     "DenseNet121_Weights",
@@ -278,6 +277,8 @@ class DenseNet121_Weights(WeightsEnum):
                     "acc@5": 91.972,
                 }
             },
+            "_ops": 2.834,
+            "_weight_size": 30.845,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -296,6 +297,8 @@ class DenseNet161_Weights(WeightsEnum):
                     "acc@5": 93.560,
                 }
             },
+            "_ops": 7.728,
+            "_weight_size": 110.369,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -314,6 +317,8 @@ class DenseNet169_Weights(WeightsEnum):
                     "acc@5": 92.806,
                 }
             },
+            "_ops": 3.36,
+            "_weight_size": 54.708,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -332,6 +337,8 @@ class DenseNet201_Weights(WeightsEnum):
                     "acc@5": 93.370,
                 }
             },
+            "_ops": 4.291,
+            "_weight_size": 77.373,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -444,7 +451,6 @@ def densenet201(*, weights: Optional[DenseNet201_Weights] = None, progress: bool
 # The dictionary below is internal implementation detail and will be removed in v0.15
 from ._utils import _ModelURLs
 
-
 model_urls = _ModelURLs(
     {
         "densenet121": DenseNet121_Weights.IMAGENET1K_V1.url,
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 9d99fd236..5b97c8fc2 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -388,6 +388,8 @@ class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 37.0,
                 }
             },
+            "_ops": 134.38,
+            "_weight_size": 159.743,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -407,6 +409,8 @@ class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "box_map": 46.7,
                 }
             },
+            "_ops": 280.371,
+            "_weight_size": 167.104,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -426,6 +430,8 @@ class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
                     "box_map": 32.8,
                 }
             },
+            "_ops": 4.494,
+            "_weight_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -445,6 +451,8 @@ class FasterRCNN_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
                     "box_map": 22.8,
                 }
             },
+            "_ops": 0.719,
+            "_weight_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index 2ac71c339..535518f82 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -662,6 +662,8 @@ class FCOS_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 39.2,
                 }
             },
+            "_ops": 128.207,
+            "_weight_size": 123.608,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index c19dd21a5..6964389f1 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -328,6 +328,8 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "kp_map": 61.1,
                 }
             },
+            "_ops": 133.924,
+            "_weight_size": 226.054,
             "_docs": """
                 These weights were produced by following a similar training recipe as on the paper but use a checkpoint
                 from an early epoch.
@@ -347,6 +349,8 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "kp_map": 65.0,
                 }
             },
+            "_ops": 137.42,
+            "_weight_size": 226.054,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index 795f9b8f7..f8a13a658 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -370,6 +370,8 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "mask_map": 34.6,
                 }
             },
+            "_ops": 134.38,
+            "_weight_size": 169.84,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -390,6 +392,8 @@ class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "mask_map": 41.8,
                 }
             },
+            "_ops": 333.577,
+            "_weight_size": 177.219,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index ffa21b14f..498ff2207 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -690,6 +690,8 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 36.4,
                 }
             },
+            "_ops": 151.54,
+            "_weight_size": 130.267,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -709,6 +711,8 @@ class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "box_map": 41.5,
                 }
             },
+            "_ops": 152.238,
+            "_weight_size": 146.037,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index 44102f7ac..5ec27f45f 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -39,6 +39,8 @@ class SSD300_VGG16_Weights(WeightsEnum):
                     "box_map": 25.1,
                 }
             },
+            "_ops": 34.858,
+            "_weight_size": 135.988,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index d34795d72..10e32d248 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -198,6 +198,8 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
                     "box_map": 21.3,
                 }
             },
+            "_ops": 0.583,
+            "_weight_size": 13.418,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index c98eb37f9..05414c931 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -464,6 +464,8 @@ class EfficientNet_B0_Weights(WeightsEnum):
                     "acc@5": 93.532,
                 }
             },
+            "_ops": 0.386,
+            "_weight_size": 20.451,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -486,6 +488,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                     "acc@5": 94.186,
                 }
             },
+            "_ops": 0.687,
+            "_weight_size": 30.134,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -504,6 +508,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                     "acc@5": 94.934,
                 }
             },
+            "_ops": 0.687,
+            "_weight_size": 30.136,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -530,6 +536,8 @@ class EfficientNet_B2_Weights(WeightsEnum):
                     "acc@5": 95.310,
                 }
             },
+            "_ops": 1.088,
+            "_weight_size": 35.174,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -552,6 +560,8 @@ class EfficientNet_B3_Weights(WeightsEnum):
                     "acc@5": 96.054,
                 }
             },
+            "_ops": 1.827,
+            "_weight_size": 47.184,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -574,6 +584,8 @@ class EfficientNet_B4_Weights(WeightsEnum):
                     "acc@5": 96.594,
                 }
             },
+            "_ops": 4.394,
+            "_weight_size": 74.489,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -596,6 +608,8 @@ class EfficientNet_B5_Weights(WeightsEnum):
                     "acc@5": 96.628,
                 }
             },
+            "_ops": 10.266,
+            "_weight_size": 116.864,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -618,6 +632,8 @@ class EfficientNet_B6_Weights(WeightsEnum):
                     "acc@5": 96.916,
                 }
             },
+            "_ops": 19.068,
+            "_weight_size": 165.362,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -640,6 +656,8 @@ class EfficientNet_B7_Weights(WeightsEnum):
                     "acc@5": 96.908,
                 }
             },
+            "_ops": 37.746,
+            "_weight_size": 254.675,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -664,6 +682,8 @@ class EfficientNet_V2_S_Weights(WeightsEnum):
                     "acc@5": 96.878,
                 }
             },
+            "_ops": 8.366,
+            "_weight_size": 82.704,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -692,6 +712,8 @@ class EfficientNet_V2_M_Weights(WeightsEnum):
                     "acc@5": 97.156,
                 }
             },
+            "_ops": 24.582,
+            "_weight_size": 208.01,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -723,6 +745,8 @@ class EfficientNet_V2_L_Weights(WeightsEnum):
                     "acc@5": 97.788,
                 }
             },
+            "_ops": 56.08,
+            "_weight_size": 454.573,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py
index 0ea3dd5d0..b5435c7bd 100644
--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -290,6 +290,8 @@ class GoogLeNet_Weights(WeightsEnum):
                     "acc@5": 89.530,
                 }
             },
+            "_ops": 1.498,
+            "_weight_size": 49.731,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index 928c07ac8..d2adb0842 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -422,6 +422,8 @@ class Inception_V3_Weights(WeightsEnum):
                     "acc@5": 93.450,
                 }
             },
+            "_ops": 5.713,
+            "_weight_size": 103.903,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
index 7bf928763..96c395132 100644
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -785,6 +785,8 @@ class MaxVit_T_Weights(WeightsEnum):
                     "acc@5": 96.722,
                 }
             },
+            "_ops": 5.558,
+            "_weight_size": 118.769,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 48103f115..bf94b9630 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -231,6 +231,8 @@ class MNASNet0_5_Weights(WeightsEnum):
                     "acc@5": 87.490,
                 }
             },
+            "_ops": 0.104,
+            "_weight_size": 8.591,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -251,6 +253,8 @@ class MNASNet0_75_Weights(WeightsEnum):
                     "acc@5": 90.496,
                 }
             },
+            "_ops": 0.215,
+            "_weight_size": 12.303,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -273,6 +277,8 @@ class MNASNet1_0_Weights(WeightsEnum):
                     "acc@5": 91.510,
                 }
             },
+            "_ops": 0.314,
+            "_weight_size": 16.915,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -293,6 +299,8 @@ class MNASNet1_3_Weights(WeightsEnum):
                     "acc@5": 93.522,
                 }
             },
+            "_ops": 0.526,
+            "_weight_size": 24.246,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index 86b659ebd..7920b906c 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -194,6 +194,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                     "acc@5": 90.286,
                 }
             },
+            "_ops": 0.301,
+            "_weight_size": 13.555,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -209,6 +211,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                     "acc@5": 90.822,
                 }
             },
+            "_ops": 0.301,
+            "_weight_size": 13.598,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index 715fc822e..8ae4b7d60 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -307,6 +307,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                     "acc@5": 91.340,
                 }
             },
+            "_ops": 0.217,
+            "_weight_size": 21.114,
             "_docs": """These weights were trained from scratch by using a simple training recipe.""",
         },
     )
@@ -323,6 +325,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                     "acc@5": 92.566,
                 }
             },
+            "_ops": 0.217,
+            "_weight_size": 21.107,
             "_docs": """
                 These weights improve marginally upon the results of the original paper by using a modified version of
                 TorchVision's `new training recipe
@@ -347,6 +351,8 @@ class MobileNet_V3_Small_Weights(WeightsEnum):
                     "acc@5": 87.402,
                 }
             },
+            "_ops": 0.057,
+            "_weight_size": 9.829,
             "_docs": """
                 These weights improve upon the results of the original paper by using a simple training recipe.
             """,
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index 1773f3d59..7aa468492 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -552,6 +552,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 2.7894},
                 "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
             },
+            "_ops": 211.007,
+            "_weight_size": 20.129,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -570,6 +572,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 2.7161},
                 "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
             },
+            "_ops": 211.007,
+            "_weight_size": 20.129,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -588,6 +592,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Cleanpass": {"epe": 1.94},
                 "Sintel-Test-Finalpass": {"epe": 3.18},
             },
+            "_ops": 211.007,
+            "_weight_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -612,6 +618,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Cleanpass": {"epe": 1.819},
                 "Sintel-Test-Finalpass": {"epe": 3.067},
             },
+            "_ops": 211.007,
+            "_weight_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -636,6 +644,8 @@ class Raft_Large_Weights(WeightsEnum):
             "_metrics": {
                 "Kitti-Test": {"fl_all": 5.10},
             },
+            "_ops": 211.007,
+            "_weight_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -657,6 +667,8 @@ class Raft_Large_Weights(WeightsEnum):
             "_metrics": {
                 "Kitti-Test": {"fl_all": 5.19},
             },
+            "_ops": 211.007,
+            "_weight_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -698,6 +710,8 @@ class Raft_Small_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 3.2790},
                 "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
             },
+            "_ops": 47.655,
+            "_weight_size": 3.821,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -715,6 +729,8 @@ class Raft_Small_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 3.2831},
                 "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
             },
+            "_ops": 47.655,
+            "_weight_size": 3.821,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index abf2184ac..96d3fc261 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -123,6 +123,8 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
                     "acc@5": 89.404,
                 }
             },
+            "_ops": 1.498,
+            "_weight_size": 12.618,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index 34cd2a0a3..bb6b5aa5b 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -183,6 +183,8 @@ class Inception_V3_QuantizedWeights(WeightsEnum):
                     "acc@5": 93.354,
                 }
             },
+            "_ops": 5.713,
+            "_weight_size": 23.146,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py
index 1f91967f1..c85c54f33 100644
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -80,6 +80,8 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.150,
                 }
             },
+            "_ops": 0.301,
+            "_weight_size": 3.423,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py
index 53229c095..c0a613edd 100644
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -175,6 +175,8 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.858,
                 }
             },
+            "_ops": 0.217,
+            "_weight_size": 21.554,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py
index 286c040b0..b6f4165e6 100644
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -175,6 +175,8 @@ class ResNet18_QuantizedWeights(WeightsEnum):
                     "acc@5": 88.882,
                 }
             },
+            "_ops": 1.814,
+            "_weight_size": 11.238,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -194,6 +196,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                     "acc@5": 92.814,
                 }
             },
+            "_ops": 4.089,
+            "_weight_size": 24.759,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -209,6 +213,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                     "acc@5": 94.976,
                 }
             },
+            "_ops": 4.089,
+            "_weight_size": 24.953,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -228,6 +234,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                     "acc@5": 94.480,
                 }
             },
+            "_ops": 16.414,
+            "_weight_size": 86.034,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -243,6 +251,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                     "acc@5": 96.132,
                 }
             },
+            "_ops": 16.414,
+            "_weight_size": 86.645,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -263,6 +273,8 @@ class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
                     "acc@5": 96.326,
                 }
             },
+            "_ops": 15.46,
+            "_weight_size": 81.556,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index a6317e28b..11d89c0b3 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -139,6 +139,8 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
                     "acc@5": 79.780,
                 }
             },
+            "_ops": 0.04,
+            "_weight_size": 1.501,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -158,6 +160,8 @@ class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
                     "acc@5": 87.582,
                 }
             },
+            "_ops": 0.145,
+            "_weight_size": 2.334,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -178,6 +182,8 @@ class ShuffleNet_V2_X1_5_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.700,
                 }
             },
+            "_ops": 0.296,
+            "_weight_size": 3.672,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -198,6 +204,8 @@ class ShuffleNet_V2_X2_0_QuantizedWeights(WeightsEnum):
                     "acc@5": 92.488,
                 }
             },
+            "_ops": 0.583,
+            "_weight_size": 7.467,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 866e62c16..a60262c3b 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -428,6 +428,8 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                     "acc@5": 91.716,
                 }
             },
+            "_ops": 0.402,
+            "_weight_size": 16.806,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -444,6 +446,8 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                     "acc@5": 92.742,
                 }
             },
+            "_ops": 0.402,
+            "_weight_size": 16.806,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -468,6 +472,8 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                     "acc@5": 93.136,
                 }
             },
+            "_ops": 0.834,
+            "_weight_size": 24.774,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -484,6 +490,8 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                     "acc@5": 94.502,
                 }
             },
+            "_ops": 0.834,
+            "_weight_size": 24.774,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -508,6 +516,8 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                     "acc@5": 93.966,
                 }
             },
+            "_ops": 1.612,
+            "_weight_size": 43.152,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -524,6 +534,8 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                     "acc@5": 95.444,
                 }
             },
+            "_ops": 1.612,
+            "_weight_size": 43.152,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -548,6 +560,8 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                     "acc@5": 94.576,
                 }
             },
+            "_ops": 3.176,
+            "_weight_size": 74.567,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -564,6 +578,8 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                     "acc@5": 95.972,
                 }
             },
+            "_ops": 3.176,
+            "_weight_size": 74.567,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -588,6 +604,8 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                     "acc@5": 95.048,
                 }
             },
+            "_ops": 8.473,
+            "_weight_size": 150.701,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -604,6 +622,8 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                     "acc@5": 96.330,
                 }
             },
+            "_ops": 8.473,
+            "_weight_size": 150.701,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -628,6 +648,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 95.240,
                 }
             },
+            "_ops": 15.912,
+            "_weight_size": 319.49,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -644,6 +666,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 96.328,
                 }
             },
+            "_ops": 15.912,
+            "_weight_size": 319.49,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -665,6 +689,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 98.054,
                 }
             },
+            "_ops": 46.735,
+            "_weight_size": 319.49,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -686,6 +712,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 97.244,
                 }
             },
+            "_ops": 15.912,
+            "_weight_size": 319.49,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -709,6 +737,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 95.340,
                 }
             },
+            "_ops": 32.28,
+            "_weight_size": 554.076,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -725,6 +755,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 96.498,
                 }
             },
+            "_ops": 32.28,
+            "_weight_size": 554.076,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -746,6 +778,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 98.362,
                 }
             },
+            "_ops": 94.826,
+            "_weight_size": 554.076,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -767,6 +801,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 97.480,
                 }
             },
+            "_ops": 32.28,
+            "_weight_size": 554.076,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -791,6 +827,8 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                     "acc@5": 98.682,
                 }
             },
+            "_ops": 374.57,
+            "_weight_size": 2461.564,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -812,6 +850,8 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                     "acc@5": 97.844,
                 }
             },
+            "_ops": 127.518,
+            "_weight_size": 2461.564,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -835,6 +875,8 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                     "acc@5": 90.950,
                 }
             },
+            "_ops": 0.414,
+            "_weight_size": 21.258,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -851,6 +893,8 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                     "acc@5": 92.322,
                 }
             },
+            "_ops": 0.414,
+            "_weight_size": 21.257,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -875,6 +919,8 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                     "acc@5": 92.348,
                 }
             },
+            "_ops": 0.8,
+            "_weight_size": 27.945,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -891,6 +937,8 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                     "acc@5": 93.826,
                 }
             },
+            "_ops": 0.8,
+            "_weight_size": 27.945,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -915,6 +963,8 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                     "acc@5": 93.440,
                 }
             },
+            "_ops": 1.603,
+            "_weight_size": 35.339,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -931,6 +981,8 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                     "acc@5": 94.922,
                 }
             },
+            "_ops": 1.603,
+            "_weight_size": 35.339,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -955,6 +1007,8 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                     "acc@5": 93.992,
                 }
             },
+            "_ops": 3.177,
+            "_weight_size": 58.756,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -971,6 +1025,8 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                     "acc@5": 95.430,
                 }
             },
+            "_ops": 3.177,
+            "_weight_size": 58.756,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -995,6 +1051,8 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                     "acc@5": 94.686,
                 }
             },
+            "_ops": 7.995,
+            "_weight_size": 151.456,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1011,6 +1069,8 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                     "acc@5": 95.678,
                 }
             },
+            "_ops": 7.995,
+            "_weight_size": 151.456,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1035,6 +1095,8 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                     "acc@5": 94.944,
                 }
             },
+            "_ops": 15.941,
+            "_weight_size": 207.627,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1051,6 +1113,8 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                     "acc@5": 96.196,
                 }
             },
+            "_ops": 15.941,
+            "_weight_size": 207.627,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1075,6 +1139,8 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                     "acc@5": 95.248,
                 }
             },
+            "_ops": 31.736,
+            "_weight_size": 412.039,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1091,6 +1157,8 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                     "acc@5": 96.288,
                 }
             },
+            "_ops": 31.736,
+            "_weight_size": 412.039,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index dbf14463e..80eb46669 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -323,6 +323,8 @@ class ResNet18_Weights(WeightsEnum):
                     "acc@5": 89.078,
                 }
             },
+            "_ops": 1.814,
+            "_weight_size": 44.661,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -343,6 +345,8 @@ class ResNet34_Weights(WeightsEnum):
                     "acc@5": 91.420,
                 }
             },
+            "_ops": 3.664,
+            "_weight_size": 83.275,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -363,6 +367,8 @@ class ResNet50_Weights(WeightsEnum):
                     "acc@5": 92.862,
                 }
             },
+            "_ops": 4.089,
+            "_weight_size": 97.781,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -379,6 +385,8 @@ class ResNet50_Weights(WeightsEnum):
                     "acc@5": 95.434,
                 }
             },
+            "_ops": 4.089,
+            "_weight_size": 97.79,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -402,6 +410,8 @@ class ResNet101_Weights(WeightsEnum):
                     "acc@5": 93.546,
                 }
             },
+            "_ops": 7.801,
+            "_weight_size": 170.511,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -418,6 +428,8 @@ class ResNet101_Weights(WeightsEnum):
                     "acc@5": 95.780,
                 }
             },
+            "_ops": 7.801,
+            "_weight_size": 170.53,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -441,6 +453,8 @@ class ResNet152_Weights(WeightsEnum):
                     "acc@5": 94.046,
                 }
             },
+            "_ops": 11.514,
+            "_weight_size": 230.434,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -457,6 +471,8 @@ class ResNet152_Weights(WeightsEnum):
                     "acc@5": 96.002,
                 }
             },
+            "_ops": 11.514,
+            "_weight_size": 230.474,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -480,6 +496,8 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                     "acc@5": 93.698,
                 }
             },
+            "_ops": 4.23,
+            "_weight_size": 95.789,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -496,6 +514,8 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                     "acc@5": 95.340,
                 }
             },
+            "_ops": 4.23,
+            "_weight_size": 95.833,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -519,6 +539,8 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                     "acc@5": 94.526,
                 }
             },
+            "_ops": 16.414,
+            "_weight_size": 339.586,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -535,6 +557,8 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                     "acc@5": 96.228,
                 }
             },
+            "_ops": 16.414,
+            "_weight_size": 339.673,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -558,6 +582,8 @@ class ResNeXt101_64X4D_Weights(WeightsEnum):
                     "acc@5": 96.454,
                 }
             },
+            "_ops": 15.46,
+            "_weight_size": 319.318,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -581,6 +607,8 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                     "acc@5": 94.086,
                 }
             },
+            "_ops": 11.398,
+            "_weight_size": 131.82,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -597,6 +625,8 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                     "acc@5": 95.758,
                 }
             },
+            "_ops": 11.398,
+            "_weight_size": 263.124,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -620,6 +650,8 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                     "acc@5": 94.284,
                 }
             },
+            "_ops": 22.753,
+            "_weight_size": 242.896,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -636,6 +668,8 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                     "acc@5": 96.020,
                 }
             },
+            "_ops": 22.753,
+            "_weight_size": 484.747,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
index 29ab01548..ce5867380 100644
--- a/torchvision/models/segmentation/deeplabv3.py
+++ b/torchvision/models/segmentation/deeplabv3.py
@@ -152,6 +152,8 @@ class DeepLabV3_ResNet50_Weights(WeightsEnum):
                     "pixel_acc": 92.4,
                 }
             },
+            "_ops": 178.722,
+            "_weight_size": 160.515,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -171,6 +173,8 @@ class DeepLabV3_ResNet101_Weights(WeightsEnum):
                     "pixel_acc": 92.4,
                 }
             },
+            "_ops": 258.743,
+            "_weight_size": 233.217,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -190,6 +194,8 @@ class DeepLabV3_MobileNet_V3_Large_Weights(WeightsEnum):
                     "pixel_acc": 91.2,
                 }
             },
+            "_ops": 10.452,
+            "_weight_size": 42.301,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
index 6f1c9c4b8..8031a7d0d 100644
--- a/torchvision/models/segmentation/fcn.py
+++ b/torchvision/models/segmentation/fcn.py
@@ -71,6 +71,8 @@ class FCN_ResNet50_Weights(WeightsEnum):
                     "pixel_acc": 91.4,
                 }
             },
+            "_ops": 152.717,
+            "_weight_size": 135.009,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -90,6 +92,8 @@ class FCN_ResNet101_Weights(WeightsEnum):
                     "pixel_acc": 91.9,
                 }
             },
+            "_ops": 232.738,
+            "_weight_size": 207.711,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
index 44c96f1c2..e90a2917d 100644
--- a/torchvision/models/segmentation/lraspp.py
+++ b/torchvision/models/segmentation/lraspp.py
@@ -108,6 +108,8 @@ class LRASPP_MobileNet_V3_Large_Weights(WeightsEnum):
                     "pixel_acc": 91.2,
                 }
             },
+            "_ops": 2.086,
+            "_weight_size": 12.49,
             "_docs": """
                 These weights were trained on a subset of COCO, using only the 20 categories that are present in the
                 Pascal VOC dataset.
diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index 159e1be3b..005b338b7 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -204,6 +204,8 @@ class ShuffleNet_V2_X0_5_Weights(WeightsEnum):
                     "acc@5": 81.746,
                 }
             },
+            "_ops": 0.04,
+            "_weight_size": 5.282,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -224,6 +226,8 @@ class ShuffleNet_V2_X1_0_Weights(WeightsEnum):
                     "acc@5": 88.316,
                 }
             },
+            "_ops": 0.145,
+            "_weight_size": 8.791,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -244,6 +248,8 @@ class ShuffleNet_V2_X1_5_Weights(WeightsEnum):
                     "acc@5": 91.086,
                 }
             },
+            "_ops": 0.296,
+            "_weight_size": 13.557,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -267,6 +273,8 @@ class ShuffleNet_V2_X2_0_Weights(WeightsEnum):
                     "acc@5": 93.006,
                 }
             },
+            "_ops": 0.583,
+            "_weight_size": 28.433,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py
index 9fe6521e1..94f5d50e6 100644
--- a/torchvision/models/squeezenet.py
+++ b/torchvision/models/squeezenet.py
@@ -135,6 +135,8 @@ class SqueezeNet1_0_Weights(WeightsEnum):
                     "acc@5": 80.420,
                 }
             },
+            "_ops": 0.819,
+            "_weight_size": 4.778,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -154,6 +156,8 @@ class SqueezeNet1_1_Weights(WeightsEnum):
                     "acc@5": 80.624,
                 }
             },
+            "_ops": 0.349,
+            "_weight_size": 4.729,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 64714c3af..47498e66d 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -660,6 +660,8 @@ class Swin_T_Weights(WeightsEnum):
                     "acc@5": 95.776,
                 }
             },
+            "_ops": 4.491,
+            "_weight_size": 108.19,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -683,6 +685,8 @@ class Swin_S_Weights(WeightsEnum):
                     "acc@5": 96.360,
                 }
             },
+            "_ops": 8.741,
+            "_weight_size": 189.786,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -706,6 +710,8 @@ class Swin_B_Weights(WeightsEnum):
                     "acc@5": 96.640,
                 }
             },
+            "_ops": 15.431,
+            "_weight_size": 335.364,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -729,6 +735,8 @@ class Swin_V2_T_Weights(WeightsEnum):
                     "acc@5": 96.132,
                 }
             },
+            "_ops": 5.94,
+            "_weight_size": 108.626,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -752,6 +760,8 @@ class Swin_V2_S_Weights(WeightsEnum):
                     "acc@5": 96.816,
                 }
             },
+            "_ops": 11.546,
+            "_weight_size": 190.675,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -775,6 +785,8 @@ class Swin_V2_B_Weights(WeightsEnum):
                     "acc@5": 96.864,
                 }
             },
+            "_ops": 20.325,
+            "_weight_size": 336.372,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index dea783c2f..6725dedd4 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -127,6 +127,8 @@ class VGG11_Weights(WeightsEnum):
                     "acc@5": 88.628,
                 }
             },
+            "_ops": 7.609,
+            "_weight_size": 506.84,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -145,6 +147,8 @@ class VGG11_BN_Weights(WeightsEnum):
                     "acc@5": 89.810,
                 }
             },
+            "_ops": 7.609,
+            "_weight_size": 506.881,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -163,6 +167,8 @@ class VGG13_Weights(WeightsEnum):
                     "acc@5": 89.246,
                 }
             },
+            "_ops": 11.308,
+            "_weight_size": 507.545,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -181,6 +187,8 @@ class VGG13_BN_Weights(WeightsEnum):
                     "acc@5": 90.374,
                 }
             },
+            "_ops": 11.308,
+            "_weight_size": 507.59,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -199,6 +207,8 @@ class VGG16_Weights(WeightsEnum):
                     "acc@5": 90.382,
                 }
             },
+            "_ops": 15.47,
+            "_weight_size": 527.796,
         },
     )
     IMAGENET1K_FEATURES = Weights(
@@ -221,6 +231,8 @@ class VGG16_Weights(WeightsEnum):
                     "acc@5": float("nan"),
                 }
             },
+            "_ops": 15.47,
+            "_weight_size": 527.802,
             "_docs": """
                 These weights can't be used for classification because they are missing values in the `classifier`
                 module. Only the `features` module has valid values and can be used for feature extraction. The weights
@@ -244,6 +256,8 @@ class VGG16_BN_Weights(WeightsEnum):
                     "acc@5": 91.516,
                 }
             },
+            "_ops": 15.47,
+            "_weight_size": 527.866,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -262,6 +276,8 @@ class VGG19_Weights(WeightsEnum):
                     "acc@5": 90.876,
                 }
             },
+            "_ops": 19.632,
+            "_weight_size": 548.051,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -280,6 +296,8 @@ class VGG19_BN_Weights(WeightsEnum):
                     "acc@5": 91.842,
                 }
             },
+            "_ops": 19.632,
+            "_weight_size": 548.143,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index 1b5118b53..d20d6e907 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -624,6 +624,8 @@ class MViT_V1_B_Weights(WeightsEnum):
                     "acc@5": 93.582,
                 }
             },
+            "_ops": 70.599,
+            "_weight_size": 139.764,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -655,6 +657,8 @@ class MViT_V2_S_Weights(WeightsEnum):
                     "acc@5": 94.665,
                 }
             },
+            "_ops": 64.224,
+            "_weight_size": 131.884,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/resnet.py b/torchvision/models/video/resnet.py
index 352ae92d1..6f5bd8764 100644
--- a/torchvision/models/video/resnet.py
+++ b/torchvision/models/video/resnet.py
@@ -332,6 +332,8 @@ class R3D_18_Weights(WeightsEnum):
                     "acc@5": 83.479,
                 }
             },
+            "_ops": 40.697,
+            "_weight_size": 127.359,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -350,6 +352,8 @@ class MC3_18_Weights(WeightsEnum):
                     "acc@5": 84.130,
                 }
             },
+            "_ops": 43.343,
+            "_weight_size": 44.672,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -368,6 +372,8 @@ class R2Plus1D_18_Weights(WeightsEnum):
                     "acc@5": 86.175,
                 }
             },
+            "_ops": 40.519,
+            "_weight_size": 120.318,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py
index 53e3e841a..64874712f 100644
--- a/torchvision/models/video/s3d.py
+++ b/torchvision/models/video/s3d.py
@@ -175,6 +175,8 @@ class S3D_Weights(WeightsEnum):
                     "acc@5": 88.050,
                 }
             },
+            "_ops": 17.979,
+            "_weight_size": 31.972,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index be62ce1ce..06a47c03a 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -363,6 +363,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 95.318,
                 }
             },
+            "_ops": 17.564,
+            "_weight_size": 330.285,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -387,6 +389,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 97.650,
                 }
             },
+            "_ops": 55.484,
+            "_weight_size": 331.398,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -412,6 +416,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 96.180,
                 }
             },
+            "_ops": 17.564,
+            "_weight_size": 330.285,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -436,6 +442,8 @@ class ViT_B_32_Weights(WeightsEnum):
                     "acc@5": 92.466,
                 }
             },
+            "_ops": 4.409,
+            "_weight_size": 336.604,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -460,6 +468,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 94.638,
                 }
             },
+            "_ops": 61.555,
+            "_weight_size": 1161.023,
             "_docs": """
                 These weights were trained from scratch by using a modified version of TorchVision's
                 `new training recipe
@@ -485,6 +495,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 98.512,
                 }
             },
+            "_ops": 361.986,
+            "_weight_size": 1164.258,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -510,6 +522,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 97.422,
                 }
             },
+            "_ops": 61.555,
+            "_weight_size": 1161.023,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -534,6 +548,8 @@ class ViT_L_32_Weights(WeightsEnum):
                     "acc@5": 93.07,
                 }
             },
+            "_ops": 15.378,
+            "_weight_size": 1169.449,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -562,6 +578,8 @@ class ViT_H_14_Weights(WeightsEnum):
                     "acc@5": 98.694,
                 }
             },
+            "_ops": 1016.717,
+            "_weight_size": 2416.643,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -587,6 +605,8 @@ class ViT_H_14_Weights(WeightsEnum):
                     "acc@5": 97.730,
                 }
             },
+            "_ops": 167.295,
+            "_weight_size": 2411.209,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
-- 
GitLab


From c595886203ffaa632ba43e254b3c06f28430919f Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 14 Nov 2022 12:49:44 +0000
Subject: [PATCH 149/624] Fix bug on prototype `pad` (#6949)

---
 .../transforms/functional/_geometry.py        | 95 ++++++++++++-------
 1 file changed, 61 insertions(+), 34 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index ce97ce057..98bd7a527 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -727,39 +727,38 @@ def _pad_with_scalar_fill(
     shape = image.shape
     num_channels, height, width = shape[-3:]
 
-    if image.numel() > 0:
-        image = image.reshape(-1, num_channels, height, width)
-
-        if padding_mode == "edge":
-            # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
-            # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
-            # name.
-            padding_mode = "replicate"
-
-        if padding_mode == "constant":
-            image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
-        elif padding_mode in ("reflect", "replicate"):
-            # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
-            # TODO: See https://github.com/pytorch/pytorch/issues/40763
-            dtype = image.dtype
-            if not image.is_floating_point():
-                needs_cast = True
-                image = image.to(torch.float32)
-            else:
-                needs_cast = False
-
-            image = torch_pad(image, torch_padding, mode=padding_mode)
-
-            if needs_cast:
-                image = image.to(dtype)
-        else:  # padding_mode == "symmetric"
-            image = _FT._pad_symmetric(image, torch_padding)
+    batch_size = 1
+    for s in shape[:-3]:
+        batch_size *= s
+
+    image = image.reshape(batch_size, num_channels, height, width)
+
+    if padding_mode == "edge":
+        # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
+        # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
+        # name.
+        padding_mode = "replicate"
+
+    if padding_mode == "constant":
+        image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
+    elif padding_mode in ("reflect", "replicate"):
+        # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
+        # TODO: See https://github.com/pytorch/pytorch/issues/40763
+        dtype = image.dtype
+        if not image.is_floating_point():
+            needs_cast = True
+            image = image.to(torch.float32)
+        else:
+            needs_cast = False
 
-        new_height, new_width = image.shape[-2:]
-    else:
-        left, right, top, bottom = torch_padding
-        new_height = height + top + bottom
-        new_width = width + left + right
+        image = torch_pad(image, torch_padding, mode=padding_mode)
+
+        if needs_cast:
+            image = image.to(dtype)
+    else:  # padding_mode == "symmetric"
+        image = _FT._pad_symmetric(image, torch_padding)
+
+    new_height, new_width = image.shape[-2:]
 
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
@@ -868,7 +867,24 @@ def pad(
         return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
 
 
-crop_image_tensor = _FT.crop
+def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    h, w = image.shape[-2:]
+
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        image = image[..., max(top, 0) : bottom, max(left, 0) : right]
+        torch_padding = [
+            max(min(right, 0) - left, 0),
+            max(right - max(w, left), 0),
+            max(min(bottom, 0) - top, 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    return image[..., top:bottom, left:right]
+
+
 crop_image_pil = _FP.crop
 
 
@@ -893,7 +909,18 @@ def crop_bounding_box(
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-    return crop_image_tensor(mask, top, left, height, width)
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = crop_image_tensor(mask, top, left, height, width)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
 
 
 def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-- 
GitLab


From de350bc01ad2193ea2888f0ce8a6a346d3cba5a9 Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Mon, 14 Nov 2022 15:56:04 +0000
Subject: [PATCH 150/624] Set pytorch vision decoder probesize for getting
 stream info based on the value from decode setting (#6900) (#6950)

Summary:
Pull Request resolved: https://github.com/pytorch/vision/pull/6900

1. Pass along the value of probesize from decode setting.
2. allow pytorch vision decoder to set the probesize for getting stream info

Reviewed By: jdsgomes

Differential Revision: D40967782

fbshipit-source-id: 642ae3b67c50b1fdafc20f9814a52fdf5db527d7

Co-authored-by: Chen Liu <cliu7@meta.com>
---
 torchvision/csrc/io/decoder/decoder.cpp | 2 ++
 torchvision/csrc/io/decoder/defs.h      | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index f13e2c3ff..a895eed2d 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -312,6 +312,8 @@ bool Decoder::init(
     }
   }
 
+  av_dict_set_int(&options, "probesize", params_.probeSize, 0);
+
   interrupted_ = false;
 
   // ffmpeg avformat_open_input call can hang if media source doesn't respond
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
index 502e5762e..ba82625c3 100644
--- a/torchvision/csrc/io/decoder/defs.h
+++ b/torchvision/csrc/io/decoder/defs.h
@@ -213,6 +213,12 @@ struct DecoderParameters {
 
   // Skip packets that fail with EPERM errors and continue decoding.
   bool skipOperationNotPermittedPackets{false};
+
+  // probing size in bytes, i.e. the size of the data to analyze to get stream
+  // information. A higher value will enable detecting more information in case
+  // it is dispersed into the stream, but will increase latency. Must be an
+  // integer not lesser than 32. It is 5000000 by default.
+  int64_t probeSize{5000000};
 };
 
 struct DecoderHeader {
-- 
GitLab


From b1f6c9e271368cd84837522af39e68dd4b5768a7 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 14 Nov 2022 19:36:05 +0000
Subject: [PATCH 151/624] [prototype] Optimize and clean up all affine methods
 (#6945)

* Clean up `_get_inverse_affine_matrix` and `_compute_affine_output_size`

* Optimize `_apply_grid_transform`

* Cleanup `_assert_grid_transform_inputs`

* Fix bugs on `_pad_with_scalar_fill` & `crop_mask` and port `crop_image_tensor`

* Call directly `_pad_with_scalar_fill`

* Fix linter

* Clean up `center_crop_image_tensor`

* Fix comments.

* Fixing rounding issues.

* Bumping tolerance for rotate which is unrelated to this PR.

* Fix tolerance threshold for RandomPerspective.

* Clean up `_affine_grid` and `affine_image_tensor`

* Clean up `rotate_image_tensor`

* Fixing linter

* Address code-review comments.
---
 test/prototype_transforms_kernel_infos.py     |   2 +-
 test/test_prototype_transforms_consistency.py |   1 +
 .../transforms/functional/_geometry.py        | 292 ++++++++++++++++--
 3 files changed, 265 insertions(+), 30 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 361a921b1..25daf3da5 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -915,7 +915,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
             float32_vs_uint8=True,
             # TODO: investigate
-            closeness_kwargs=pil_reference_pixel_difference(100, agg_method="mean"),
+            closeness_kwargs=pil_reference_pixel_difference(110, agg_method="mean"),
             test_marks=[
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 0cc52f8b8..d82d9ebea 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -401,6 +401,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
             ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
         ],
+        closeness_kwargs={"atol": None, "rtol": None},
     ),
     ConsistencyConfig(
         prototype_transforms.RandomRotation,
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 98bd7a527..41262185b 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1,16 +1,16 @@
+import math
 import numbers
 import warnings
 from typing import List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
-from torch.nn.functional import interpolate, pad as torch_pad
+from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 from torchvision.transforms.functional import (
     _compute_resized_output_size as __compute_resized_output_size,
-    _get_inverse_affine_matrix,
     _get_perspective_coeffs,
     InterpolationMode,
     pil_modes_mapping,
@@ -272,6 +272,195 @@ def _affine_parse_args(
     return angle, translate, shear, center
 
 
+def _get_inverse_affine_matrix(
+    center: List[float], angle: float, translate: List[float], scale: float, shear: List[float], inverted: bool = True
+) -> List[float]:
+    # Helper method to compute inverse matrix for affine transformation
+
+    # Pillow requires inverse affine transformation matrix:
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    #
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RotateScaleShear is rotation with scale and shear matrix
+    #
+    #       RotateScaleShear(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    cx, cy = center
+    tx, ty = translate
+
+    # Cached results
+    cos_sy = math.cos(sy)
+    tan_sx = math.tan(sx)
+    rot_minus_sy = rot - sy
+    cx_plus_tx = cx + tx
+    cy_plus_ty = cy + ty
+
+    # Rotate Scale Shear (RSS) without scaling
+    a = math.cos(rot_minus_sy) / cos_sy
+    b = -(a * tan_sx + math.sin(rot))
+    c = math.sin(rot_minus_sy) / cos_sy
+    d = math.cos(rot) - c * tan_sx
+
+    if inverted:
+        # Inverted rotation matrix with scale and shear
+        # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+        matrix = [d / scale, -b / scale, 0.0, -c / scale, a / scale, 0.0]
+        # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+        # and then apply center translation: C * RSS^-1 * C^-1 * T^-1
+        matrix[2] += cx - matrix[0] * cx_plus_tx - matrix[1] * cy_plus_ty
+        matrix[5] += cy - matrix[3] * cx_plus_tx - matrix[4] * cy_plus_ty
+    else:
+        matrix = [a * scale, b * scale, 0.0, c * scale, d * scale, 0.0]
+        # Apply inverse of center translation: RSS * C^-1
+        # and then apply translation and center : T * C * RSS * C^-1
+        matrix[2] += cx_plus_tx - matrix[0] * cx - matrix[1] * cy
+        matrix[5] += cy_plus_ty - matrix[3] * cx - matrix[4] * cy
+
+    return matrix
+
+
+def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    pts = torch.tensor(
+        [
+            [-half_w, -half_h, 1.0],
+            [-half_w, half_h, 1.0],
+            [half_w, half_h, 1.0],
+            [half_w, -half_h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, max_vals = new_pts.aminmax(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    halfs = torch.tensor((half_w, half_h))
+    min_vals.add_(halfs)
+    max_vals.add_(halfs)
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    inv_tol = 1.0 / tol
+    cmax = max_vals.mul_(inv_tol).trunc_().mul_(tol).ceil_()
+    cmin = min_vals.mul_(inv_tol).trunc_().mul_(tol).floor_()
+    size = cmax.sub_(cmin)
+    return int(size[0]), int(size[1])  # w, h
+
+
+def _apply_grid_transform(
+    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: features.FillTypeJIT
+) -> torch.Tensor:
+
+    shape = float_img.shape
+    if shape[0] > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(shape[0], -1, -1, -1)
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones((shape[0], 1, shape[2], shape[3]), dtype=float_img.dtype, device=float_img.device)
+        float_img = torch.cat((float_img, mask), dim=1)
+
+    float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3)
+        mask = mask.expand_as(float_img)
+        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]
+        fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1)
+        if mode == "nearest":
+            bool_mask = mask < 0.5
+            float_img[bool_mask] = fill_img.expand_as(float_img)[bool_mask]
+        else:  # 'bilinear'
+            # The following is mathematically equivalent to:
+            # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill
+            float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img)
+
+    return float_img
+
+
+def _assert_grid_transform_inputs(
+    image: torch.Tensor,
+    matrix: Optional[List[float]],
+    interpolation: str,
+    fill: features.FillTypeJIT,
+    supported_interpolation_modes: List[str],
+    coeffs: Optional[List[float]] = None,
+) -> None:
+    if matrix is not None:
+        if not isinstance(matrix, list):
+            raise TypeError("Argument matrix should be a list")
+        elif len(matrix) != 6:
+            raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None:
+        if isinstance(fill, (tuple, list)):
+            length = len(fill)
+            num_channels = image.shape[-3]
+            if length > 1 and length != num_channels:
+                raise ValueError(
+                    "The number of elements in 'fill' cannot broadcast to match the number of "
+                    f"channels of the image ({length} != {num_channels})"
+                )
+        elif not isinstance(fill, (int, float)):
+            raise ValueError("Argument fill should be either int, float, tuple or list")
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _affine_grid(
+    theta: torch.Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> torch.Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+    dtype = theta.dtype
+    device = theta.device
+
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace((1.0 - ow) * 0.5, (ow - 1.0) * 0.5, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace((1.0 - oh) * 0.5, (oh - 1.0) * 0.5, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2).div_(torch.tensor([0.5 * w, 0.5 * h], dtype=dtype, device=device))
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
 def affine_image_tensor(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -286,9 +475,19 @@ def affine_image_tensor(
         return image
 
     shape = image.shape
-    num_channels, height, width = shape[-3:]
-    image = image.reshape(-1, num_channels, height, width)
+    ndim = image.ndim
+    fp = torch.is_floating_point(image)
 
+    if ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+        needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    height, width = shape[-2:]
     angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
 
     center_f = [0.0, 0.0]
@@ -299,8 +498,20 @@ def affine_image_tensor(
     translate_f = [float(t) for t in translate]
     matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
 
-    output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill)
-    return output.reshape(shape)
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    dtype = image.dtype if fp else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height)
+    output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
+
+    if not fp:
+        output = output.round_().to(image.dtype)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    return output
 
 
 @torch.jit.unused
@@ -395,7 +606,7 @@ def _affine_bounding_box_xyxy(
         out_bboxes.sub_(tr.repeat((1, 2)))
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
-        new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
         spatial_size = (new_height, new_width)
 
     return out_bboxes.to(bounding_box.dtype), spatial_size
@@ -543,18 +754,26 @@ def rotate_image_tensor(
     matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
 
     if image.numel() > 0:
-        image = _FT.rotate(
-            image.reshape(-1, num_channels, height, width),
-            matrix,
-            interpolation=interpolation.value,
-            expand=expand,
-            fill=fill,
-        )
-        new_height, new_width = image.shape[-2:]
+        fp = torch.is_floating_point(image)
+        image = image.reshape(-1, num_channels, height, width)
+
+        _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+        ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
+        dtype = image.dtype if fp else torch.float32
+        theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+        grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh)
+        output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
+
+        if not fp:
+            output = output.round_().to(image.dtype)
+
+        new_height, new_width = output.shape[-2:]
     else:
-        new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height)
+        output = image
+        new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
 
-    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+    return output.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
 @torch.jit.unused
@@ -944,7 +1163,6 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
     # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #
-    # TODO: should we define them transposed?
     theta1 = torch.tensor(
         [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
     )
@@ -959,8 +1177,9 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
     base_grid[..., 2].fill_(1)
 
     rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device))
-    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
-    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+    shape = (1, oh * ow, 3)
+    output_grid1 = base_grid.view(shape).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(shape).bmm(theta2.transpose(1, 2))
 
     output_grid = output_grid1.div_(output_grid2).sub_(1.0)
     return output_grid.view(1, oh, ow, 2)
@@ -996,14 +1215,19 @@ def perspective_image_tensor(
         return image
 
     shape = image.shape
+    ndim = image.ndim
+    fp = torch.is_floating_point(image)
 
-    if image.ndim > 4:
+    if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
     else:
         needs_unsquash = False
 
-    _FT._assert_grid_transform_inputs(
+    _assert_grid_transform_inputs(
         image,
         matrix=None,
         interpolation=interpolation.value,
@@ -1012,10 +1236,13 @@ def perspective_image_tensor(
         coeffs=perspective_coeffs,
     )
 
-    ow, oh = image.shape[-1], image.shape[-2]
-    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    oh, ow = shape[-2:]
+    dtype = image.dtype if fp else torch.float32
     grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
-    output = _FT._apply_grid_transform(image, grid, interpolation.value, fill=fill)
+    output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
+
+    if not fp:
+        output = output.round_().to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1086,7 +1313,6 @@ def perspective_bounding_box(
         (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
     ]
 
-    # TODO: should we define them transposed?
     theta1 = torch.tensor(
         [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
         dtype=dtype,
@@ -1193,17 +1419,25 @@ def elastic_image_tensor(
         return image
 
     shape = image.shape
+    ndim = image.ndim
     device = image.device
+    fp = torch.is_floating_point(image)
 
-    if image.ndim > 4:
+    if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
     else:
         needs_unsquash = False
 
     image_height, image_width = shape[-2:]
     grid = _create_identity_grid((image_height, image_width), device=device).add_(displacement.to(device))
-    output = _FT._apply_grid_transform(image, grid, interpolation.value, fill)
+    output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill)
+
+    if not fp:
+        output = output.round_().to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1361,7 +1595,7 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        image = _FT.torch_pad(image, _FT._parse_pad_padding(padding_ltrb), value=0.0)
+        image = torch_pad(image, _parse_pad_padding(padding_ltrb), value=0.0)
 
         image_height, image_width = image.shape[-2:]
         if crop_width == image_width and crop_height == image_height:
-- 
GitLab


From a14899441c5b0a2fa35c35c13a6e164d21881421 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 15 Nov 2022 13:54:09 +0100
Subject: [PATCH 152/624] fix flake8 repo url (#6953)

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4c51a5312..6662d7e9b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
           - black == 22.3.0
           - usort == 1.0.2
 
-  - repo: https://gitlab.com/pycqa/flake8
+  - repo: https://github.com/PyCQA/flake8
     rev: 5.0.4
     hooks:
       - id: flake8
-- 
GitLab


From ad128b753c7e8cc0c600dfddac22ff48fc73c9d9 Mon Sep 17 00:00:00 2001
From: shunsuke yokokawa <45687080+yokosyun@users.noreply.github.com>
Date: Wed, 16 Nov 2022 00:25:58 +0900
Subject: [PATCH 153/624] fix documentation of _get_inverse_affine_matrix
 (#6947)

---
 torchvision/transforms/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index ec5804f0c..94bed3d7b 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -1004,7 +1004,7 @@ def _get_inverse_affine_matrix(
     #       RotateScaleShear(a, s, (sx, sy)) =
     #       = R(a) * S(s) * SHy(sy) * SHx(sx)
     #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
-    #         [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
     #         [ 0                    , 0                                      , 1 ]
     # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
     # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
-- 
GitLab


From 3f4dcae6492e184b34f0f0747ce9c45842df0a7e Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Wed, 16 Nov 2022 15:44:21 +0000
Subject: [PATCH 154/624] Fix MaxViT typing (#6929)

* Fix MaxViT typing

* linter
---
 torchvision/models/maxvit.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
index 96c395132..d32fc5bb5 100644
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -1,6 +1,7 @@
 import math
+from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple
+from typing import Any, Callable, List, Optional, Sequence, Tuple
 
 import numpy as np
 import torch
@@ -426,7 +427,7 @@ class MaxVitLayer(nn.Module):
     ) -> None:
         super().__init__()
 
-        layers: OrderedDict[str, Any] = OrderedDict()  # type: ignore
+        layers: OrderedDict = OrderedDict()
 
         # convolutional layer
         layers["MBconv"] = MBConv(
-- 
GitLab


From b1054cbb8a8fbe0e266a9adb5e0bde555c9f3c3b Mon Sep 17 00:00:00 2001
From: Aditya Oke <47158509+oke-aditya@users.noreply.github.com>
Date: Thu, 17 Nov 2022 15:12:32 +0530
Subject: [PATCH 155/624] Add Video SwinTransformer (#6521)

* Just start adding mere copy paste

* Replace d with t and D with T

* Align swin transformer video to image a bit

* Rename d -> t

* align with 2d impl

* align with 2d impl

* Add helpful comments and config for 3d

* add docs

* Add docs

* Add configurations

* Add docs

* Fix bugs

* Fix wrong edit

* Fix wrong edit

* Fix bugs

* Fix bugs

* Fix as per fx suggestions

* Update torchvision/models/video/swin_transformer.py

* Fix as per fx suggestions

* Fix expect files and code

* Update the expect files

* Modify video swin

* Add min size and min temporal size, num params

* Add flops and size

* Fix types

* Fix url recipe

Co-authored-by: Yosua Michael Maranatha <yosuamichael@fb.com>
---
 docs/source/models.rst                        |   1 +
 docs/source/models/swin_transformer.rst       |   2 +-
 docs/source/models/video_swin_transformer.rst |  27 +
 .../ModelTester.test_swin3d_b_expect.pkl      | Bin 0 -> 1078 bytes
 .../ModelTester.test_swin3d_s_expect.pkl      | Bin 0 -> 1078 bytes
 .../ModelTester.test_swin3d_t_expect.pkl      | Bin 0 -> 1078 bytes
 torchvision/models/swin_transformer.py        |   4 +-
 torchvision/models/video/__init__.py          |   1 +
 torchvision/models/video/swin_transformer.py  | 740 ++++++++++++++++++
 9 files changed, 773 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/models/video_swin_transformer.rst
 create mode 100644 test/expect/ModelTester.test_swin3d_b_expect.pkl
 create mode 100644 test/expect/ModelTester.test_swin3d_s_expect.pkl
 create mode 100644 test/expect/ModelTester.test_swin3d_t_expect.pkl
 create mode 100644 torchvision/models/video/swin_transformer.py

diff --git a/docs/source/models.rst b/docs/source/models.rst
index 10618434f..ddd355503 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -518,6 +518,7 @@ pre-trained weights:
    models/video_mvit
    models/video_resnet
    models/video_s3d
+   models/video_swin_transformer
 
 |
 
diff --git a/docs/source/models/swin_transformer.rst b/docs/source/models/swin_transformer.rst
index 35b529879..b302f5bd7 100644
--- a/docs/source/models/swin_transformer.rst
+++ b/docs/source/models/swin_transformer.rst
@@ -15,7 +15,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate an SwinTransformer model (original and V2) with and without pre-trained weights.
-All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` 
+All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer``
 base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_ for
 more details about this class.
diff --git a/docs/source/models/video_swin_transformer.rst b/docs/source/models/video_swin_transformer.rst
new file mode 100644
index 000000000..e31e69759
--- /dev/null
+++ b/docs/source/models/video_swin_transformer.rst
@@ -0,0 +1,27 @@
+Video SwinTransformer
+=====================
+
+.. currentmodule:: torchvision.models.video
+
+The Video SwinTransformer model is based on the `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`__ paper.
+
+.. betastatus:: video module
+
+
+Model builders
+--------------
+
+The following model builders can be used to instantiate a VideoResNet model, with or
+without pre-trained weights. All the model builders internally rely on the
+``torchvision.models.video.swin_transformer.SwinTransformer3d`` base class. Please refer to the `source
+code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    swin3d_t
+    swin3d_s
+    swin3d_b
diff --git a/test/expect/ModelTester.test_swin3d_b_expect.pkl b/test/expect/ModelTester.test_swin3d_b_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1efc513c91166243925d9f32cc2ae2d35de2f019
GIT binary patch
literal 1078
zcmWIWW@cev;NW1u0GbSz48Hj(sW~C3#U-gldL=+AzPLOy&p0JMDL%ELAT_x}KP9mw
zQLi97$IZ!<A&MN(z?58)UzD7o7hhVEnN!RaUzD0unwgUlUy_<voL>}QX2ey<prH}L
z4s>5JP-S9zY9S*;3zu6?equ>5M3AeHDS{EG#-NZnf*D9B7nByVcpH@zvIcrHcr$t%
zwH2~?8+kLf6|#3EC+Fwn09~J%pI6M~UzCzsl$zp_nOwqE$e|fj$XQaz6;#Nr7vRm#
z!DMluW){#i5Dvf@QtAwX#D$c;ftwRAYB)7yzq4HsV`G={!r3;%LVZ6=S%;lX$i=-v
zTZQ&>#ecQk(Z6orgmgz+?u1Oc9gRu$$7+1`KkAsVZx)N<{$q8b_8w;g?T%lFvh!q@
zx3lG+VYmF6oV^~C)c&frKD$TO-1`?uyxAjuHhJH&?QiXJ=dayo@`H2Vn{M@ee^%ey
z7xDPG?S!{a_O_QVwaeajYM;!jmHP^sjO}_74ffsI|8}2QP40ek#+Um(YHRM7d6H!p
z^54}ib5pzh+K&_WF$gv7KJ#{>-JgdI`?t5Z?+1m}PBzo)Cx9UZ!ni|AgF%UKXqBZF
z6$7Kk%}EY5xG<8KF&D_@i}TV$nSi!}aDX=>h=QkS<T%s>NuU6tQ#iV@$evY4F;WYd
z4v8{W-vHfEWPfO&7#ace6G20vp%&oH#-;<+Eyt`2HwTn2KmbNFfiesbTmpv|DCF2c
Y*^J=|Q~^jjz?+o~B*qMcAoUQn0H*RXd;kCd

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_swin3d_s_expect.pkl b/test/expect/ModelTester.test_swin3d_s_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0c1e594993e01c3610c395608fe46ec6bde16214
GIT binary patch
literal 1078
zcmWIWW@cev;NW1u0GbSz48Hj(sW~C3#U-gldL=+AzPLOy&p0K%I6k$aAT_x}KP9mw
zQLi97$IZ!<A&MN(z?58)UzD7o7hhVEnN!RaUzD0unwgUlUy_<voL>}QX2ey<prH}L
z4s>5JP-S9zY9S*;3zu6?equ>5M3AeHDS{EG#-NZnf*D9B7nByVcpH@zvIcrHcr$t%
zwH2~?8+kLf6|#3EC+Fwn09~J%pI6M~UzCzsl$zp_nOwqE$e|fj$XQaz6;#Nr7vRm#
z!DMluW){#i5Dvf@QtAwX#D$c;ftwRAYB=fUEA1C>=-pHF*vejP4U?U2e5IX(uhZ_(
z9jo?5^sm};{P7>V19kFy^t_GiZkROeUDuGeFC~B8K9<j`ZCjkU?VVdh?9cW5+7~Mr
zzVAjd+r9%DVf((PhTG+bY~NpdWXV1eg{u9Ns;%}Le^_aEd41^qE$`g+{%FzI_aiLU
z{&Pu~eb;nt`w5Q?_X;E@+lj~O+b>(*YIo=3D*JM&So<9Zw(XPpcft0Cw7#uU|0g?}
zsXJ`<P5HS`YAM^kz2<&)|IL=z+U>X5&y&b%4+^ck*@ev~fFT9KxI;^WL5Xl^m8BLH
z1Ea^yNe(r*Fp`)t7s%#|^U_0^fVP5gfHxzEf~RTZIMf75pa7y%IJ&XOo>fONQVW<4
zi85B-0Nqeze`uf>8UgeZK|`UT7U0dsrUTV2$E*uC2b3>B07f%`G7J!00*4nU<k&#j
XjNuAY0Z2N)o0SbD#teiY^$@iH+>9<-

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_swin3d_t_expect.pkl b/test/expect/ModelTester.test_swin3d_t_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..5e658ff16b7352da3748eebeabd03a7c4fb5a8dc
GIT binary patch
literal 1078
zcmWIWW@cev;NW1u0GbSz48Hj(sW~C3#U-gldL=+AzPLOy&p0K%BtEsGAT_x}KP9mw
zQLi97$IZ!<A&MN(z?58)UzD7o7hhVEnN!RaUzD0unwbN%A~ml#zbL-Uh^vr6LnDG6
z=)Pj0%Ea{4LPm%dF1MWg#FAi$AXgz%1S3$5K_PPlGmuU$C@p01HYzD(4fJO4X7n~{
zD`fLF@@8x+Wba5$&d<pKx;`^Mub9ifC?&NhHN_<}xrD2bLo=w5v!swKsE}JPz?+?e
z$>KoGETCy19Dp;V)ENYc3n_gAHz!`yaN5(AVJjP(z3)P<iyil5DZ784<n}*X^<-a9
z>j~Re7Sj74)pFS{PF%n5NY}%CT-jy&lm15TQ{?yCw|v!qyQVvS_D7U;Y!^j5*w=P%
zx!tY9%lC2kF0noL%-lYsOT_-k)0z89a$@bQR!i^yB9U&-u}5X!(JBW!9pPGghs*8u
zpL#Xz=YMh8zbD|8?cR0Y_GMk!Z?|vlY`a%wXY5kWO51Z7$L{9{xosz<K6@X>4cq<8
z@@DLfdB$UJ;4t5|N>|1H38R|5!u3@)c?n+oL7{c`<ztT%z>orA+@YnxphP&d%2JDp
zfzjjUB!?PY7)i{S3uN=fdFi1{KwCjLz?%_7!P7Ky9BP6jPyo>>9Nk!C&#I#ssRc}j
zL>a4ZfNm(VKQvGbjR5+IprO!E3-D%R(}C)iW7dV61Iia50Hc{e83qV0fx`<Fa%`Y%
X#&89y03;pY&B_K6V+KNydWc#8be}R8

literal 0
HcmV?d00001

diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 47498e66d..2347217bd 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -494,6 +494,8 @@ class SwinTransformerBlockV2(SwinTransformerBlock):
         )
 
     def forward(self, x: Tensor):
+        # Here is the difference, we apply norm after the attention in V2.
+        # In V1 we applied norm before the attention.
         x = x + self.stochastic_depth(self.norm1(self.attn(x)))
         x = x + self.stochastic_depth(self.norm2(self.mlp(x)))
         return x
@@ -587,7 +589,7 @@ class SwinTransformer(nn.Module):
 
         num_features = embed_dim * 2 ** (len(depths) - 1)
         self.norm = norm_layer(num_features)
-        self.permute = Permute([0, 3, 1, 2])
+        self.permute = Permute([0, 3, 1, 2])  # B H W C -> B C H W
         self.avgpool = nn.AdaptiveAvgPool2d(1)
         self.flatten = nn.Flatten(1)
         self.head = nn.Linear(num_features, num_classes)
diff --git a/torchvision/models/video/__init__.py b/torchvision/models/video/__init__.py
index 232c92013..f1eedd311 100644
--- a/torchvision/models/video/__init__.py
+++ b/torchvision/models/video/__init__.py
@@ -1,3 +1,4 @@
 from .mvit import *
 from .resnet import *
 from .s3d import *
+from .swin_transformer import *
diff --git a/torchvision/models/video/swin_transformer.py b/torchvision/models/video/swin_transformer.py
new file mode 100644
index 000000000..3d19f97ce
--- /dev/null
+++ b/torchvision/models/video/swin_transformer.py
@@ -0,0 +1,740 @@
+# Modified from 2d Swin Transformers in torchvision:
+# https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py
+
+from functools import partial
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ...transforms._presets import VideoClassification
+
+from ...utils import _log_api_usage_once
+
+from .._api import register_model, Weights, WeightsEnum
+
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..swin_transformer import PatchMerging, SwinTransformerBlock
+
+__all__ = [
+    "SwinTransformer3d",
+    "Swin3D_T_Weights",
+    "Swin3D_S_Weights",
+    "Swin3D_B_Weights",
+    "swin3d_t",
+    "swin3d_s",
+    "swin3d_b",
+]
+
+
+def _get_window_and_shift_size(
+    shift_size: List[int], size_dhw: List[int], window_size: List[int]
+) -> Tuple[List[int], List[int]]:
+    for i in range(3):
+        if size_dhw[i] <= window_size[i]:
+            # In this case, window_size will adapt to the input size, and no need to shift
+            window_size[i] = size_dhw[i]
+            shift_size[i] = 0
+
+    return window_size, shift_size
+
+
+torch.fx.wrap("_get_window_and_shift_size")
+
+
+def _get_relative_position_bias(
+    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: List[int]
+) -> Tensor:
+    window_vol = window_size[0] * window_size[1] * window_size[2]
+    # In 3d case we flatten the relative_position_bias
+    relative_position_bias = relative_position_bias_table[
+        relative_position_index[:window_vol, :window_vol].flatten()  # type: ignore[index]
+    ]
+    relative_position_bias = relative_position_bias.view(window_vol, window_vol, -1)
+    relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().unsqueeze(0)
+    return relative_position_bias
+
+
+torch.fx.wrap("_get_relative_position_bias")
+
+
+def _compute_pad_size_3d(size_dhw: Tuple[int, int, int], patch_size: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    pad_size = [(patch_size[i] - size_dhw[i] % patch_size[i]) % patch_size[i] for i in range(3)]
+    return pad_size[0], pad_size[1], pad_size[2]
+
+
+torch.fx.wrap("_compute_pad_size_3d")
+
+
+def _compute_attention_mask_3d(
+    x: Tensor,
+    size_dhw: Tuple[int, int, int],
+    window_size: Tuple[int, int, int],
+    shift_size: Tuple[int, int, int],
+) -> Tensor:
+    # generate attention mask
+    attn_mask = x.new_zeros(*size_dhw)
+    num_windows = (size_dhw[0] // window_size[0]) * (size_dhw[1] // window_size[1]) * (size_dhw[2] // window_size[2])
+    slices = [
+        (
+            (0, -window_size[i]),
+            (-window_size[i], -shift_size[i]),
+            (-shift_size[i], None),
+        )
+        for i in range(3)
+    ]
+    count = 0
+    for d in slices[0]:
+        for h in slices[1]:
+            for w in slices[2]:
+                attn_mask[d[0] : d[1], h[0] : h[1], w[0] : w[1]] = count
+                count += 1
+
+    # Partition window on attn_mask
+    attn_mask = attn_mask.view(
+        size_dhw[0] // window_size[0],
+        window_size[0],
+        size_dhw[1] // window_size[1],
+        window_size[1],
+        size_dhw[2] // window_size[2],
+        window_size[2],
+    )
+    attn_mask = attn_mask.permute(0, 2, 4, 1, 3, 5).reshape(
+        num_windows, window_size[0] * window_size[1] * window_size[2]
+    )
+    attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+torch.fx.wrap("_compute_attention_mask_3d")
+
+
+def shifted_window_attention_3d(
+    input: Tensor,
+    qkv_weight: Tensor,
+    proj_weight: Tensor,
+    relative_position_bias: Tensor,
+    window_size: List[int],
+    num_heads: int,
+    shift_size: List[int],
+    attention_dropout: float = 0.0,
+    dropout: float = 0.0,
+    qkv_bias: Optional[Tensor] = None,
+    proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        input (Tensor[B, T, H, W, C]): The input tensor, 5-dimensions.
+        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
+        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
+        relative_position_bias (Tensor): The learned relative position bias added to attention.
+        window_size (List[int]): 3-dimensions window size, T, H, W .
+        num_heads (int): Number of attention heads.
+        shift_size (List[int]): Shift size for shifted window attention (T, H, W).
+        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
+        dropout (float): Dropout ratio of output. Default: 0.0.
+        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
+        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+    Returns:
+        Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
+    """
+    b, t, h, w, c = input.shape
+    # pad feature maps to multiples of window size
+    pad_size = _compute_pad_size_3d((t, h, w), (window_size[0], window_size[1], window_size[2]))
+    x = F.pad(input, (0, 0, 0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+    _, tp, hp, wp, _ = x.shape
+    padded_size = (tp, hp, wp)
+
+    # cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+
+    # partition windows
+    num_windows = (
+        (padded_size[0] // window_size[0]) * (padded_size[1] // window_size[1]) * (padded_size[2] // window_size[2])
+    )
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        window_size[0],
+        padded_size[1] // window_size[1],
+        window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).reshape(
+        b * num_windows, window_size[0] * window_size[1] * window_size[2], c
+    )  # B*nW, Wd*Wh*Ww, C
+
+    # multi-head attention
+    qkv = F.linear(x, qkv_weight, qkv_bias)
+    qkv = qkv.reshape(x.size(0), x.size(1), 3, num_heads, c // num_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0], qkv[1], qkv[2]
+    q = q * (c // num_heads) ** -0.5
+    attn = q.matmul(k.transpose(-2, -1))
+    # add relative position bias
+    attn = attn + relative_position_bias
+
+    if sum(shift_size) > 0:
+        # generate attention mask to handle shifted windows with varying size
+        attn_mask = _compute_attention_mask_3d(
+            x,
+            (padded_size[0], padded_size[1], padded_size[2]),
+            (window_size[0], window_size[1], window_size[2]),
+            (shift_size[0], shift_size[1], shift_size[2]),
+        )
+        attn = attn.view(x.size(0) // num_windows, num_windows, num_heads, x.size(1), x.size(1))
+        attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
+        attn = attn.view(-1, num_heads, x.size(1), x.size(1))
+
+    attn = F.softmax(attn, dim=-1)
+    attn = F.dropout(attn, p=attention_dropout)
+
+    x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), c)
+    x = F.linear(x, proj_weight, proj_bias)
+    x = F.dropout(x, p=dropout)
+
+    # reverse windows
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        padded_size[1] // window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[0],
+        window_size[1],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).reshape(b, tp, hp, wp, c)
+
+    # reverse cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+
+    # unpad features
+    x = x[:, :t, :h, :w, :].contiguous()
+    return x
+
+
+torch.fx.wrap("shifted_window_attention_3d")
+
+
+class ShiftedWindowAttention3d(nn.Module):
+    """
+    See :func:`shifted_window_attention_3d`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        window_size: List[int],
+        shift_size: List[int],
+        num_heads: int,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        if len(window_size) != 3 or len(shift_size) != 3:
+            raise ValueError("window_size and shift_size must be of length 2")
+
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.shift_size = shift_size
+        self.num_heads = num_heads
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+
+        self.define_relative_position_bias_table()
+        self.define_relative_position_index()
+
+    def define_relative_position_bias_table(self) -> None:
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1),
+                self.num_heads,
+            )
+        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def define_relative_position_index(self) -> None:
+        # get pair-wise relative position index for each token inside the window
+        coords_dhw = [torch.arange(self.window_size[i]) for i in range(3)]
+        coords = torch.stack(
+            torch.meshgrid(coords_dhw[0], coords_dhw[1], coords_dhw[2], indexing="ij")
+        )  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= 2 * self.window_size[2] - 1
+        # We don't flatten the relative_position_index here in 3d case.
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def get_relative_position_bias(self, window_size: List[int]) -> torch.Tensor:
+        return _get_relative_position_bias(self.relative_position_bias_table, self.relative_position_index, window_size)  # type: ignore
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, t, h, w, _ = x.shape
+        size_dhw = [t, h, w]
+        window_size, shift_size = self.window_size.copy(), self.shift_size.copy()
+        # Handle case where window_size is larger than the input tensor
+        window_size, shift_size = _get_window_and_shift_size(shift_size, size_dhw, window_size)
+
+        relative_position_bias = self.get_relative_position_bias(window_size)
+
+        return shifted_window_attention_3d(
+            x,
+            self.qkv.weight,
+            self.proj.weight,
+            relative_position_bias,
+            window_size,
+            self.num_heads,
+            shift_size=shift_size,
+            attention_dropout=self.attention_dropout,
+            dropout=self.dropout,
+            qkv_bias=self.qkv.bias,
+            proj_bias=self.proj.bias,
+        )
+
+
+# Modified from:
+# https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/mmaction/models/backbones/swin_transformer.py
+class PatchEmbed3d(nn.Module):
+    """Video to Patch Embedding.
+
+    Args:
+        patch_size (List[int]): Patch token size.
+        in_channels (int): Number of input channels. Default: 3
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self,
+        patch_size: List[int],
+        in_channels: int = 3,
+        embed_dim: int = 96,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.tuple_patch_size = (patch_size[0], patch_size[1], patch_size[2])
+
+        self.proj = nn.Conv3d(
+            in_channels,
+            embed_dim,
+            kernel_size=self.tuple_patch_size,
+            stride=self.tuple_patch_size,
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        # padding
+        _, _, t, h, w = x.size()
+        pad_size = _compute_pad_size_3d((t, h, w), self.tuple_patch_size)
+        x = F.pad(x, (0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+        x = self.proj(x)  # B C T Wh Ww
+        x = x.permute(0, 2, 3, 4, 1)  # B T Wh Ww C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class SwinTransformer3d(nn.Module):
+    """
+    Implements 3D Swin Transformer from the `"Video Swin Transformer" <https://arxiv.org/abs/2106.13230>`_ paper.
+    Args:
+        patch_size (List[int]): Patch size.
+        embed_dim (int): Patch embedding dimension.
+        depths (List(int)): Depth of each Swin Transformer layer.
+        num_heads (List(int)): Number of attention heads in different layers.
+        window_size (List[int]): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
+        num_classes (int): Number of classes for classification head. Default: 400.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None.
+        block (nn.Module, optional): SwinTransformer Block. Default: None.
+        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
+        patch_embed (nn.Module, optional): Patch Embedding layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        patch_size: List[int],
+        embed_dim: int,
+        depths: List[int],
+        num_heads: List[int],
+        window_size: List[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.1,
+        num_classes: int = 400,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        block: Optional[Callable[..., nn.Module]] = None,
+        downsample_layer: Callable[..., nn.Module] = PatchMerging,
+        patch_embed: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+
+        if block is None:
+            block = partial(SwinTransformerBlock, attn_layer=ShiftedWindowAttention3d)
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-5)
+
+        if patch_embed is None:
+            patch_embed = PatchEmbed3d
+
+        # split image into non-overlapping patches
+        self.patch_embed = patch_embed(patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=dropout)
+
+        layers: List[nn.Module] = []
+        total_stage_blocks = sum(depths)
+        stage_block_id = 0
+        # build SwinTransformer blocks
+        for i_stage in range(len(depths)):
+            stage: List[nn.Module] = []
+            dim = embed_dim * 2**i_stage
+            for i_layer in range(depths[i_stage]):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / (total_stage_blocks - 1)
+                stage.append(
+                    block(
+                        dim,
+                        num_heads[i_stage],
+                        window_size=window_size,
+                        shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
+                        mlp_ratio=mlp_ratio,
+                        dropout=dropout,
+                        attention_dropout=attention_dropout,
+                        stochastic_depth_prob=sd_prob,
+                        norm_layer=norm_layer,
+                        attn_layer=ShiftedWindowAttention3d,
+                    )
+                )
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            # add patch merging layer
+            if i_stage < (len(depths) - 1):
+                layers.append(downsample_layer(dim, norm_layer))
+        self.features = nn.Sequential(*layers)
+
+        self.num_features = embed_dim * 2 ** (len(depths) - 1)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool3d(1)
+        self.head = nn.Linear(self.num_features, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: B C T H W
+        x = self.patch_embed(x)  # B _T _H _W C
+        x = self.pos_drop(x)
+        x = self.features(x)  # B _T _H _W C
+        x = self.norm(x)
+        x = x.permute(0, 4, 1, 2, 3)  # B, C, _T, _H, _W
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.head(x)
+        return x
+
+
+def _swin_transformer3d(
+    patch_size: List[int],
+    embed_dim: int,
+    depths: List[int],
+    num_heads: List[int],
+    window_size: List[int],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> SwinTransformer3d:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = SwinTransformer3d(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        stochastic_depth_prob=stochastic_depth_prob,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
+
+
+_COMMON_META = {
+    "categories": _KINETICS400_CATEGORIES,
+    "min_size": (1, 1),
+    "min_temporal_size": 1,
+}
+
+
+class Swin3D_T_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_t-7615ae03.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 28158070,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 77.715,
+                    "acc@5": 93.519,
+                }
+            },
+            "_ops": 43.882,
+            "_weight_size": 121.543,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_S_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_s-da41c237.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 49816678,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.521,
+                    "acc@5": 94.158,
+                }
+            },
+            "_ops": 82.841,
+            "_weight_size": 218.288,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_B_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.427,
+                    "acc@5": 94.386,
+                }
+            },
+            "_ops": 140.667,
+            "_weight_size": 364.134,
+        },
+    )
+    KINETICS400_IMAGENET22K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_22k-7c6ae6fa.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 81.643,
+                    "acc@5": 95.574,
+                }
+            },
+            "_ops": 140.667,
+            "_weight_size": 364.134,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_T_Weights.KINETICS400_V1))
+def swin3d_t(*, weights: Optional[Swin3D_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_tiny architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_T_Weights
+        :members:
+    """
+    weights = Swin3D_T_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_S_Weights.KINETICS400_V1))
+def swin3d_s(*, weights: Optional[Swin3D_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_small architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_S_Weights
+        :members:
+    """
+    weights = Swin3D_S_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_B_Weights.KINETICS400_V1))
+def swin3d_b(*, weights: Optional[Swin3D_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_base architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_B_Weights
+        :members:
+    """
+    weights = Swin3D_B_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
-- 
GitLab


From d710f3d1edc06afa244468cb96603ba6dbd4d9d5 Mon Sep 17 00:00:00 2001
From: Bruno Korbar <bjuncek@gmail.com>
Date: Thu, 17 Nov 2022 13:12:43 +0000
Subject: [PATCH 156/624] Bkorbar/pyavapi (#6943)

* Test: add backend parameter

* VideoReader object now works on backend

* Frame reading now passes

* Keyframe seek now passes

* Pyav backend now supports metadata

* changes in test to reflect GPU decoder change

* Linter?

* Test GPU output

* Addressing Joao's comments

* lint

* lint

* Revert "Test GPU output"

This reverts commit f62e955d7dc81bcb23b40d58ea75413b9b62e76d.

* lint?

* lint

* lint

* Address issues in build?

* hopefully doc fix

* Arrgh

* arrgh

* fix typos

* fix input options

* remove read from memory option in pyav

* skip read from mem test for gpu and pyab be

* fix test

* remove unused import

* Hack to get reading from memory work with pyav

* patch audio test

* gallery change in a hope that docs won't break

* check video decoder inside io

* adding missing lib loading code

* remove unused input

Co-authored-by: Bruno Korbar <bkorbar@quansight.com>
Co-authored-by: Joao Gomes <jdsgomes@fb.com>
---
 gallery/plot_video_api.py           |   1 +
 test/test_video_gpu_decoder.py      |  10 +-
 test/test_videoapi.py               |  98 +++++++++-------
 torchvision/__init__.py             |  13 ++-
 torchvision/io/__init__.py          |   6 +-
 torchvision/io/_load_gpu_decoder.py |   8 --
 torchvision/io/video.py             |   7 ++
 torchvision/io/video_reader.py      | 172 ++++++++++++++++++++--------
 8 files changed, 210 insertions(+), 105 deletions(-)
 delete mode 100644 torchvision/io/_load_gpu_decoder.py

diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
index d83a508ea..76e3590d5 100644
--- a/gallery/plot_video_api.py
+++ b/gallery/plot_video_api.py
@@ -32,6 +32,7 @@ videos, together with the examples on how to build datasets and more.
 import torch
 import torchvision
 from torchvision.datasets.utils import download_url
+torchvision.set_video_backend("video_reader")
 
 # Download the sample video
 download_url(
diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
index d987db6dd..aa6d0aee9 100644
--- a/test/test_video_gpu_decoder.py
+++ b/test/test_video_gpu_decoder.py
@@ -3,6 +3,7 @@ import os
 
 import pytest
 import torch
+import torchvision
 from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
 
 try:
@@ -29,8 +30,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_frame_reading(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         with av.open(full_path) as container:
             for av_frame in container.decode(container.streams.video[0]):
                 av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -54,7 +56,8 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_seek_reading(self, keyframes, full_path, duration):
-        decoder = VideoReader(full_path, device="cuda")
+        torchvision.set_video_backend("cuda")
+        decoder = VideoReader(full_path)
         time = duration / 2
         decoder.seek(time, keyframes_only=keyframes)
         with av.open(full_path) as container:
@@ -79,8 +82,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_metadata(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         video_metadata = decoder.get_metadata()["video"]
         with av.open(full_path) as container:
             video = container.streams.video[0]
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index 4688e5a64..c1bfb9012 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -53,7 +53,9 @@ test_videos = {
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         with av.open(full_path) as av_reader:
             if av_reader.streams.video:
@@ -117,50 +119,60 @@ class TestVideoApi:
 
     @pytest.mark.parametrize("stream", ["video", "audio"])
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading_mem_vs_file(self, test_video, stream):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
-        # Test video reading from file vs from memory
-        vr_frames, vr_frames_mem = [], []
-        vr_pts, vr_pts_mem = [], []
-        # get vr frames
-        video_reader = VideoReader(full_path, stream)
-        for vr_frame in video_reader:
-            vr_frames.append(vr_frame["data"])
-            vr_pts.append(vr_frame["pts"])
-
-        # get vr frames = read from memory
-        f = open(full_path, "rb")
-        fbytes = f.read()
-        f.close()
-        video_reader_from_mem = VideoReader(fbytes, stream)
-
-        for vr_frame_from_mem in video_reader_from_mem:
-            vr_frames_mem.append(vr_frame_from_mem["data"])
-            vr_pts_mem.append(vr_frame_from_mem["pts"])
-
-        # same number of frames
-        assert len(vr_frames) == len(vr_frames_mem)
-        assert len(vr_pts) == len(vr_pts_mem)
-
-        # compare the frames and ptss
-        for i in range(len(vr_frames)):
-            assert vr_pts[i] == vr_pts_mem[i]
-            mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-            # on average the difference is very small and caused
-            # by decoding (around 1%)
-            # TODO: asses empirically how to set this? atm it's 1%
-            # averaged over all frames
-            assert mean_delta.item() < 2.55
-
-        del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        reader = VideoReader(full_path)
+        reader_md = reader.get_metadata()
+
+        if stream in reader_md:
+            # Test video reading from file vs from memory
+            vr_frames, vr_frames_mem = [], []
+            vr_pts, vr_pts_mem = [], []
+            # get vr frames
+            video_reader = VideoReader(full_path, stream)
+            for vr_frame in video_reader:
+                vr_frames.append(vr_frame["data"])
+                vr_pts.append(vr_frame["pts"])
+
+            # get vr frames = read from memory
+            f = open(full_path, "rb")
+            fbytes = f.read()
+            f.close()
+            video_reader_from_mem = VideoReader(fbytes, stream)
+
+            for vr_frame_from_mem in video_reader_from_mem:
+                vr_frames_mem.append(vr_frame_from_mem["data"])
+                vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+            # same number of frames
+            assert len(vr_frames) == len(vr_frames_mem)
+            assert len(vr_pts) == len(vr_pts_mem)
+
+            # compare the frames and ptss
+            for i in range(len(vr_frames)):
+                assert vr_pts[i] == vr_pts_mem[i]
+                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+                # on average the difference is very small and caused
+                # by decoding (around 1%)
+                # TODO: asses empirically how to set this? atm it's 1%
+                # averaged over all frames
+                assert mean_delta.item() < 2.55
+
+            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        else:
+            del reader, reader_md
 
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_metadata(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_metadata(self, test_video, config, backend):
         """
         Test that the metadata returned via pyav corresponds to the one returned
         by the new video decoder API
         """
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         reader = VideoReader(full_path, "video")
         reader_md = reader.get_metadata()
@@ -168,7 +180,9 @@ class TestVideoApi:
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_seek_start(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_seek_start(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         video_reader = VideoReader(full_path, "video")
         num_frames = 0
@@ -194,7 +208,9 @@ class TestVideoApi:
         assert start_num_frames == num_frames
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_accurateseek_middle(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader"])
+    def test_accurateseek_middle(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         stream = "video"
         video_reader = VideoReader(full_path, stream)
@@ -233,7 +249,9 @@ class TestVideoApi:
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_keyframe_reading(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
+    def test_keyframe_reading(self, test_video, config, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
         av_reader = av.open(full_path)
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 739f79407..e7bf4f24c 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+from modulefinder import Module
 
 import torch
 from torchvision import datasets, io, models, ops, transforms, utils
@@ -11,6 +12,7 @@ try:
 except ImportError:
     pass
 
+
 # Check if torchvision is being imported within the root folder
 if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
     os.path.realpath(os.getcwd()), "torchvision"
@@ -66,11 +68,16 @@ def set_video_backend(backend):
         backend, please compile torchvision from source.
     """
     global _video_backend
-    if backend not in ["pyav", "video_reader"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
+    if backend not in ["pyav", "video_reader", "cuda"]:
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
     if backend == "video_reader" and not io._HAS_VIDEO_OPT:
+        # TODO: better messages
         message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        warnings.warn(message)
+        raise RuntimeError(message)
+    elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER:
+        # TODO: better messages
+        message = "cuda video backend is not available."
+        raise RuntimeError(message)
     else:
         _video_backend = backend
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index ba7d4f69f..fcfef3ea1 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -4,10 +4,6 @@ import torch
 
 from ..utils import _log_api_usage_once
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
@@ -32,7 +28,7 @@ from .image import (
     write_jpeg,
     write_png,
 )
-from .video import read_video, read_video_timestamps, write_video
+from .video import _HAS_GPU_VIDEO_DECODER, read_video, read_video_timestamps, write_video
 from .video_reader import VideoReader
 
 
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
deleted file mode 100644
index f7869f0a9..000000000
--- a/torchvision/io/_load_gpu_decoder.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from ..extension import _load_library
-
-
-try:
-    _load_library("Decoder")
-    _HAS_GPU_VIDEO_DECODER = True
-except (ImportError, OSError):
-    _HAS_GPU_VIDEO_DECODER = False
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index 002fde998..dc3b60bac 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -9,9 +9,16 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 
+from ..extension import _load_library
+
 from ..utils import _log_api_usage_once
 from . import _video_opt
 
+try:
+    _load_library("Decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError, ModuleNotFoundError):
+    _HAS_GPU_VIDEO_DECODER = False
 
 try:
     import av
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 0449d6d1e..764b82dfe 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -1,14 +1,12 @@
+import io
 import warnings
+
 from typing import Any, Dict, Iterator, Optional
 
 import torch
 
 from ..utils import _log_api_usage_once
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import _HAS_VIDEO_OPT
 
 if _HAS_VIDEO_OPT:
@@ -22,11 +20,37 @@ else:
         return False
 
 
+try:
+    import av
+
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
+
+
 class VideoReader:
     """
     Fine-grained video-reading API.
     Supports frame-by-frame reading of various streams from a single video
-    container.
+    container. Much like previous video_reader API it supports the following
+    backends: video_reader, pyav, and cuda.
+    Backends can be set via `torchvision.set_video_backend` function.
 
     .. betastatus:: VideoReader class
 
@@ -88,16 +112,11 @@ class VideoReader:
             Default value (0) enables multithreading with codec-dependent heuristic. The performance
             will depend on the version of FFMPEG codecs supported.
 
-        device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
-            To use GPU decoding, pass ``device="cuda"``.
 
         path (str, optional):
             .. warning:
                 This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
                 Please use ``src`` instead.
-
-
-
     """
 
     def __init__(
@@ -105,45 +124,59 @@ class VideoReader:
         src: str = "",
         stream: str = "video",
         num_threads: int = 0,
-        device: str = "cpu",
         path: Optional[str] = None,
     ) -> None:
         _log_api_usage_once(self)
-        self.is_cuda = False
-        device = torch.device(device)
-        if device.type == "cuda":
-            if not _HAS_GPU_VIDEO_DECODER:
-                raise RuntimeError("Not compiled with GPU decoder support.")
-            self.is_cuda = True
-            self._c = torch.classes.torchvision.GPUDecoder(src, device)
-            return
-        if not _has_video_opt():
-            raise RuntimeError(
-                "Not compiled with video_reader support, "
-                + "to enable video_reader support, please install "
-                + "ffmpeg (version 4.2 is currently supported) and "
-                + "build torchvision from source."
-            )
-
-        if src == "":
-            if path is None:
-                raise TypeError("src cannot be empty")
-            src = path
-            warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
-
-        elif isinstance(src, bytes):
-            src = torch.frombuffer(src, dtype=torch.uint8)
+        from .. import get_video_backend
 
+        self.backend = get_video_backend()
         if isinstance(src, str):
-            self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            if src == "":
+                if path is None:
+                    raise TypeError("src cannot be empty")
+                src = path
+                warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
+        elif isinstance(src, bytes):
+            if self.backend in ["cuda"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
+                )
+            elif self.backend == "pyav":
+                src = io.BytesIO(src)
+            else:
+                src = torch.frombuffer(src, dtype=torch.uint8)
         elif isinstance(src, torch.Tensor):
-            if self.is_cuda:
-                raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
-            self._c = torch.classes.torchvision.Video("", "", 0)
-            self._c.init_from_memory(src, stream, num_threads)
+            if self.backend in ["cuda", "pyav"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
+                )
         else:
             raise TypeError("`src` must be either string, Tensor or bytes object.")
 
+        if self.backend == "cuda":
+            device = torch.device("cuda")
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+
+        elif self.backend == "video_reader":
+            if isinstance(src, str):
+                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            elif isinstance(src, torch.Tensor):
+                self._c = torch.classes.torchvision.Video("", "", 0)
+                self._c.init_from_memory(src, stream, num_threads)
+
+        elif self.backend == "pyav":
+            self.container = av.open(src, metadata_errors="ignore")
+            # TODO: load metadata
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+
+            # TODO: add extradata exception
+
+        else:
+            raise RuntimeError("Unknown video backend: {}".format(self.backend))
+
     def __next__(self) -> Dict[str, Any]:
         """Decodes and returns the next frame of the current stream.
         Frames are encoded as a dict with mandatory
@@ -156,14 +189,29 @@ class VideoReader:
             and corresponding timestamp (``pts``) in seconds
 
         """
-        if self.is_cuda:
+        if self.backend == "cuda":
             frame = self._c.next()
             if frame.numel() == 0:
                 raise StopIteration
-            return {"data": frame}
-        frame, pts = self._c.next()
+            return {"data": frame, "pts": None}
+        elif self.backend == "video_reader":
+            frame, pts = self._c.next()
+        else:
+            try:
+                frame = next(self._c)
+                pts = float(frame.pts * frame.time_base)
+                if "video" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
+                elif "audio" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_ndarray()).permute(1, 0)
+                else:
+                    frame = None
+            except av.error.EOFError:
+                raise StopIteration
+
         if frame.numel() == 0:
             raise StopIteration
+
         return {"data": frame, "pts": pts}
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
@@ -182,7 +230,18 @@ class VideoReader:
             frame with the exact timestamp if it exists or
             the first frame with timestamp larger than ``time_s``.
         """
-        self._c.seek(time_s, keyframes_only)
+        if self.backend in ["cuda", "video_reader"]:
+            self._c.seek(time_s, keyframes_only)
+        else:
+            # handle special case as pyav doesn't catch it
+            if time_s < 0:
+                time_s = 0
+            temp_str = self.container.streams.get(**self.pyav_stream)[0]
+            offset = int(round(time_s / temp_str.time_base))
+            if not keyframes_only:
+                warnings.warn("Accurate seek is not implemented for pyav backend")
+            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
+            self._c = self.container.decode(**self.pyav_stream)
         return self
 
     def get_metadata(self) -> Dict[str, Any]:
@@ -191,6 +250,21 @@ class VideoReader:
         Returns:
             (dict): dictionary containing duration and frame rate for every stream
         """
+        if self.backend == "pyav":
+            metadata = {}  # type:  Dict[str, Any]
+            for stream in self.container.streams:
+                if stream.type not in metadata:
+                    if stream.type == "video":
+                        rate_n = "fps"
+                    else:
+                        rate_n = "framerate"
+                    metadata[stream.type] = {rate_n: [], "duration": []}
+
+                rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
+
+                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
+                metadata[stream.type][rate_n].append(float(rate))
+            return metadata
         return self._c.get_metadata()
 
     def set_current_stream(self, stream: str) -> bool:
@@ -210,6 +284,12 @@ class VideoReader:
         Returns:
             (bool): True on succes, False otherwise
         """
-        if self.is_cuda:
-            print("GPU decoding only works with video stream.")
+        if self.backend == "cuda":
+            warnings.warn("GPU decoding only works with video stream.")
+        if self.backend == "pyav":
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            return True
         return self._c.set_current_stream(stream)
-- 
GitLab


From 5b4f79d9ba8cbeeb8d6f0fbba3ba5757b718888b Mon Sep 17 00:00:00 2001
From: Vladislav Sovrasov <vladislav.sovrasov@intel.com>
Date: Fri, 18 Nov 2022 12:58:21 +0100
Subject: [PATCH 157/624] Don't use named args in MHA calls to allow applying
 pytorch forward hooks to VIT (#6956)

---
 torchvision/models/vision_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index 06a47c03a..2045ae509 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -110,7 +110,7 @@ class EncoderBlock(nn.Module):
     def forward(self, input: torch.Tensor):
         torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
         x = self.ln_1(input)
-        x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False)
+        x, _ = self.self_attention(x, x, x, need_weights=False)
         x = self.dropout(x)
         x = x + input
 
-- 
GitLab


From c00b9531ff28c2db8380c3cc4187af785e1bba3d Mon Sep 17 00:00:00 2001
From: Joao Gomes <jdsgomes@fb.com>
Date: Mon, 21 Nov 2022 11:16:00 +0000
Subject: [PATCH 158/624] [FBcode->GH] Cherrypick bkorbar pyavapi (#6963)

* [fbsync] Bkorbar/pyavapi (#6943)

Summary:
* Test: add backend parameter

* VideoReader object now works on backend

* Frame reading now passes

* Keyframe seek now passes

* Pyav backend now supports metadata

* changes in test to reflect GPU decoder change

* Linter?

* Test GPU output

* Addressing Joao's comments

* lint

* lint

* Revert "Test GPU output"

This reverts commit f62e955d7dc81bcb23b40d58ea75413b9b62e76d.

* lint?

* lint

* lint

* Address issues in build?

* hopefully doc fix

* Arrgh

* arrgh

* fix typos

* fix input options

* remove read from memory option in pyav

* skip read from mem test for gpu and pyab be

* fix test

* remove unused import

* Hack to get reading from memory work with pyav

* patch audio test

* gallery change in a hope that docs won't break

* check video decoder inside io

* adding missing lib loading code

* remove unused input

Reviewed By: YosuaMichael

Differential Revision: D41376278

fbshipit-source-id: 1575f63a524227e69187990d88973fc50f705217

Co-authored-by: Bruno Korbar <bkorbar@quansight.com>
Co-authored-by: Joao Gomes <jdsgomes@fb.com>

* cherrypick chages made during fbsync

Co-authored-by: Joao Gomes <jdsgomes@meta.com>
Co-authored-by: Bruno Korbar <bkorbar@quansight.com>
---
 torchvision/io/__init__.py          | 7 ++++++-
 torchvision/io/_load_gpu_decoder.py | 8 ++++++++
 torchvision/io/video.py             | 8 --------
 3 files changed, 14 insertions(+), 9 deletions(-)
 create mode 100644 torchvision/io/_load_gpu_decoder.py

diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index fcfef3ea1..8427095ce 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -4,6 +4,11 @@ import torch
 
 from ..utils import _log_api_usage_once
 
+try:
+    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
+except ModuleNotFoundError:
+    _HAS_GPU_VIDEO_DECODER = False
+
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
@@ -28,7 +33,7 @@ from .image import (
     write_jpeg,
     write_png,
 )
-from .video import _HAS_GPU_VIDEO_DECODER, read_video, read_video_timestamps, write_video
+from .video import read_video, read_video_timestamps, write_video
 from .video_reader import VideoReader
 
 
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
new file mode 100644
index 000000000..f7869f0a9
--- /dev/null
+++ b/torchvision/io/_load_gpu_decoder.py
@@ -0,0 +1,8 @@
+from ..extension import _load_library
+
+
+try:
+    _load_library("Decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError):
+    _HAS_GPU_VIDEO_DECODER = False
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index dc3b60bac..6d549bed4 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -9,17 +9,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 
-from ..extension import _load_library
-
 from ..utils import _log_api_usage_once
 from . import _video_opt
 
-try:
-    _load_library("Decoder")
-    _HAS_GPU_VIDEO_DECODER = True
-except (ImportError, OSError, ModuleNotFoundError):
-    _HAS_GPU_VIDEO_DECODER = False
-
 try:
     import av
 
-- 
GitLab


From 4a310f26049371959617921d0eb9b001f4d262c6 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 21 Nov 2022 17:15:49 -0500
Subject: [PATCH 159/624] [Nova] Disable Linux Wheels Builds from CircleCI
 (#6954)

* [Nova] Disable Linux Wheels Builds from CircleCI

* remove old comment
---
 .circleci/config.yml                     | 463 -----------------------
 .circleci/regenerate.py                  |  13 +
 .github/workflows/build-wheels-linux.yml |   4 +-
 3 files changed, 14 insertions(+), 466 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9c03a0669..d4cecee10 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1130,90 +1130,6 @@ workflows:
           name: binary_linux_wheel_py3.7_rocm5.2
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.8_rocm5.1.1
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.8_rocm5.2
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.9_rocm5.1.1
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.9_rocm5.2
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.10_rocm5.1.1
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.10_rocm5.2
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
       - binary_macos_wheel:
           conda_docker_image: pytorch/conda-builder:cpu
           cu_version: cpu
@@ -1756,17 +1672,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_cpu
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cpu
-          subfolder: cpu/
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda116
           cu_version: cu116
@@ -1778,17 +1683,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_cu116
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu116
-          subfolder: cu116/
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda117
           cu_version: cu117
@@ -1800,17 +1694,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_cu117
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu117
-          subfolder: cu117/
       - binary_linux_wheel:
           cu_version: rocm5.1.1
           filters:
@@ -1821,17 +1704,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_rocm5.1.1
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_rocm5.1.1
-          subfolder: rocm5.1.1/
       - binary_linux_wheel:
           cu_version: rocm5.2
           filters:
@@ -1842,341 +1714,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_rocm5.2
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_rocm5.2
-          subfolder: rocm5.2/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.1.1
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.2
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_rocm5.2
-          subfolder: rocm5.2/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.1.1
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.2
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_rocm5.2
-          subfolder: rocm5.2/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.1.1
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.2
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_rocm5.2
-          subfolder: rocm5.2/
       - binary_macos_wheel:
           conda_docker_image: pytorch/conda-builder:cpu
           cu_version: cpu
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 07c0358e6..3a3ab14a4 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -59,6 +59,14 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         ):
                             # the fields must match the build_docs "requires" dependency
                             fb = "/.*/"
+
+                        # Disable all Linux Wheels Workflows from CircleCI
+                        # since those will now be done through Nova. We'll keep
+                        # around the py3.7 Linux Wheels build since the docs
+                        # job depends on it.
+                        if os_type == "linux" and btype == "wheel" and python_version != "3.7":
+                            continue
+
                         w += workflow_pair(
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
                         )
@@ -82,6 +90,11 @@ def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix=""
         )
     )
 
+    # For the remaining py3.7 Linux Wheels job left around for the docs build,
+    # we'll disable uploads.
+    if os_type == "linux" and btype == "wheel":
+        upload = False
+
     if upload:
         w.append(generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, filter_branch=filter_branch))
         # disable smoke tests, they are broken and needs to be fixed
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index 16a9a9f5f..d2000cae9 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -38,9 +38,7 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From bfb474b9d3ffffec5c3a040c16bc77006f35a94e Mon Sep 17 00:00:00 2001
From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Date: Thu, 24 Nov 2022 02:17:53 -0800
Subject: [PATCH 160/624] Update test detection model (#6939)

* update tests

* revert replacing expect

* test_detection_model freeze_rng_state

* update tests
---
 test/test_models.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index a169f5053..083096ae2 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -40,7 +40,7 @@ def _get_image(input_shape, real_image, device):
     - `fcos_resnet50_fpn`,
     - `maskrcnn_resnet50_fpn`,
     - `maskrcnn_resnet50_fpn_v2`,
-    in `test_classification_model` and `test_detection_mode`.
+    in `test_classification_model` and `test_detection_model`.
     To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params`
     """
     if real_image:
@@ -167,6 +167,7 @@ def _check_jit_scriptable(nn_module, args, unwrapper=None, eager_out=None):
         return imported
 
     sm = torch.jit.script(nn_module)
+    sm.eval()
 
     if eager_out is None:
         with torch.no_grad(), freeze_rng_state():
@@ -192,7 +193,8 @@ def _check_fx_compatible(model, inputs, eager_out=None):
     model_fx = torch.fx.symbolic_trace(model)
     if eager_out is None:
         eager_out = model(inputs)
-    fx_out = model_fx(inputs)
+    with torch.no_grad(), freeze_rng_state():
+        fx_out = model_fx(inputs)
     torch.testing.assert_close(eager_out, fx_out)
 
 
@@ -717,7 +719,8 @@ def test_segmentation_model(model_fn, dev):
     model.eval().to(device=dev)
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
     x = torch.rand(input_shape).to(device=dev)
-    out = model(x)
+    with torch.no_grad(), freeze_rng_state():
+        out = model(x)
 
     def check_out(out):
         prec = 0.01
@@ -745,7 +748,7 @@ def test_segmentation_model(model_fn, dev):
     _check_fx_compatible(model, x, eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
             out = model(x)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -782,7 +785,8 @@ def test_detection_model(model_fn, dev):
     model.eval().to(device=dev)
     x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     model_input = [x]
-    out = model(model_input)
+    with torch.no_grad(), freeze_rng_state():
+        out = model(model_input)
     assert model_input[0] is x
 
     def check_out(out):
@@ -843,7 +847,7 @@ def test_detection_model(model_fn, dev):
     _check_jit_scriptable(model, ([x],), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
             out = model(model_input)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
-- 
GitLab


From 72686211e2a8b78e5a5dc8c28be34eb9cfcdad4c Mon Sep 17 00:00:00 2001
From: YosuaMichael <yosuamichael@fb.com>
Date: Fri, 25 Nov 2022 17:11:38 +0000
Subject: [PATCH 161/624] Suppress warning when calling torch.frombuffer with
 non-writable buffer (#6971)

Co-authored-by: Joao Gomes <jdsgomes@fb.com>
---
 torchvision/io/_video_opt.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
index b598196d4..1f04a57c6 100644
--- a/torchvision/io/_video_opt.py
+++ b/torchvision/io/_video_opt.py
@@ -336,7 +336,10 @@ def _read_video_from_memory(
     _validate_pts(audio_pts_range)
 
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually dont modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
 
     result = torch.ops.video_reader.read_video_from_memory(
         video_data,
@@ -378,7 +381,10 @@ def _read_video_timestamps_from_memory(
     is much faster than read_video(...)
     """
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually dont modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.read_video_from_memory(
         video_data,
         0,  # seek_frame_margin
@@ -416,7 +422,10 @@ def _probe_video_from_memory(
     This function is torchscriptable
     """
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually dont modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.probe_video_from_memory(video_data)
     vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
     info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-- 
GitLab


From 678300d0e3f517feb096c6014ee031cfc9c1855f Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 28 Nov 2022 08:13:06 +0000
Subject: [PATCH 162/624] Adding BibTeX entry for TorchVision (#6690)

* Adding BibTeX entry for TorchVision

* Adding CITATION.cff file
---
 CITATION.cff | 14 ++++++++++++++
 README.rst   | 15 +++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 CITATION.cff

diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..37db28b2b
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,14 @@
+cff-version: 1.2.0
+title: "TorchVision: PyTorch's Computer Vision library"
+message: >-
+  If you find TorchVision useful in your work, please
+  consider citing the following BibTeX entry.
+type: software
+authors:
+  - given-names: TorchVision maintainers and contributors
+url: "https://github.com/pytorch/vision"
+license: "BSD-3-Clause"
+date-released: "2016-11-06"
+journal: "GitHub repository"
+publisher: "GitHub"
+key: "torchvision2016"
diff --git a/README.rst b/README.rst
index dec9deac7..c0bbd0c09 100644
--- a/README.rst
+++ b/README.rst
@@ -198,3 +198,18 @@ Pre-trained Model License
 The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
 
 More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE <https://github.com/facebookresearch/SWAG/blob/main/LICENSE>`_ for additional details.
+
+Citing TorchVision
+==================
+
+If you find TorchVision useful in your work, please consider citing the following BibTeX entry:
+```BibTeX
+@software{torchvision2016,
+	title        = {TorchVision: PyTorch's Computer Vision library},
+	author       = {TorchVision maintainers and contributors},
+	year         = 2016,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/pytorch/vision}}
+}
+```
-- 
GitLab


From 51e8dace89bc3ffa8b38544b8b1b206cf4930b5e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 28 Nov 2022 09:30:01 +0100
Subject: [PATCH 163/624] fix README rendering (#6980)

---
 README.rst | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.rst b/README.rst
index c0bbd0c09..0035df22b 100644
--- a/README.rst
+++ b/README.rst
@@ -203,13 +203,14 @@ Citing TorchVision
 ==================
 
 If you find TorchVision useful in your work, please consider citing the following BibTeX entry:
-```BibTeX
-@software{torchvision2016,
-	title        = {TorchVision: PyTorch's Computer Vision library},
-	author       = {TorchVision maintainers and contributors},
-	year         = 2016,
-	journal      = {GitHub repository},
-	publisher    = {GitHub},
-	howpublished = {\url{https://github.com/pytorch/vision}}
-}
-```
+
+.. code:: bibtex
+
+    @software{torchvision2016,
+        title        = {TorchVision: PyTorch's Computer Vision library},
+        author       = {TorchVision maintainers and contributors},
+        year         = 2016,
+        journal      = {GitHub repository},
+        publisher    = {GitHub},
+        howpublished = {\url{https://github.com/pytorch/vision}}
+    }
-- 
GitLab


From 346f6dd980e89bbf4434774bfe8f827d89fa52de Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 28 Nov 2022 10:25:18 +0100
Subject: [PATCH 164/624] use bitshifts for int to int in convert_dtype (#6978)

---
 torchvision/prototype/transforms/functional/_meta.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 0d2bd7bf1..a2da77b12 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -379,15 +379,7 @@ def convert_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.f
         if num_value_bits_input > num_value_bits_output:
             return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
         else:
-            # The bitshift kernel is not vectorized
-            #  https://github.com/pytorch/pytorch/blob/703c19008df4700b6a522b0ae5c4b6d5ffc0906f/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L315-L322
-            #  This results in the multiplication actually being faster.
-            # TODO: If the bitshift kernel is optimized in core, replace the computation below with
-            #  `image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)`
-            max_value_input = float(_FT._max_value(dtype))
-            max_value_output = float(_FT._max_value(image.dtype))
-            factor = int((max_value_input + 1) // (max_value_output + 1))
-            return image.to(dtype).mul_(factor)
+            return image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)
 
 
 # We changed the name to align it with the new naming scheme. Still, `convert_image_dtype` is
-- 
GitLab


From 50b77fa7871504cdcb7a76c6f91e4ba39ac05bc9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 28 Nov 2022 14:28:34 +0100
Subject: [PATCH 165/624] use non-random images for interpolation kernels for
 testing (#6977)

* use non-random images for interpolation kernels for testing

* use real image rather than artificial

* cleanup
---
 test/prototype_common_utils.py            |  40 ++++++-
 test/prototype_transforms_kernel_infos.py | 128 ++++++----------------
 2 files changed, 72 insertions(+), 96 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 9a613901e..795642683 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -3,6 +3,7 @@
 import collections.abc
 import dataclasses
 import functools
+import pathlib
 from collections import defaultdict
 from typing import Callable, Optional, Sequence, Tuple, Union
 
@@ -14,7 +15,7 @@ from datasets_utils import combinations_grid
 from torch.nn.functional import one_hot
 from torch.testing._comparison import assert_equal as _assert_equal, BooleanPair, NonePair, NumberPair, TensorLikePair
 from torchvision.prototype import features
-from torchvision.prototype.transforms.functional import to_image_tensor
+from torchvision.prototype.transforms.functional import convert_dtype_image_tensor, to_image_tensor
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
 __all__ = [
@@ -69,6 +70,7 @@ class ImagePair(TensorLikePair):
         self._compare_attributes(actual, expected)
 
         actual, expected = self._equalize_attributes(actual, expected)
+        actual, expected = self._promote_for_comparison(actual, expected)
         abs_diff = torch.abs(actual - expected)
 
         if self.allowed_percentage_diff is not None:
@@ -313,6 +315,42 @@ def make_image_loaders(
 make_images = from_loaders(make_image_loaders)
 
 
+def make_image_loader_for_interpolation(size="random", *, color_space=features.ColorSpace.RGB, dtype=torch.uint8):
+    size = _parse_spatial_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    features.ColorSpace.GRAY: "L",
+                    features.ColorSpace.GRAY_ALPHA: "LA",
+                    features.ColorSpace.RGB: "RGB",
+                    features.ColorSpace.RGB_ALPHA: "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
+
+        return features.Image(image_tensor, color_space=color_space)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, color_space=color_space)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=(features.ColorSpace.RGB,),
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
+        yield make_image_loader_for_interpolation(**params)
+
+
 @dataclasses.dataclass
 class BoundingBoxLoader(TensorLoader):
     format: features.BoundingBoxFormat
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 25daf3da5..12bdf60a4 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -19,6 +19,7 @@ from prototype_common_utils import (
     make_bounding_box_loaders,
     make_image_loader,
     make_image_loaders,
+    make_image_loaders_for_interpolation,
     make_mask_loaders,
     make_video_loaders,
     mark_framework_limitation,
@@ -287,7 +288,7 @@ def reference_resize_image_tensor(*args, **kwargs):
 
 def reference_inputs_resize_image_tensor():
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]),
+        make_image_loaders_for_interpolation(),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.NEAREST_EXACT,
@@ -319,17 +320,6 @@ def sample_inputs_resize_mask():
         yield ArgsKwargs(mask_loader, size=[min(mask_loader.shape[-2:]) + 1])
 
 
-@pil_reference_wrapper
-def reference_resize_mask(*args, **kwargs):
-    return F.resize_image_pil(*args, interpolation=F.InterpolationMode.NEAREST, **kwargs)
-
-
-def reference_inputs_resize_mask():
-    for mask_loader in make_mask_loaders(extra_dims=[()], num_objects=[1]):
-        for size in _get_resize_sizes(mask_loader.shape[-2:]):
-            yield ArgsKwargs(mask_loader, size=size)
-
-
 def sample_inputs_resize_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
         yield ArgsKwargs(video_loader, size=[min(video_loader.shape[-2:]) + 1])
@@ -369,11 +359,9 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resize_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
-                # TODO: investigate
-                **pil_reference_pixel_difference(110, agg_method="mean"),
+                **pil_reference_pixel_difference(10, agg_method="mean"),
                 **cuda_vs_cpu_pixel_difference(),
-                # TODO: investigate
-                **float32_vs_uint8_pixel_difference(50),
+                **float32_vs_uint8_pixel_difference(1, agg_method="mean"),
             },
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
@@ -391,9 +379,6 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.resize_mask,
             sample_inputs_fn=sample_inputs_resize_mask,
-            reference_fn=reference_resize_mask,
-            reference_inputs_fn=reference_inputs_resize_mask,
-            float32_vs_uint8=True,
             closeness_kwargs=pil_reference_pixel_difference(10),
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
@@ -501,9 +486,7 @@ def sample_inputs_affine_image_tensor():
 
 
 def reference_inputs_affine_image_tensor():
-    for image_loader, affine_kwargs in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _AFFINE_KWARGS
-    ):
+    for image_loader, affine_kwargs in itertools.product(make_image_loaders_for_interpolation(), _AFFINE_KWARGS):
         yield ArgsKwargs(
             image_loader,
             interpolation=F.InterpolationMode.NEAREST,
@@ -617,18 +600,6 @@ def sample_inputs_affine_mask():
         yield ArgsKwargs(mask_loader, **_full_affine_params())
 
 
-@pil_reference_wrapper
-def reference_affine_mask(*args, **kwargs):
-    return F.affine_image_pil(*args, interpolation=F.InterpolationMode.NEAREST, **kwargs)
-
-
-def reference_inputs_resize_mask():
-    for mask_loader, affine_kwargs in itertools.product(
-        make_mask_loaders(extra_dims=[()], num_objects=[1]), _AFFINE_KWARGS
-    ):
-        yield ArgsKwargs(mask_loader, **affine_kwargs)
-
-
 def sample_inputs_affine_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
         yield ArgsKwargs(video_loader, **_full_affine_params())
@@ -665,10 +636,6 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.affine_mask,
             sample_inputs_fn=sample_inputs_affine_mask,
-            reference_fn=reference_affine_mask,
-            reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=pil_reference_pixel_difference(10),
-            float32_vs_uint8=True,
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
             ],
@@ -870,9 +837,7 @@ def sample_inputs_rotate_image_tensor():
 
 
 def reference_inputs_rotate_image_tensor():
-    for image_loader, angle in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _ROTATE_ANGLES
-    ):
+    for image_loader, angle in itertools.product(make_image_loaders_for_interpolation(), _ROTATE_ANGLES):
         yield ArgsKwargs(image_loader, angle=angle)
 
 
@@ -891,16 +856,6 @@ def sample_inputs_rotate_mask():
         yield ArgsKwargs(mask_loader, angle=15.0)
 
 
-@pil_reference_wrapper
-def reference_rotate_mask(*args, **kwargs):
-    return F.rotate_image_pil(*args, interpolation=F.InterpolationMode.NEAREST, **kwargs)
-
-
-def reference_inputs_rotate_mask():
-    for mask_loader, angle in itertools.product(make_mask_loaders(extra_dims=[()], num_objects=[1]), _ROTATE_ANGLES):
-        yield ArgsKwargs(mask_loader, angle=angle)
-
-
 def sample_inputs_rotate_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
         yield ArgsKwargs(video_loader, angle=15.0)
@@ -914,8 +869,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.rotate_image_pil),
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
             float32_vs_uint8=True,
-            # TODO: investigate
-            closeness_kwargs=pil_reference_pixel_difference(110, agg_method="mean"),
+            closeness_kwargs=pil_reference_pixel_difference(1, agg_method="mean"),
             test_marks=[
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok
@@ -929,10 +883,6 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.rotate_mask,
             sample_inputs_fn=sample_inputs_rotate_mask,
-            reference_fn=reference_rotate_mask,
-            reference_inputs_fn=reference_inputs_rotate_mask,
-            float32_vs_uint8=True,
-            closeness_kwargs=pil_reference_pixel_difference(10),
         ),
         KernelInfo(
             F.rotate_video,
@@ -1058,7 +1008,7 @@ def reference_resized_crop_image_tensor(*args, **kwargs):
 
 def reference_inputs_resized_crop_image_tensor():
     for image_loader, interpolation, params in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]),
+        make_image_loaders_for_interpolation(),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.NEAREST_EXACT,
@@ -1089,13 +1039,6 @@ def sample_inputs_resized_crop_mask():
         yield ArgsKwargs(mask_loader, **_RESIZED_CROP_PARAMS[0])
 
 
-def reference_inputs_resized_crop_mask():
-    for mask_loader, params in itertools.product(
-        make_mask_loaders(extra_dims=[()], num_objects=[1]), _RESIZED_CROP_PARAMS
-    ):
-        yield ArgsKwargs(mask_loader, **params)
-
-
 def sample_inputs_resized_crop_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
         yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0])
@@ -1110,11 +1053,9 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
-                # TODO: investigate
-                **pil_reference_pixel_difference(60, agg_method="mean"),
                 **cuda_vs_cpu_pixel_difference(),
-                # TODO: investigate
-                **float32_vs_uint8_pixel_difference(50),
+                **pil_reference_pixel_difference(3, agg_method="mean"),
+                **float32_vs_uint8_pixel_difference(3, agg_method="mean"),
             },
         ),
         KernelInfo(
@@ -1124,10 +1065,6 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.resized_crop_mask,
             sample_inputs_fn=sample_inputs_resized_crop_mask,
-            reference_fn=pil_reference_wrapper(F.resized_crop_image_pil),
-            reference_inputs_fn=reference_inputs_resized_crop_mask,
-            float32_vs_uint8=True,
-            closeness_kwargs=pil_reference_pixel_difference(10),
         ),
         KernelInfo(
             F.resized_crop_video,
@@ -1298,12 +1235,24 @@ def sample_inputs_perspective_image_tensor():
 
 
 def reference_inputs_perspective_image_tensor():
-    for image_loader, coefficients in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PERSPECTIVE_COEFFS
+    for image_loader, coefficients, interpolation in itertools.product(
+        make_image_loaders_for_interpolation(),
+        _PERSPECTIVE_COEFFS,
+        [
+            F.InterpolationMode.NEAREST,
+            F.InterpolationMode.BILINEAR,
+        ],
     ):
         # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
-            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=coefficients)
+            yield ArgsKwargs(
+                image_loader,
+                startpoints=None,
+                endpoints=None,
+                interpolation=interpolation,
+                fill=fill,
+                coefficients=coefficients,
+            )
 
 
 def sample_inputs_perspective_bounding_box():
@@ -1339,8 +1288,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs={
-                # TODO: investigate
-                **pil_reference_pixel_difference(160, agg_method="mean"),
+                **pil_reference_pixel_difference(2, agg_method="mean"),
                 **cuda_vs_cpu_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(),
             },
@@ -1381,7 +1329,7 @@ def sample_inputs_elastic_image_tensor():
 
 def reference_inputs_elastic_image_tensor():
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]),
+        make_image_loaders_for_interpolation(),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.BILINEAR,
@@ -1409,12 +1357,6 @@ def sample_inputs_elastic_mask():
         yield ArgsKwargs(mask_loader, displacement=displacement)
 
 
-def reference_inputs_elastic_mask():
-    for mask_loader in make_mask_loaders(extra_dims=[()], num_objects=[1]):
-        displacement = _get_elastic_displacement(mask_loader.shape[-2:])
-        yield ArgsKwargs(mask_loader, displacement=displacement)
-
-
 def sample_inputs_elastic_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
         displacement = _get_elastic_displacement(video_loader.shape[-2:])
@@ -1426,11 +1368,12 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.elastic_image_tensor,
             sample_inputs_fn=sample_inputs_elastic_image_tensor,
-            reference_fn=pil_reference_wrapper(F.elastic_image_pil),
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
-            # TODO: investigate
-            closeness_kwargs=float32_vs_uint8_pixel_difference(60, agg_method="mean"),
+            closeness_kwargs={
+                **float32_vs_uint8_pixel_difference(6, agg_method="mean"),
+                **cuda_vs_cpu_pixel_difference(),
+            },
         ),
         KernelInfo(
             F.elastic_bounding_box,
@@ -1439,15 +1382,11 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.elastic_mask,
             sample_inputs_fn=sample_inputs_elastic_mask,
-            reference_fn=pil_reference_wrapper(F.elastic_image_pil),
-            reference_inputs_fn=reference_inputs_elastic_mask,
-            float32_vs_uint8=True,
-            # TODO: investigate
-            closeness_kwargs=pil_reference_pixel_difference(80, agg_method="mean"),
         ),
         KernelInfo(
             F.elastic_video,
             sample_inputs_fn=sample_inputs_elastic_video,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
@@ -2089,8 +2028,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
-                # TODO: investigate
-                **pil_reference_pixel_difference(20),
+                **pil_reference_pixel_difference(2, agg_method="mean"),
                 **float32_vs_uint8_pixel_difference(),
             },
         ),
-- 
GitLab


From 4df1a85c9035f9f4784b42b1d450096a9256551b Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 28 Nov 2022 13:37:46 +0000
Subject: [PATCH 166/624] [prototype] Remove `_FT` aliases from functional
 (#6983)

* Remove `_FT` usages from functional

* Update error messages
---
 .../transforms/functional/_augment.py         | 11 ++++++--
 .../prototype/transforms/functional/_color.py | 25 +++++++++++--------
 .../transforms/functional/_geometry.py        | 16 +++++++++---
 .../prototype/transforms/functional/_meta.py  | 11 ++++----
 4 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index baa3e1573..3d121eb33 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -4,10 +4,17 @@ import PIL.Image
 
 import torch
 from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
-erase_image_tensor = _FT.erase
+
+def erase_image_tensor(
+    image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    if not inplace:
+        image = image.clone()
+
+    image[..., i : i + h, j : j + w] = v
+    return image
 
 
 @torch.jit.unused
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 66805339c..fe09d3ba7 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -1,7 +1,8 @@
 import torch
 from torch.nn.functional import conv2d
 from torchvision.prototype import features
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
+from torchvision.transforms import functional_pil as _FP
+from torchvision.transforms.functional_tensor import _max_value
 
 from ._meta import _num_value_bits, _rgb_to_gray, convert_dtype_image_tensor
 
@@ -9,7 +10,7 @@ from ._meta import _num_value_bits, _rgb_to_gray, convert_dtype_image_tensor
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
     ratio = float(ratio)
     fp = image1.is_floating_point()
-    bound = _FT._max_value(image1.dtype)
+    bound = _max_value(image1.dtype)
     output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
     return output if fp else output.to(image1.dtype)
 
@@ -18,10 +19,12 @@ def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float
     if brightness_factor < 0:
         raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
 
-    _FT._assert_channels(image, [1, 3])
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
 
     fp = image.is_floating_point()
-    bound = _FT._max_value(image.dtype)
+    bound = _max_value(image.dtype)
     output = image.mul(brightness_factor).clamp_(0, bound)
     return output if fp else output.to(image.dtype)
 
@@ -48,7 +51,7 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
 
     c = image.shape[-3]
     if c not in [1, 3]:
-        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
 
     if c == 1:  # Match PIL behaviour
         return image
@@ -82,7 +85,7 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
 
     c = image.shape[-3]
     if c not in [1, 3]:
-        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
     fp = image.is_floating_point()
     if c == 3:
         grayscale_image = _rgb_to_gray(image, cast=False)
@@ -121,7 +124,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     if image.numel() == 0 or height <= 2 or width <= 2:
         return image
 
-    bound = _FT._max_value(image.dtype)
+    bound = _max_value(image.dtype)
     fp = image.is_floating_point()
     shape = image.shape
 
@@ -248,7 +251,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
 
     c = image.shape[-3]
     if c not in [1, 3]:
-        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
 
     if c == 1:  # Match PIL behaviour
         return image
@@ -350,7 +353,7 @@ def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
 
 
 def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
-    if threshold > _FT._max_value(image.dtype):
+    if threshold > _max_value(image.dtype):
         raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
 
     return torch.where(image >= threshold, invert_image_tensor(image), image)
@@ -375,13 +378,13 @@ def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTyp
 def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
     c = image.shape[-3]
     if c not in [1, 3]:
-        raise TypeError(f"Input image tensor permitted channel values are {[1, 3]}, but found {c}")
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
 
     if image.numel() == 0:
         # exit earlier on empty images
         return image
 
-    bound = _FT._max_value(image.dtype)
+    bound = _max_value(image.dtype)
     fp = image.is_floating_point()
     float_image = image if fp else image.to(torch.float32)
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 41262185b..56fcdf523 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -8,7 +8,7 @@ import torch
 from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
 from torchvision.prototype import features
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
+from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional import (
     _compute_resized_output_size as __compute_resized_output_size,
     _get_perspective_coeffs,
@@ -17,10 +17,15 @@ from torchvision.transforms.functional import (
     pil_to_tensor,
     to_pil_image,
 )
+from torchvision.transforms.functional_tensor import _pad_symmetric
 
 from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
 
-horizontal_flip_image_tensor = _FT.hflip
+
+def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-1)
+
+
 horizontal_flip_image_pil = _FP.hflip
 
 
@@ -58,7 +63,10 @@ def horizontal_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return horizontal_flip_image_pil(inpt)
 
 
-vertical_flip_image_tensor = _FT.vflip
+def vertical_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-2)
+
+
 vertical_flip_image_pil = _FP.vflip
 
 
@@ -975,7 +983,7 @@ def _pad_with_scalar_fill(
         if needs_cast:
             image = image.to(dtype)
     else:  # padding_mode == "symmetric"
-        image = _FT._pad_symmetric(image, torch_padding)
+        image = _pad_symmetric(image, torch_padding)
 
     new_height, new_width = image.shape[-2:]
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index a2da77b12..4f190d7b8 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -4,7 +4,8 @@ import PIL.Image
 import torch
 from torchvision.prototype import features
 from torchvision.prototype.features import BoundingBoxFormat, ColorSpace
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
+from torchvision.transforms import functional_pil as _FP
+from torchvision.transforms.functional_tensor import _max_value
 
 
 def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
@@ -193,7 +194,7 @@ def clamp_bounding_box(
 
 def _strip_alpha(image: torch.Tensor) -> torch.Tensor:
     image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
-    if not torch.all(alpha == _FT._max_value(alpha.dtype)):
+    if not torch.all(alpha == _max_value(alpha.dtype)):
         raise RuntimeError(
             "Stripping the alpha channel if it contains values other than the max value is not supported."
         )
@@ -204,7 +205,7 @@ def _add_alpha(image: torch.Tensor, alpha: Optional[torch.Tensor] = None) -> tor
     if alpha is None:
         shape = list(image.shape)
         shape[-3] = 1
-        alpha = torch.full(shape, _FT._max_value(image.dtype), dtype=image.dtype, device=image.device)
+        alpha = torch.full(shape, _max_value(image.dtype), dtype=image.dtype, device=image.device)
     return torch.cat((image, alpha), dim=-3)
 
 
@@ -363,14 +364,14 @@ def convert_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.f
         # Instead, we can also multiply by the maximum value plus something close to `1`. See
         # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
         eps = 1e-3
-        max_value = float(_FT._max_value(dtype))
+        max_value = float(_max_value(dtype))
         # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
         # discrete set `{0, 1}`.
         return image.mul(max_value + 1.0 - eps).to(dtype)
     else:
         # int to float
         if float_output:
-            return image.to(dtype).mul_(1.0 / _FT._max_value(image.dtype))
+            return image.to(dtype).mul_(1.0 / _max_value(image.dtype))
 
         # int to int
         num_value_bits_input = _num_value_bits(image.dtype)
-- 
GitLab


From 74ea933c29898f838991acdf49a70cb20b4ec3ad Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 28 Nov 2022 16:23:38 +0100
Subject: [PATCH 167/624] Cleanup prototype transforms tests (#6984)

* minor cleanup of the prototype transforms tests

* refactor ImagePair

* pretty format enum
---
 test/prototype_common_utils.py               | 45 ++++++++------------
 test/prototype_transforms_kernel_infos.py    | 32 +++++++-------
 test/test_prototype_transforms_functional.py | 29 ++++---------
 3 files changed, 43 insertions(+), 63 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 795642683..61cf065e4 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -2,6 +2,7 @@
 
 import collections.abc
 import dataclasses
+import enum
 import functools
 import pathlib
 from collections import defaultdict
@@ -53,45 +54,31 @@ class ImagePair(TensorLikePair):
         actual,
         expected,
         *,
-        agg_method=None,
-        allowed_percentage_diff=None,
+        mae=False,
         **other_parameters,
     ):
         if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
             actual, expected = [to_image_tensor(input) for input in [actual, expected]]
 
         super().__init__(actual, expected, **other_parameters)
-        self.agg_method = getattr(torch, agg_method) if isinstance(agg_method, str) else agg_method
-        self.allowed_percentage_diff = allowed_percentage_diff
+        self.mae = mae
 
     def compare(self) -> None:
         actual, expected = self.actual, self.expected
 
         self._compare_attributes(actual, expected)
-
         actual, expected = self._equalize_attributes(actual, expected)
-        actual, expected = self._promote_for_comparison(actual, expected)
-        abs_diff = torch.abs(actual - expected)
 
-        if self.allowed_percentage_diff is not None:
-            percentage_diff = float((abs_diff.ne(0).to(torch.float64).mean()))
-            if percentage_diff > self.allowed_percentage_diff:
+        if self.mae:
+            actual, expected = self._promote_for_comparison(actual, expected)
+            mae = float(torch.abs(actual - expected).float().mean())
+            if mae > self.atol:
                 raise self._make_error_meta(
                     AssertionError,
-                    f"{percentage_diff:.1%} elements differ, "
-                    f"but only {self.allowed_percentage_diff:.1%} is allowed",
+                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
                 )
-
-        if self.agg_method is None:
-            super()._compare_values(actual, expected)
         else:
-            agg_abs_diff = float(self.agg_method(abs_diff.to(torch.float64)))
-            if agg_abs_diff > self.atol:
-                raise self._make_error_meta(
-                    AssertionError,
-                    f"The '{self.agg_method.__name__}' of the absolute difference is {agg_abs_diff}, "
-                    f"but only {self.atol} is allowed.",
-                )
+            super()._compare_values(actual, expected)
 
 
 def assert_close(
@@ -142,6 +129,8 @@ def parametrized_error_message(*args, **kwargs):
     def to_str(obj):
         if isinstance(obj, torch.Tensor) and obj.numel() > 10:
             return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
         else:
             return repr(obj)
 
@@ -174,11 +163,13 @@ class ArgsKwargs:
         yield self.kwargs
 
     def load(self, device="cpu"):
-        args = tuple(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args)
-        kwargs = {
-            keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg for keyword, arg in self.kwargs.items()
-        }
-        return args, kwargs
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
 
 
 DEFAULT_SQUARE_SPATIAL_SIZE = 15
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 12bdf60a4..d784ccc66 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -52,7 +52,7 @@ class KernelInfo(InfoBase):
         # values to be tested. If not specified, `sample_inputs_fn` will be used.
         reference_inputs_fn=None,
         # If true-ish, triggers a test that checks the kernel for consistency between uint8 and float32 inputs with the
-        # the reference inputs. This is usually used whenever we use a PIL kernel as reference.
+        # reference inputs. This is usually used whenever we use a PIL kernel as reference.
         # Can be a callable in which case it will be called with `other_args, kwargs`. It should return the same
         # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input
         # dtype.
@@ -73,8 +73,8 @@ class KernelInfo(InfoBase):
         self.float32_vs_uint8 = float32_vs_uint8
 
 
-def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, agg_method=None):
-    return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, agg_method=agg_method)
+def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False):
+    return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae)
 
 
 def cuda_vs_cpu_pixel_difference(atol=1):
@@ -84,21 +84,21 @@ def cuda_vs_cpu_pixel_difference(atol=1):
     }
 
 
-def pil_reference_pixel_difference(atol=1, agg_method=None):
+def pil_reference_pixel_difference(atol=1, mae=False):
     return {
         (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): _pixel_difference_closeness_kwargs(
-            atol, agg_method=agg_method
+            atol, mae=mae
         )
     }
 
 
-def float32_vs_uint8_pixel_difference(atol=1, agg_method=None):
+def float32_vs_uint8_pixel_difference(atol=1, mae=False):
     return {
         (
             ("TestKernels", "test_float32_vs_uint8"),
             torch.float32,
             "cpu",
-        ): _pixel_difference_closeness_kwargs(atol, dtype=torch.float32, agg_method=agg_method)
+        ): _pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae)
     }
 
 
@@ -359,9 +359,9 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_resize_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
-                **pil_reference_pixel_difference(10, agg_method="mean"),
+                **pil_reference_pixel_difference(10, mae=True),
                 **cuda_vs_cpu_pixel_difference(),
-                **float32_vs_uint8_pixel_difference(1, agg_method="mean"),
+                **float32_vs_uint8_pixel_difference(1, mae=True),
             },
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
@@ -613,7 +613,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.affine_image_pil),
             reference_inputs_fn=reference_inputs_affine_image_tensor,
             float32_vs_uint8=True,
-            closeness_kwargs=pil_reference_pixel_difference(10, agg_method="mean"),
+            closeness_kwargs=pil_reference_pixel_difference(10, mae=True),
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
                 xfail_jit_tuple_instead_of_list("fill"),
@@ -869,7 +869,7 @@ KERNEL_INFOS.extend(
             reference_fn=pil_reference_wrapper(F.rotate_image_pil),
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
             float32_vs_uint8=True,
-            closeness_kwargs=pil_reference_pixel_difference(1, agg_method="mean"),
+            closeness_kwargs=pil_reference_pixel_difference(1, mae=True),
             test_marks=[
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok
@@ -1054,8 +1054,8 @@ KERNEL_INFOS.extend(
             float32_vs_uint8=True,
             closeness_kwargs={
                 **cuda_vs_cpu_pixel_difference(),
-                **pil_reference_pixel_difference(3, agg_method="mean"),
-                **float32_vs_uint8_pixel_difference(3, agg_method="mean"),
+                **pil_reference_pixel_difference(3, mae=True),
+                **float32_vs_uint8_pixel_difference(3, mae=True),
             },
         ),
         KernelInfo(
@@ -1288,7 +1288,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs={
-                **pil_reference_pixel_difference(2, agg_method="mean"),
+                **pil_reference_pixel_difference(2, mae=True),
                 **cuda_vs_cpu_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(),
             },
@@ -1371,7 +1371,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs={
-                **float32_vs_uint8_pixel_difference(6, agg_method="mean"),
+                **float32_vs_uint8_pixel_difference(6, mae=True),
                 **cuda_vs_cpu_pixel_difference(),
             },
         ),
@@ -2028,7 +2028,7 @@ KERNEL_INFOS.extend(
             reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
-                **pil_reference_pixel_difference(2, agg_method="mean"),
+                **pil_reference_pixel_difference(2, mae=True),
                 **float32_vs_uint8_pixel_difference(),
             },
         ),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index c45bddae5..16a64526b 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -61,12 +61,7 @@ def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None):
     ]
 
 
-def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None):
-    if condition is None:
-
-        def condition(info):
-            return True
-
+def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn):
     def decorator(test_fn):
         parts = test_fn.__qualname__.split(".")
         if len(parts) == 1:
@@ -81,9 +76,6 @@ def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=No
         argnames = ("info", "args_kwargs")
         argvalues = []
         for info in infos:
-            if not condition(info):
-                continue
-
             argvalues.extend(make_info_args_kwargs_params(info, args_kwargs_fn=args_kwargs_fn, test_id=test_id))
 
         return pytest.mark.parametrize(argnames, argvalues)(test_fn)
@@ -110,9 +102,8 @@ class TestKernels:
         args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(),
     )
     reference_inputs = make_info_args_kwargs_parametrization(
-        KERNEL_INFOS,
+        [info for info in KERNEL_INFOS if info.reference_fn is not None],
         args_kwargs_fn=lambda info: info.reference_inputs_fn(),
-        condition=lambda info: info.reference_fn is not None,
     )
 
     @ignore_jit_warning_no_profile
@@ -131,7 +122,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
-            msg=parametrized_error_message(*other_args, *kwargs),
+            msg=parametrized_error_message(*other_args, **kwargs),
         )
 
     def _unbatch(self, batch, *, data_dims):
@@ -188,7 +179,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device),
-            msg=parametrized_error_message(*other_args, *kwargs),
+            msg=parametrized_error_message(*other_args, **kwargs),
         )
 
     @sample_inputs
@@ -218,7 +209,7 @@ class TestKernels:
             output_cpu,
             check_device=False,
             **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device),
-            msg=parametrized_error_message(*other_args, *kwargs),
+            msg=parametrized_error_message(*other_args, **kwargs),
         )
 
     @sample_inputs
@@ -245,7 +236,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
-            msg=parametrized_error_message(*other_args, *kwargs),
+            msg=parametrized_error_message(*other_args, **kwargs),
         )
 
     @make_info_args_kwargs_parametrization(
@@ -272,7 +263,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=torch.float32, device=input.device),
-            msg=parametrized_error_message(*other_args, *kwargs),
+            msg=parametrized_error_message(*other_args, **kwargs),
         )
 
 
@@ -290,9 +281,8 @@ def spy_on(mocker):
 
 class TestDispatchers:
     image_sample_inputs = make_info_args_kwargs_parametrization(
-        DISPATCHER_INFOS,
+        [info for info in DISPATCHER_INFOS if features.Image in info.kernels],
         args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
-        condition=lambda info: features.Image in info.kernels,
     )
 
     @ignore_jit_warning_no_profile
@@ -341,9 +331,8 @@ class TestDispatchers:
         spy.assert_called_once()
 
     @make_info_args_kwargs_parametrization(
-        DISPATCHER_INFOS,
+        [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
         args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
-        condition=lambda info: info.pil_kernel_info is not None,
     )
     def test_dispatch_pil(self, info, args_kwargs, spy_on):
         (image_feature, *other_args), kwargs = args_kwargs.load()
-- 
GitLab


From 0bd77df21d2ac1d075fa394a45494637d9443c8c Mon Sep 17 00:00:00 2001
From: Joao Gomes <jdsgomes@fb.com>
Date: Tue, 29 Nov 2022 10:42:43 +0000
Subject: [PATCH 168/624] supress warning in VideoReader too (#6976)

---
 torchvision/io/video_reader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 764b82dfe..36e9512e1 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -144,7 +144,10 @@ class VideoReader:
             elif self.backend == "pyav":
                 src = io.BytesIO(src)
             else:
-                src = torch.frombuffer(src, dtype=torch.uint8)
+                with warnings.catch_warnings():
+                    # Ignore the warning because we actually dont modify the buffer in this function
+                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
+                    src = torch.frombuffer(src, dtype=torch.uint8)
         elif isinstance(src, torch.Tensor):
             if self.backend in ["cuda", "pyav"]:
                 raise RuntimeError(
-- 
GitLab


From 57a0c32e2f82b3f4fe1e9469aad5b4fc9be3148e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 29 Nov 2022 14:43:51 +0100
Subject: [PATCH 169/624] introduce type failures in dispatchers (#6988)

* introduce type failures in dispatchers

* add type checks to all dispatchers

* add missing else

* add test

* fix convert_color_space
---
 test/test_prototype_transforms_functional.py  |  8 ++
 .../transforms/functional/_augment.py         |  7 +-
 .../prototype/transforms/functional/_color.py | 78 +++++++++++++---
 .../transforms/functional/_geometry.py        | 91 ++++++++++++++++---
 .../prototype/transforms/functional/_meta.py  | 65 +++++++++----
 .../prototype/transforms/functional/_misc.py  | 22 +++--
 .../transforms/functional/_temporal.py        |  6 +-
 .../transforms/functional/_type_conversion.py | 16 ++--
 8 files changed, 233 insertions(+), 60 deletions(-)

diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 16a64526b..883716445 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -422,6 +422,14 @@ class TestDispatchers:
 
         assert dispatcher_params == feature_params
 
+    @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
+    def test_unkown_type(self, info):
+        unkown_input = object()
+        (_, *other_args), kwargs = next(iter(info.sample_inputs())).load("cpu")
+
+        with pytest.raises(TypeError, match=re.escape(str(type(unkown_input)))):
+            info.dispatcher(unkown_input, *other_args, **kwargs)
+
 
 @pytest.mark.parametrize(
     ("alias", "target"),
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 3d121eb33..c6d48d381 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -51,5 +51,10 @@ def erase(
     elif isinstance(inpt, features.Video):
         output = erase_video(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
         return features.Video.wrap_like(inpt, output)
-    else:  # isinstance(inpt, PIL.Image.Image):
+    elif isinstance(inpt, PIL.Image.Image):
         return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index fe09d3ba7..dff640586 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -1,3 +1,4 @@
+import PIL.Image
 import torch
 from torch.nn.functional import conv2d
 from torchvision.prototype import features
@@ -41,8 +42,13 @@ def adjust_brightness(inpt: features.InputTypeJIT, brightness_factor: float) ->
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
     elif isinstance(inpt, features._Feature):
         return inpt.adjust_brightness(brightness_factor=brightness_factor)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
@@ -75,8 +81,13 @@ def adjust_saturation(inpt: features.InputTypeJIT, saturation_factor: float) ->
         return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
     elif isinstance(inpt, features._Feature):
         return inpt.adjust_saturation(saturation_factor=saturation_factor)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
@@ -109,8 +120,13 @@ def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> feat
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
     elif isinstance(inpt, features._Feature):
         return inpt.adjust_contrast(contrast_factor=contrast_factor)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
@@ -177,8 +193,13 @@ def adjust_sharpness(inpt: features.InputTypeJIT, sharpness_factor: float) -> fe
         return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
     elif isinstance(inpt, features._Feature):
         return inpt.adjust_sharpness(sharpness_factor=sharpness_factor)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
@@ -284,8 +305,13 @@ def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.Input
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
     elif isinstance(inpt, features._Feature):
         return inpt.adjust_hue(hue_factor=hue_factor)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
@@ -319,8 +345,13 @@ def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) ->
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
     elif isinstance(inpt, features._Feature):
         return inpt.adjust_gamma(gamma=gamma, gain=gain)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
@@ -348,8 +379,13 @@ def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
         return posterize_image_tensor(inpt, bits=bits)
     elif isinstance(inpt, features._Feature):
         return inpt.posterize(bits=bits)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return posterize_image_pil(inpt, bits=bits)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
@@ -371,8 +407,13 @@ def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTyp
         return solarize_image_tensor(inpt, threshold=threshold)
     elif isinstance(inpt, features._Feature):
         return inpt.solarize(threshold=threshold)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return solarize_image_pil(inpt, threshold=threshold)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
@@ -416,8 +457,13 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return autocontrast_image_tensor(inpt)
     elif isinstance(inpt, features._Feature):
         return inpt.autocontrast()
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return autocontrast_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
@@ -501,8 +547,13 @@ def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return equalize_image_tensor(inpt)
     elif isinstance(inpt, features._Feature):
         return inpt.equalize()
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return equalize_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
@@ -527,5 +578,10 @@ def invert(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return invert_image_tensor(inpt)
     elif isinstance(inpt, features._Feature):
         return inpt.invert()
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return invert_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 56fcdf523..60f931d5f 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -59,8 +59,13 @@ def horizontal_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return horizontal_flip_image_tensor(inpt)
     elif isinstance(inpt, features._Feature):
         return inpt.horizontal_flip()
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return horizontal_flip_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def vertical_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
@@ -100,8 +105,13 @@ def vertical_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
         return vertical_flip_image_tensor(inpt)
     elif isinstance(inpt, features._Feature):
         return inpt.vertical_flip()
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return vertical_flip_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 # We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
@@ -221,10 +231,15 @@ def resize(
         return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
     elif isinstance(inpt, features._Feature):
         return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         if antialias is not None and not antialias:
             warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
         return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def _affine_parse_args(
@@ -725,7 +740,7 @@ def affine(
         return inpt.affine(
             angle, translate=translate, scale=scale, shear=shear, interpolation=interpolation, fill=fill, center=center
         )
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return affine_image_pil(
             inpt,
             angle,
@@ -736,6 +751,11 @@ def affine(
             fill=fill,
             center=center,
         )
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def rotate_image_tensor(
@@ -889,8 +909,13 @@ def rotate(
         return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
     elif isinstance(inpt, features._Feature):
         return inpt.rotate(angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
@@ -1090,8 +1115,13 @@ def pad(
 
     elif isinstance(inpt, features._Feature):
         return inpt.pad(padding, fill=fill, padding_mode=padding_mode)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1159,8 +1189,13 @@ def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: i
         return crop_image_tensor(inpt, top, left, height, width)
     elif isinstance(inpt, features._Feature):
         return inpt.crop(top, left, height, width)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return crop_image_pil(inpt, top, left, height, width)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
@@ -1411,10 +1446,15 @@ def perspective(
         return inpt.perspective(
             startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
         )
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return perspective_image_pil(
             inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
         )
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def elastic_image_tensor(
@@ -1560,8 +1600,13 @@ def elastic(
         return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
     elif isinstance(inpt, features._Feature):
         return inpt.elastic(displacement, interpolation=interpolation, fill=fill)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 elastic_transform = elastic
@@ -1665,8 +1710,13 @@ def center_crop(inpt: features.InputTypeJIT, output_size: List[int]) -> features
         return center_crop_image_tensor(inpt, output_size)
     elif isinstance(inpt, features._Feature):
         return inpt.center_crop(output_size)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return center_crop_image_pil(inpt, output_size)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def resized_crop_image_tensor(
@@ -1753,8 +1803,13 @@ def resized_crop(
         )
     elif isinstance(inpt, features._Feature):
         return inpt.resized_crop(top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def _parse_five_crop_size(size: List[int]) -> List[int]:
@@ -1831,8 +1886,13 @@ def five_crop(
     elif isinstance(inpt, features.Video):
         output = five_crop_video(inpt.as_subclass(torch.Tensor), size)
         return tuple(features.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
-    else:  # isinstance(inpt, PIL.Image.Image):
+    elif isinstance(inpt, PIL.Image.Image):
         return five_crop_image_pil(inpt, size)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def ten_crop_image_tensor(image: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]:
@@ -1879,5 +1939,10 @@ def ten_crop(
     elif isinstance(inpt, features.Video):
         output = ten_crop_video(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
         return [features.Video.wrap_like(inpt, item) for item in output]
-    else:  # isinstance(inpt, PIL.Image.Image):
+    elif isinstance(inpt, PIL.Image.Image):
         return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 4f190d7b8..4605b433b 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -23,17 +23,22 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
 get_dimensions_image_pil = _FP.get_dimensions
 
 
-def get_dimensions(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]:
-    if isinstance(image, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
+def get_dimensions(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
     ):
-        return get_dimensions_image_tensor(image)
-    elif isinstance(image, (features.Image, features.Video)):
-        channels = image.num_channels
-        height, width = image.spatial_size
+        return get_dimensions_image_tensor(inpt)
+    elif isinstance(inpt, (features.Image, features.Video)):
+        channels = inpt.num_channels
+        height, width = inpt.spatial_size
         return [channels, height, width]
+    elif isinstance(inpt, PIL.Image.Image):
+        return get_dimensions_image_pil(inpt)
     else:
-        return get_dimensions_image_pil(image)
+        raise TypeError(
+            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def get_num_channels_image_tensor(image: torch.Tensor) -> int:
@@ -54,15 +59,20 @@ def get_num_channels_video(video: torch.Tensor) -> int:
     return get_num_channels_image_tensor(video)
 
 
-def get_num_channels(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> int:
-    if isinstance(image, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video))
+def get_num_channels(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> int:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
     ):
-        return get_num_channels_image_tensor(image)
-    elif isinstance(image, (features.Image, features.Video)):
-        return image.num_channels
+        return get_num_channels_image_tensor(inpt)
+    elif isinstance(inpt, (features.Image, features.Video)):
+        return inpt.num_channels
+    elif isinstance(inpt, PIL.Image.Image):
+        return get_num_channels_image_pil(inpt)
     else:
-        return get_num_channels_image_pil(image)
+        raise TypeError(
+            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
@@ -103,8 +113,13 @@ def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]:
         return get_spatial_size_image_tensor(inpt)
     elif isinstance(inpt, (features.Image, features.Video, features.BoundingBox, features.Mask)):
         return list(inpt.spatial_size)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return get_spatial_size_image_pil(inpt)  # type: ignore[no-any-return]
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def get_num_frames_video(video: torch.Tensor) -> int:
@@ -117,7 +132,9 @@ def get_num_frames(inpt: features.VideoTypeJIT) -> int:
     elif isinstance(inpt, features.Video):
         return inpt.num_frames
     else:
-        raise TypeError(f"The video should be a Tensor. Got {type(inpt)}")
+        raise TypeError(
+            f"Input can either be a plain tensor or a `Video` tensor subclass, but got {type(inpt)} instead."
+        )
 
 
 def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
@@ -315,8 +332,13 @@ def convert_color_space(
             inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
         )
         return features.Video.wrap_like(inpt, output, color_space=color_space)
+    elif isinstance(inpt, PIL.Image.Image):
+        return convert_color_space_image_pil(inpt, color_space=color_space)
     else:
-        return convert_color_space_image_pil(inpt, color_space)
+        raise TypeError(
+            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
 def _num_value_bits(dtype: torch.dtype) -> int:
@@ -402,6 +424,11 @@ def convert_dtype(
     elif isinstance(inpt, features.Image):
         output = convert_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype)
         return features.Image.wrap_like(inpt, output)
-    else:  # isinstance(inpt, features.Video):
+    elif isinstance(inpt, features.Video):
         output = convert_dtype_video(inpt.as_subclass(torch.Tensor), dtype)
         return features.Video.wrap_like(inpt, output)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or an `Image` or `Video` tensor subclass, "
+            f"but got {type(inpt)} instead."
+        )
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index d8bfc7cae..575e5c76c 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -53,13 +53,14 @@ def normalize(
     std: List[float],
     inplace: bool = False,
 ) -> torch.Tensor:
-    if torch.jit.is_scripting():
-        correct_type = isinstance(inpt, torch.Tensor)
-    else:
-        correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, (features.Image, features.Video))
-        inpt = inpt.as_subclass(torch.Tensor)
-    if not correct_type:
-        raise TypeError(f"img should be Tensor Image. Got {type(inpt)}")
+    if not torch.jit.is_scripting():
+        if features.is_simple_tensor(inpt) or isinstance(inpt, (features.Image, features.Video)):
+            inpt = inpt.as_subclass(torch.Tensor)
+        else:
+            raise TypeError(
+                f"Input can either be a plain tensor or an `Image` or `Video` tensor subclass, "
+                f"but got {type(inpt)} instead."
+            )
 
     # Image or Video type should not be retained after normalization due to unknown data range
     # Thus we return Tensor for input Image
@@ -168,5 +169,10 @@ def gaussian_blur(
         return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
     elif isinstance(inpt, features._Feature):
         return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma)
-    else:
+    elif isinstance(inpt, PIL.Image.Image):
         return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
diff --git a/torchvision/prototype/transforms/functional/_temporal.py b/torchvision/prototype/transforms/functional/_temporal.py
index 0aed43746..15d9918ae 100644
--- a/torchvision/prototype/transforms/functional/_temporal.py
+++ b/torchvision/prototype/transforms/functional/_temporal.py
@@ -15,10 +15,14 @@ def uniform_temporal_subsample(
 ) -> features.VideoTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)):
         return uniform_temporal_subsample_video(inpt, num_samples, temporal_dim=temporal_dim)
-    else:  # isinstance(inpt, features.Video)
+    elif isinstance(inpt, features.Video):
         if temporal_dim != -4 and inpt.ndim - 4 != temporal_dim:
             raise ValueError("Video inputs must have temporal_dim equivalent to -4")
         output = uniform_temporal_subsample_video(
             inpt.as_subclass(torch.Tensor), num_samples, temporal_dim=temporal_dim
         )
         return features.Video.wrap_like(inpt, output)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a `Video` tensor subclass, but got {type(inpt)} instead."
+        )
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index 0e3d08ef0..ff2a8bdf4 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -8,13 +8,15 @@ from torchvision.transforms import functional as _F
 
 
 @torch.jit.unused
-def to_image_tensor(image: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image:
-    if isinstance(image, np.ndarray):
-        output = torch.from_numpy(image).permute((2, 0, 1)).contiguous()
-    elif isinstance(image, PIL.Image.Image):
-        output = pil_to_tensor(image)
-    else:  # isinstance(inpt, torch.Tensor):
-        output = image
+def to_image_tensor(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image:
+    if isinstance(inpt, np.ndarray):
+        output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous()
+    elif isinstance(inpt, PIL.Image.Image):
+        output = pil_to_tensor(inpt)
+    elif isinstance(inpt, torch.Tensor):
+        output = inpt
+    else:
+        raise TypeError(f"Input can either be a numpy array or a PIL image, but got {type(inpt)} instead.")
     return features.Image(output)
 
 
-- 
GitLab


From 474d87b8a942eb0d5f22f5d98120c6f8961c798e Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Tue, 29 Nov 2022 19:50:25 -0500
Subject: [PATCH 170/624] [Nova] Fix extensions in wheels build (#6972)

* Fix extensions in wheels build

* pass pre-script from caller

* add pre-script to run relocate on wheel

* install auditwheel for relocate

* reorder numpy install

* use jpeg80 for py3.9 and py3.10 mac builds

* revert back to jpeg9

* printing the found jpeg libs during build time and reintro circle for comparison

* more debug logs, remove extra linux builds

* typo fix

* brew uninstall conflicting jpeg libraries on mac machines

* fix path for unittests

* ufmt
---
 .github/workflows/build-wheels-linux.yml |  4 +--
 .github/workflows/build-wheels-m1.yml    |  4 +--
 .github/workflows/build-wheels-macos.yml |  4 +--
 packaging/post_build_script.sh           |  1 +
 packaging/pre_build_script.sh            | 33 ++++++++++++++++++++++++
 test/smoke_test.py                       |  7 +++++
 6 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 packaging/post_build_script.sh
 create mode 100644 packaging/pre_build_script.sh

diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index d2000cae9..806b5331e 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -22,8 +22,8 @@ jobs:
       matrix:
         include:
           - repository: pytorch/vision
-            pre-script: ""
-            post-script: ""
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index c73fe698f..0e6ba7380 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -22,8 +22,8 @@ jobs:
       matrix:
         include:
           - repository: pytorch/vision
-            pre-script: ""
-            post-script: ""
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
index 1d61622c7..33e66adcf 100644
--- a/.github/workflows/build-wheels-macos.yml
+++ b/.github/workflows/build-wheels-macos.yml
@@ -22,8 +22,8 @@ jobs:
       matrix:
         include:
           - repository: pytorch/vision
-            pre-script: ""
-            post-script: ""
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
new file mode 100644
index 000000000..f7acf2ea9
--- /dev/null
+++ b/packaging/post_build_script.sh
@@ -0,0 +1 @@
+LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python packaging/wheel/relocate.py
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
new file mode 100644
index 000000000..930722062
--- /dev/null
+++ b/packaging/pre_build_script.sh
@@ -0,0 +1,33 @@
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
+  # Uninstall Conflicting jpeg brew formulae
+  jpeg_packages=$(brew list | grep jpeg)
+  echo "Existing Jpeg-related Brew libraries"
+  echo $jpeg_packages
+  for pkg in $jpeg_packages; do
+    brew uninstall --ignore-dependencies --force $pkg || true
+  done
+
+  # Install libpng from Anaconda (defaults)
+  conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
+  conda install -yq ffmpeg=4.2 -c pytorch
+  conda install -yq wget
+else
+  # Install native CentOS libJPEG, freetype and GnuTLS
+  yum install -y libjpeg-turbo-devel freetype gnutls
+
+  # Download all the dependencies required to compile image and video_reader
+  # extensions
+  mkdir -p ext_libraries
+  pushd ext_libraries
+  popd
+  export PATH="$(pwd)/ext_libraries/bin:$PATH"
+  pip install auditwheel
+
+  # Point to custom libraries
+  export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
+  export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include
+  export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib
+fi
+
+pip install numpy pyyaml future ninja
+pip install --upgrade setuptools
diff --git a/test/smoke_test.py b/test/smoke_test.py
index 65247d226..e4334c659 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,5 +1,12 @@
 """Run smoke tests"""
 
+import os
+
 import torchvision
+from torchvision.io import read_image
 
+image_path = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
+)
 print("torchvision version is ", torchvision.__version__)
+img = read_image(image_path)
-- 
GitLab


From b94f176aebcb1173cc45b4d9536b60e2594afda3 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Tue, 29 Nov 2022 20:46:24 -0500
Subject: [PATCH 171/624] [Nova] Disable CircleCI Linux CPU Unittests (#6968)

* [Nova] Disable CircleCI Linux Unittests

* add back linux gpu jobs
---
 .circleci/config.yml    | 16 ----------------
 .circleci/regenerate.py |  2 ++
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d4cecee10..8b0b429e2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1488,22 +1488,6 @@ workflows:
       - unittest_torchhub
       - unittest_onnx
       - unittest_extended
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.7
-          python_version: '3.7'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.8
-          python_version: '3.8'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.9
-          python_version: '3.9'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.10
-          python_version: '3.10'
       - unittest_linux_gpu:
           cu_version: cu116
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 3a3ab14a4..f0ffdfb86 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -250,6 +250,8 @@ def unittest_workflows(indentation=6):
         for device_type in ["cpu", "gpu"]:
             if os_type == "macos" and device_type == "gpu":
                 continue
+            if os_type == "linux" and device_type == "cpu":
+                continue
             for i, python_version in enumerate(PYTHON_VERSIONS):
                 job = {
                     "name": f"unittest_{os_type}_{device_type}_py{python_version}",
-- 
GitLab


From 4941c6b6b62bbda3cd462e8e59d114bbdc9683c6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 30 Nov 2022 12:00:19 +0100
Subject: [PATCH 172/624] expose some prototype transforms utils (#6989)

* expose some prototype transforms utils

* rename _isinstance
---
 test/test_prototype_transforms.py             |  6 +-
 test/test_prototype_transforms_consistency.py |  2 +-
 test/test_prototype_transforms_utils.py       |  2 +-
 torchvision/prototype/transforms/__init__.py  |  2 +-
 torchvision/prototype/transforms/_augment.py  |  2 +-
 .../prototype/transforms/_auto_augment.py     |  5 +-
 torchvision/prototype/transforms/_color.py    |  2 +-
 .../prototype/transforms/_deprecated.py       |  2 +-
 torchvision/prototype/transforms/_geometry.py |  5 +-
 torchvision/prototype/transforms/_misc.py     |  3 +-
 .../prototype/transforms/_transform.py        |  8 +--
 torchvision/prototype/transforms/_utils.py    | 68 +-----------------
 torchvision/prototype/transforms/utils.py     | 69 +++++++++++++++++++
 13 files changed, 88 insertions(+), 88 deletions(-)
 create mode 100644 torchvision/prototype/transforms/utils.py

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 9ba5c8564..2544bf29f 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -23,7 +23,7 @@ from prototype_common_utils import (
 )
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import features, transforms
-from torchvision.prototype.transforms._utils import _isinstance
+from torchvision.prototype.transforms.utils import check_type
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
@@ -1860,7 +1860,7 @@ def test_permute_dimensions(dims, inverse_dims):
         value_type = type(value)
         transformed_value = transformed_sample[key]
 
-        if _isinstance(value, (features.Image, features.is_simple_tensor, features.Video)):
+        if check_type(value, (features.Image, features.is_simple_tensor, features.Video)):
             if transform.dims.get(value_type) is not None:
                 assert transformed_value.permute(inverse_dims[value_type]).equal(value)
             assert type(transformed_value) == torch.Tensor
@@ -1893,7 +1893,7 @@ def test_transpose_dimensions(dims):
         transformed_value = transformed_sample[key]
 
         transposed_dims = transform.dims.get(value_type)
-        if _isinstance(value, (features.Image, features.is_simple_tensor, features.Video)):
+        if check_type(value, (features.Image, features.is_simple_tensor, features.Video)):
             if transposed_dims is not None:
                 assert transformed_value.transpose(*transposed_dims).equal(value)
             assert type(transformed_value) == torch.Tensor
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index d82d9ebea..f5738a36a 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -26,8 +26,8 @@ from torchvision import transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import features, transforms as prototype_transforms
 from torchvision.prototype.transforms import functional as prototype_F
-from torchvision.prototype.transforms._utils import query_spatial_size
 from torchvision.prototype.transforms.functional import to_image_pil
+from torchvision.prototype.transforms.utils import query_spatial_size
 from torchvision.transforms import functional as legacy_F
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)])
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
index 3d5960c96..69b23bf12 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -6,8 +6,8 @@ import torch
 from prototype_common_utils import make_bounding_box, make_detection_mask, make_image
 
 from torchvision.prototype import features
-from torchvision.prototype.transforms._utils import has_all, has_any
 from torchvision.prototype.transforms.functional import to_image_pil
+from torchvision.prototype.transforms.utils import has_all, has_any
 
 
 IMAGE = make_image(color_space=features.ColorSpace.RGB)
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index a5e54bb5f..04b007190 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -1,6 +1,6 @@
 from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
 
-from . import functional  # usort: skip
+from . import functional, utils  # usort: skip
 
 from ._transform import Transform  # usort: skip
 from ._presets import StereoMatching  # usort: skip
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index cf861c46d..8ec7929cd 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -11,7 +11,7 @@ from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, InterpolationMode
 
 from ._transform import _RandomApplyTransform
-from ._utils import has_any, query_chw, query_spatial_size
+from .utils import has_any, query_chw, query_spatial_size
 
 
 class RandomErasing(_RandomApplyTransform):
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index a78be373d..28029db82 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -10,7 +10,8 @@ from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F,
 from torchvision.prototype.transforms.functional._meta import get_spatial_size
 from torchvision.transforms import functional_tensor as _FT
 
-from ._utils import _isinstance, _setup_fill_arg
+from ._utils import _setup_fill_arg
+from .utils import check_type
 
 
 class _AutoAugmentBase(Transform):
@@ -38,7 +39,7 @@ class _AutoAugmentBase(Transform):
 
         image_or_videos = []
         for idx, inpt in enumerate(flat_inputs):
-            if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)):
+            if check_type(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)):
                 image_or_videos.append((idx, inpt))
             elif isinstance(inpt, unsupported_types):
                 raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 0dcf636c3..49b2c0987 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -7,7 +7,7 @@ from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, Transform
 
 from ._transform import _RandomApplyTransform
-from ._utils import query_chw
+from .utils import query_chw
 
 
 class ColorJitter(Transform):
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index ac61f4f77..593eb8895 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -11,7 +11,7 @@ from torchvision.transforms import functional as _F
 from typing_extensions import Literal
 
 from ._transform import _RandomApplyTransform
-from ._utils import query_chw
+from .utils import query_chw
 
 
 class ToTensor(Transform):
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 3839fed65..c5313c265 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -21,11 +21,8 @@ from ._utils import (
     _setup_fill_arg,
     _setup_float_or_seq,
     _setup_size,
-    has_all,
-    has_any,
-    query_bounding_box,
-    query_spatial_size,
 )
+from .utils import has_all, has_any, query_bounding_box, query_spatial_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 1d4b0f6fa..e50d9cff0 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -7,7 +7,8 @@ from torchvision.ops import remove_small_boxes
 from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, Transform
 
-from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size, has_any, query_bounding_box
+from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size
+from .utils import has_any, query_bounding_box
 
 
 class Identity(Transform):
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 95cf9c011..43224cabd 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -5,7 +5,7 @@ import PIL.Image
 import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype.transforms._utils import _isinstance
+from torchvision.prototype.transforms.utils import check_type
 from torchvision.utils import _log_api_usage_once
 
 
@@ -36,8 +36,7 @@ class Transform(nn.Module):
         params = self._get_params(flat_inputs)
 
         flat_outputs = [
-            self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt
-            for inpt in flat_inputs
+            self._transform(inpt, params) if check_type(inpt, self._transformed_types) else inpt for inpt in flat_inputs
         ]
 
         return tree_unflatten(flat_outputs, spec)
@@ -80,8 +79,7 @@ class _RandomApplyTransform(Transform):
         params = self._get_params(flat_inputs)
 
         flat_outputs = [
-            self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt
-            for inpt in flat_inputs
+            self._transform(inpt, params) if check_type(inpt, self._transformed_types) else inpt for inpt in flat_inputs
         ]
 
         return tree_unflatten(flat_outputs, spec)
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index 2272396f7..60f648986 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -1,15 +1,11 @@
 import functools
 import numbers
 from collections import defaultdict
-from typing import Any, Callable, Dict, List, Sequence, Tuple, Type, TypeVar, Union
+from typing import Any, Dict, Sequence, Type, TypeVar, Union
 
-import PIL.Image
-
-from torchvision._utils import sequence_to_str
 from torchvision.prototype import features
 from torchvision.prototype.features._feature import FillType, FillTypeJIT
 
-from torchvision.prototype.transforms.functional._meta import get_dimensions, get_spatial_size
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
 from typing_extensions import Literal
@@ -100,65 +96,3 @@ def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
 def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
     if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
         raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-
-def query_bounding_box(flat_inputs: List[Any]) -> features.BoundingBox:
-    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)]
-    if not bounding_boxes:
-        raise TypeError("No bounding box was found in the sample")
-    elif len(bounding_boxes) > 1:
-        raise ValueError("Found multiple bounding boxes in the sample")
-    return bounding_boxes.pop()
-
-
-def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
-    chws = {
-        tuple(get_dimensions(inpt))
-        for inpt in flat_inputs
-        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(inpt)
-    }
-    if not chws:
-        raise TypeError("No image or video was found in the sample")
-    elif len(chws) > 1:
-        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
-    c, h, w = chws.pop()
-    return c, h, w
-
-
-def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
-    sizes = {
-        tuple(get_spatial_size(inpt))
-        for inpt in flat_inputs
-        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox))
-        or features.is_simple_tensor(inpt)
-    }
-    if not sizes:
-        raise TypeError("No image, video, mask or bounding box was found in the sample")
-    elif len(sizes) > 1:
-        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
-    h, w = sizes.pop()
-    return h, w
-
-
-def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
-    for type_or_check in types_or_checks:
-        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
-            return True
-    return False
-
-
-def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    for inpt in flat_inputs:
-        if _isinstance(inpt, types_or_checks):
-            return True
-    return False
-
-
-def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    for type_or_check in types_or_checks:
-        for inpt in flat_inputs:
-            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
-                break
-        else:
-            return False
-    return True
diff --git a/torchvision/prototype/transforms/utils.py b/torchvision/prototype/transforms/utils.py
new file mode 100644
index 000000000..73ab34661
--- /dev/null
+++ b/torchvision/prototype/transforms/utils.py
@@ -0,0 +1,69 @@
+from typing import Any, Callable, List, Tuple, Type, Union
+
+import PIL.Image
+
+from torchvision._utils import sequence_to_str
+from torchvision.prototype import features
+from torchvision.prototype.transforms.functional import get_dimensions, get_spatial_size
+
+
+def query_bounding_box(flat_inputs: List[Any]) -> features.BoundingBox:
+    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)]
+    if not bounding_boxes:
+        raise TypeError("No bounding box was found in the sample")
+    elif len(bounding_boxes) > 1:
+        raise ValueError("Found multiple bounding boxes in the sample")
+    return bounding_boxes.pop()
+
+
+def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
+    chws = {
+        tuple(get_dimensions(inpt))
+        for inpt in flat_inputs
+        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(inpt)
+    }
+    if not chws:
+        raise TypeError("No image or video was found in the sample")
+    elif len(chws) > 1:
+        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
+    c, h, w = chws.pop()
+    return c, h, w
+
+
+def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
+    sizes = {
+        tuple(get_spatial_size(inpt))
+        for inpt in flat_inputs
+        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox))
+        or features.is_simple_tensor(inpt)
+    }
+    if not sizes:
+        raise TypeError("No image, video, mask or bounding box was found in the sample")
+    elif len(sizes) > 1:
+        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
+    h, w = sizes.pop()
+    return h, w
+
+
+def check_type(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
+    for type_or_check in types_or_checks:
+        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
+            return True
+    return False
+
+
+def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for inpt in flat_inputs:
+        if check_type(inpt, types_or_checks):
+            return True
+    return False
+
+
+def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for type_or_check in types_or_checks:
+        for inpt in flat_inputs:
+            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
+                break
+        else:
+            return False
+    return True
-- 
GitLab


From 647b32abc2b072f98fc07ce14f735c07fc4cfc65 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 30 Nov 2022 12:24:03 +0100
Subject: [PATCH 173/624] add minor tolerance for flaky bounding box test
 (#6995)

---
 test/prototype_transforms_kernel_infos.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index d784ccc66..b5618c53d 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -372,6 +372,9 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_resize_bounding_box,
             reference_fn=reference_resize_bounding_box,
             reference_inputs_fn=reference_inputs_resize_bounding_box,
+            closeness_kwargs={
+                (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
+            },
             test_marks=[
                 xfail_jit_python_scalar_arg("size"),
             ],
-- 
GitLab


From a718345a8d60c73a441f6254d6eae456c8a6d787 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Wed, 30 Nov 2022 16:13:53 -0500
Subject: [PATCH 174/624] [Nova] Disable Macos Wheels from CircleCI (#6987)

---
 .circleci/config.yml                     | 112 -----------------------
 .circleci/regenerate.py                  |   4 +
 .github/workflows/build-m1-binaries.yml  |  10 --
 .github/workflows/build-wheels-m1.yml    |   4 +-
 .github/workflows/build-wheels-macos.yml |   4 +-
 5 files changed, 6 insertions(+), 128 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8b0b429e2..0777bb30f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1130,30 +1130,6 @@ workflows:
           name: binary_linux_wheel_py3.7_rocm5.2
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1698,94 +1674,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_rocm5.2
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.7_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.8_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.9_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.10_cpu
-          subfolder: ''
       - binary_win_wheel:
           cu_version: cpu
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index f0ffdfb86..362bfca9f 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -67,6 +67,10 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         if os_type == "linux" and btype == "wheel" and python_version != "3.7":
                             continue
 
+                        # Disable all Macos Wheels Workflows from CircleCI.
+                        if os_type == "macos" and btype == "wheel":
+                            continue
+
                         w += workflow_pair(
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
                         )
diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml
index 81b15172a..94846bea4 100644
--- a/.github/workflows/build-m1-binaries.yml
+++ b/.github/workflows/build-m1-binaries.yml
@@ -81,16 +81,6 @@ jobs:
         with:
           name: torchvision-py${{ matrix.py_vers }}-macos11-m1
           path: dist/
-      - name: Upload wheel to S3
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }}
-        shell: arch -arch arm64 bash {0}
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-        run: |
-          for pkg in dist/*; do
-            aws s3 cp "$pkg" "s3://pytorch/whl/${CHANNEL}/cpu/" --acl public-read
-          done
   build_conda:
     name: "Build TorchVision M1 conda packages"
     runs-on: macos-m1-12
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index 0e6ba7380..7f5446ebb 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -39,9 +39,7 @@ jobs:
       package-name: ${{ matrix.package-name }}
       runner-type: macos-m1-12
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
index 33e66adcf..78677961d 100644
--- a/.github/workflows/build-wheels-macos.yml
+++ b/.github/workflows/build-wheels-macos.yml
@@ -39,9 +39,7 @@ jobs:
       package-name: ${{ matrix.package-name }}
       runner-type: macos-12
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From 264b83b5b974d3f1f878ba21b7dfdd7d13b21d88 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 1 Dec 2022 14:27:18 +0000
Subject: [PATCH 175/624] Add shebang to bash script to make internal linter
 happy (#6999)

---
 packaging/post_build_script.sh | 1 +
 packaging/pre_build_script.sh  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
index f7acf2ea9..ae7542f9f 100644
--- a/packaging/post_build_script.sh
+++ b/packaging/post_build_script.sh
@@ -1 +1,2 @@
+#!/bin/bash
 LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python packaging/wheel/relocate.py
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 930722062..5dfde2386 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Uninstall Conflicting jpeg brew formulae
   jpeg_packages=$(brew list | grep jpeg)
-- 
GitLab


From 790f1cdcea0359619adfc9ec37b91883748d1854 Mon Sep 17 00:00:00 2001
From: toni057 <toni.blaslov@gmail.com>
Date: Thu, 1 Dec 2022 15:45:38 +0000
Subject: [PATCH 176/624] Adding _ops and _weight_size metadata checks to tests
 (#6996)

* Adding _ops and _weight_size metadata checks to tests

* Fixing wrong ops value

* Changing test_schema_meta_validation to instantiate the model only once

* moving instantiating quantized models inside get_ops

* Small refactor of test_schema_meta_validation logic

* Reverting to previous ops value

* Simplifying unquantized models logic in test_schema_meta_validation

* Update test/test_extended_models.py

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>

Co-authored-by: Toni Blaslov <tblaslov@fb.com>
Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/common_extended_utils.py | 303 ++++++++++++++++++++++++++++++++++
 test/test_extended_models.py  |  61 +++++--
 2 files changed, 352 insertions(+), 12 deletions(-)
 create mode 100644 test/common_extended_utils.py

diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py
new file mode 100644
index 000000000..ea631a674
--- /dev/null
+++ b/test/common_extended_utils.py
@@ -0,0 +1,303 @@
+import os
+from collections import defaultdict
+from numbers import Number
+from typing import Any, List
+
+import torch
+from torch.utils._python_dispatch import TorchDispatchMode
+
+from torch.utils._pytree import tree_map
+
+from torchvision.models._api import Weights
+
+aten = torch.ops.aten
+quantized = torch.ops.quantized
+
+
+def get_shape(i):
+    if isinstance(i, torch.Tensor):
+        return i.shape
+    elif hasattr(i, "weight"):
+        return i.weight().shape
+    else:
+        raise ValueError(f"Unknown type {type(i)}")
+
+
+def prod(x):
+    res = 1
+    for i in x:
+        res *= i
+    return res
+
+
+def matmul_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for matmul.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    input_shapes = [get_shape(v) for v in inputs]
+    assert len(input_shapes) == 2, input_shapes
+    assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    flop = prod(input_shapes[0]) * input_shapes[-1][-1]
+    return flop
+
+
+def addmm_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for fully connected layers.
+    """
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [get_shape(v) for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [batch size, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]
+    assert len(input_shapes[1]) == 2, input_shapes[1]
+    batch_size, input_dim = input_shapes[0]
+    output_dim = input_shapes[1][1]
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def bmm_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the bmm operation.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [get_shape(v) for v in inputs]
+    n, c, t = input_shapes[0]
+    d = input_shapes[-1][-1]
+    flop = n * c * t * d
+    return flop
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Number:
+    """
+    Count flops for convolution. Note only multiplication is
+    counted. Computation for addition and bias is ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flop = batch_size * prod(w_shape) * prod(conv_shape)
+    return flop
+
+
+def conv_flop(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+    transposed = inputs[6]
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+def quant_conv_flop(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for quantized convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=False)
+
+
+def transpose_shape(shape):
+    return [shape[1], shape[0]] + list(shape[2:])
+
+
+def conv_backward_flop(inputs: List[Any], outputs: List[Any]):
+    grad_out_shape, x_shape, w_shape = [get_shape(i) for i in inputs[:3]]
+    output_mask = inputs[-1]
+    fwd_transposed = inputs[7]
+    flop_count = 0
+
+    if output_mask[0]:
+        grad_input_shape = get_shape(outputs[0])
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed)
+    if output_mask[1]:
+        grad_weight_shape = get_shape(outputs[1])
+        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed)
+
+    return flop_count
+
+
+flop_mapping = {
+    aten.mm: matmul_flop,
+    aten.matmul: matmul_flop,
+    aten.addmm: addmm_flop,
+    aten.bmm: bmm_flop,
+    aten.convolution: conv_flop,
+    aten._convolution: conv_flop,
+    aten.convolution_backward: conv_backward_flop,
+    quantized.conv2d: quant_conv_flop,
+    quantized.conv2d_relu: quant_conv_flop,
+}
+
+unmapped_ops = set()
+
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+class FlopCounterMode(TorchDispatchMode):
+    def __init__(self, model=None):
+        self.flop_counts = defaultdict(lambda: defaultdict(int))
+        self.parents = ["Global"]
+        # global mod
+        if model is not None:
+            for name, module in dict(model.named_children()).items():
+                module.register_forward_pre_hook(self.enter_module(name))
+                module.register_forward_hook(self.exit_module(name))
+
+    def enter_module(self, name):
+        def f(module, inputs):
+            self.parents.append(name)
+            inputs = normalize_tuple(inputs)
+            out = self.create_backwards_pop(name)(*inputs)
+            return out
+
+        return f
+
+    def exit_module(self, name):
+        def f(module, inputs, outputs):
+            assert self.parents[-1] == name
+            self.parents.pop()
+            outputs = normalize_tuple(outputs)
+            return self.create_backwards_push(name)(*outputs)
+
+        return f
+
+    def create_backwards_push(self, name):
+        class PushState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                self.parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def create_backwards_pop(self, name):
+        class PopState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                assert self.parents[-1] == name
+                self.parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def __enter__(self):
+        self.flop_counts.clear()
+        super().__enter__()
+
+    def __exit__(self, *args):
+        # print(f"Total: {sum(self.flop_counts['Global'].values()) / 1e9} GFLOPS")
+        # for mod in self.flop_counts.keys():
+        #     print(f"Module: ", mod)
+        #     for k, v in self.flop_counts[mod].items():
+        #         print(f"{k}: {v / 1e9} GFLOPS")
+        #     print()
+        super().__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+
+        out = func(*args, **kwargs)
+        func_packet = func._overloadpacket
+        if func_packet in flop_mapping:
+            flop_count = flop_mapping[func_packet](args, normalize_tuple(out))
+            for par in self.parents:
+                self.flop_counts[par][func_packet] += flop_count
+        else:
+            unmapped_ops.add(func_packet)
+
+        return out
+
+    def get_flops(self):
+        return sum(self.flop_counts["Global"].values()) / 1e9
+
+
+def get_dims(module_name, height, width):
+    # detection models have curated input sizes
+    if module_name == "detection":
+        # we can feed a batch of 1 for detection model instead of a list of 1 image
+        dims = (3, height, width)
+    elif module_name == "video":
+        # hard-coding the time dimension to size 16
+        dims = (1, 16, 3, height, width)
+    else:
+        dims = (1, 3, height, width)
+
+    return dims
+
+
+def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512):
+    module_name = model.__module__.split(".")[-2]
+    dims = get_dims(module_name=module_name, height=height, width=width)
+
+    input_tensor = torch.randn(dims)
+
+    # try:
+    preprocess = weight.transforms()
+    if module_name == "optical_flow":
+        inp = preprocess(input_tensor, input_tensor)
+    else:
+        # hack to enable mod(*inp) for optical_flow models
+        inp = [preprocess(input_tensor)]
+
+    model.eval()
+
+    flop_counter = FlopCounterMode(model)
+    with flop_counter:
+        # detection models expect a list of 3d tensors as inputs
+        if module_name == "detection":
+            model(inp)
+        else:
+            model(*inp)
+
+        flops = flop_counter.get_flops()
+
+    return round(flops, 3)
+
+
+def get_weight_size_mb(weight):
+    weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1])
+    weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024
+
+    return round(weights_size_mb, 3)
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index c3bb5d653..f6c9ce087 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -4,11 +4,11 @@ import os
 import pytest
 import test_models as TM
 import torch
+from common_extended_utils import get_ops, get_weight_size_mb
 from torchvision import models
 from torchvision.models._api import get_model_weights, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
 
-
 run_if_test_with_extended = pytest.mark.skipif(
     os.getenv("PYTORCH_TEST_WITH_EXTENDED", "0") != "1",
     reason="Extended tests are disabled by default. Set PYTORCH_TEST_WITH_EXTENDED=1 to run them.",
@@ -131,6 +131,22 @@ def test_naming_conventions(model_fn):
     assert len(weights_enum) == 0 or hasattr(weights_enum, "DEFAULT")
 
 
+detection_models_input_dims = {
+    "fasterrcnn_mobilenet_v3_large_320_fpn": (320, 320),
+    "fasterrcnn_mobilenet_v3_large_fpn": (800, 800),
+    "fasterrcnn_resnet50_fpn": (800, 800),
+    "fasterrcnn_resnet50_fpn_v2": (800, 800),
+    "fcos_resnet50_fpn": (800, 800),
+    "keypointrcnn_resnet50_fpn": (1333, 1333),
+    "maskrcnn_resnet50_fpn": (800, 800),
+    "maskrcnn_resnet50_fpn_v2": (800, 800),
+    "retinanet_resnet50_fpn": (800, 800),
+    "retinanet_resnet50_fpn_v2": (800, 800),
+    "ssd300_vgg16": (300, 300),
+    "ssdlite320_mobilenet_v3_large": (320, 320),
+}
+
+
 @pytest.mark.parametrize(
     "model_fn",
     TM.list_model_fns(models)
@@ -182,7 +198,7 @@ def test_schema_meta_validation(model_fn):
         pytest.skip(f"Model '{model_name}' doesn't have any pre-trained weights.")
 
     problematic_weights = {}
-    incorrect_params = []
+    incorrect_meta = []
     bad_names = []
     for w in weights_enum:
         actual_fields = set(w.meta.keys())
@@ -195,24 +211,45 @@ def test_schema_meta_validation(model_fn):
         unsupported_fields = set(w.meta.keys()) - permitted_fields
         if missing_fields or unsupported_fields:
             problematic_weights[w] = {"missing": missing_fields, "unsupported": unsupported_fields}
-        if w == weights_enum.DEFAULT:
+
+        if w == weights_enum.DEFAULT or any(w.meta[k] != weights_enum.DEFAULT.meta[k] for k in ["num_params", "_ops"]):
             if module_name == "quantization":
                 # parameters() count doesn't work well with quantization, so we check against the non-quantized
                 unquantized_w = w.meta.get("unquantized")
-                if unquantized_w is not None and w.meta.get("num_params") != unquantized_w.meta.get("num_params"):
-                    incorrect_params.append(w)
+                if unquantized_w is not None:
+                    if w.meta.get("num_params") != unquantized_w.meta.get("num_params"):
+                        incorrect_meta.append((w, "num_params"))
+
+                    # the methodology for quantized ops count doesn't work as well, so we take unquantized FLOPs
+                    # instead
+                    if w.meta["_ops"] != unquantized_w.meta.get("_ops"):
+                        incorrect_meta.append((w, "_ops"))
+
             else:
-                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
-                    incorrect_params.append(w)
-        else:
-            if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"):
-                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
-                    incorrect_params.append(w)
+                # loading the model and using it for parameter and ops verification
+                model = model_fn(weights=w)
+
+                if w.meta.get("num_params") != sum(p.numel() for p in model.parameters()):
+                    incorrect_meta.append((w, "num_params"))
+
+                kwargs = {}
+                if model_name in detection_models_input_dims:
+                    # detection models have non default height and width
+                    height, width = detection_models_input_dims[model_name]
+                    kwargs = {"height": height, "width": width}
+
+                calculated_ops = get_ops(model=model, weight=w, **kwargs)
+                if calculated_ops != w.meta["_ops"]:
+                    incorrect_meta.append((w, "_ops"))
+
         if not w.name.isupper():
             bad_names.append(w)
 
+        if get_weight_size_mb(w) != w.meta.get("_weight_size"):
+            incorrect_meta.append((w, "_weight_size"))
+
     assert not problematic_weights
-    assert not incorrect_params
+    assert not incorrect_meta
     assert not bad_names
 
 
-- 
GitLab


From 01c11a0564b8417561ae4c414fe659fc97476987 Mon Sep 17 00:00:00 2001
From: Erjia Guan <68879799+ejguan@users.noreply.github.com>
Date: Fri, 2 Dec 2022 09:58:14 -0500
Subject: [PATCH 177/624] [DataPipe] Properly cleanup unclosed files within
 generator function (#6997)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .../prototype/datasets/_builtin/celeba.py     | 39 ++++++++++---------
 .../prototype/datasets/_builtin/mnist.py      | 33 ++++++++--------
 .../prototype/datasets/_builtin/pcam.py       | 13 ++++---
 3 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 829fa2560..12771a11e 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -30,25 +30,26 @@ class CelebACSVParser(IterDataPipe[Tuple[str, Dict[str, str]]]):
 
     def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]:
         for _, file in self.datapipe:
-            lines = (line.decode() for line in file)
-
-            if self.fieldnames:
-                fieldnames = self.fieldnames
-            else:
-                # The first row is skipped, because it only contains the number of samples
-                next(lines)
-
-                # Empty field names are filtered out, because some files have an extra white space after the header
-                # line, which is recognized as extra column
-                fieldnames = [name for name in next(csv.reader([next(lines)], dialect="celeba")) if name]
-                # Some files do not include a label for the image ID column
-                if fieldnames[0] != "image_id":
-                    fieldnames.insert(0, "image_id")
-
-            for line in csv.DictReader(lines, fieldnames=fieldnames, dialect="celeba"):
-                yield line.pop("image_id"), line
-
-            file.close()
+            try:
+                lines = (line.decode() for line in file)
+
+                if self.fieldnames:
+                    fieldnames = self.fieldnames
+                else:
+                    # The first row is skipped, because it only contains the number of samples
+                    next(lines)
+
+                    # Empty field names are filtered out, because some files have an extra white space after the header
+                    # line, which is recognized as extra column
+                    fieldnames = [name for name in next(csv.reader([next(lines)], dialect="celeba")) if name]
+                    # Some files do not include a label for the image ID column
+                    if fieldnames[0] != "image_id":
+                        fieldnames.insert(0, "image_id")
+
+                for line in csv.DictReader(lines, fieldnames=fieldnames, dialect="celeba"):
+                    yield line.pop("image_id"), line
+            finally:
+                file.close()
 
 
 NAME = "celeba"
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index c13836a8c..97d729d53 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -37,27 +37,28 @@ class MNISTFileReader(IterDataPipe[torch.Tensor]):
 
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            read = functools.partial(fromfile, file, byte_order="big")
+            try:
+                read = functools.partial(fromfile, file, byte_order="big")
 
-            magic = int(read(dtype=torch.int32, count=1))
-            dtype = self._DTYPE_MAP[magic // 256]
-            ndim = magic % 256 - 1
+                magic = int(read(dtype=torch.int32, count=1))
+                dtype = self._DTYPE_MAP[magic // 256]
+                ndim = magic % 256 - 1
 
-            num_samples = int(read(dtype=torch.int32, count=1))
-            shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
-            count = prod(shape) if shape else 1
+                num_samples = int(read(dtype=torch.int32, count=1))
+                shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
+                count = prod(shape) if shape else 1
 
-            start = self.start or 0
-            stop = min(self.stop, num_samples) if self.stop else num_samples
+                start = self.start or 0
+                stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            if start:
-                num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-                file.seek(num_bytes_per_value * count * start, 1)
+                if start:
+                    num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+                    file.seek(num_bytes_per_value * count * start, 1)
 
-            for _ in range(stop - start):
-                yield read(dtype=dtype, count=count).reshape(shape)
-
-            file.close()
+                for _ in range(stop - start):
+                    yield read(dtype=dtype, count=count).reshape(shape)
+            finally:
+                file.close()
 
 
 class _MNISTBase(Dataset):
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
index 3a9fe6e90..f533ba180 100644
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ b/torchvision/prototype/datasets/_builtin/pcam.py
@@ -28,12 +28,13 @@ class PCAMH5Reader(IterDataPipe[Tuple[str, io.IOBase]]):
         import h5py
 
         for _, handle in self.datapipe:
-            with h5py.File(handle) as data:
-                if self.key is not None:
-                    data = data[self.key]
-                yield from data
-
-            handle.close()
+            try:
+                with h5py.File(handle) as data:
+                    if self.key is not None:
+                        data = data[self.key]
+                    yield from data
+            finally:
+                handle.close()
 
 
 _Resource = namedtuple("_Resource", ("file_name", "gdrive_id", "sha256"))
-- 
GitLab


From c093b9c03424d79e8077fd4b2b8eabdb3e7d71e2 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Mon, 5 Dec 2022 01:10:28 -0800
Subject: [PATCH 178/624] Upgrade nightly wheels to ROCm5.3 (#6955)

* Update to ROCm 5.3

* Regenerate config.yml
---
 .circleci/config.yml    | 22 +++++++++++-----------
 .circleci/regenerate.py |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0777bb30f..7eba2450b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1120,16 +1120,16 @@ workflows:
           name: binary_linux_wheel_py3.7_cu117
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.7_rocm5.1.1
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
       - binary_linux_wheel:
           cu_version: rocm5.2
           name: binary_linux_wheel_py3.7_rocm5.2
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
+      - binary_linux_wheel:
+          cu_version: rocm5.3
+          name: binary_linux_wheel_py3.7_rocm5.3
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-rocm:5.3
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1655,25 +1655,25 @@ workflows:
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda117
       - binary_linux_wheel:
-          cu_version: rocm5.1.1
+          cu_version: rocm5.2
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.1.1
+          name: nightly_binary_linux_wheel_py3.7_rocm5.2
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
+          wheel_docker_image: pytorch/manylinux-rocm:5.2
       - binary_linux_wheel:
-          cu_version: rocm5.2
+          cu_version: rocm5.3
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.2
+          name: nightly_binary_linux_wheel_py3.7_rocm5.3
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
+          wheel_docker_image: pytorch/manylinux-rocm:5.3
       - binary_win_wheel:
           cu_version: cpu
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 362bfca9f..88c7460b2 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -32,7 +32,7 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
             cu_versions_dict = {
-                "linux": ["cpu", "cu116", "cu117", "rocm5.1.1", "rocm5.2"],
+                "linux": ["cpu", "cu116", "cu117", "rocm5.2", "rocm5.3"],
                 "win": ["cpu", "cu116", "cu117"],
                 "macos": ["cpu"],
             }
-- 
GitLab


From a8007dcdfb5159a711fa343d2ac4bb7df826975f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 5 Dec 2022 15:48:16 +0100
Subject: [PATCH 179/624] rename features._Feature to datapoints._Datapoint
 (#7002)

* rename features._Feature to datapoints.Datapoint

* _Datapoint to Datapoint

* move is_simple_tensor to transforms.utils

* fix CI

* move Datapoint out of public namespace
---
 .github/workflows/prototype-tests.yml         |   6 +-
 test/prototype_common_utils.py                |  76 ++---
 test/prototype_transforms_dispatcher_infos.py | 160 +++++------
 test/prototype_transforms_kernel_infos.py     |  92 +++---
 ...atures.py => test_prototype_datapoints.py} |  36 +--
 test/test_prototype_datasets_builtin.py       |  14 +-
 test/test_prototype_transforms.py             | 232 +++++++--------
 test/test_prototype_transforms_consistency.py |  34 +--
 test/test_prototype_transforms_functional.py  | 114 ++++----
 test/test_prototype_transforms_utils.py       |  75 ++---
 torchvision/prototype/__init__.py             |   2 +-
 .../{features => datapoints}/__init__.py      |   2 +-
 .../{features => datapoints}/_bounding_box.py |   4 +-
 .../_feature.py => datapoints/_datapoint.py}  |  86 +++---
 .../{features => datapoints}/_image.py        |   4 +-
 .../{features => datapoints}/_label.py        |   4 +-
 .../{features => datapoints}/_mask.py         |   4 +-
 .../{features => datapoints}/_video.py        |   4 +-
 .../prototype/datasets/_builtin/caltech.py    |   5 +-
 .../prototype/datasets/_builtin/celeba.py     |   5 +-
 .../prototype/datasets/_builtin/cifar.py      |   2 +-
 .../prototype/datasets/_builtin/clevr.py      |   2 +-
 .../prototype/datasets/_builtin/coco.py       |  10 +-
 .../prototype/datasets/_builtin/country211.py |   2 +-
 .../prototype/datasets/_builtin/cub200.py     |   5 +-
 .../prototype/datasets/_builtin/dtd.py        |   2 +-
 .../prototype/datasets/_builtin/eurosat.py    |   2 +-
 .../prototype/datasets/_builtin/fer2013.py    |   2 +-
 .../prototype/datasets/_builtin/food101.py    |   2 +-
 .../prototype/datasets/_builtin/gtsrb.py      |   2 +-
 .../prototype/datasets/_builtin/imagenet.py   |   2 +-
 .../prototype/datasets/_builtin/mnist.py      |   2 +-
 .../datasets/_builtin/oxford_iiit_pet.py      |   2 +-
 .../prototype/datasets/_builtin/pcam.py       |   5 +-
 .../prototype/datasets/_builtin/sbd.py        |   6 +-
 .../prototype/datasets/_builtin/semeion.py    |   2 +-
 .../datasets/_builtin/stanford_cars.py        |   2 +-
 .../prototype/datasets/_builtin/svhn.py       |   2 +-
 .../prototype/datasets/_builtin/usps.py       |   2 +-
 .../prototype/datasets/_builtin/voc.py        |   2 +-
 torchvision/prototype/datasets/_folder.py     |   2 +-
 .../prototype/datasets/utils/_encoded.py      |   4 +-
 torchvision/prototype/transforms/_augment.py  |  75 ++---
 .../prototype/transforms/_auto_augment.py     |  41 +--
 torchvision/prototype/transforms/_color.py    |  24 +-
 .../prototype/transforms/_deprecated.py       |  34 ++-
 torchvision/prototype/transforms/_geometry.py |  91 +++---
 torchvision/prototype/transforms/_meta.py     |  46 +--
 torchvision/prototype/transforms/_misc.py     |  25 +-
 torchvision/prototype/transforms/_temporal.py |   8 +-
 .../prototype/transforms/_type_conversion.py  |  17 +-
 torchvision/prototype/transforms/_utils.py    |   6 +-
 .../transforms/functional/_augment.py         |  18 +-
 .../prototype/transforms/functional/_color.py | 112 +++++---
 .../transforms/functional/_deprecated.py      |  24 +-
 .../transforms/functional/_geometry.py        | 263 ++++++++++--------
 .../prototype/transforms/functional/_meta.py  |  75 +++--
 .../prototype/transforms/functional/_misc.py  |  23 +-
 .../transforms/functional/_temporal.py        |  16 +-
 .../transforms/functional/_type_conversion.py |   6 +-
 torchvision/prototype/transforms/utils.py     |  22 +-
 61 files changed, 1050 insertions(+), 899 deletions(-)
 rename test/{test_prototype_features.py => test_prototype_datapoints.py} (66%)
 rename torchvision/prototype/{features => datapoints}/__init__.py (76%)
 rename torchvision/prototype/{features => datapoints}/_bounding_box.py (98%)
 rename torchvision/prototype/{features/_feature.py => datapoints/_datapoint.py} (80%)
 rename torchvision/prototype/{features => datapoints}/_image.py (99%)
 rename torchvision/prototype/{features => datapoints}/_label.py (97%)
 rename torchvision/prototype/{features => datapoints}/_mask.py (98%)
 rename torchvision/prototype/{features => datapoints}/_video.py (99%)

diff --git a/.github/workflows/prototype-tests.yml b/.github/workflows/prototype-tests.yml
index daff383b0..d3a9fbf1e 100644
--- a/.github/workflows/prototype-tests.yml
+++ b/.github/workflows/prototype-tests.yml
@@ -43,14 +43,14 @@ jobs:
         id: setup
         run: exit 0
 
-      - name: Run prototype features tests
+      - name: Run prototype datapoints tests
         shell: bash
         run: |
           pytest \
             --durations=20 \
-            --cov=torchvision/prototype/features \
+            --cov=torchvision/prototype/datapoints \
             --cov-report=term-missing \
-            test/test_prototype_features*.py
+            test/test_prototype_datapoints*.py
 
       - name: Run prototype transforms tests
         if: success() || ( failure() && steps.setup.conclusion == 'success' )
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 61cf065e4..18664eb09 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -15,7 +15,7 @@ import torch.testing
 from datasets_utils import combinations_grid
 from torch.nn.functional import one_hot
 from torch.testing._comparison import assert_equal as _assert_equal, BooleanPair, NonePair, NumberPair, TensorLikePair
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms.functional import convert_dtype_image_tensor, to_image_tensor
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
@@ -238,7 +238,7 @@ class TensorLoader:
 
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
-    color_space: features.ColorSpace
+    color_space: datapoints.ColorSpace
     spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
 
@@ -248,10 +248,10 @@ class ImageLoader(TensorLoader):
 
 
 NUM_CHANNELS_MAP = {
-    features.ColorSpace.GRAY: 1,
-    features.ColorSpace.GRAY_ALPHA: 2,
-    features.ColorSpace.RGB: 3,
-    features.ColorSpace.RGB_ALPHA: 4,
+    datapoints.ColorSpace.GRAY: 1,
+    datapoints.ColorSpace.GRAY_ALPHA: 2,
+    datapoints.ColorSpace.RGB: 3,
+    datapoints.ColorSpace.RGB_ALPHA: 4,
 }
 
 
@@ -265,7 +265,7 @@ def get_num_channels(color_space):
 def make_image_loader(
     size="random",
     *,
-    color_space=features.ColorSpace.RGB,
+    color_space=datapoints.ColorSpace.RGB,
     extra_dims=(),
     dtype=torch.float32,
     constant_alpha=True,
@@ -276,9 +276,9 @@ def make_image_loader(
     def fn(shape, dtype, device):
         max_value = get_max_value(dtype)
         data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
-        if color_space in {features.ColorSpace.GRAY_ALPHA, features.ColorSpace.RGB_ALPHA} and constant_alpha:
+        if color_space in {datapoints.ColorSpace.GRAY_ALPHA, datapoints.ColorSpace.RGB_ALPHA} and constant_alpha:
             data[..., -1, :, :] = max_value
-        return features.Image(data, color_space=color_space)
+        return datapoints.Image(data, color_space=color_space)
 
     return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, color_space=color_space)
 
@@ -290,10 +290,10 @@ def make_image_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
-        features.ColorSpace.GRAY,
-        features.ColorSpace.GRAY_ALPHA,
-        features.ColorSpace.RGB,
-        features.ColorSpace.RGB_ALPHA,
+        datapoints.ColorSpace.GRAY,
+        datapoints.ColorSpace.GRAY_ALPHA,
+        datapoints.ColorSpace.RGB,
+        datapoints.ColorSpace.RGB_ALPHA,
     ),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.float32, torch.uint8),
@@ -306,7 +306,7 @@ def make_image_loaders(
 make_images = from_loaders(make_image_loaders)
 
 
-def make_image_loader_for_interpolation(size="random", *, color_space=features.ColorSpace.RGB, dtype=torch.uint8):
+def make_image_loader_for_interpolation(size="random", *, color_space=datapoints.ColorSpace.RGB, dtype=torch.uint8):
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
@@ -318,24 +318,24 @@ def make_image_loader_for_interpolation(size="random", *, color_space=features.C
             .resize((width, height))
             .convert(
                 {
-                    features.ColorSpace.GRAY: "L",
-                    features.ColorSpace.GRAY_ALPHA: "LA",
-                    features.ColorSpace.RGB: "RGB",
-                    features.ColorSpace.RGB_ALPHA: "RGBA",
+                    datapoints.ColorSpace.GRAY: "L",
+                    datapoints.ColorSpace.GRAY_ALPHA: "LA",
+                    datapoints.ColorSpace.RGB: "RGB",
+                    datapoints.ColorSpace.RGB_ALPHA: "RGBA",
                 }[color_space]
             )
         )
 
         image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
 
-        return features.Image(image_tensor, color_space=color_space)
+        return datapoints.Image(image_tensor, color_space=color_space)
 
     return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, color_space=color_space)
 
 
 def make_image_loaders_for_interpolation(
     sizes=((233, 147),),
-    color_spaces=(features.ColorSpace.RGB,),
+    color_spaces=(datapoints.ColorSpace.RGB,),
     dtypes=(torch.uint8,),
 ):
     for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
@@ -344,7 +344,7 @@ def make_image_loaders_for_interpolation(
 
 @dataclasses.dataclass
 class BoundingBoxLoader(TensorLoader):
-    format: features.BoundingBoxFormat
+    format: datapoints.BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
 
@@ -362,11 +362,11 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
 
 def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
     if isinstance(format, str):
-        format = features.BoundingBoxFormat[format]
+        format = datapoints.BoundingBoxFormat[format]
     if format not in {
-        features.BoundingBoxFormat.XYXY,
-        features.BoundingBoxFormat.XYWH,
-        features.BoundingBoxFormat.CXCYWH,
+        datapoints.BoundingBoxFormat.XYXY,
+        datapoints.BoundingBoxFormat.XYWH,
+        datapoints.BoundingBoxFormat.CXCYWH,
     }:
         raise pytest.UsageError(f"Can't make bounding box in format {format}")
 
@@ -378,19 +378,19 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dt
             raise pytest.UsageError()
 
         if any(dim == 0 for dim in extra_dims):
-            return features.BoundingBox(
+            return datapoints.BoundingBox(
                 torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
             )
 
         height, width = spatial_size
 
-        if format == features.BoundingBoxFormat.XYXY:
+        if format == datapoints.BoundingBoxFormat.XYXY:
             x1 = torch.randint(0, width // 2, extra_dims)
             y1 = torch.randint(0, height // 2, extra_dims)
             x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
             y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
             parts = (x1, y1, x2, y2)
-        elif format == features.BoundingBoxFormat.XYWH:
+        elif format == datapoints.BoundingBoxFormat.XYWH:
             x = torch.randint(0, width // 2, extra_dims)
             y = torch.randint(0, height // 2, extra_dims)
             w = randint_with_tensor_bounds(1, width - x)
@@ -403,7 +403,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dt
             h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
             parts = (cx, cy, w, h)
 
-        return features.BoundingBox(
+        return datapoints.BoundingBox(
             torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
         )
 
@@ -416,7 +416,7 @@ make_bounding_box = from_loader(make_bounding_box_loader)
 def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
-    formats=tuple(features.BoundingBoxFormat),
+    formats=tuple(datapoints.BoundingBoxFormat),
     spatial_size="random",
     dtypes=(torch.float32, torch.int64),
 ):
@@ -456,7 +456,7 @@ def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
         # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
         # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
         data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
-        return features.Label(data, categories=categories)
+        return datapoints.Label(data, categories=categories)
 
     return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)
 
@@ -480,7 +480,7 @@ def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int
             # since `one_hot` only supports int64
             label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
             data = one_hot(label, num_classes=num_categories).to(dtype)
-        return features.OneHotLabel(data, categories=categories)
+        return datapoints.OneHotLabel(data, categories=categories)
 
     return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)
 
@@ -509,7 +509,7 @@ def make_detection_mask_loader(size="random", *, num_objects="random", extra_dim
 
     def fn(shape, dtype, device):
         data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return features.Mask(data)
+        return datapoints.Mask(data)
 
     return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
 
@@ -537,7 +537,7 @@ def make_segmentation_mask_loader(size="random", *, num_categories="random", ext
 
     def fn(shape, dtype, device):
         data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return features.Mask(data)
+        return datapoints.Mask(data)
 
     return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
 
@@ -583,7 +583,7 @@ class VideoLoader(ImageLoader):
 def make_video_loader(
     size="random",
     *,
-    color_space=features.ColorSpace.RGB,
+    color_space=datapoints.ColorSpace.RGB,
     num_frames="random",
     extra_dims=(),
     dtype=torch.uint8,
@@ -593,7 +593,7 @@ def make_video_loader(
 
     def fn(shape, dtype, device):
         video = make_image(size=shape[-2:], color_space=color_space, extra_dims=shape[:-3], dtype=dtype, device=device)
-        return features.Video(video, color_space=color_space)
+        return datapoints.Video(video, color_space=color_space)
 
     return VideoLoader(
         fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype, color_space=color_space
@@ -607,8 +607,8 @@ def make_video_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
-        features.ColorSpace.GRAY,
-        features.ColorSpace.RGB,
+        datapoints.ColorSpace.GRAY,
+        datapoints.ColorSpace.RGB,
     ),
     num_frames=(1, 0, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 8a9f5148e..b92278fef 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -4,7 +4,7 @@ import pytest
 import torchvision.prototype.transforms.functional as F
 from prototype_common_utils import InfoBase, TestMark
 from prototype_transforms_kernel_infos import KERNEL_INFOS
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
@@ -139,20 +139,20 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.horizontal_flip,
         kernels={
-            features.Image: F.horizontal_flip_image_tensor,
-            features.Video: F.horizontal_flip_video,
-            features.BoundingBox: F.horizontal_flip_bounding_box,
-            features.Mask: F.horizontal_flip_mask,
+            datapoints.Image: F.horizontal_flip_image_tensor,
+            datapoints.Video: F.horizontal_flip_video,
+            datapoints.BoundingBox: F.horizontal_flip_bounding_box,
+            datapoints.Mask: F.horizontal_flip_mask,
         },
         pil_kernel_info=PILKernelInfo(F.horizontal_flip_image_pil, kernel_name="horizontal_flip_image_pil"),
     ),
     DispatcherInfo(
         F.resize,
         kernels={
-            features.Image: F.resize_image_tensor,
-            features.Video: F.resize_video,
-            features.BoundingBox: F.resize_bounding_box,
-            features.Mask: F.resize_mask,
+            datapoints.Image: F.resize_image_tensor,
+            datapoints.Video: F.resize_video,
+            datapoints.BoundingBox: F.resize_bounding_box,
+            datapoints.Mask: F.resize_mask,
         },
         pil_kernel_info=PILKernelInfo(F.resize_image_pil),
         test_marks=[
@@ -162,10 +162,10 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.affine,
         kernels={
-            features.Image: F.affine_image_tensor,
-            features.Video: F.affine_video,
-            features.BoundingBox: F.affine_bounding_box,
-            features.Mask: F.affine_mask,
+            datapoints.Image: F.affine_image_tensor,
+            datapoints.Video: F.affine_video,
+            datapoints.BoundingBox: F.affine_bounding_box,
+            datapoints.Mask: F.affine_mask,
         },
         pil_kernel_info=PILKernelInfo(F.affine_image_pil),
         test_marks=[
@@ -179,20 +179,20 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.vertical_flip,
         kernels={
-            features.Image: F.vertical_flip_image_tensor,
-            features.Video: F.vertical_flip_video,
-            features.BoundingBox: F.vertical_flip_bounding_box,
-            features.Mask: F.vertical_flip_mask,
+            datapoints.Image: F.vertical_flip_image_tensor,
+            datapoints.Video: F.vertical_flip_video,
+            datapoints.BoundingBox: F.vertical_flip_bounding_box,
+            datapoints.Mask: F.vertical_flip_mask,
         },
         pil_kernel_info=PILKernelInfo(F.vertical_flip_image_pil, kernel_name="vertical_flip_image_pil"),
     ),
     DispatcherInfo(
         F.rotate,
         kernels={
-            features.Image: F.rotate_image_tensor,
-            features.Video: F.rotate_video,
-            features.BoundingBox: F.rotate_bounding_box,
-            features.Mask: F.rotate_mask,
+            datapoints.Image: F.rotate_image_tensor,
+            datapoints.Video: F.rotate_video,
+            datapoints.BoundingBox: F.rotate_bounding_box,
+            datapoints.Mask: F.rotate_mask,
         },
         pil_kernel_info=PILKernelInfo(F.rotate_image_pil),
         test_marks=[
@@ -204,30 +204,30 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.crop,
         kernels={
-            features.Image: F.crop_image_tensor,
-            features.Video: F.crop_video,
-            features.BoundingBox: F.crop_bounding_box,
-            features.Mask: F.crop_mask,
+            datapoints.Image: F.crop_image_tensor,
+            datapoints.Video: F.crop_video,
+            datapoints.BoundingBox: F.crop_bounding_box,
+            datapoints.Mask: F.crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"),
     ),
     DispatcherInfo(
         F.resized_crop,
         kernels={
-            features.Image: F.resized_crop_image_tensor,
-            features.Video: F.resized_crop_video,
-            features.BoundingBox: F.resized_crop_bounding_box,
-            features.Mask: F.resized_crop_mask,
+            datapoints.Image: F.resized_crop_image_tensor,
+            datapoints.Video: F.resized_crop_video,
+            datapoints.BoundingBox: F.resized_crop_bounding_box,
+            datapoints.Mask: F.resized_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.resized_crop_image_pil),
     ),
     DispatcherInfo(
         F.pad,
         kernels={
-            features.Image: F.pad_image_tensor,
-            features.Video: F.pad_video,
-            features.BoundingBox: F.pad_bounding_box,
-            features.Mask: F.pad_mask,
+            datapoints.Image: F.pad_image_tensor,
+            datapoints.Video: F.pad_video,
+            datapoints.BoundingBox: F.pad_bounding_box,
+            datapoints.Mask: F.pad_mask,
         },
         pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"),
         test_marks=[
@@ -251,10 +251,10 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.perspective,
         kernels={
-            features.Image: F.perspective_image_tensor,
-            features.Video: F.perspective_video,
-            features.BoundingBox: F.perspective_bounding_box,
-            features.Mask: F.perspective_mask,
+            datapoints.Image: F.perspective_image_tensor,
+            datapoints.Video: F.perspective_video,
+            datapoints.BoundingBox: F.perspective_bounding_box,
+            datapoints.Mask: F.perspective_mask,
         },
         pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
         test_marks=[
@@ -264,20 +264,20 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.elastic,
         kernels={
-            features.Image: F.elastic_image_tensor,
-            features.Video: F.elastic_video,
-            features.BoundingBox: F.elastic_bounding_box,
-            features.Mask: F.elastic_mask,
+            datapoints.Image: F.elastic_image_tensor,
+            datapoints.Video: F.elastic_video,
+            datapoints.BoundingBox: F.elastic_bounding_box,
+            datapoints.Mask: F.elastic_mask,
         },
         pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
     ),
     DispatcherInfo(
         F.center_crop,
         kernels={
-            features.Image: F.center_crop_image_tensor,
-            features.Video: F.center_crop_video,
-            features.BoundingBox: F.center_crop_bounding_box,
-            features.Mask: F.center_crop_mask,
+            datapoints.Image: F.center_crop_image_tensor,
+            datapoints.Video: F.center_crop_video,
+            datapoints.BoundingBox: F.center_crop_bounding_box,
+            datapoints.Mask: F.center_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
         test_marks=[
@@ -287,8 +287,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.gaussian_blur,
         kernels={
-            features.Image: F.gaussian_blur_image_tensor,
-            features.Video: F.gaussian_blur_video,
+            datapoints.Image: F.gaussian_blur_image_tensor,
+            datapoints.Video: F.gaussian_blur_video,
         },
         pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil),
         test_marks=[
@@ -299,56 +299,56 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.equalize,
         kernels={
-            features.Image: F.equalize_image_tensor,
-            features.Video: F.equalize_video,
+            datapoints.Image: F.equalize_image_tensor,
+            datapoints.Video: F.equalize_video,
         },
         pil_kernel_info=PILKernelInfo(F.equalize_image_pil, kernel_name="equalize_image_pil"),
     ),
     DispatcherInfo(
         F.invert,
         kernels={
-            features.Image: F.invert_image_tensor,
-            features.Video: F.invert_video,
+            datapoints.Image: F.invert_image_tensor,
+            datapoints.Video: F.invert_video,
         },
         pil_kernel_info=PILKernelInfo(F.invert_image_pil, kernel_name="invert_image_pil"),
     ),
     DispatcherInfo(
         F.posterize,
         kernels={
-            features.Image: F.posterize_image_tensor,
-            features.Video: F.posterize_video,
+            datapoints.Image: F.posterize_image_tensor,
+            datapoints.Video: F.posterize_video,
         },
         pil_kernel_info=PILKernelInfo(F.posterize_image_pil, kernel_name="posterize_image_pil"),
     ),
     DispatcherInfo(
         F.solarize,
         kernels={
-            features.Image: F.solarize_image_tensor,
-            features.Video: F.solarize_video,
+            datapoints.Image: F.solarize_image_tensor,
+            datapoints.Video: F.solarize_video,
         },
         pil_kernel_info=PILKernelInfo(F.solarize_image_pil, kernel_name="solarize_image_pil"),
     ),
     DispatcherInfo(
         F.autocontrast,
         kernels={
-            features.Image: F.autocontrast_image_tensor,
-            features.Video: F.autocontrast_video,
+            datapoints.Image: F.autocontrast_image_tensor,
+            datapoints.Video: F.autocontrast_video,
         },
         pil_kernel_info=PILKernelInfo(F.autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_sharpness,
         kernels={
-            features.Image: F.adjust_sharpness_image_tensor,
-            features.Video: F.adjust_sharpness_video,
+            datapoints.Image: F.adjust_sharpness_image_tensor,
+            datapoints.Video: F.adjust_sharpness_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
     ),
     DispatcherInfo(
         F.erase,
         kernels={
-            features.Image: F.erase_image_tensor,
-            features.Video: F.erase_video,
+            datapoints.Image: F.erase_image_tensor,
+            datapoints.Video: F.erase_video,
         },
         pil_kernel_info=PILKernelInfo(F.erase_image_pil),
         test_marks=[
@@ -358,48 +358,48 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.adjust_brightness,
         kernels={
-            features.Image: F.adjust_brightness_image_tensor,
-            features.Video: F.adjust_brightness_video,
+            datapoints.Image: F.adjust_brightness_image_tensor,
+            datapoints.Video: F.adjust_brightness_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_brightness_image_pil, kernel_name="adjust_brightness_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_contrast,
         kernels={
-            features.Image: F.adjust_contrast_image_tensor,
-            features.Video: F.adjust_contrast_video,
+            datapoints.Image: F.adjust_contrast_image_tensor,
+            datapoints.Video: F.adjust_contrast_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_gamma,
         kernels={
-            features.Image: F.adjust_gamma_image_tensor,
-            features.Video: F.adjust_gamma_video,
+            datapoints.Image: F.adjust_gamma_image_tensor,
+            datapoints.Video: F.adjust_gamma_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_hue,
         kernels={
-            features.Image: F.adjust_hue_image_tensor,
-            features.Video: F.adjust_hue_video,
+            datapoints.Image: F.adjust_hue_image_tensor,
+            datapoints.Video: F.adjust_hue_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_saturation,
         kernels={
-            features.Image: F.adjust_saturation_image_tensor,
-            features.Video: F.adjust_saturation_video,
+            datapoints.Image: F.adjust_saturation_image_tensor,
+            datapoints.Video: F.adjust_saturation_video,
         },
         pil_kernel_info=PILKernelInfo(F.adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
     ),
     DispatcherInfo(
         F.five_crop,
         kernels={
-            features.Image: F.five_crop_image_tensor,
-            features.Video: F.five_crop_video,
+            datapoints.Image: F.five_crop_image_tensor,
+            datapoints.Video: F.five_crop_video,
         },
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         test_marks=[
@@ -410,8 +410,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.ten_crop,
         kernels={
-            features.Image: F.ten_crop_image_tensor,
-            features.Video: F.ten_crop_video,
+            datapoints.Image: F.ten_crop_image_tensor,
+            datapoints.Video: F.ten_crop_video,
         },
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
@@ -422,8 +422,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.normalize,
         kernels={
-            features.Image: F.normalize_image_tensor,
-            features.Video: F.normalize_video,
+            datapoints.Image: F.normalize_image_tensor,
+            datapoints.Video: F.normalize_video,
         },
         test_marks=[
             skip_dispatch_feature,
@@ -434,8 +434,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.convert_dtype,
         kernels={
-            features.Image: F.convert_dtype_image_tensor,
-            features.Video: F.convert_dtype_video,
+            datapoints.Image: F.convert_dtype_image_tensor,
+            datapoints.Video: F.convert_dtype_video,
         },
         test_marks=[
             skip_dispatch_feature,
@@ -444,7 +444,7 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.uniform_temporal_subsample,
         kernels={
-            features.Video: F.uniform_temporal_subsample_video,
+            datapoints.Video: F.uniform_temporal_subsample_video,
         },
         test_marks=[
             skip_dispatch_feature,
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index b5618c53d..8849365ea 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -26,7 +26,7 @@ from prototype_common_utils import (
     TestMark,
 )
 from torch.utils._pytree import tree_map
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.transforms.functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
@@ -176,7 +176,7 @@ def reference_inputs_horizontal_flip_image_tensor():
 
 def sample_inputs_horizontal_flip_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders(
-        formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
+        formats=[datapoints.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(
             bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
@@ -258,13 +258,13 @@ def _get_resize_sizes(spatial_size):
 
 def sample_inputs_resize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+        sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
     ):
         for size in _get_resize_sizes(image_loader.spatial_size):
             yield ArgsKwargs(image_loader, size=size)
 
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB]),
+        make_image_loaders(sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB]),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.BILINEAR,
@@ -468,7 +468,7 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
 
 def sample_inputs_affine_image_tensor():
     make_affine_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+        make_image_loaders, sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
     )
 
     for image_loader, affine_params in itertools.product(make_affine_image_loaders(), _DIVERSE_AFFINE_PARAMS):
@@ -499,7 +499,7 @@ def reference_inputs_affine_image_tensor():
 
 def sample_inputs_affine_bounding_box():
     for bounding_box_loader, affine_params in itertools.product(
-        make_bounding_box_loaders(formats=[features.BoundingBoxFormat.XYXY]), _DIVERSE_AFFINE_PARAMS
+        make_bounding_box_loaders(formats=[datapoints.BoundingBoxFormat.XYXY]), _DIVERSE_AFFINE_PARAMS
     ):
         yield ArgsKwargs(
             bounding_box_loader,
@@ -537,7 +537,7 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         bbox_xyxy = F.convert_format_bounding_box(
-            bbox.float(), old_format=format_, new_format=features.BoundingBoxFormat.XYXY, inplace=True
+            bbox.float(), old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
         )
         points = np.array(
             [
@@ -557,7 +557,7 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
             ],
         )
         out_bbox = F.convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
+            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         return out_bbox.to(dtype=in_dtype)
 
@@ -652,7 +652,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_convert_format_bounding_box():
-    formats = list(features.BoundingBoxFormat)
+    formats = list(datapoints.BoundingBoxFormat)
     for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
         yield ArgsKwargs(bounding_box_loader, old_format=bounding_box_loader.format, new_format=new_format)
 
@@ -681,7 +681,7 @@ KERNEL_INFOS.append(
 
 def sample_inputs_convert_color_space_image_tensor():
     color_spaces = sorted(
-        set(features.ColorSpace) - {features.ColorSpace.OTHER}, key=lambda color_space: color_space.value
+        set(datapoints.ColorSpace) - {datapoints.ColorSpace.OTHER}, key=lambda color_space: color_space.value
     )
 
     for old_color_space, new_color_space in cycle_over(color_spaces):
@@ -697,7 +697,7 @@ def sample_inputs_convert_color_space_image_tensor():
 
 @pil_reference_wrapper
 def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space):
-    color_space_pil = features.ColorSpace.from_pil_mode(image_pil.mode)
+    color_space_pil = datapoints.ColorSpace.from_pil_mode(image_pil.mode)
     if color_space_pil != old_color_space:
         raise pytest.UsageError(
             f"Converting the tensor image into an PIL image changed the colorspace "
@@ -715,7 +715,7 @@ def reference_inputs_convert_color_space_image_tensor():
 
 
 def sample_inputs_convert_color_space_video():
-    color_spaces = [features.ColorSpace.GRAY, features.ColorSpace.RGB]
+    color_spaces = [datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB]
 
     for old_color_space, new_color_space in cycle_over(color_spaces):
         for video_loader in make_video_loaders(sizes=["random"], color_spaces=[old_color_space], num_frames=["random"]):
@@ -754,7 +754,7 @@ def reference_inputs_vertical_flip_image_tensor():
 
 def sample_inputs_vertical_flip_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders(
-        formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
+        formats=[datapoints.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(
             bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
@@ -817,7 +817,7 @@ _ROTATE_ANGLES = [-87, 15, 90]
 
 def sample_inputs_rotate_image_tensor():
     make_rotate_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+        make_image_loaders, sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
     )
 
     for image_loader in make_rotate_image_loaders():
@@ -899,7 +899,7 @@ _CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20
 
 def sample_inputs_crop_image_tensor():
     for image_loader, params in itertools.product(
-        make_image_loaders(sizes=[(16, 17)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]),
+        make_image_loaders(sizes=[(16, 17)], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]),
         [
             dict(top=4, left=3, height=7, width=8),
             dict(top=-1, left=3, height=7, width=8),
@@ -1085,7 +1085,7 @@ _PAD_PARAMS = combinations_grid(
 
 def sample_inputs_pad_image_tensor():
     make_pad_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+        make_image_loaders, sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
     )
 
     for image_loader, padding in itertools.product(
@@ -1401,7 +1401,7 @@ _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
 
 def sample_inputs_center_crop_image_tensor():
     for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=[(16, 17)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]),
+        make_image_loaders(sizes=[(16, 17)], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]),
         [
             # valid `output_size` types for which cropping is applied to both dimensions
             *[5, (4,), (2, 3), [6], [3, 2]],
@@ -1488,7 +1488,7 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_gaussian_blur_image_tensor():
     make_gaussian_blur_image_loaders = functools.partial(
-        make_image_loaders, sizes=[(7, 33)], color_spaces=[features.ColorSpace.RGB]
+        make_image_loaders, sizes=[(7, 33)], color_spaces=[datapoints.ColorSpace.RGB]
     )
 
     for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
@@ -1527,7 +1527,7 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_equalize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1555,7 +1555,7 @@ def reference_inputs_equalize_image_tensor():
     spatial_size = (256, 256)
     for dtype, color_space, fn in itertools.product(
         [torch.uint8],
-        [features.ColorSpace.GRAY, features.ColorSpace.RGB],
+        [datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB],
         [
             lambda shape, dtype, device: torch.zeros(shape, dtype=dtype, device=device),
             lambda shape, dtype, device: torch.full(
@@ -1611,14 +1611,14 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_invert_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader)
 
 
 def reference_inputs_invert_image_tensor():
     for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1651,7 +1651,7 @@ _POSTERIZE_BITS = [1, 4, 8]
 
 def sample_inputs_posterize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
 
@@ -1659,7 +1659,7 @@ def sample_inputs_posterize_image_tensor():
 def reference_inputs_posterize_image_tensor():
     for image_loader, bits in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _POSTERIZE_BITS,
     ):
@@ -1698,14 +1698,14 @@ def _get_solarize_thresholds(dtype):
 
 def sample_inputs_solarize_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))
 
 
 def reference_inputs_solarize_image_tensor():
     for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
     ):
         for threshold in _get_solarize_thresholds(image_loader.dtype):
             yield ArgsKwargs(image_loader, threshold=threshold)
@@ -1741,14 +1741,14 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_autocontrast_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader)
 
 
 def reference_inputs_autocontrast_image_tensor():
     for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
     ):
         yield ArgsKwargs(image_loader)
 
@@ -1785,7 +1785,7 @@ _ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]
 def sample_inputs_adjust_sharpness_image_tensor():
     for image_loader in make_image_loaders(
         sizes=["random", (2, 2)],
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB),
+        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB),
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
 
@@ -1793,7 +1793,7 @@ def sample_inputs_adjust_sharpness_image_tensor():
 def reference_inputs_adjust_sharpness_image_tensor():
     for image_loader, sharpness_factor in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _ADJUST_SHARPNESS_FACTORS,
     ):
@@ -1859,7 +1859,7 @@ _ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]
 
 def sample_inputs_adjust_brightness_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
 
@@ -1867,7 +1867,7 @@ def sample_inputs_adjust_brightness_image_tensor():
 def reference_inputs_adjust_brightness_image_tensor():
     for image_loader, brightness_factor in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _ADJUST_BRIGHTNESS_FACTORS,
     ):
@@ -1903,7 +1903,7 @@ _ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
 
 def sample_inputs_adjust_contrast_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
@@ -1911,7 +1911,7 @@ def sample_inputs_adjust_contrast_image_tensor():
 def reference_inputs_adjust_contrast_image_tensor():
     for image_loader, contrast_factor in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _ADJUST_CONTRAST_FACTORS,
     ):
@@ -1953,7 +1953,7 @@ _ADJUST_GAMMA_GAMMAS_GAINS = [
 def sample_inputs_adjust_gamma_image_tensor():
     gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
 
@@ -1961,7 +1961,7 @@ def sample_inputs_adjust_gamma_image_tensor():
 def reference_inputs_adjust_gamma_image_tensor():
     for image_loader, (gamma, gain) in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _ADJUST_GAMMA_GAMMAS_GAINS,
     ):
@@ -2001,7 +2001,7 @@ _ADJUST_HUE_FACTORS = [-0.1, 0.5]
 
 def sample_inputs_adjust_hue_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
 
@@ -2009,7 +2009,7 @@ def sample_inputs_adjust_hue_image_tensor():
 def reference_inputs_adjust_hue_image_tensor():
     for image_loader, hue_factor in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _ADJUST_HUE_FACTORS,
     ):
@@ -2047,7 +2047,7 @@ _ADJUST_SATURATION_FACTORS = [0.1, 0.5]
 
 def sample_inputs_adjust_saturation_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
+        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
     ):
         yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
@@ -2055,7 +2055,7 @@ def sample_inputs_adjust_saturation_image_tensor():
 def reference_inputs_adjust_saturation_image_tensor():
     for image_loader, saturation_factor in itertools.product(
         make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
+            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
         ),
         _ADJUST_SATURATION_FACTORS,
     ):
@@ -2120,7 +2120,7 @@ def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
             sizes=[_get_five_ten_crop_spatial_size(size)],
-            color_spaces=[features.ColorSpace.RGB],
+            color_spaces=[datapoints.ColorSpace.RGB],
             dtypes=[torch.float32],
         ):
             yield ArgsKwargs(image_loader, size=size)
@@ -2144,7 +2144,7 @@ def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
             sizes=[_get_five_ten_crop_spatial_size(size)],
-            color_spaces=[features.ColorSpace.RGB],
+            color_spaces=[datapoints.ColorSpace.RGB],
             dtypes=[torch.float32],
         ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
@@ -2218,7 +2218,7 @@ _NORMALIZE_MEANS_STDS = [
 
 def sample_inputs_normalize_image_tensor():
     for image_loader, (mean, std) in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]),
+        make_image_loaders(sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]),
         _NORMALIZE_MEANS_STDS,
     ):
         yield ArgsKwargs(image_loader, mean=mean, std=std)
@@ -2227,7 +2227,7 @@ def sample_inputs_normalize_image_tensor():
 def sample_inputs_normalize_video():
     mean, std = _NORMALIZE_MEANS_STDS[0]
     for video_loader in make_video_loaders(
-        sizes=["random"], color_spaces=[features.ColorSpace.RGB], num_frames=["random"], dtypes=[torch.float32]
+        sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], num_frames=["random"], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(video_loader, mean=mean, std=std)
 
@@ -2260,7 +2260,7 @@ def sample_inputs_convert_dtype_image_tensor():
             continue
 
         for image_loader in make_image_loaders(
-            sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[input_dtype]
+            sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[input_dtype]
         ):
             yield ArgsKwargs(image_loader, dtype=output_dtype)
 
@@ -2388,7 +2388,7 @@ def reference_uniform_temporal_subsample_video(x, num_samples, temporal_dim=-4):
 
 
 def reference_inputs_uniform_temporal_subsample_video():
-    for video_loader in make_video_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB], num_frames=[10]):
+    for video_loader in make_video_loaders(sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], num_frames=[10]):
         for num_samples in range(1, video_loader.shape[-4] + 1):
             yield ArgsKwargs(video_loader, num_samples)
 
diff --git a/test/test_prototype_features.py b/test/test_prototype_datapoints.py
similarity index 66%
rename from test/test_prototype_features.py
rename to test/test_prototype_datapoints.py
index d2b0d2e63..d036b5db1 100644
--- a/test/test_prototype_features.py
+++ b/test/test_prototype_datapoints.py
@@ -1,36 +1,36 @@
 import pytest
 import torch
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 
 
 def test_isinstance():
     assert isinstance(
-        features.Label([0, 1, 0], categories=["foo", "bar"]),
+        datapoints.Label([0, 1, 0], categories=["foo", "bar"]),
         torch.Tensor,
     )
 
 
 def test_wrapping_no_copy():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     assert label.data_ptr() == tensor.data_ptr()
 
 
 def test_to_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     label_to = label.to(torch.int32)
 
-    assert type(label_to) is features.Label
+    assert type(label_to) is datapoints.Label
     assert label_to.dtype is torch.int32
     assert label_to.categories is label.categories
 
 
 def test_to_feature_reference():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
+    label = datapoints.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
 
     tensor_to = tensor.to(label)
 
@@ -40,31 +40,31 @@ def test_to_feature_reference():
 
 def test_clone_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     label_clone = label.clone()
 
-    assert type(label_clone) is features.Label
+    assert type(label_clone) is datapoints.Label
     assert label_clone.data_ptr() != label.data_ptr()
     assert label_clone.categories is label.categories
 
 
 def test_requires_grad__wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.float32)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     assert not label.requires_grad
 
     label_requires_grad = label.requires_grad_(True)
 
-    assert type(label_requires_grad) is features.Label
+    assert type(label_requires_grad) is datapoints.Label
     assert label.requires_grad
     assert label_requires_grad.requires_grad
 
 
 def test_other_op_no_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     # any operation besides .to() and .clone() will do here
     output = label * 2
@@ -82,32 +82,32 @@ def test_other_op_no_wrapping():
 )
 def test_no_tensor_output_op_no_wrapping(op):
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     output = op(label)
 
-    assert type(output) is not features.Label
+    assert type(output) is not datapoints.Label
 
 
 def test_inplace_op_no_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     output = label.add_(0)
 
     assert type(output) is torch.Tensor
-    assert type(label) is features.Label
+    assert type(label) is datapoints.Label
 
 
 def test_wrap_like():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
+    label = datapoints.Label(tensor, categories=["foo", "bar"])
 
     # any operation besides .to() and .clone() will do here
     output = label * 2
 
-    label_new = features.Label.wrap_like(label, output)
+    label_new = datapoints.Label.wrap_like(label, output)
 
-    assert type(label_new) is features.Label
+    assert type(label_new) is datapoints.Label
     assert label_new.data_ptr() == output.data_ptr()
     assert label_new.categories is label.categories
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 7bea05fce..25ceaa490 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -6,6 +6,8 @@ from pathlib import Path
 
 import pytest
 import torch
+
+import torchvision.prototype.transforms.utils
 from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
 from torch.testing._comparison import assert_equal, ObjectPair, TensorLikePair
 from torch.utils.data import DataLoader
@@ -14,7 +16,7 @@ from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.datapipes.iter import ShardingFilter, Shuffler
 from torchdata.datapipes.utils import StreamWrapper
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import datasets, features, transforms
+from torchvision.prototype import datapoints, datasets, transforms
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
 
 
@@ -130,7 +132,11 @@ class TestCommon:
     def test_no_simple_tensors(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
 
-        simple_tensors = {key for key, value in next_consume(iter(dataset)).items() if features.is_simple_tensor(value)}
+        simple_tensors = {
+            key
+            for key, value in next_consume(iter(dataset)).items()
+            if torchvision.prototype.transforms.utils.is_simple_tensor(value)
+        }
         if simple_tensors:
             raise AssertionError(
                 f"The values of key(s) "
@@ -258,7 +264,7 @@ class TestUSPS:
             assert "image" in sample
             assert "label" in sample
 
-            assert isinstance(sample["image"], features.Image)
-            assert isinstance(sample["label"], features.Label)
+            assert isinstance(sample["image"], datapoints.Image)
+            assert isinstance(sample["label"], datapoints.Label)
 
             assert sample["image"].shape == (1, 16, 16)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 2544bf29f..44474e888 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -6,6 +6,8 @@ import PIL.Image
 
 import pytest
 import torch
+
+import torchvision.prototype.transforms.utils
 from common_utils import assert_equal, cpu_and_gpu
 from prototype_common_utils import (
     DEFAULT_EXTRA_DIMS,
@@ -22,7 +24,7 @@ from prototype_common_utils import (
     make_videos,
 )
 from torchvision.ops.boxes import box_iou
-from torchvision.prototype import features, transforms
+from torchvision.prototype import datapoints, transforms
 from torchvision.prototype.transforms.utils import check_type
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
 
@@ -159,8 +161,8 @@ class TestSmoke:
                 itertools.chain.from_iterable(
                     fn(
                         color_spaces=[
-                            features.ColorSpace.GRAY,
-                            features.ColorSpace.RGB,
+                            datapoints.ColorSpace.GRAY,
+                            datapoints.ColorSpace.RGB,
                         ],
                         dtypes=[torch.uint8],
                         extra_dims=[(), (4,)],
@@ -190,7 +192,7 @@ class TestSmoke:
             (
                 transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
                 itertools.chain.from_iterable(
-                    fn(color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32])
+                    fn(color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32])
                     for fn in [
                         make_images,
                         make_vanilla_tensor_images,
@@ -237,10 +239,10 @@ class TestSmoke:
             )
             for old_color_space, new_color_space in itertools.product(
                 [
-                    features.ColorSpace.GRAY,
-                    features.ColorSpace.GRAY_ALPHA,
-                    features.ColorSpace.RGB,
-                    features.ColorSpace.RGB_ALPHA,
+                    datapoints.ColorSpace.GRAY,
+                    datapoints.ColorSpace.GRAY_ALPHA,
+                    datapoints.ColorSpace.RGB,
+                    datapoints.ColorSpace.RGB_ALPHA,
                 ],
                 repeat=2,
             )
@@ -251,7 +253,7 @@ class TestSmoke:
 
     def test_convert_color_space_unsupported_types(self):
         transform = transforms.ConvertColorSpace(
-            color_space=features.ColorSpace.RGB, old_color_space=features.ColorSpace.GRAY
+            color_space=datapoints.ColorSpace.RGB, old_color_space=datapoints.ColorSpace.GRAY
         )
 
         for inpt in [make_bounding_box(format="XYXY"), make_masks()]:
@@ -287,26 +289,26 @@ class TestRandomHorizontalFlip:
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomHorizontalFlip(p=p)
 
-        actual = transform(features.Image(input))
+        actual = transform(datapoints.Image(input))
 
-        assert_equal(features.Image(expected), actual)
+        assert_equal(datapoints.Image(expected), actual)
 
     def test_features_mask(self, p):
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomHorizontalFlip(p=p)
 
-        actual = transform(features.Mask(input))
+        actual = transform(datapoints.Mask(input))
 
-        assert_equal(features.Mask(expected), actual)
+        assert_equal(datapoints.Mask(expected), actual)
 
     def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
+        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
         transform = transforms.RandomHorizontalFlip(p=p)
 
         actual = transform(input)
 
         expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
+        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
         assert_equal(expected, actual)
         assert actual.format == expected.format
         assert actual.spatial_size == expected.spatial_size
@@ -340,26 +342,26 @@ class TestRandomVerticalFlip:
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomVerticalFlip(p=p)
 
-        actual = transform(features.Image(input))
+        actual = transform(datapoints.Image(input))
 
-        assert_equal(features.Image(expected), actual)
+        assert_equal(datapoints.Image(expected), actual)
 
     def test_features_mask(self, p):
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomVerticalFlip(p=p)
 
-        actual = transform(features.Mask(input))
+        actual = transform(datapoints.Mask(input))
 
-        assert_equal(features.Mask(expected), actual)
+        assert_equal(datapoints.Mask(expected), actual)
 
     def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
+        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
         transform = transforms.RandomVerticalFlip(p=p)
 
         actual = transform(input)
 
         expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
+        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
         assert_equal(expected, actual)
         assert actual.format == expected.format
         assert actual.spatial_size == expected.spatial_size
@@ -386,7 +388,7 @@ class TestPad:
         transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         _ = transform(inpt)
 
         fill = transforms._utils._convert_fill_arg(fill)
@@ -394,13 +396,13 @@ class TestPad:
             padding = list(padding)
         fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
 
-    @pytest.mark.parametrize("fill", [12, {features.Image: 12, features.Mask: 34}])
+    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
     def test__transform_image_mask(self, fill, mocker):
         transform = transforms.Pad(1, fill=fill, padding_mode="constant")
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        image = features.Image(torch.rand(3, 32, 32))
-        mask = features.Mask(torch.randint(0, 5, size=(32, 32)))
+        image = datapoints.Image(torch.rand(3, 32, 32))
+        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
         inpt = [image, mask]
         _ = transform(inpt)
 
@@ -436,7 +438,7 @@ class TestRandomZoomOut:
     def test__get_params(self, fill, side_range, mocker):
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
 
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         h, w = image.spatial_size = (24, 32)
 
         params = transform._get_params([image])
@@ -450,7 +452,7 @@ class TestRandomZoomOut:
     @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
     @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
     def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
 
@@ -469,13 +471,13 @@ class TestRandomZoomOut:
         fill = transforms._utils._convert_fill_arg(fill)
         fn.assert_called_once_with(inpt, **params, fill=fill)
 
-    @pytest.mark.parametrize("fill", [12, {features.Image: 12, features.Mask: 34}])
+    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
     def test__transform_image_mask(self, fill, mocker):
         transform = transforms.RandomZoomOut(fill=fill, p=1.0)
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        image = features.Image(torch.rand(3, 32, 32))
-        mask = features.Mask(torch.randint(0, 5, size=(32, 32)))
+        image = datapoints.Image(torch.rand(3, 32, 32))
+        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
         inpt = [image, mask]
 
         torch.manual_seed(12)
@@ -547,7 +549,7 @@ class TestRandomRotation:
             assert transform.degrees == [float(-degrees), float(degrees)]
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.rotate")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
         # Otherwise, we can mock transform._get_params
@@ -563,10 +565,10 @@ class TestRandomRotation:
     @pytest.mark.parametrize("expand", [False, True])
     def test_boundingbox_spatial_size(self, angle, expand):
         # Specific test for BoundingBox.rotate
-        bbox = features.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
+        bbox = datapoints.BoundingBox(
+            torch.tensor([1, 2, 3, 4]), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
         )
-        img = features.Image(torch.rand(1, 3, 32, 32))
+        img = datapoints.Image(torch.rand(1, 3, 32, 32))
 
         out_img = img.rotate(angle, expand=expand)
         out_bbox = bbox.rotate(angle, expand=expand)
@@ -619,7 +621,7 @@ class TestRandomAffine:
     @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
     @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
     def test__get_params(self, degrees, translate, scale, shear, mocker):
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
         h, w = image.spatial_size
@@ -682,7 +684,7 @@ class TestRandomAffine:
             assert transform.degrees == [float(-degrees), float(degrees)]
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.affine")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
 
@@ -718,7 +720,7 @@ class TestRandomCrop:
     @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
     @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
     def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
         h, w = image.spatial_size
@@ -771,11 +773,11 @@ class TestRandomCrop:
             output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
         )
 
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (32, 32)
 
-        expected = mocker.MagicMock(spec=features.Image)
+        expected = mocker.MagicMock(spec=datapoints.Image)
         expected.num_channels = 3
         if isinstance(padding, int):
             expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
@@ -859,7 +861,7 @@ class TestGaussianBlur:
             assert transform.sigma == [sigma, sigma]
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
 
@@ -891,7 +893,7 @@ class TestRandomColorOp:
         transform = transform_cls(p=p, **kwargs)
 
         fn = mocker.patch(f"torchvision.prototype.transforms.functional.{func_op_name}")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         _ = transform(inpt)
         if p > 0.0:
             fn.assert_called_once_with(inpt, **kwargs)
@@ -910,7 +912,7 @@ class TestRandomPerspective:
     def test__get_params(self, mocker):
         dscale = 0.5
         transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
 
@@ -927,7 +929,7 @@ class TestRandomPerspective:
         transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.perspective")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
         # vfdev-5, Feature Request: let's store params as Transform attribute
@@ -971,7 +973,7 @@ class TestElasticTransform:
         alpha = 2.0
         sigma = 3.0
         transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
 
@@ -1001,7 +1003,7 @@ class TestElasticTransform:
             assert transform.sigma == sigma
 
         fn = mocker.patch("torchvision.prototype.transforms.functional.elastic")
-        inpt = mocker.MagicMock(spec=features.Image)
+        inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
 
@@ -1030,7 +1032,7 @@ class TestRandomErasing:
         with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
             transforms.RandomErasing(scale=[-1, 2])
 
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
 
@@ -1041,7 +1043,7 @@ class TestRandomErasing:
 
     @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
     def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
 
@@ -1100,7 +1102,7 @@ class TestRandomErasing:
 class TestTransform:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test_check_transformed_types(self, inpt_type, mocker):
         # This test ensures that we correctly handle which types to transform and which to bypass
@@ -1118,7 +1120,7 @@ class TestTransform:
 class TestToImageTensor:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch(
@@ -1129,7 +1131,7 @@ class TestToImageTensor:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToImageTensor()
         transform(inpt)
-        if inpt_type in (features.BoundingBox, features.Image, str, int):
+        if inpt_type in (datapoints.BoundingBox, datapoints.Image, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt)
@@ -1138,7 +1140,7 @@ class TestToImageTensor:
 class TestToImagePIL:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
@@ -1146,7 +1148,7 @@ class TestToImagePIL:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToImagePIL()
         transform(inpt)
-        if inpt_type in (features.BoundingBox, PIL.Image.Image, str, int):
+        if inpt_type in (datapoints.BoundingBox, PIL.Image.Image, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt, mode=transform.mode)
@@ -1155,7 +1157,7 @@ class TestToImagePIL:
 class TestToPILImage:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
@@ -1163,7 +1165,7 @@ class TestToPILImage:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToPILImage()
         transform(inpt)
-        if inpt_type in (PIL.Image.Image, features.BoundingBox, str, int):
+        if inpt_type in (PIL.Image.Image, datapoints.BoundingBox, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt, mode=transform.mode)
@@ -1172,7 +1174,7 @@ class TestToPILImage:
 class TestToTensor:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.transforms.functional.to_tensor")
@@ -1181,7 +1183,7 @@ class TestToTensor:
         with pytest.warns(UserWarning, match="deprecated and will be removed"):
             transform = transforms.ToTensor()
         transform(inpt)
-        if inpt_type in (features.Image, torch.Tensor, features.BoundingBox, str, int):
+        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBox, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt)
@@ -1223,10 +1225,10 @@ class TestRandomIoUCrop:
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
     def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=features.Image)
+        image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
-        bboxes = features.BoundingBox(
+        bboxes = datapoints.BoundingBox(
             torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
             format="XYXY",
             spatial_size=image.spatial_size,
@@ -1263,9 +1265,9 @@ class TestRandomIoUCrop:
 
     def test__transform_empty_params(self, mocker):
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
-        image = features.Image(torch.rand(1, 3, 4, 4))
-        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
-        label = features.Label(torch.tensor([1]))
+        image = datapoints.Image(torch.rand(1, 3, 4, 4))
+        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
+        label = datapoints.Label(torch.tensor([1]))
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
         transform._get_params = mocker.MagicMock(return_value={})
@@ -1283,10 +1285,10 @@ class TestRandomIoUCrop:
     def test__transform(self, mocker):
         transform = transforms.RandomIoUCrop()
 
-        image = features.Image(torch.rand(3, 32, 24))
+        image = datapoints.Image(torch.rand(3, 32, 24))
         bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
-        label = features.Label(torch.randint(0, 10, size=(6,)))
-        ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
+        label = datapoints.Label(torch.randint(0, 10, size=(6,)))
+        ohe_label = datapoints.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
         masks = make_detection_mask((32, 24), num_objects=6)
 
         sample = [image, bboxes, label, ohe_label, masks]
@@ -1312,21 +1314,21 @@ class TestRandomIoUCrop:
 
         # check number of bboxes vs number of labels:
         output_bboxes = output[1]
-        assert isinstance(output_bboxes, features.BoundingBox)
+        assert isinstance(output_bboxes, datapoints.BoundingBox)
         assert len(output_bboxes) == expected_within_targets
 
         # check labels
         output_label = output[2]
-        assert isinstance(output_label, features.Label)
+        assert isinstance(output_label, datapoints.Label)
         assert len(output_label) == expected_within_targets
         torch.testing.assert_close(output_label, label[is_within_crop_area])
 
         output_ohe_label = output[3]
-        assert isinstance(output_ohe_label, features.OneHotLabel)
+        assert isinstance(output_ohe_label, datapoints.OneHotLabel)
         torch.testing.assert_close(output_ohe_label, ohe_label[is_within_crop_area])
 
         output_masks = output[4]
-        assert isinstance(output_masks, features.Mask)
+        assert isinstance(output_masks, datapoints.Mask)
         assert len(output_masks) == expected_within_targets
 
 
@@ -1337,7 +1339,7 @@ class TestScaleJitter:
         scale_range = (0.5, 1.5)
 
         transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
+        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
 
         n_samples = 5
         for _ in range(n_samples):
@@ -1387,7 +1389,7 @@ class TestRandomShortestSize:
 
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
 
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
+        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
         params = transform._get_params([sample])
 
         assert "size" in params
@@ -1439,21 +1441,21 @@ class TestSimpleCopyPaste:
 
         flat_sample = [
             # images, batch size = 2
-            self.create_fake_image(mocker, features.Image),
+            self.create_fake_image(mocker, datapoints.Image),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=features.Label),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
+            mocker.MagicMock(spec=datapoints.Label),
+            mocker.MagicMock(spec=datapoints.BoundingBox),
+            mocker.MagicMock(spec=datapoints.Mask),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
+            mocker.MagicMock(spec=datapoints.BoundingBox),
+            mocker.MagicMock(spec=datapoints.Mask),
         ]
 
         with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"):
             transform._extract_image_targets(flat_sample)
 
-    @pytest.mark.parametrize("image_type", [features.Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [features.Label, features.OneHotLabel])
+    @pytest.mark.parametrize("image_type", [datapoints.Image, PIL.Image.Image, torch.Tensor])
+    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
     def test__extract_image_targets(self, image_type, label_type, mocker):
         transform = transforms.SimpleCopyPaste()
 
@@ -1463,12 +1465,12 @@ class TestSimpleCopyPaste:
             self.create_fake_image(mocker, image_type),
             # labels, bboxes, masks
             mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
+            mocker.MagicMock(spec=datapoints.BoundingBox),
+            mocker.MagicMock(spec=datapoints.Mask),
             # labels, bboxes, masks
             mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
+            mocker.MagicMock(spec=datapoints.BoundingBox),
+            mocker.MagicMock(spec=datapoints.Mask),
         ]
 
         images, targets = transform._extract_image_targets(flat_sample)
@@ -1483,15 +1485,15 @@ class TestSimpleCopyPaste:
 
         for target in targets:
             for key, type_ in [
-                ("boxes", features.BoundingBox),
-                ("masks", features.Mask),
+                ("boxes", datapoints.BoundingBox),
+                ("masks", datapoints.Mask),
                 ("labels", label_type),
             ]:
                 assert key in target
                 assert isinstance(target[key], type_)
                 assert target[key] in flat_sample
 
-    @pytest.mark.parametrize("label_type", [features.Label, features.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
     def test__copy_paste(self, label_type):
         image = 2 * torch.ones(3, 32, 32)
         masks = torch.zeros(2, 32, 32)
@@ -1501,13 +1503,13 @@ class TestSimpleCopyPaste:
         blending = True
         resize_interpolation = InterpolationMode.BILINEAR
         antialias = None
-        if label_type == features.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
-            "boxes": features.BoundingBox(
+            "boxes": datapoints.BoundingBox(
                 torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
             ),
-            "masks": features.Mask(masks),
+            "masks": datapoints.Mask(masks),
             "labels": label_type(labels),
         }
 
@@ -1516,13 +1518,13 @@ class TestSimpleCopyPaste:
         paste_masks[0, 13:19, 12:18] = 1
         paste_masks[1, 15:19, 1:8] = 1
         paste_labels = torch.tensor([3, 4])
-        if label_type == features.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
-            "boxes": features.BoundingBox(
+            "boxes": datapoints.BoundingBox(
                 torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
             ),
-            "masks": features.Mask(paste_masks),
+            "masks": datapoints.Mask(paste_masks),
             "labels": label_type(paste_labels),
         }
 
@@ -1538,7 +1540,7 @@ class TestSimpleCopyPaste:
         torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
 
         expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == features.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
             expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
         torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
 
@@ -1556,9 +1558,9 @@ class TestFixedSizeCrop:
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(size=spatial_size, color_space=features.ColorSpace.RGB),
+            make_image(size=spatial_size, color_space=datapoints.ColorSpace.RGB),
             make_bounding_box(
-                format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
+                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
             ),
         ]
         params = transform._get_params(flat_inputs)
@@ -1656,7 +1658,7 @@ class TestFixedSizeCrop:
         )
 
         bounding_boxes = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
         masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
@@ -1695,7 +1697,7 @@ class TestFixedSizeCrop:
         )
 
         bounding_box = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
@@ -1721,7 +1723,7 @@ class TestLinearTransformation:
         [
             122 * torch.ones(1, 3, 8, 8),
             122.0 * torch.ones(1, 3, 8, 8),
-            features.Image(122 * torch.ones(1, 3, 8, 8)),
+            datapoints.Image(122 * torch.ones(1, 3, 8, 8)),
             PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
         ],
     )
@@ -1744,10 +1746,10 @@ class TestLinearTransformation:
 class TestLabelToOneHot:
     def test__transform(self):
         categories = ["apple", "pear", "pineapple"]
-        labels = features.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
+        labels = datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
         transform = transforms.LabelToOneHot()
         ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, features.OneHotLabel)
+        assert isinstance(ohe_labels, datapoints.OneHotLabel)
         assert ohe_labels.shape == (4, 3)
         assert ohe_labels.categories == labels.categories == categories
 
@@ -1797,11 +1799,11 @@ class TestRandomResize:
     [
         (
             torch.float64,
-            {torch.Tensor: torch.float64, features.Image: torch.float64, features.BoundingBox: torch.float64},
+            {torch.Tensor: torch.float64, datapoints.Image: torch.float64, datapoints.BoundingBox: torch.float64},
         ),
         (
-            {torch.Tensor: torch.int32, features.Image: torch.float32, features.BoundingBox: torch.float64},
-            {torch.Tensor: torch.int32, features.Image: torch.float32, features.BoundingBox: torch.float64},
+            {torch.Tensor: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+            {torch.Tensor: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
         ),
     ],
 )
@@ -1809,7 +1811,7 @@ def test_to_dtype(dtype, expected_dtypes):
     sample = dict(
         plain_tensor=torch.testing.make_tensor(5, dtype=torch.int64, device="cpu"),
         image=make_image(dtype=torch.uint8),
-        bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY, dtype=torch.float32),
+        bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
         str="str",
         int=0,
     )
@@ -1834,12 +1836,12 @@ def test_to_dtype(dtype, expected_dtypes):
     ("dims", "inverse_dims"),
     [
         (
-            {torch.Tensor: (1, 2, 0), features.Image: (2, 1, 0), features.Video: None},
-            {torch.Tensor: (2, 0, 1), features.Image: (2, 1, 0), features.Video: None},
+            {torch.Tensor: (1, 2, 0), datapoints.Image: (2, 1, 0), datapoints.Video: None},
+            {torch.Tensor: (2, 0, 1), datapoints.Image: (2, 1, 0), datapoints.Video: None},
         ),
         (
-            {torch.Tensor: (1, 2, 0), features.Image: (2, 1, 0), features.Video: (1, 2, 3, 0)},
-            {torch.Tensor: (2, 0, 1), features.Image: (2, 1, 0), features.Video: (3, 0, 1, 2)},
+            {torch.Tensor: (1, 2, 0), datapoints.Image: (2, 1, 0), datapoints.Video: (1, 2, 3, 0)},
+            {torch.Tensor: (2, 0, 1), datapoints.Image: (2, 1, 0), datapoints.Video: (3, 0, 1, 2)},
         ),
     ],
 )
@@ -1847,7 +1849,7 @@ def test_permute_dimensions(dims, inverse_dims):
     sample = dict(
         plain_tensor=torch.testing.make_tensor((3, 28, 28), dtype=torch.uint8, device="cpu"),
         image=make_image(),
-        bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY),
+        bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
         video=make_video(),
         str="str",
         int=0,
@@ -1860,7 +1862,9 @@ def test_permute_dimensions(dims, inverse_dims):
         value_type = type(value)
         transformed_value = transformed_sample[key]
 
-        if check_type(value, (features.Image, features.is_simple_tensor, features.Video)):
+        if check_type(
+            value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
+        ):
             if transform.dims.get(value_type) is not None:
                 assert transformed_value.permute(inverse_dims[value_type]).equal(value)
             assert type(transformed_value) == torch.Tensor
@@ -1872,14 +1876,14 @@ def test_permute_dimensions(dims, inverse_dims):
     "dims",
     [
         (-1, -2),
-        {torch.Tensor: (-1, -2), features.Image: (1, 2), features.Video: None},
+        {torch.Tensor: (-1, -2), datapoints.Image: (1, 2), datapoints.Video: None},
     ],
 )
 def test_transpose_dimensions(dims):
     sample = dict(
         plain_tensor=torch.testing.make_tensor((3, 28, 28), dtype=torch.uint8, device="cpu"),
         image=make_image(),
-        bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY),
+        bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
         video=make_video(),
         str="str",
         int=0,
@@ -1893,7 +1897,9 @@ def test_transpose_dimensions(dims):
         transformed_value = transformed_sample[key]
 
         transposed_dims = transform.dims.get(value_type)
-        if check_type(value, (features.Image, features.is_simple_tensor, features.Video)):
+        if check_type(
+            value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
+        ):
             if transposed_dims is not None:
                 assert transformed_value.transpose(*transposed_dims).equal(value)
             assert type(transformed_value) == torch.Tensor
@@ -1907,7 +1913,7 @@ class TestUniformTemporalSubsample:
         [
             torch.zeros(10, 3, 8, 8),
             torch.zeros(1, 10, 3, 8, 8),
-            features.Video(torch.zeros(1, 10, 3, 8, 8)),
+            datapoints.Video(torch.zeros(1, 10, 3, 8, 8)),
         ],
     )
     def test__transform(self, inpt):
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index f5738a36a..f562649be 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -24,13 +24,13 @@ from prototype_common_utils import (
 )
 from torchvision import transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import features, transforms as prototype_transforms
+from torchvision.prototype import datapoints, transforms as prototype_transforms
 from torchvision.prototype.transforms import functional as prototype_F
 from torchvision.prototype.transforms.functional import to_image_pil
 from torchvision.prototype.transforms.utils import query_spatial_size
 from torchvision.transforms import functional as legacy_F
 
-DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)])
+DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[datapoints.ColorSpace.RGB], extra_dims=[(4,)])
 
 
 class ConsistencyConfig:
@@ -138,7 +138,7 @@ CONSISTENCY_CONFIGS = [
         # Make sure that the product of the height, width and number of channels matches the number of elements in
         # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36.
         make_images_kwargs=dict(
-            DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=[features.ColorSpace.RGB]
+            DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=[datapoints.ColorSpace.RGB]
         ),
         supports_pil=False,
     ),
@@ -150,7 +150,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(num_output_channels=3),
         ],
         make_images_kwargs=dict(
-            DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=[features.ColorSpace.RGB, features.ColorSpace.GRAY]
+            DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=[datapoints.ColorSpace.RGB, datapoints.ColorSpace.GRAY]
         ),
     ),
     ConsistencyConfig(
@@ -173,10 +173,10 @@ CONSISTENCY_CONFIGS = [
         [ArgsKwargs()],
         make_images_kwargs=dict(
             color_spaces=[
-                features.ColorSpace.GRAY,
-                features.ColorSpace.GRAY_ALPHA,
-                features.ColorSpace.RGB,
-                features.ColorSpace.RGB_ALPHA,
+                datapoints.ColorSpace.GRAY,
+                datapoints.ColorSpace.GRAY_ALPHA,
+                datapoints.ColorSpace.RGB,
+                datapoints.ColorSpace.RGB_ALPHA,
             ],
             extra_dims=[()],
         ),
@@ -733,7 +733,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -771,7 +771,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -819,7 +819,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -868,7 +868,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -902,7 +902,7 @@ class TestRefDetTransforms:
         size = (600, 800)
         num_objects = 22
 
-        pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB))
+        pil_image = to_image_pil(make_image(size=size, color_space=datapoints.ColorSpace.RGB))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -912,7 +912,7 @@ class TestRefDetTransforms:
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB))
+        tensor_image = torch.Tensor(make_image(size=size, color_space=datapoints.ColorSpace.RGB))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -922,7 +922,7 @@ class TestRefDetTransforms:
 
         yield (tensor_image, target)
 
-        feature_image = make_image(size=size, color_space=features.ColorSpace.RGB)
+        feature_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB)
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -1006,7 +1006,7 @@ class TestRefSegTransforms:
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
-            feature_image = make_image(size=size, color_space=features.ColorSpace.RGB, dtype=image_dtype)
+            feature_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB, dtype=image_dtype)
             feature_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
 
             dp = (conv_fn(feature_image), feature_mask)
@@ -1053,7 +1053,7 @@ class TestRefSegTransforms:
                 seg_transforms.RandomCrop(size=480),
                 prototype_transforms.Compose(
                     [
-                        PadIfSmaller(size=480, fill=defaultdict(lambda: 0, {features.Mask: 255})),
+                        PadIfSmaller(size=480, fill=defaultdict(lambda: 0, {datapoints.Mask: 255})),
                         prototype_transforms.RandomCrop(size=480),
                     ]
                 ),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 883716445..7cd84fbcd 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -10,12 +10,14 @@ import PIL.Image
 import pytest
 
 import torch
+
+import torchvision.prototype.transforms.utils
 from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
 from prototype_common_utils import assert_close, make_bounding_boxes, make_image, parametrized_error_message
 from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
 from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F
 from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding
 from torchvision.prototype.transforms.functional._meta import convert_format_bounding_box
@@ -147,18 +149,22 @@ class TestKernels:
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
-        feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input)
+        feature_type = (
+            datapoints.Image
+            if torchvision.prototype.transforms.utils.is_simple_tensor(batched_input)
+            else type(batched_input)
+        )
         # This dictionary contains the number of rightmost dimensions that contain the actual data.
         # Everything to the left is considered a batch dimension.
         data_dims = {
-            features.Image: 3,
-            features.BoundingBox: 1,
+            datapoints.Image: 3,
+            datapoints.BoundingBox: 1,
             # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks
             # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one
             # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as
             # common ground.
-            features.Mask: 2,
-            features.Video: 4,
+            datapoints.Mask: 2,
+            datapoints.Video: 4,
         }.get(feature_type)
         if data_dims is None:
             raise pytest.UsageError(
@@ -281,8 +287,8 @@ def spy_on(mocker):
 
 class TestDispatchers:
     image_sample_inputs = make_info_args_kwargs_parametrization(
-        [info for info in DISPATCHER_INFOS if features.Image in info.kernels],
-        args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
+        [info for info in DISPATCHER_INFOS if datapoints.Image in info.kernels],
+        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
     )
 
     @ignore_jit_warning_no_profile
@@ -323,7 +329,7 @@ class TestDispatchers:
         (image_feature, *other_args), kwargs = args_kwargs.load()
         image_simple_tensor = torch.Tensor(image_feature)
 
-        kernel_info = info.kernel_infos[features.Image]
+        kernel_info = info.kernel_infos[datapoints.Image]
         spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.id)
 
         info.dispatcher(image_simple_tensor, *other_args, **kwargs)
@@ -332,7 +338,7 @@ class TestDispatchers:
 
     @make_info_args_kwargs_parametrization(
         [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
-        args_kwargs_fn=lambda info: info.sample_inputs(features.Image),
+        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
     )
     def test_dispatch_pil(self, info, args_kwargs, spy_on):
         (image_feature, *other_args), kwargs = args_kwargs.load()
@@ -403,7 +409,7 @@ class TestDispatchers:
     @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
     def test_dispatcher_feature_signatures_consistency(self, info):
         try:
-            feature_method = getattr(features._Feature, info.id)
+            feature_method = getattr(datapoints._datapoint.Datapoint, info.id)
         except AttributeError:
             pytest.skip("Dispatcher doesn't support arbitrary feature dispatch.")
 
@@ -413,7 +419,7 @@ class TestDispatchers:
         feature_signature = inspect.signature(feature_method)
         feature_params = list(feature_signature.parameters.values())[1:]
 
-        # Because we use `from __future__ import annotations` inside the module where `features._Feature` is defined,
+        # Because we use `from __future__ import annotations` inside the module where `features._datapoint` is defined,
         # the annotations are stored as strings. This makes them concrete again, so they can be compared to the natively
         # concrete dispatcher annotations.
         feature_annotations = get_type_hints(feature_method)
@@ -505,8 +511,12 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
         [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
         [1, 1, 5, 5],
     ]
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
+    in_boxes = datapoints.BoundingBox(
+        in_boxes,
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=spatial_size,
+        dtype=torch.float64,
+        device=device,
     )
     # Tested parameters
     angle = 63
@@ -572,7 +582,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
 
         height, width = bbox.spatial_size
         bbox_xyxy = convert_format_bounding_box(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
+            bbox, old_format=bbox.format, new_format=datapoints.BoundingBoxFormat.XYXY
         )
         points = np.array(
             [
@@ -605,15 +615,15 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
             height = int(height - 2 * tr_y)
             width = int(width - 2 * tr_x)
 
-        out_bbox = features.BoundingBox(
+        out_bbox = datapoints.BoundingBox(
             out_bbox,
-            format=features.BoundingBoxFormat.XYXY,
+            format=datapoints.BoundingBoxFormat.XYXY,
             spatial_size=(height, width),
             dtype=bbox.dtype,
             device=bbox.device,
         )
         return (
-            convert_format_bounding_box(out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format),
+            convert_format_bounding_box(out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox.format),
             (height, width),
         )
 
@@ -641,7 +651,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_)
             expected_bboxes.append(expected_bbox)
         if len(expected_bboxes) > 1:
@@ -664,8 +674,12 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
         [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
         [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
     ]
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
+    in_boxes = datapoints.BoundingBox(
+        in_boxes,
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=spatial_size,
+        dtype=torch.float64,
+        device=device,
     )
     # Tested parameters
     angle = 45
@@ -725,7 +739,7 @@ def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize(
     "format",
-    [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH],
+    [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH],
 )
 @pytest.mark.parametrize(
     "top, left, height, width, expected_bboxes",
@@ -755,9 +769,11 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         [50.0, 5.0, 70.0, 22.0],
         [45.0, 46.0, 56.0, 62.0],
     ]
-    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=size, device=device)
-    if format != features.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
+    in_boxes = datapoints.BoundingBox(
+        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=size, device=device
+    )
+    if format != datapoints.BoundingBoxFormat.XYXY:
+        in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
     output_boxes, output_spatial_size = F.crop_bounding_box(
         in_boxes,
@@ -768,8 +784,8 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         size[1],
     )
 
-    if format != features.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
+    if format != datapoints.BoundingBoxFormat.XYXY:
+        output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
     torch.testing.assert_close(output_spatial_size, size)
@@ -802,7 +818,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize(
     "format",
-    [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH],
+    [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH],
 )
 @pytest.mark.parametrize(
     "top, left, height, width, size",
@@ -831,16 +847,16 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
         expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
+    in_boxes = datapoints.BoundingBox(
+        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
     )
-    if format != features.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
+    if format != datapoints.BoundingBoxFormat.XYXY:
+        in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
     output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
 
-    if format != features.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
+    if format != datapoints.BoundingBoxFormat.XYXY:
+        output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
     torch.testing.assert_close(output_spatial_size, size)
@@ -868,14 +884,14 @@ def test_correctness_pad_bounding_box(device, padding):
         bbox_dtype = bbox.dtype
         bbox = (
             bbox.clone()
-            if bbox_format == features.BoundingBoxFormat.XYXY
-            else convert_format_bounding_box(bbox, bbox_format, features.BoundingBoxFormat.XYXY)
+            if bbox_format == datapoints.BoundingBoxFormat.XYXY
+            else convert_format_bounding_box(bbox, bbox_format, datapoints.BoundingBoxFormat.XYXY)
         )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_format_bounding_box(bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format)
+        bbox = convert_format_bounding_box(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format)
         if bbox.dtype != bbox_dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
@@ -903,7 +919,7 @@ def test_correctness_pad_bounding_box(device, padding):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, padding))
 
         if len(expected_bboxes) > 1:
@@ -949,7 +965,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
         )
 
         bbox_xyxy = convert_format_bounding_box(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
+            bbox, old_format=bbox.format, new_format=datapoints.BoundingBoxFormat.XYXY
         )
         points = np.array(
             [
@@ -968,14 +984,16 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
             np.max(transformed_points[:, 0]),
             np.max(transformed_points[:, 1]),
         ]
-        out_bbox = features.BoundingBox(
+        out_bbox = datapoints.BoundingBox(
             np.array(out_bbox),
-            format=features.BoundingBoxFormat.XYXY,
+            format=datapoints.BoundingBoxFormat.XYXY,
             spatial_size=bbox.spatial_size,
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format)
+        return convert_format_bounding_box(
+            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox.format
+        )
 
     spatial_size = (32, 38)
 
@@ -1000,7 +1018,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
@@ -1019,7 +1037,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
         format_ = bbox.format
         spatial_size_ = bbox.spatial_size
         dtype = bbox.dtype
-        bbox = convert_format_bounding_box(bbox.float(), format_, features.BoundingBoxFormat.XYWH)
+        bbox = convert_format_bounding_box(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
@@ -1033,7 +1051,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
             bbox[3].item(),
         ]
         out_bbox = torch.tensor(out_bbox)
-        out_bbox = convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_)
+        out_bbox = convert_format_bounding_box(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
@@ -1050,7 +1068,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
 
         if len(expected_bboxes) > 1:
@@ -1135,7 +1153,7 @@ def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize,
         torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor)
     )
 
-    image = features.Image(tensor)
+    image = datapoints.Image(tensor)
 
     out = fn(image, kernel_size=ksize, sigma=sigma)
     torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
@@ -1147,7 +1165,7 @@ def test_normalize_output_type():
     assert type(output) is torch.Tensor
     torch.testing.assert_close(inpt - 0.5, output)
 
-    inpt = make_image(color_space=features.ColorSpace.RGB)
+    inpt = make_image(color_space=datapoints.ColorSpace.RGB)
     output = F.normalize(inpt, mean=[0.5, 0.5, 0.5], std=[1.0, 1.0, 1.0])
     assert type(output) is torch.Tensor
     torch.testing.assert_close(inpt - 0.5, output)
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
index 69b23bf12..8774b3bb8 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -3,42 +3,51 @@ import pytest
 
 import torch
 
+import torchvision.prototype.transforms.utils
 from prototype_common_utils import make_bounding_box, make_detection_mask, make_image
 
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms.functional import to_image_pil
 from torchvision.prototype.transforms.utils import has_all, has_any
 
 
-IMAGE = make_image(color_space=features.ColorSpace.RGB)
-BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
+IMAGE = make_image(color_space=datapoints.ColorSpace.RGB)
+BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
 MASK = make_detection_mask(size=IMAGE.spatial_size)
 
 
 @pytest.mark.parametrize(
     ("sample", "types", "expected"),
     [
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.BoundingBox), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox, features.Mask), True),
-        ((MASK,), (features.Image, features.BoundingBox), False),
-        ((BOUNDING_BOX,), (features.Image, features.Mask), False),
-        ((IMAGE,), (features.BoundingBox, features.Mask), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox, datapoints.Mask), True),
+        ((MASK,), (datapoints.Image, datapoints.BoundingBox), False),
+        ((BOUNDING_BOX,), (datapoints.Image, datapoints.Mask), False),
+        ((IMAGE,), (datapoints.BoundingBox, datapoints.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (features.Image, features.BoundingBox, features.Mask),
+            (datapoints.Image, datapoints.BoundingBox, datapoints.Mask),
             True,
         ),
-        ((), (features.Image, features.BoundingBox, features.Mask), False),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, features.Image),), True),
+        ((), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (features.Image, PIL.Image.Image, features.is_simple_tensor), True),
-        ((torch.Tensor(IMAGE),), (features.Image, PIL.Image.Image, features.is_simple_tensor), True),
-        ((to_image_pil(IMAGE),), (features.Image, PIL.Image.Image, features.is_simple_tensor), True),
+        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.prototype.transforms.utils.is_simple_tensor), True),
+        (
+            (torch.Tensor(IMAGE),),
+            (datapoints.Image, PIL.Image.Image, torchvision.prototype.transforms.utils.is_simple_tensor),
+            True,
+        ),
+        (
+            (to_image_pil(IMAGE),),
+            (datapoints.Image, PIL.Image.Image, torchvision.prototype.transforms.utils.is_simple_tensor),
+            True,
+        ),
     ],
 )
 def test_has_any(sample, types, expected):
@@ -48,31 +57,31 @@ def test_has_any(sample, types, expected):
 @pytest.mark.parametrize(
     ("sample", "types", "expected"),
     [
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.BoundingBox), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox, features.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox, datapoints.Mask), True),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (features.Image, features.BoundingBox, features.Mask),
+            (datapoints.Image, datapoints.BoundingBox, datapoints.Mask),
             True,
         ),
-        ((BOUNDING_BOX, MASK), (features.Image, features.BoundingBox), False),
-        ((BOUNDING_BOX, MASK), (features.Image, features.Mask), False),
-        ((IMAGE, MASK), (features.BoundingBox, features.Mask), False),
+        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox), False),
+        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), False),
+        ((IMAGE, MASK), (datapoints.BoundingBox, datapoints.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (features.Image, features.BoundingBox, features.Mask),
+            (datapoints.Image, datapoints.BoundingBox, datapoints.Mask),
             True,
         ),
-        ((BOUNDING_BOX, MASK), (features.Image, features.BoundingBox, features.Mask), False),
-        ((IMAGE, MASK), (features.Image, features.BoundingBox, features.Mask), False),
-        ((IMAGE, BOUNDING_BOX), (features.Image, features.BoundingBox, features.Mask), False),
+        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
+        ((IMAGE, MASK), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
+        ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (lambda obj: isinstance(obj, (features.Image, features.BoundingBox, features.Mask)),),
+            (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BoundingBox, datapoints.Mask)),),
             True,
         ),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py
index 0edf8eb2e..200f5cd95 100644
--- a/torchvision/prototype/__init__.py
+++ b/torchvision/prototype/__init__.py
@@ -1 +1 @@
-from . import features, models, transforms, utils
+from . import datapoints, models, transforms, utils
diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/datapoints/__init__.py
similarity index 76%
rename from torchvision/prototype/features/__init__.py
rename to torchvision/prototype/datapoints/__init__.py
index e11e99a9b..92f345e20 100644
--- a/torchvision/prototype/features/__init__.py
+++ b/torchvision/prototype/datapoints/__init__.py
@@ -1,5 +1,5 @@
 from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._feature import _Feature, FillType, FillTypeJIT, InputType, InputTypeJIT, is_simple_tensor
+from ._datapoint import FillType, FillTypeJIT, InputType, InputTypeJIT
 from ._image import ColorSpace, Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
 from ._label import Label, OneHotLabel
 from ._mask import Mask
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
similarity index 98%
rename from torchvision/prototype/features/_bounding_box.py
rename to torchvision/prototype/datapoints/_bounding_box.py
index a91a50ecb..398770cbf 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -6,7 +6,7 @@ import torch
 from torchvision._utils import StrEnum
 from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
 
-from ._feature import _Feature, FillTypeJIT
+from ._datapoint import Datapoint, FillTypeJIT
 
 
 class BoundingBoxFormat(StrEnum):
@@ -15,7 +15,7 @@ class BoundingBoxFormat(StrEnum):
     CXCYWH = StrEnum.auto()
 
 
-class BoundingBox(_Feature):
+class BoundingBox(Datapoint):
     format: BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/datapoints/_datapoint.py
similarity index 80%
rename from torchvision/prototype/features/_feature.py
rename to torchvision/prototype/datapoints/_datapoint.py
index 3d7623645..53d1b05fb 100644
--- a/torchvision/prototype/features/_feature.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -10,16 +10,12 @@ from torch.types import _device, _dtype, _size
 from torchvision.transforms import InterpolationMode
 
 
-F = TypeVar("F", bound="_Feature")
+D = TypeVar("D", bound="Datapoint")
 FillType = Union[int, float, Sequence[int], Sequence[float], None]
 FillTypeJIT = Union[int, float, List[float], None]
 
 
-def is_simple_tensor(inpt: Any) -> bool:
-    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, _Feature)
-
-
-class _Feature(torch.Tensor):
+class Datapoint(torch.Tensor):
     __F: Optional[ModuleType] = None
 
     @staticmethod
@@ -31,22 +27,22 @@ class _Feature(torch.Tensor):
     ) -> torch.Tensor:
         return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
-    # FIXME: this is just here for BC with the prototype datasets. Some datasets use the _Feature directly to have a
+    # FIXME: this is just here for BC with the prototype datasets. Some datasets use the Datapoint directly to have a
     #  a no-op input for the prototype transforms. For this use case, we can't use plain tensors, since they will be
-    #  interpreted as images. We should decide if we want a public no-op feature like `GenericFeature` or make this one
-    #  public again.
+    #  interpreted as images. We should decide if we want a public no-op datapoint like `GenericDatapoint` or make this
+    #  one public again.
     def __new__(
         cls,
         data: Any,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
-    ) -> _Feature:
+    ) -> Datapoint:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-        return tensor.as_subclass(_Feature)
+        return tensor.as_subclass(Datapoint)
 
     @classmethod
-    def wrap_like(cls: Type[F], other: F, tensor: torch.Tensor) -> F:
+    def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
         # FIXME: this is just here for BC with the prototype datasets. See __new__ for details. If that is resolved,
         #  this method should be made abstract
         # raise NotImplementedError
@@ -75,15 +71,15 @@ class _Feature(torch.Tensor):
         ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
         ``args`` and ``kwargs`` of the original call.
 
-        The default behavior of :class:`~torch.Tensor`'s is to retain a custom tensor type. For the :class:`_Feature`
+        The default behavior of :class:`~torch.Tensor`'s is to retain a custom tensor type. For the :class:`Datapoint`
         use case, this has two downsides:
 
-        1. Since some :class:`Feature`'s require metadata to be constructed, the default wrapping, i.e.
+        1. Since some :class:`Datapoint`'s require metadata to be constructed, the default wrapping, i.e.
            ``return cls(func(*args, **kwargs))``, will fail for them.
         2. For most operations, there is no way of knowing if the input type is still valid for the output.
 
         For these reasons, the automatic output wrapping is turned off for most operators. The only exceptions are
-        listed in :attr:`~_Feature._NO_WRAPPING_EXCEPTIONS`
+        listed in :attr:`Datapoint._NO_WRAPPING_EXCEPTIONS`
         """
         # Since super().__torch_function__ has no hook to prevent the coercing of the output into the input type, we
         # need to reimplement the functionality.
@@ -98,9 +94,9 @@ class _Feature(torch.Tensor):
             # Apart from `func` needing to be an exception, we also require the primary operand, i.e. `args[0]`, to be
             # an instance of the class that `__torch_function__` was invoked on. The __torch_function__ protocol will
             # invoke this method on *all* types involved in the computation by walking the MRO upwards. For example,
-            # `torch.Tensor(...).to(features.Image(...))` will invoke `features.Image.__torch_function__` with
-            # `args = (torch.Tensor(), features.Image())` first. Without this guard, the original `torch.Tensor` would
-            # be wrapped into a `features.Image`.
+            # `torch.Tensor(...).to(datapoints.Image(...))` will invoke `datapoints.Image.__torch_function__` with
+            # `args = (torch.Tensor(), datapoints.Image())` first. Without this guard, the original `torch.Tensor` would
+            # be wrapped into a `datapoints.Image`.
             if wrapper and isinstance(args[0], cls):
                 return wrapper(cls, args[0], output)  # type: ignore[no-any-return]
 
@@ -123,11 +119,11 @@ class _Feature(torch.Tensor):
         # until the first time we need reference to the functional module and it's shared across all instances of
         # the class. This approach avoids the DataLoader issue described at
         # https://github.com/pytorch/vision/pull/6476#discussion_r953588621
-        if _Feature.__F is None:
+        if Datapoint.__F is None:
             from ..transforms import functional
 
-            _Feature.__F = functional
-        return _Feature.__F
+            Datapoint.__F = functional
+        return Datapoint.__F
 
     # Add properties for common attributes like shape, dtype, device, ndim etc
     # this way we return the result without passing into __torch_function__
@@ -151,10 +147,10 @@ class _Feature(torch.Tensor):
         with DisableTorchFunction():
             return super().dtype
 
-    def horizontal_flip(self) -> _Feature:
+    def horizontal_flip(self) -> Datapoint:
         return self
 
-    def vertical_flip(self) -> _Feature:
+    def vertical_flip(self) -> Datapoint:
         return self
 
     # TODO: We have to ignore override mypy error as there is torch.Tensor built-in deprecated op: Tensor.resize
@@ -165,13 +161,13 @@ class _Feature(torch.Tensor):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[bool] = None,
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
-    def crop(self, top: int, left: int, height: int, width: int) -> _Feature:
+    def crop(self, top: int, left: int, height: int, width: int) -> Datapoint:
         return self
 
-    def center_crop(self, output_size: List[int]) -> _Feature:
+    def center_crop(self, output_size: List[int]) -> Datapoint:
         return self
 
     def resized_crop(
@@ -183,7 +179,7 @@ class _Feature(torch.Tensor):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         antialias: Optional[bool] = None,
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
     def pad(
@@ -191,7 +187,7 @@ class _Feature(torch.Tensor):
         padding: Union[int, List[int]],
         fill: FillTypeJIT = None,
         padding_mode: str = "constant",
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
     def rotate(
@@ -201,7 +197,7 @@ class _Feature(torch.Tensor):
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: FillTypeJIT = None,
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
     def affine(
@@ -213,7 +209,7 @@ class _Feature(torch.Tensor):
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
     def perspective(
@@ -223,7 +219,7 @@ class _Feature(torch.Tensor):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
     def elastic(
@@ -231,45 +227,45 @@ class _Feature(torch.Tensor):
         displacement: torch.Tensor,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
-    ) -> _Feature:
+    ) -> Datapoint:
         return self
 
-    def adjust_brightness(self, brightness_factor: float) -> _Feature:
+    def adjust_brightness(self, brightness_factor: float) -> Datapoint:
         return self
 
-    def adjust_saturation(self, saturation_factor: float) -> _Feature:
+    def adjust_saturation(self, saturation_factor: float) -> Datapoint:
         return self
 
-    def adjust_contrast(self, contrast_factor: float) -> _Feature:
+    def adjust_contrast(self, contrast_factor: float) -> Datapoint:
         return self
 
-    def adjust_sharpness(self, sharpness_factor: float) -> _Feature:
+    def adjust_sharpness(self, sharpness_factor: float) -> Datapoint:
         return self
 
-    def adjust_hue(self, hue_factor: float) -> _Feature:
+    def adjust_hue(self, hue_factor: float) -> Datapoint:
         return self
 
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> _Feature:
+    def adjust_gamma(self, gamma: float, gain: float = 1) -> Datapoint:
         return self
 
-    def posterize(self, bits: int) -> _Feature:
+    def posterize(self, bits: int) -> Datapoint:
         return self
 
-    def solarize(self, threshold: float) -> _Feature:
+    def solarize(self, threshold: float) -> Datapoint:
         return self
 
-    def autocontrast(self) -> _Feature:
+    def autocontrast(self) -> Datapoint:
         return self
 
-    def equalize(self) -> _Feature:
+    def equalize(self) -> Datapoint:
         return self
 
-    def invert(self) -> _Feature:
+    def invert(self) -> Datapoint:
         return self
 
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> _Feature:
+    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Datapoint:
         return self
 
 
-InputType = Union[torch.Tensor, PIL.Image.Image, _Feature]
+InputType = Union[torch.Tensor, PIL.Image.Image, Datapoint]
 InputTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/datapoints/_image.py
similarity index 99%
rename from torchvision/prototype/features/_image.py
rename to torchvision/prototype/datapoints/_image.py
index fd04e8939..fc2069110 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -8,7 +8,7 @@ import torch
 from torchvision._utils import StrEnum
 from torchvision.transforms.functional import InterpolationMode
 
-from ._feature import _Feature, FillTypeJIT
+from ._datapoint import Datapoint, FillTypeJIT
 
 
 class ColorSpace(StrEnum):
@@ -57,7 +57,7 @@ def _from_tensor_shape(shape: List[int]) -> ColorSpace:
         return ColorSpace.OTHER
 
 
-class Image(_Feature):
+class Image(Datapoint):
     color_space: ColorSpace
 
     @classmethod
diff --git a/torchvision/prototype/features/_label.py b/torchvision/prototype/datapoints/_label.py
similarity index 97%
rename from torchvision/prototype/features/_label.py
rename to torchvision/prototype/datapoints/_label.py
index 9c2bcfc0f..549154933 100644
--- a/torchvision/prototype/features/_label.py
+++ b/torchvision/prototype/datapoints/_label.py
@@ -5,13 +5,13 @@ from typing import Any, Optional, Sequence, Type, TypeVar, Union
 import torch
 from torch.utils._pytree import tree_map
 
-from ._feature import _Feature
+from ._datapoint import Datapoint
 
 
 L = TypeVar("L", bound="_LabelBase")
 
 
-class _LabelBase(_Feature):
+class _LabelBase(Datapoint):
     categories: Optional[Sequence[str]]
 
     @classmethod
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/datapoints/_mask.py
similarity index 98%
rename from torchvision/prototype/features/_mask.py
rename to torchvision/prototype/datapoints/_mask.py
index eb823f824..ca4aba87d 100644
--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/datapoints/_mask.py
@@ -5,10 +5,10 @@ from typing import Any, List, Optional, Tuple, Union
 import torch
 from torchvision.transforms import InterpolationMode
 
-from ._feature import _Feature, FillTypeJIT
+from ._datapoint import Datapoint, FillTypeJIT
 
 
-class Mask(_Feature):
+class Mask(Datapoint):
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Mask:
         return tensor.as_subclass(cls)
diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/datapoints/_video.py
similarity index 99%
rename from torchvision/prototype/features/_video.py
rename to torchvision/prototype/datapoints/_video.py
index 042f643e5..5c55d23a1 100644
--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -6,11 +6,11 @@ from typing import Any, List, Optional, Tuple, Union
 import torch
 from torchvision.transforms.functional import InterpolationMode
 
-from ._feature import _Feature, FillTypeJIT
+from ._datapoint import Datapoint, FillTypeJIT
 from ._image import ColorSpace
 
 
-class Video(_Feature):
+class Video(Datapoint):
     color_space: ColorSpace
 
     @classmethod
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index eadc2a019..55a77c1a9 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -4,6 +4,8 @@ from typing import Any, BinaryIO, Dict, List, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
+from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -12,7 +14,6 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
@@ -114,7 +115,7 @@ class Caltech101(Dataset):
                 format="xyxy",
                 spatial_size=image.spatial_size,
             ),
-            contour=_Feature(ann["obj_contour"].T),
+            contour=Datapoint(ann["obj_contour"].T),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 12771a11e..9050cf0b5 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -3,6 +3,8 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
+from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -11,7 +13,6 @@ from torchvision.prototype.datasets.utils._internal import (
     INFINITE_BUFFER_SIZE,
     path_accessor,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
@@ -148,7 +149,7 @@ class CelebA(Dataset):
                 spatial_size=image.spatial_size,
             ),
             landmarks={
-                landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
+                landmark: Datapoint((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
                 for landmark in {key[:-2] for key in landmarks.keys()}
             },
         )
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
index 0fff2e6a1..de87f46c8 100644
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ b/torchvision/prototype/datasets/_builtin/cifar.py
@@ -6,6 +6,7 @@ from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Tuple, U
 
 import numpy as np
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
+from torchvision.prototype.datapoints import Image, Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -13,7 +14,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import Image, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
index 753a28363..e28263568 100644
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ b/torchvision/prototype/datasets/_builtin/clevr.py
@@ -2,6 +2,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -11,7 +12,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_accessor,
     path_comparator,
 )
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index 4ec4580e7..fa68bf4dc 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -14,6 +14,8 @@ from torchdata.datapipes.iter import (
     Mapper,
     UnBatcher,
 )
+from torchvision.prototype.datapoints import BoundingBox, Label, Mask
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -24,7 +26,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_accessor,
     read_categories_file,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
@@ -113,8 +114,7 @@ class Coco(Dataset):
         spatial_size = (image_meta["height"], image_meta["width"])
         labels = [ann["category_id"] for ann in anns]
         return dict(
-            # TODO: create a segmentation feature
-            segmentations=_Feature(
+            segmentations=Mask(
                 torch.stack(
                     [
                         self._segmentation_to_mask(
@@ -124,8 +124,8 @@ class Coco(Dataset):
                     ]
                 )
             ),
-            areas=_Feature([ann["area"] for ann in anns]),
-            crowds=_Feature([ann["iscrowd"] for ann in anns], dtype=torch.bool),
+            areas=Datapoint([ann["area"] for ann in anns]),
+            crowds=Datapoint([ann["iscrowd"] for ann in anns], dtype=torch.bool),
             bounding_boxes=BoundingBox(
                 [ann["bbox"] for ann in anns],
                 format="xywh",
diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py
index c006e4454..0f4b3d769 100644
--- a/torchvision/prototype/datasets/_builtin/country211.py
+++ b/torchvision/prototype/datasets/_builtin/country211.py
@@ -2,6 +2,7 @@ import pathlib
 from typing import Any, Dict, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -9,7 +10,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index 2a88f7030..ea192baf6 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -14,6 +14,8 @@ from torchdata.datapipes.iter import (
     Mapper,
 )
 from torchdata.datapipes.map import IterToMapConverter
+from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -25,7 +27,6 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature, BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
@@ -161,7 +162,7 @@ class CUB200(Dataset):
                 format="xyxy",
                 spatial_size=spatial_size,
             ),
-            segmentation=_Feature(content["seg"]),
+            segmentation=Datapoint(content["seg"]),
         )
 
     def _prepare_sample(
diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py
index ebd5eaec5..6ddab2af7 100644
--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ b/torchvision/prototype/datasets/_builtin/dtd.py
@@ -3,6 +3,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -12,7 +13,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py
index 12a379b47..463eed79d 100644
--- a/torchvision/prototype/datasets/_builtin/eurosat.py
+++ b/torchvision/prototype/datasets/_builtin/eurosat.py
@@ -2,9 +2,9 @@ import pathlib
 from typing import Any, Dict, List, Tuple, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py
index b2693aa96..73c6184b6 100644
--- a/torchvision/prototype/datasets/_builtin/fer2013.py
+++ b/torchvision/prototype/datasets/_builtin/fer2013.py
@@ -3,9 +3,9 @@ from typing import Any, Dict, List, Union
 
 import torch
 from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper
+from torchvision.prototype.datapoints import Image, Label
 from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py
index 122962599..f3054d8fb 100644
--- a/torchvision/prototype/datasets/_builtin/food101.py
+++ b/torchvision/prototype/datasets/_builtin/food101.py
@@ -2,6 +2,7 @@ from pathlib import Path
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -11,7 +12,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 73eee8435..adcc31b27 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -2,6 +2,7 @@ import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
+from torchvision.prototype.datapoints import BoundingBox, Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -9,7 +10,6 @@ from torchvision.prototype.datasets.utils._internal import (
     INFINITE_BUFFER_SIZE,
     path_comparator,
 )
-from torchvision.prototype.features import BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py
index 8388285e5..5e2db41e1 100644
--- a/torchvision/prototype/datasets/_builtin/imagenet.py
+++ b/torchvision/prototype/datasets/_builtin/imagenet.py
@@ -15,6 +15,7 @@ from torchdata.datapipes.iter import (
     TarArchiveLoader,
 )
 from torchdata.datapipes.map import IterToMapConverter
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, ManualDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -25,7 +26,6 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 97d729d53..9364aa3ad 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -7,9 +7,9 @@ from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Sequence
 
 import torch
 from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper
+from torchvision.prototype.datapoints import Image, Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE
-from torchvision.prototype.features import Image, Label
 from torchvision.prototype.utils._internal import fromfile
 
 from .._api import register_dataset, register_info
diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
index 7621e7b11..fbc7d30c2 100644
--- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
+++ b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
@@ -3,6 +3,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -13,7 +14,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
index f533ba180..9de224b95 100644
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ b/torchvision/prototype/datasets/_builtin/pcam.py
@@ -4,10 +4,9 @@ from collections import namedtuple
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper
-from torchvision.prototype import features
+from torchvision.prototype.datapoints import Image, Label
 from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Label
 
 from .._api import register_dataset, register_info
 
@@ -109,7 +108,7 @@ class PCAM(Dataset):
         image, target = data  # They're both numpy arrays at this point
 
         return {
-            "image": features.Image(image.transpose(2, 0, 1)),
+            "image": Image(image.transpose(2, 0, 1)),
             "label": Label(target.item(), categories=self._categories),
         }
 
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
index 01dd1d888..c9f054b2c 100644
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ b/torchvision/prototype/datasets/_builtin/sbd.py
@@ -4,6 +4,7 @@ from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -15,7 +16,6 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import _Feature
 
 from .._api import register_dataset, register_info
 
@@ -92,8 +92,8 @@ class SBD(Dataset):
             image=EncodedImage.from_file(image_buffer),
             ann_path=ann_path,
             # the boundaries are stored in sparse CSC format, which is not supported by PyTorch
-            boundaries=_Feature(np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])),
-            segmentation=_Feature(anns["Segmentation"].item()),
+            boundaries=Datapoint(np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])),
+            segmentation=Datapoint(anns["Segmentation"].item()),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py
index 8107f6565..9ae2c17ab 100644
--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ b/torchvision/prototype/datasets/_builtin/semeion.py
@@ -3,9 +3,9 @@ from typing import Any, Dict, List, Tuple, Union
 
 import torch
 from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper
+from torchvision.prototype.datapoints import Image, OneHotLabel
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, OneHotLabel
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index 82aec3129..02db37169 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -2,6 +2,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
+from torchvision.prototype.datapoints import BoundingBox, Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -10,7 +11,6 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
-from torchvision.prototype.features import BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py
index 6dd55a77c..d276298ca 100644
--- a/torchvision/prototype/datasets/_builtin/svhn.py
+++ b/torchvision/prototype/datasets/_builtin/svhn.py
@@ -3,9 +3,9 @@ from typing import Any, BinaryIO, Dict, List, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher
+from torchvision.prototype.datapoints import Image, Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat
-from torchvision.prototype.features import Image, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py
index e5ca58f84..7d1fed04e 100644
--- a/torchvision/prototype/datasets/_builtin/usps.py
+++ b/torchvision/prototype/datasets/_builtin/usps.py
@@ -3,9 +3,9 @@ from typing import Any, Dict, List, Union
 
 import torch
 from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper
+from torchvision.prototype.datapoints import Image, Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index 901f8eeb1..d14189132 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -6,6 +6,7 @@ from xml.etree import ElementTree
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
 from torchvision.datasets import VOCDetection
+from torchvision.prototype.datapoints import BoundingBox, Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -16,7 +17,6 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
-from torchvision.prototype.features import BoundingBox, Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py
index 01a93a52a..0a37df03a 100644
--- a/torchvision/prototype/datasets/_folder.py
+++ b/torchvision/prototype/datasets/_folder.py
@@ -5,9 +5,9 @@ import pathlib
 from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedData, EncodedImage
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Label
 
 
 __all__ = ["from_data_folder", "from_image_folder"]
diff --git a/torchvision/prototype/datasets/utils/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py
index 1e06878ba..64cd9f7b9 100644
--- a/torchvision/prototype/datasets/utils/_encoded.py
+++ b/torchvision/prototype/datasets/utils/_encoded.py
@@ -7,13 +7,13 @@ from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
 import PIL.Image
 import torch
 
-from torchvision.prototype.features._feature import _Feature
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
 
 D = TypeVar("D", bound="EncodedData")
 
 
-class EncodedData(_Feature):
+class EncodedData(Datapoint):
     @classmethod
     def _wrap(cls: Type[D], tensor: torch.Tensor) -> D:
         return tensor.as_subclass(cls)
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 8ec7929cd..23238c7a5 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -6,16 +6,17 @@ from typing import Any, cast, Dict, List, Optional, Tuple, Union
 import PIL.Image
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
+
 from torchvision.ops import masks_to_boxes
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode
 
 from ._transform import _RandomApplyTransform
-from .utils import has_any, query_chw, query_spatial_size
+from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
 
 
 class RandomErasing(_RandomApplyTransform):
-    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image, features.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)
 
     def __init__(
         self,
@@ -91,8 +92,8 @@ class RandomErasing(_RandomApplyTransform):
         return dict(i=i, j=j, h=h, w=w, v=v)
 
     def _transform(
-        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
-    ) -> Union[features.ImageType, features.VideoType]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         if params["v"] is not None:
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
@@ -107,20 +108,20 @@ class _BaseMixupCutmix(_RandomApplyTransform):
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
-            has_any(flat_inputs, features.Image, features.Video, features.is_simple_tensor)
-            and has_any(flat_inputs, features.OneHotLabel)
+            has_any(flat_inputs, datapoints.Image, datapoints.Video, is_simple_tensor)
+            and has_any(flat_inputs, datapoints.OneHotLabel)
         ):
             raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
-        if has_any(flat_inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
+        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Label):
             raise TypeError(
                 f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
             )
 
-    def _mixup_onehotlabel(self, inpt: features.OneHotLabel, lam: float) -> features.OneHotLabel:
+    def _mixup_onehotlabel(self, inpt: datapoints.OneHotLabel, lam: float) -> datapoints.OneHotLabel:
         if inpt.ndim < 2:
             raise ValueError("Need a batch of one hot labels")
         output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
-        return features.OneHotLabel.wrap_like(inpt, output)
+        return datapoints.OneHotLabel.wrap_like(inpt, output)
 
 
 class RandomMixup(_BaseMixupCutmix):
@@ -129,17 +130,17 @@ class RandomMixup(_BaseMixupCutmix):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         lam = params["lam"]
-        if isinstance(inpt, (features.Image, features.Video)) or features.is_simple_tensor(inpt):
-            expected_ndim = 5 if isinstance(inpt, features.Video) else 4
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+            expected_ndim = 5 if isinstance(inpt, datapoints.Video) else 4
             if inpt.ndim < expected_ndim:
                 raise ValueError("The transform expects a batched input")
             output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
 
-            if isinstance(inpt, (features.Image, features.Video)):
+            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
                 output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
 
             return output
-        elif isinstance(inpt, features.OneHotLabel):
+        elif isinstance(inpt, datapoints.OneHotLabel):
             return self._mixup_onehotlabel(inpt, lam)
         else:
             return inpt
@@ -169,9 +170,9 @@ class RandomCutmix(_BaseMixupCutmix):
         return dict(box=box, lam_adjusted=lam_adjusted)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, (features.Image, features.Video)) or features.is_simple_tensor(inpt):
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
             box = params["box"]
-            expected_ndim = 5 if isinstance(inpt, features.Video) else 4
+            expected_ndim = 5 if isinstance(inpt, datapoints.Video) else 4
             if inpt.ndim < expected_ndim:
                 raise ValueError("The transform expects a batched input")
             x1, y1, x2, y2 = box
@@ -179,11 +180,11 @@ class RandomCutmix(_BaseMixupCutmix):
             output = inpt.clone()
             output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
 
-            if isinstance(inpt, (features.Image, features.Video)):
+            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
                 output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
 
             return output
-        elif isinstance(inpt, features.OneHotLabel):
+        elif isinstance(inpt, datapoints.OneHotLabel):
             lam_adjusted = params["lam_adjusted"]
             return self._mixup_onehotlabel(inpt, lam_adjusted)
         else:
@@ -205,15 +206,15 @@ class SimpleCopyPaste(_RandomApplyTransform):
 
     def _copy_paste(
         self,
-        image: features.TensorImageType,
+        image: datapoints.TensorImageType,
         target: Dict[str, Any],
-        paste_image: features.TensorImageType,
+        paste_image: datapoints.TensorImageType,
         paste_target: Dict[str, Any],
         random_selection: torch.Tensor,
         blending: bool,
         resize_interpolation: F.InterpolationMode,
         antialias: Optional[bool],
-    ) -> Tuple[features.TensorImageType, Dict[str, Any]]:
+    ) -> Tuple[datapoints.TensorImageType, Dict[str, Any]]:
 
         paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection])
         paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection])
@@ -262,7 +263,7 @@ class SimpleCopyPaste(_RandomApplyTransform):
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
         boxes = F.convert_format_bounding_box(
-            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
+            xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
 
@@ -271,7 +272,7 @@ class SimpleCopyPaste(_RandomApplyTransform):
 
         # Check for degenerated boxes and remove them
         boxes = F.convert_format_bounding_box(
-            out_target["boxes"], old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY
+            out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY
         )
         degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
         if degenerate_boxes.any():
@@ -285,20 +286,20 @@ class SimpleCopyPaste(_RandomApplyTransform):
 
     def _extract_image_targets(
         self, flat_sample: List[Any]
-    ) -> Tuple[List[features.TensorImageType], List[Dict[str, Any]]]:
+    ) -> Tuple[List[datapoints.TensorImageType], List[Dict[str, Any]]]:
         # fetch all images, bboxes, masks and labels from unstructured input
         # with List[image], List[BoundingBox], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
         for obj in flat_sample:
-            if isinstance(obj, features.Image) or features.is_simple_tensor(obj):
+            if isinstance(obj, datapoints.Image) or is_simple_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
                 images.append(F.to_image_tensor(obj))
-            elif isinstance(obj, features.BoundingBox):
+            elif isinstance(obj, datapoints.BoundingBox):
                 bboxes.append(obj)
-            elif isinstance(obj, features.Mask):
+            elif isinstance(obj, datapoints.Mask):
                 masks.append(obj)
-            elif isinstance(obj, (features.Label, features.OneHotLabel)):
+            elif isinstance(obj, (datapoints.Label, datapoints.OneHotLabel)):
                 labels.append(obj)
 
         if not (len(images) == len(bboxes) == len(masks) == len(labels)):
@@ -316,27 +317,27 @@ class SimpleCopyPaste(_RandomApplyTransform):
     def _insert_outputs(
         self,
         flat_sample: List[Any],
-        output_images: List[features.TensorImageType],
+        output_images: List[datapoints.TensorImageType],
         output_targets: List[Dict[str, Any]],
     ) -> None:
         c0, c1, c2, c3 = 0, 0, 0, 0
         for i, obj in enumerate(flat_sample):
-            if isinstance(obj, features.Image):
-                flat_sample[i] = features.Image.wrap_like(obj, output_images[c0])
+            if isinstance(obj, datapoints.Image):
+                flat_sample[i] = datapoints.Image.wrap_like(obj, output_images[c0])
                 c0 += 1
             elif isinstance(obj, PIL.Image.Image):
                 flat_sample[i] = F.to_image_pil(output_images[c0])
                 c0 += 1
-            elif features.is_simple_tensor(obj):
+            elif is_simple_tensor(obj):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
-            elif isinstance(obj, features.BoundingBox):
-                flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"])
+            elif isinstance(obj, datapoints.BoundingBox):
+                flat_sample[i] = datapoints.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"])
                 c1 += 1
-            elif isinstance(obj, features.Mask):
-                flat_sample[i] = features.Mask.wrap_like(obj, output_targets[c2]["masks"])
+            elif isinstance(obj, datapoints.Mask):
+                flat_sample[i] = datapoints.Mask.wrap_like(obj, output_targets[c2]["masks"])
                 c2 += 1
-            elif isinstance(obj, (features.Label, features.OneHotLabel)):
+            elif isinstance(obj, (datapoints.Label, datapoints.OneHotLabel)):
                 flat_sample[i] = obj.wrap_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
                 c3 += 1
 
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 28029db82..d4f2ca214 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -5,13 +5,14 @@ import PIL.Image
 import torch
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
-from torchvision.prototype import features
+
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.prototype.transforms.functional._meta import get_spatial_size
 from torchvision.transforms import functional_tensor as _FT
 
 from ._utils import _setup_fill_arg
-from .utils import check_type
+from .utils import check_type, is_simple_tensor
 
 
 class _AutoAugmentBase(Transform):
@@ -19,7 +20,7 @@ class _AutoAugmentBase(Transform):
         self,
         *,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__()
         self.interpolation = interpolation
@@ -33,13 +34,21 @@ class _AutoAugmentBase(Transform):
     def _flatten_and_extract_image_or_video(
         self,
         inputs: Any,
-        unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask),
-    ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[features.ImageType, features.VideoType]]:
+        unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBox, datapoints.Mask),
+    ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints.ImageType, datapoints.VideoType]]:
         flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
 
         image_or_videos = []
         for idx, inpt in enumerate(flat_inputs):
-            if check_type(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)):
+            if check_type(
+                inpt,
+                (
+                    datapoints.Image,
+                    PIL.Image.Image,
+                    is_simple_tensor,
+                    datapoints.Video,
+                ),
+            ):
                 image_or_videos.append((idx, inpt))
             elif isinstance(inpt, unsupported_types):
                 raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
@@ -58,7 +67,7 @@ class _AutoAugmentBase(Transform):
     def _unflatten_and_insert_image_or_video(
         self,
         flat_inputs_with_spec: Tuple[List[Any], TreeSpec, int],
-        image_or_video: Union[features.ImageType, features.VideoType],
+        image_or_video: Union[datapoints.ImageType, datapoints.VideoType],
     ) -> Any:
         flat_inputs, spec, idx = flat_inputs_with_spec
         flat_inputs[idx] = image_or_video
@@ -66,12 +75,12 @@ class _AutoAugmentBase(Transform):
 
     def _apply_image_or_video_transform(
         self,
-        image: Union[features.ImageType, features.VideoType],
+        image: Union[datapoints.ImageType, datapoints.VideoType],
         transform_id: str,
         magnitude: float,
         interpolation: InterpolationMode,
-        fill: Dict[Type, features.FillTypeJIT],
-    ) -> Union[features.ImageType, features.VideoType]:
+        fill: Dict[Type, datapoints.FillTypeJIT],
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         fill_ = fill[type(image)]
 
         if transform_id == "Identity":
@@ -182,7 +191,7 @@ class AutoAugment(_AutoAugmentBase):
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.policy = policy
@@ -338,7 +347,7 @@ class RandAugment(_AutoAugmentBase):
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_ops = num_ops
@@ -390,7 +399,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         self,
         num_magnitude_bins: int = 31,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ):
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_magnitude_bins = num_magnitude_bins
@@ -446,7 +455,7 @@ class AugMix(_AutoAugmentBase):
         alpha: float = 1.0,
         all_ops: bool = True,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self._PARAMETER_MAX = 10
@@ -474,7 +483,7 @@ class AugMix(_AutoAugmentBase):
         augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
 
         orig_dims = list(image_or_video.shape)
-        expected_ndim = 5 if isinstance(orig_image_or_video, features.Video) else 4
+        expected_ndim = 5 if isinstance(orig_image_or_video, datapoints.Video) else 4
         batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
@@ -511,7 +520,7 @@ class AugMix(_AutoAugmentBase):
             mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
         mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
-        if isinstance(orig_image_or_video, (features.Image, features.Video)):
+        if isinstance(orig_image_or_video, (datapoints.Image, datapoints.Video)):
             mix = orig_image_or_video.wrap_like(orig_image_or_video, mix)  # type: ignore[arg-type]
         elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_image_pil(mix)
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 49b2c0987..0254dd7c2 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -3,11 +3,12 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
-from torchvision.prototype import features
+
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
 from ._transform import _RandomApplyTransform
-from .utils import query_chw
+from .utils import is_simple_tensor, query_chw
 
 
 class ColorJitter(Transform):
@@ -82,7 +83,12 @@ class ColorJitter(Transform):
 
 
 class RandomPhotometricDistort(Transform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
 
     def __init__(
         self,
@@ -111,15 +117,15 @@ class RandomPhotometricDistort(Transform):
         )
 
     def _permute_channels(
-        self, inpt: Union[features.ImageType, features.VideoType], permutation: torch.Tensor
-    ) -> Union[features.ImageType, features.VideoType]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], permutation: torch.Tensor
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         if isinstance(inpt, PIL.Image.Image):
             inpt = F.pil_to_tensor(inpt)
 
         output = inpt[..., permutation, :, :]
 
-        if isinstance(inpt, (features.Image, features.Video)):
-            output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.OTHER)  # type: ignore[arg-type]
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            output = inpt.wrap_like(inpt, output, color_space=datapoints.ColorSpace.OTHER)  # type: ignore[arg-type]
 
         elif isinstance(inpt, PIL.Image.Image):
             output = F.to_image_pil(output)
@@ -127,8 +133,8 @@ class RandomPhotometricDistort(Transform):
         return output
 
     def _transform(
-        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
-    ) -> Union[features.ImageType, features.VideoType]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         if params["brightness"]:
             inpt = F.adjust_brightness(
                 inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index 593eb8895..3247a8051 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -5,13 +5,13 @@ import numpy as np
 import PIL.Image
 import torch
 
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import Transform
 from torchvision.transforms import functional as _F
 from typing_extensions import Literal
 
 from ._transform import _RandomApplyTransform
-from .utils import query_chw
+from .utils import is_simple_tensor, query_chw
 
 
 class ToTensor(Transform):
@@ -29,7 +29,12 @@ class ToTensor(Transform):
 
 
 class Grayscale(Transform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
 
     def __init__(self, num_output_channels: Literal[1, 3] = 1) -> None:
         deprecation_msg = (
@@ -53,16 +58,21 @@ class Grayscale(Transform):
         self.num_output_channels = num_output_channels
 
     def _transform(
-        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
-    ) -> Union[features.ImageType, features.VideoType]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
-        if isinstance(inpt, (features.Image, features.Video)):
-            output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)  # type: ignore[arg-type]
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            output = inpt.wrap_like(inpt, output, color_space=datapoints.ColorSpace.GRAY)  # type: ignore[arg-type]
         return output
 
 
 class RandomGrayscale(_RandomApplyTransform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
 
     def __init__(self, p: float = 0.1) -> None:
         warnings.warn(
@@ -84,9 +94,9 @@ class RandomGrayscale(_RandomApplyTransform):
         return dict(num_input_channels=num_input_channels)
 
     def _transform(
-        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
-    ) -> Union[features.ImageType, features.VideoType]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
-        if isinstance(inpt, (features.Image, features.Video)):
-            output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY)  # type: ignore[arg-type]
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            output = inpt.wrap_like(inpt, output, color_space=datapoints.ColorSpace.GRAY)  # type: ignore[arg-type]
         return output
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index c5313c265..1cbf02d5a 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -5,8 +5,9 @@ from typing import Any, cast, Dict, List, Optional, Sequence, Tuple, Type, Union
 
 import PIL.Image
 import torch
+
 from torchvision.ops.boxes import box_iou
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
 from torchvision.transforms.functional import _get_perspective_coeffs
 
@@ -22,7 +23,7 @@ from ._utils import (
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import has_all, has_any, query_bounding_box, query_spatial_size
+from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_spatial_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -145,23 +146,23 @@ class RandomResizedCrop(Transform):
         )
 
 
-ImageOrVideoTypeJIT = Union[features.ImageTypeJIT, features.VideoTypeJIT]
+ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
 
 
 class FiveCrop(Transform):
     """
     Example:
         >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[Union[features.Image, features.Video], ...], features.Label]):
+        ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], datapoints.Label]):
         ...         images_or_videos, labels = sample
         ...         batch_size = len(images_or_videos)
         ...         image_or_video = images_or_videos[0]
         ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
-        ...         labels = features.Label.wrap_like(labels, labels.repeat(batch_size))
+        ...         labels = datapoints.Label.wrap_like(labels, labels.repeat(batch_size))
         ...         return images_or_videos, labels
         ...
-        >>> image = features.Image(torch.rand(3, 256, 256))
-        >>> label = features.Label(0)
+        >>> image = datapoints.Image(torch.rand(3, 256, 256))
+        >>> label = datapoints.Label(0)
         >>> transform = transforms.Compose([transforms.FiveCrop(), BatchMultiCrop()])
         >>> images, labels = transform(image, label)
         >>> images.shape
@@ -170,7 +171,12 @@ class FiveCrop(Transform):
         torch.Size([5])
     """
 
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
 
     def __init__(self, size: Union[int, Sequence[int]]) -> None:
         super().__init__()
@@ -182,7 +188,7 @@ class FiveCrop(Transform):
         return F.five_crop(inpt, self.size)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, features.BoundingBox, features.Mask):
+        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
             raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
 
 
@@ -191,7 +197,12 @@ class TenCrop(Transform):
     See :class:`~torchvision.prototype.transforms.FiveCrop` for an example.
     """
 
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
 
     def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
         super().__init__()
@@ -199,12 +210,12 @@ class TenCrop(Transform):
         self.vertical_flip = vertical_flip
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, features.BoundingBox, features.Mask):
+        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
             raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
 
     def _transform(
-        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
-    ) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[List[datapoints.ImageTypeJIT], List[datapoints.VideoTypeJIT]]:
         return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
 
 
@@ -212,7 +223,7 @@ class Pad(Transform):
     def __init__(
         self,
         padding: Union[int, Sequence[int]],
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -235,7 +246,7 @@ class Pad(Transform):
 class RandomZoomOut(_RandomApplyTransform):
     def __init__(
         self,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         side_range: Sequence[float] = (1.0, 4.0),
         p: float = 0.5,
     ) -> None:
@@ -276,7 +287,7 @@ class RandomRotation(Transform):
         degrees: Union[numbers.Number, Sequence],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
@@ -315,7 +326,7 @@ class RandomAffine(Transform):
         scale: Optional[Sequence[float]] = None,
         shear: Optional[Union[int, float, Sequence[float]]] = None,
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
@@ -390,7 +401,7 @@ class RandomCrop(Transform):
         size: Union[int, Sequence[int]],
         padding: Optional[Union[int, Sequence[int]]] = None,
         pad_if_needed: bool = False,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -480,7 +491,7 @@ class RandomPerspective(_RandomApplyTransform):
     def __init__(
         self,
         distortion_scale: float = 0.5,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         p: float = 0.5,
     ) -> None:
@@ -540,7 +551,7 @@ class ElasticTransform(Transform):
         self,
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     ) -> None:
         super().__init__()
@@ -606,9 +617,9 @@ class RandomIoUCrop(Transform):
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
-            has_all(flat_inputs, features.BoundingBox)
-            and has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor)
-            and has_any(flat_inputs, features.Label, features.OneHotLabel)
+            has_all(flat_inputs, datapoints.BoundingBox)
+            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
+            and has_any(flat_inputs, datapoints.Label, datapoints.OneHotLabel)
         ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
@@ -646,7 +657,7 @@ class RandomIoUCrop(Transform):
 
                 # check for any valid boxes with centers within the crop area
                 xyxy_bboxes = F.convert_format_bounding_box(
-                    bboxes.as_subclass(torch.Tensor), bboxes.format, features.BoundingBoxFormat.XYXY
+                    bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
                 )
                 cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
                 cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
@@ -671,19 +682,19 @@ class RandomIoUCrop(Transform):
 
         is_within_crop_area = params["is_within_crop_area"]
 
-        if isinstance(inpt, (features.Label, features.OneHotLabel)):
+        if isinstance(inpt, (datapoints.Label, datapoints.OneHotLabel)):
             return inpt.wrap_like(inpt, inpt[is_within_crop_area])  # type: ignore[arg-type]
 
         output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
 
-        if isinstance(output, features.BoundingBox):
+        if isinstance(output, datapoints.BoundingBox):
             bboxes = output[is_within_crop_area]
             bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size)
-            output = features.BoundingBox.wrap_like(output, bboxes)
-        elif isinstance(output, features.Mask):
+            output = datapoints.BoundingBox.wrap_like(output, bboxes)
+        elif isinstance(output, datapoints.Mask):
             # apply is_within_crop_area if mask is one-hot encoded
             masks = output[is_within_crop_area]
-            output = features.Mask.wrap_like(output, masks)
+            output = datapoints.Mask.wrap_like(output, masks)
 
         return output
 
@@ -751,7 +762,7 @@ class FixedSizeCrop(Transform):
     def __init__(
         self,
         size: Union[int, Sequence[int]],
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         padding_mode: str = "constant",
     ) -> None:
         super().__init__()
@@ -764,13 +775,19 @@ class FixedSizeCrop(Transform):
         self.padding_mode = padding_mode
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if not has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video):
+        if not has_any(
+            flat_inputs,
+            PIL.Image.Image,
+            datapoints.Image,
+            is_simple_tensor,
+            datapoints.Video,
+        ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
             )
 
-        if has_any(flat_inputs, features.BoundingBox) and not has_any(
-            flat_inputs, features.Label, features.OneHotLabel
+        if has_any(flat_inputs, datapoints.BoundingBox) and not has_any(
+            flat_inputs, datapoints.Label, datapoints.OneHotLabel
         ):
             raise TypeError(
                 f"If a BoundingBox is contained in the input sample, "
@@ -809,7 +826,7 @@ class FixedSizeCrop(Transform):
             )
             bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, spatial_size=spatial_size)
             height_and_width = F.convert_format_bounding_box(
-                bounding_boxes, old_format=format, new_format=features.BoundingBoxFormat.XYWH
+                bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
             )[..., 2:]
             is_valid = torch.all(height_and_width > 0, dim=-1)
         else:
@@ -842,10 +859,10 @@ class FixedSizeCrop(Transform):
             )
 
         if params["is_valid"] is not None:
-            if isinstance(inpt, (features.Label, features.OneHotLabel, features.Mask)):
+            if isinstance(inpt, (datapoints.Label, datapoints.OneHotLabel, datapoints.Mask)):
                 inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
-            elif isinstance(inpt, features.BoundingBox):
-                inpt = features.BoundingBox.wrap_like(
+            elif isinstance(inpt, datapoints.BoundingBox):
+                inpt = datapoints.BoundingBox.wrap_like(
                     inpt,
                     F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size),
                 )
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 4a85175e9..6ad9e0410 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -3,38 +3,41 @@ from typing import Any, Dict, Optional, Union
 import PIL.Image
 
 import torch
-from torchvision.prototype import features
+
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
+from .utils import is_simple_tensor
+
 
 class ConvertBoundingBoxFormat(Transform):
-    _transformed_types = (features.BoundingBox,)
+    _transformed_types = (datapoints.BoundingBox,)
 
-    def __init__(self, format: Union[str, features.BoundingBoxFormat]) -> None:
+    def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
         super().__init__()
         if isinstance(format, str):
-            format = features.BoundingBoxFormat[format]
+            format = datapoints.BoundingBoxFormat[format]
         self.format = format
 
-    def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
+    def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
         # We need to unwrap here to avoid unnecessary `__torch_function__` calls,
         # since `convert_format_bounding_box` does not have a dispatcher function that would do that for us
         output = F.convert_format_bounding_box(
             inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=params["format"]
         )
-        return features.BoundingBox.wrap_like(inpt, output, format=params["format"])
+        return datapoints.BoundingBox.wrap_like(inpt, output, format=params["format"])
 
 
 class ConvertDtype(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
         super().__init__()
         self.dtype = dtype
 
     def _transform(
-        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
-    ) -> Union[features.TensorImageType, features.TensorVideoType]:
+        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.TensorImageType, datapoints.TensorVideoType]:
         return F.convert_dtype(inpt, self.dtype)
 
 
@@ -44,36 +47,41 @@ ConvertImageDtype = ConvertDtype
 
 
 class ConvertColorSpace(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image, features.Video)
+    _transformed_types = (
+        is_simple_tensor,
+        datapoints.Image,
+        PIL.Image.Image,
+        datapoints.Video,
+    )
 
     def __init__(
         self,
-        color_space: Union[str, features.ColorSpace],
-        old_color_space: Optional[Union[str, features.ColorSpace]] = None,
+        color_space: Union[str, datapoints.ColorSpace],
+        old_color_space: Optional[Union[str, datapoints.ColorSpace]] = None,
     ) -> None:
         super().__init__()
 
         if isinstance(color_space, str):
-            color_space = features.ColorSpace.from_str(color_space)
+            color_space = datapoints.ColorSpace.from_str(color_space)
         self.color_space = color_space
 
         if isinstance(old_color_space, str):
-            old_color_space = features.ColorSpace.from_str(old_color_space)
+            old_color_space = datapoints.ColorSpace.from_str(old_color_space)
         self.old_color_space = old_color_space
 
     def _transform(
-        self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any]
-    ) -> Union[features.ImageType, features.VideoType]:
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         return F.convert_color_space(inpt, color_space=self.color_space, old_color_space=self.old_color_space)
 
 
 class ClampBoundingBoxes(Transform):
-    _transformed_types = (features.BoundingBox,)
+    _transformed_types = (datapoints.BoundingBox,)
 
-    def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
+    def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
         # We need to unwrap here to avoid unnecessary `__torch_function__` calls,
         # since `clamp_bounding_box` does not have a dispatcher function that would do that for us
         output = F.clamp_bounding_box(
             inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size
         )
-        return features.BoundingBox.wrap_like(inpt, output)
+        return datapoints.BoundingBox.wrap_like(inpt, output)
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index e50d9cff0..70a695199 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -3,12 +3,13 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, U
 import PIL.Image
 
 import torch
+
 from torchvision.ops import remove_small_boxes
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
 from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size
-from .utils import has_any, query_bounding_box
+from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
 class Identity(Transform):
@@ -38,7 +39,7 @@ class Lambda(Transform):
 
 
 class LinearTransformation(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
         super().__init__()
@@ -67,7 +68,7 @@ class LinearTransformation(Transform):
             raise TypeError("LinearTransformation does not work on PIL Images")
 
     def _transform(
-        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         # Image instance after linear transformation is not Image anymore due to unknown data range
         # Thus we will return Tensor for input Image
@@ -93,7 +94,7 @@ class LinearTransformation(Transform):
 
 
 class Normalize(Transform):
-    _transformed_types = (features.Image, features.is_simple_tensor, features.Video)
+    _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
     def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
         super().__init__()
@@ -106,7 +107,7 @@ class Normalize(Transform):
             raise TypeError(f"{type(self).__name__}() does not support PIL images.")
 
     def _transform(
-        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
@@ -158,7 +159,7 @@ class ToDtype(Transform):
 
 
 class PermuteDimensions(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None:
         super().__init__()
@@ -167,7 +168,7 @@ class PermuteDimensions(Transform):
         self.dims = dims
 
     def _transform(
-        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         dims = self.dims[type(inpt)]
         if dims is None:
@@ -176,7 +177,7 @@ class PermuteDimensions(Transform):
 
 
 class TransposeDimensions(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, features.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None:
         super().__init__()
@@ -185,7 +186,7 @@ class TransposeDimensions(Transform):
         self.dims = dims
 
     def _transform(
-        self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         dims = self.dims[type(inpt)]
         if dims is None:
@@ -194,7 +195,7 @@ class TransposeDimensions(Transform):
 
 
 class RemoveSmallBoundingBoxes(Transform):
-    _transformed_types = (features.BoundingBox, features.Mask, features.Label, features.OneHotLabel)
+    _transformed_types = (datapoints.BoundingBox, datapoints.Mask, datapoints.Label, datapoints.OneHotLabel)
 
     def __init__(self, min_size: float = 1.0) -> None:
         super().__init__()
@@ -210,7 +211,7 @@ class RemoveSmallBoundingBoxes(Transform):
         bounding_box = F.convert_format_bounding_box(
             bounding_box.as_subclass(torch.Tensor),
             old_format=bounding_box.format,
-            new_format=features.BoundingBoxFormat.XYXY,
+            new_format=datapoints.BoundingBoxFormat.XYXY,
         )
         valid_indices = remove_small_boxes(bounding_box, min_size=self.min_size)
 
diff --git a/torchvision/prototype/transforms/_temporal.py b/torchvision/prototype/transforms/_temporal.py
index 46293c251..62fe7f4ed 100644
--- a/torchvision/prototype/transforms/_temporal.py
+++ b/torchvision/prototype/transforms/_temporal.py
@@ -1,16 +1,18 @@
 from typing import Any, Dict
 
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
+from torchvision.prototype.transforms.utils import is_simple_tensor
+
 
 class UniformTemporalSubsample(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Video)
 
     def __init__(self, num_samples: int, temporal_dim: int = -4):
         super().__init__()
         self.num_samples = num_samples
         self.temporal_dim = temporal_dim
 
-    def _transform(self, inpt: features.VideoType, params: Dict[str, Any]) -> features.VideoType:
+    def _transform(self, inpt: datapoints.VideoType, params: Dict[str, Any]) -> datapoints.VideoType:
         return F.uniform_temporal_subsample(inpt, self.num_samples, temporal_dim=self.temporal_dim)
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index 30b92259b..01908650f 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -5,23 +5,26 @@ import PIL.Image
 import torch
 
 from torch.nn.functional import one_hot
-from torchvision.prototype import features
+
+from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
+from torchvision.prototype.transforms.utils import is_simple_tensor
+
 
 class LabelToOneHot(Transform):
-    _transformed_types = (features.Label,)
+    _transformed_types = (datapoints.Label,)
 
     def __init__(self, num_categories: int = -1):
         super().__init__()
         self.num_categories = num_categories
 
-    def _transform(self, inpt: features.Label, params: Dict[str, Any]) -> features.OneHotLabel:
+    def _transform(self, inpt: datapoints.Label, params: Dict[str, Any]) -> datapoints.OneHotLabel:
         num_categories = self.num_categories
         if num_categories == -1 and inpt.categories is not None:
             num_categories = len(inpt.categories)
         output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories)
-        return features.OneHotLabel(output, categories=inpt.categories)
+        return datapoints.OneHotLabel(output, categories=inpt.categories)
 
     def extra_repr(self) -> str:
         if self.num_categories == -1:
@@ -38,16 +41,16 @@ class PILToTensor(Transform):
 
 
 class ToImageTensor(Transform):
-    _transformed_types = (features.is_simple_tensor, PIL.Image.Image, np.ndarray)
+    _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
-    ) -> features.Image:
+    ) -> datapoints.Image:
         return F.to_image_tensor(inpt)  # type: ignore[no-any-return]
 
 
 class ToImagePIL(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, np.ndarray)
+    _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:
         super().__init__()
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index 60f648986..cbf899230 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -3,8 +3,8 @@ import numbers
 from collections import defaultdict
 from typing import Any, Dict, Sequence, Type, TypeVar, Union
 
-from torchvision.prototype import features
-from torchvision.prototype.features._feature import FillType, FillTypeJIT
+from torchvision.prototype import datapoints
+from torchvision.prototype.datapoints._datapoint import FillType, FillTypeJIT
 
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
@@ -54,7 +54,7 @@ def _get_defaultdict(default: T) -> Dict[Any, T]:
     return defaultdict(functools.partial(_default_arg, default))
 
 
-def _convert_fill_arg(fill: features.FillType) -> features.FillTypeJIT:
+def _convert_fill_arg(fill: datapoints.FillType) -> datapoints.FillTypeJIT:
     # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
     # So, we can't reassign fill to 0
     # if fill is None:
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index c6d48d381..9f4a24808 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -3,7 +3,7 @@ from typing import Union
 import PIL.Image
 
 import torch
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 
@@ -33,28 +33,28 @@ def erase_video(
 
 
 def erase(
-    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT],
+    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT],
     i: int,
     j: int,
     h: int,
     w: int,
     v: torch.Tensor,
     inplace: bool = False,
-) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
+) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    elif isinstance(inpt, features.Image):
+    elif isinstance(inpt, datapoints.Image):
         output = erase_image_tensor(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        return features.Image.wrap_like(inpt, output)
-    elif isinstance(inpt, features.Video):
+        return datapoints.Image.wrap_like(inpt, output)
+    elif isinstance(inpt, datapoints.Video):
         output = erase_video(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        return features.Video.wrap_like(inpt, output)
+        return datapoints.Video.wrap_like(inpt, output)
     elif isinstance(inpt, PIL.Image.Image):
         return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index dff640586..618968cbb 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -1,7 +1,7 @@
 import PIL.Image
 import torch
 from torch.nn.functional import conv2d
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
@@ -37,16 +37,18 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
     return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor)
 
 
-def adjust_brightness(inpt: features.InputTypeJIT, brightness_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def adjust_brightness(inpt: datapoints.InputTypeJIT, brightness_factor: float) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_brightness(brightness_factor=brightness_factor)
     elif isinstance(inpt, PIL.Image.Image):
         return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -76,16 +78,18 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
     return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor)
 
 
-def adjust_saturation(inpt: features.InputTypeJIT, saturation_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def adjust_saturation(inpt: datapoints.InputTypeJIT, saturation_factor: float) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_saturation(saturation_factor=saturation_factor)
     elif isinstance(inpt, PIL.Image.Image):
         return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -115,16 +119,18 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
     return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor)
 
 
-def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def adjust_contrast(inpt: datapoints.InputTypeJIT, contrast_factor: float) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_contrast(contrast_factor=contrast_factor)
     elif isinstance(inpt, PIL.Image.Image):
         return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -188,16 +194,18 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
     return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor)
 
 
-def adjust_sharpness(inpt: features.InputTypeJIT, sharpness_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def adjust_sharpness(inpt: datapoints.InputTypeJIT, sharpness_factor: float) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_sharpness(sharpness_factor=sharpness_factor)
     elif isinstance(inpt, PIL.Image.Image):
         return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -300,16 +308,18 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
     return adjust_hue_image_tensor(video, hue_factor=hue_factor)
 
 
-def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def adjust_hue(inpt: datapoints.InputTypeJIT, hue_factor: float) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_hue(hue_factor=hue_factor)
     elif isinstance(inpt, PIL.Image.Image):
         return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -340,16 +350,18 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
     return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain)
 
 
-def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def adjust_gamma(inpt: datapoints.InputTypeJIT, gamma: float, gain: float = 1) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_gamma(gamma=gamma, gain=gain)
     elif isinstance(inpt, PIL.Image.Image):
         return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -374,16 +386,18 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
     return posterize_image_tensor(video, bits=bits)
 
 
-def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def posterize(inpt: datapoints.InputTypeJIT, bits: int) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return posterize_image_tensor(inpt, bits=bits)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.posterize(bits=bits)
     elif isinstance(inpt, PIL.Image.Image):
         return posterize_image_pil(inpt, bits=bits)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -402,16 +416,18 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
     return solarize_image_tensor(video, threshold=threshold)
 
 
-def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def solarize(inpt: datapoints.InputTypeJIT, threshold: float) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return solarize_image_tensor(inpt, threshold=threshold)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.solarize(threshold=threshold)
     elif isinstance(inpt, PIL.Image.Image):
         return solarize_image_pil(inpt, threshold=threshold)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -452,16 +468,18 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
     return autocontrast_image_tensor(video)
 
 
-def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def autocontrast(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return autocontrast_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.autocontrast()
     elif isinstance(inpt, PIL.Image.Image):
         return autocontrast_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -542,16 +560,18 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
     return equalize_image_tensor(video)
 
 
-def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def equalize(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return equalize_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.equalize()
     elif isinstance(inpt, PIL.Image.Image):
         return equalize_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -573,15 +593,17 @@ def invert_video(video: torch.Tensor) -> torch.Tensor:
     return invert_image_tensor(video)
 
 
-def invert(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def invert(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return invert_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.invert()
     elif isinstance(inpt, PIL.Image.Image):
         return invert_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index e28bc4565..25b54917b 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -4,16 +4,16 @@ from typing import Any, List, Union
 import PIL.Image
 import torch
 
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.transforms import functional as _F
 
 
 @torch.jit.unused
 def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
     call = ", num_output_channels=3" if num_output_channels == 3 else ""
-    replacement = "convert_color_space(..., color_space=features.ColorSpace.GRAY)"
+    replacement = "convert_color_space(..., color_space=datapoints.ColorSpace.GRAY)"
     if num_output_channels == 3:
-        replacement = f"convert_color_space({replacement}, color_space=features.ColorSpace.RGB)"
+        replacement = f"convert_color_space({replacement}, color_space=datapoints.ColorSpace.RGB)"
     warnings.warn(
         f"The function `to_grayscale(...{call})` is deprecated in will be removed in a future release. "
         f"Instead, please use `{replacement}`.",
@@ -23,25 +23,25 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
 
 
 def rgb_to_grayscale(
-    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], num_output_channels: int = 1
-) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
-    if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)):
+    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], num_output_channels: int = 1
+) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
+    if not torch.jit.is_scripting() and isinstance(inpt, (datapoints.Image, datapoints.Video)):
         inpt = inpt.as_subclass(torch.Tensor)
         old_color_space = None
     elif isinstance(inpt, torch.Tensor):
-        old_color_space = features._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
+        old_color_space = datapoints._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
     else:
         old_color_space = None
 
     call = ", num_output_channels=3" if num_output_channels == 3 else ""
     replacement = (
-        f"convert_color_space(..., color_space=features.ColorSpace.GRAY"
-        f"{f', old_color_space=features.ColorSpace.{old_color_space}' if old_color_space is not None else ''})"
+        f"convert_color_space(..., color_space=datapoints.ColorSpace.GRAY"
+        f"{f', old_color_space=datapoints.ColorSpace.{old_color_space}' if old_color_space is not None else ''})"
     )
     if num_output_channels == 3:
         replacement = (
-            f"convert_color_space({replacement}, color_space=features.ColorSpace.RGB"
-            f"{f', old_color_space=features.ColorSpace.GRAY' if old_color_space is not None else ''})"
+            f"convert_color_space({replacement}, color_space=datapoints.ColorSpace.RGB"
+            f"{f', old_color_space=datapoints.ColorSpace.GRAY' if old_color_space is not None else ''})"
         )
     warnings.warn(
         f"The function `rgb_to_grayscale(...{call})` is deprecated in will be removed in a future release. "
@@ -60,7 +60,7 @@ def to_tensor(inpt: Any) -> torch.Tensor:
     return _F.to_tensor(inpt)
 
 
-def get_image_size(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]:
+def get_image_size(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
         "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 60f931d5f..cef68d66e 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional import (
     _compute_resized_output_size as __compute_resized_output_size,
@@ -34,17 +34,17 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def horizontal_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
     bounding_box = bounding_box.clone().reshape(-1, 4)
 
-    if format == features.BoundingBoxFormat.XYXY:
+    if format == datapoints.BoundingBoxFormat.XYXY:
         bounding_box[:, [2, 0]] = bounding_box[:, [0, 2]].sub_(spatial_size[1]).neg_()
-    elif format == features.BoundingBoxFormat.XYWH:
+    elif format == datapoints.BoundingBoxFormat.XYWH:
         bounding_box[:, 0].add_(bounding_box[:, 2]).sub_(spatial_size[1]).neg_()
-    else:  # format == features.BoundingBoxFormat.CXCYWH:
+    else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
         bounding_box[:, 0].sub_(spatial_size[1]).neg_()
 
     return bounding_box.reshape(shape)
@@ -54,16 +54,18 @@ def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image_tensor(video)
 
 
-def horizontal_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def horizontal_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return horizontal_flip_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.horizontal_flip()
     elif isinstance(inpt, PIL.Image.Image):
         return horizontal_flip_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -80,17 +82,17 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
     bounding_box = bounding_box.clone().reshape(-1, 4)
 
-    if format == features.BoundingBoxFormat.XYXY:
+    if format == datapoints.BoundingBoxFormat.XYXY:
         bounding_box[:, [1, 3]] = bounding_box[:, [3, 1]].sub_(spatial_size[0]).neg_()
-    elif format == features.BoundingBoxFormat.XYWH:
+    elif format == datapoints.BoundingBoxFormat.XYWH:
         bounding_box[:, 1].add_(bounding_box[:, 3]).sub_(spatial_size[0]).neg_()
-    else:  # format == features.BoundingBoxFormat.CXCYWH:
+    else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
         bounding_box[:, 1].sub_(spatial_size[0]).neg_()
 
     return bounding_box.reshape(shape)
@@ -100,16 +102,18 @@ def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
     return vertical_flip_image_tensor(video)
 
 
-def vertical_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def vertical_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return vertical_flip_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.vertical_flip()
     elif isinstance(inpt, PIL.Image.Image):
         return vertical_flip_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -221,15 +225,17 @@ def resize_video(
 
 
 def resize(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
     antialias: Optional[bool] = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
     elif isinstance(inpt, PIL.Image.Image):
         if antialias is not None and not antialias:
@@ -237,7 +243,7 @@ def resize(
         return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -392,7 +398,7 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
 
 
 def _apply_grid_transform(
-    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: features.FillTypeJIT
+    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT
 ) -> torch.Tensor:
 
     shape = float_img.shape
@@ -428,7 +434,7 @@ def _assert_grid_transform_inputs(
     image: torch.Tensor,
     matrix: Optional[List[float]],
     interpolation: str,
-    fill: features.FillTypeJIT,
+    fill: datapoints.FillTypeJIT,
     supported_interpolation_modes: List[str],
     coeffs: Optional[List[float]] = None,
 ) -> None:
@@ -491,7 +497,7 @@ def affine_image_tensor(
     scale: float,
     shear: List[float],
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if image.numel() == 0:
@@ -545,7 +551,7 @@ def affine_image_pil(
     scale: float,
     shear: List[float],
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
@@ -637,7 +643,7 @@ def _affine_bounding_box_xyxy(
 
 def affine_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
@@ -648,7 +654,7 @@ def affine_bounding_box(
     original_shape = bounding_box.shape
 
     bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center)
@@ -656,7 +662,7 @@ def affine_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
@@ -666,7 +672,7 @@ def affine_mask(
     translate: List[float],
     scale: float,
     shear: List[float],
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
@@ -699,7 +705,7 @@ def affine_video(
     scale: float,
     shear: List[float],
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     return affine_image_tensor(
@@ -715,17 +721,19 @@ def affine_video(
 
 
 def affine(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     angle: Union[int, float],
     translate: List[float],
     scale: float,
     shear: List[float],
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
-) -> features.InputTypeJIT:
+) -> datapoints.InputTypeJIT:
     # TODO: consider deprecating integers from angle and shear on the future
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return affine_image_tensor(
             inpt,
             angle,
@@ -736,7 +744,7 @@ def affine(
             fill=fill,
             center=center,
         )
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.affine(
             angle, translate=translate, scale=scale, shear=shear, interpolation=interpolation, fill=fill, center=center
         )
@@ -753,7 +761,7 @@ def affine(
         )
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -764,7 +772,7 @@ def rotate_image_tensor(
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     shape = image.shape
     num_channels, height, width = shape[-3:]
@@ -811,7 +819,7 @@ def rotate_image_pil(
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> PIL.Image.Image:
     if center is not None and expand:
         warnings.warn("The provided center argument has no effect on the result if expand is True")
@@ -824,7 +832,7 @@ def rotate_image_pil(
 
 def rotate_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     angle: float,
     expand: bool = False,
@@ -836,7 +844,7 @@ def rotate_bounding_box(
 
     original_shape = bounding_box.shape
     bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     out_bboxes, spatial_size = _affine_bounding_box_xyxy(
@@ -852,7 +860,7 @@ def rotate_bounding_box(
 
     return (
         convert_format_bounding_box(
-            out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+            out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
         ).reshape(original_shape),
         spatial_size,
     )
@@ -863,7 +871,7 @@ def rotate_mask(
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -892,28 +900,30 @@ def rotate_video(
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
 
 def rotate(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     angle: float,
     interpolation: InterpolationMode = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: features.FillTypeJIT = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+    fill: datapoints.FillTypeJIT = None,
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.rotate(angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
     elif isinstance(inpt, PIL.Image.Image):
         return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -945,7 +955,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
 def pad_image_tensor(
     image: torch.Tensor,
     padding: Union[int, List[int]],
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
     # Be aware that while `padding` has order `[left, top, right, bottom]` has order, `torch_padding` uses
@@ -1047,7 +1057,7 @@ def pad_mask(
     mask: torch.Tensor,
     padding: Union[int, List[int]],
     padding_mode: str = "constant",
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     if fill is None:
         fill = 0
@@ -1071,7 +1081,7 @@ def pad_mask(
 
 def pad_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     padding: Union[int, List[int]],
     padding_mode: str = "constant",
@@ -1082,7 +1092,7 @@ def pad_bounding_box(
 
     left, right, top, bottom = _parse_pad_padding(padding)
 
-    if format == features.BoundingBoxFormat.XYXY:
+    if format == datapoints.BoundingBoxFormat.XYXY:
         pad = [left, top, left, top]
     else:
         pad = [left, top, 0, 0]
@@ -1098,28 +1108,30 @@ def pad_bounding_box(
 def pad_video(
     video: torch.Tensor,
     padding: Union[int, List[int]],
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
     return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode)
 
 
 def pad(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     padding: Union[int, List[int]],
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     padding_mode: str = "constant",
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
 
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.pad(padding, fill=fill, padding_mode=padding_mode)
     elif isinstance(inpt, PIL.Image.Image):
         return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1147,7 +1159,7 @@ crop_image_pil = _FP.crop
 
 def crop_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     top: int,
     left: int,
     height: int,
@@ -1155,7 +1167,7 @@ def crop_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
 
     # Crop or implicit pad if left and/or top have negative values:
-    if format == features.BoundingBoxFormat.XYXY:
+    if format == datapoints.BoundingBoxFormat.XYXY:
         sub = [left, top, left, top]
     else:
         sub = [left, top, 0, 0]
@@ -1184,16 +1196,18 @@ def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int
     return crop_image_tensor(video, top, left, height, width)
 
 
-def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: int) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def crop(inpt: datapoints.InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return crop_image_tensor(inpt, top, left, height, width)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.crop(top, left, height, width)
     elif isinstance(inpt, PIL.Image.Image):
         return crop_image_pil(inpt, top, left, height, width)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1250,7 +1264,7 @@ def perspective_image_tensor(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
@@ -1299,7 +1313,7 @@ def perspective_image_pil(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BICUBIC,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
@@ -1308,7 +1322,7 @@ def perspective_image_pil(
 
 def perspective_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
@@ -1320,7 +1334,7 @@ def perspective_bounding_box(
 
     original_shape = bounding_box.shape
     bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
@@ -1390,7 +1404,7 @@ def perspective_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
@@ -1398,7 +1412,7 @@ def perspective_mask(
     mask: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
@@ -1422,7 +1436,7 @@ def perspective_video(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     return perspective_image_tensor(
@@ -1431,18 +1445,20 @@ def perspective_video(
 
 
 def perspective(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return perspective_image_tensor(
             inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
         )
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.perspective(
             startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
         )
@@ -1452,7 +1468,7 @@ def perspective(
         )
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1461,7 +1477,7 @@ def elastic_image_tensor(
     image: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     if image.numel() == 0:
         return image
@@ -1498,7 +1514,7 @@ def elastic_image_pil(
     image: PIL.Image.Image,
     displacement: torch.Tensor,
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
     output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill)
@@ -1519,7 +1535,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device) -> torch.
 
 def elastic_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     displacement: torch.Tensor,
 ) -> torch.Tensor:
     if bounding_box.numel() == 0:
@@ -1530,7 +1546,7 @@ def elastic_bounding_box(
 
     original_shape = bounding_box.shape
     bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
@@ -1558,14 +1574,14 @@ def elastic_bounding_box(
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
 def elastic_mask(
     mask: torch.Tensor,
     displacement: torch.Tensor,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1585,26 +1601,28 @@ def elastic_video(
     video: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
+    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
 
 
 def elastic(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     displacement: torch.Tensor,
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+    fill: datapoints.FillTypeJIT = None,
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.elastic(displacement, interpolation=interpolation, fill=fill)
     elif isinstance(inpt, PIL.Image.Image):
         return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1677,7 +1695,7 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL
 
 def center_crop_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     output_size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
@@ -1705,16 +1723,18 @@ def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tens
     return center_crop_image_tensor(video, output_size)
 
 
-def center_crop(inpt: features.InputTypeJIT, output_size: List[int]) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def center_crop(inpt: datapoints.InputTypeJIT, output_size: List[int]) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return center_crop_image_tensor(inpt, output_size)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.center_crop(output_size)
     elif isinstance(inpt, PIL.Image.Image):
         return center_crop_image_pil(inpt, output_size)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1749,7 +1769,7 @@ def resized_crop_image_pil(
 
 def resized_crop_bounding_box(
     bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
+    format: datapoints.BoundingBoxFormat,
     top: int,
     left: int,
     height: int,
@@ -1788,7 +1808,7 @@ def resized_crop_video(
 
 
 def resized_crop(
-    inpt: features.InputTypeJIT,
+    inpt: datapoints.InputTypeJIT,
     top: int,
     left: int,
     height: int,
@@ -1796,18 +1816,20 @@ def resized_crop(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     antialias: Optional[bool] = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return resized_crop_image_tensor(
             inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
         )
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.resized_crop(top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
     elif isinstance(inpt, PIL.Image.Image):
         return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1869,28 +1891,29 @@ def five_crop_video(
     return five_crop_image_tensor(video, size)
 
 
-ImageOrVideoTypeJIT = Union[features.ImageTypeJIT, features.VideoTypeJIT]
+ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
 
 
 def five_crop(
     inpt: ImageOrVideoTypeJIT, size: List[int]
 ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
-    # TODO: consider breaking BC here to return List[features.ImageTypeJIT/VideoTypeJIT] to align this op with `ten_crop`
+    # TODO: consider breaking BC here to return List[datapoints.ImageTypeJIT/VideoTypeJIT] to align this op with
+    #  `ten_crop`
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         return five_crop_image_tensor(inpt, size)
-    elif isinstance(inpt, features.Image):
+    elif isinstance(inpt, datapoints.Image):
         output = five_crop_image_tensor(inpt.as_subclass(torch.Tensor), size)
-        return tuple(features.Image.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
-    elif isinstance(inpt, features.Video):
+        return tuple(datapoints.Image.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
+    elif isinstance(inpt, datapoints.Video):
         output = five_crop_video(inpt.as_subclass(torch.Tensor), size)
-        return tuple(features.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
+        return tuple(datapoints.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
     elif isinstance(inpt, PIL.Image.Image):
         return five_crop_image_pil(inpt, size)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -1927,22 +1950,22 @@ def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = F
 
 
 def ten_crop(
-    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], size: List[int], vertical_flip: bool = False
-) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]:
+    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], size: List[int], vertical_flip: bool = False
+) -> Union[List[datapoints.ImageTypeJIT], List[datapoints.VideoTypeJIT]]:
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
-    elif isinstance(inpt, features.Image):
+    elif isinstance(inpt, datapoints.Image):
         output = ten_crop_image_tensor(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
-        return [features.Image.wrap_like(inpt, item) for item in output]
-    elif isinstance(inpt, features.Video):
+        return [datapoints.Image.wrap_like(inpt, item) for item in output]
+    elif isinstance(inpt, datapoints.Video):
         output = ten_crop_video(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
-        return [features.Video.wrap_like(inpt, item) for item in output]
+        return [datapoints.Video.wrap_like(inpt, item) for item in output]
     elif isinstance(inpt, PIL.Image.Image):
         return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 4605b433b..a6b9c7738 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -2,8 +2,8 @@ from typing import List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
-from torchvision.prototype import features
-from torchvision.prototype.features import BoundingBoxFormat, ColorSpace
+from torchvision.prototype import datapoints
+from torchvision.prototype.datapoints import BoundingBoxFormat, ColorSpace
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
@@ -23,12 +23,12 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
 get_dimensions_image_pil = _FP.get_dimensions
 
 
-def get_dimensions(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]:
+def get_dimensions(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> List[int]:
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         return get_dimensions_image_tensor(inpt)
-    elif isinstance(inpt, (features.Image, features.Video)):
+    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
         channels = inpt.num_channels
         height, width = inpt.spatial_size
         return [channels, height, width]
@@ -36,7 +36,7 @@ def get_dimensions(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) ->
         return get_dimensions_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -59,18 +59,18 @@ def get_num_channels_video(video: torch.Tensor) -> int:
     return get_num_channels_image_tensor(video)
 
 
-def get_num_channels(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> int:
+def get_num_channels(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> int:
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         return get_num_channels_image_tensor(inpt)
-    elif isinstance(inpt, (features.Image, features.Video)):
+    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
         return inpt.num_channels
     elif isinstance(inpt, PIL.Image.Image):
         return get_num_channels_image_pil(inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -104,20 +104,22 @@ def get_spatial_size_mask(mask: torch.Tensor) -> List[int]:
 
 
 @torch.jit.unused
-def get_spatial_size_bounding_box(bounding_box: features.BoundingBox) -> List[int]:
+def get_spatial_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[int]:
     return list(bounding_box.spatial_size)
 
 
-def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+def get_spatial_size(inpt: datapoints.InputTypeJIT) -> List[int]:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return get_spatial_size_image_tensor(inpt)
-    elif isinstance(inpt, (features.Image, features.Video, features.BoundingBox, features.Mask)):
+    elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBox, datapoints.Mask)):
         return list(inpt.spatial_size)
     elif isinstance(inpt, PIL.Image.Image):
         return get_spatial_size_image_pil(inpt)  # type: ignore[no-any-return]
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -126,15 +128,13 @@ def get_num_frames_video(video: torch.Tensor) -> int:
     return video.shape[-4]
 
 
-def get_num_frames(inpt: features.VideoTypeJIT) -> int:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)):
+def get_num_frames(inpt: datapoints.VideoTypeJIT) -> int:
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Video)):
         return get_num_frames_video(inpt)
-    elif isinstance(inpt, features.Video):
+    elif isinstance(inpt, datapoints.Video):
         return inpt.num_frames
     else:
-        raise TypeError(
-            f"Input can either be a plain tensor or a `Video` tensor subclass, but got {type(inpt)} instead."
-        )
+        raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
 
 
 def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
@@ -202,7 +202,7 @@ def clamp_bounding_box(
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
     xyxy_boxes = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
+        bounding_box.clone(), old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
@@ -309,12 +309,12 @@ def convert_color_space_video(
 
 
 def convert_color_space(
-    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT],
+    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT],
     color_space: ColorSpace,
     old_color_space: Optional[ColorSpace] = None,
-) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]:
+) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         if old_color_space is None:
             raise RuntimeError(
@@ -322,21 +322,21 @@ def convert_color_space(
                 "the `old_color_space=...` parameter needs to be passed."
             )
         return convert_color_space_image_tensor(inpt, old_color_space=old_color_space, new_color_space=color_space)
-    elif isinstance(inpt, features.Image):
+    elif isinstance(inpt, datapoints.Image):
         output = convert_color_space_image_tensor(
             inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
         )
-        return features.Image.wrap_like(inpt, output, color_space=color_space)
-    elif isinstance(inpt, features.Video):
+        return datapoints.Image.wrap_like(inpt, output, color_space=color_space)
+    elif isinstance(inpt, datapoints.Video):
         output = convert_color_space_video(
             inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
         )
-        return features.Video.wrap_like(inpt, output, color_space=color_space)
+        return datapoints.Video.wrap_like(inpt, output, color_space=color_space)
     elif isinstance(inpt, PIL.Image.Image):
         return convert_color_space_image_pil(inpt, color_space=color_space)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` tensor subclass, or a PIL image, "
+            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
@@ -415,20 +415,19 @@ def convert_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float) -
 
 
 def convert_dtype(
-    inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], dtype: torch.dtype = torch.float
+    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], dtype: torch.dtype = torch.float
 ) -> torch.Tensor:
     if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))
+        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
         return convert_dtype_image_tensor(inpt, dtype)
-    elif isinstance(inpt, features.Image):
+    elif isinstance(inpt, datapoints.Image):
         output = convert_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype)
-        return features.Image.wrap_like(inpt, output)
-    elif isinstance(inpt, features.Video):
+        return datapoints.Image.wrap_like(inpt, output)
+    elif isinstance(inpt, datapoints.Video):
         output = convert_dtype_video(inpt.as_subclass(torch.Tensor), dtype)
-        return features.Video.wrap_like(inpt, output)
+        return datapoints.Video.wrap_like(inpt, output)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor or an `Image` or `Video` tensor subclass, "
-            f"but got {type(inpt)} instead."
+            f"Input can either be a plain tensor or an `Image` or `Video` datapoint, " f"but got {type(inpt)} instead."
         )
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 575e5c76c..779918737 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -4,9 +4,12 @@ from typing import List, Optional, Union
 import PIL.Image
 import torch
 from torch.nn.functional import conv2d, pad as torch_pad
-from torchvision.prototype import features
+
+from torchvision.prototype import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
+from ..utils import is_simple_tensor
+
 
 def normalize_image_tensor(
     image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False
@@ -48,17 +51,17 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
 
 
 def normalize(
-    inpt: Union[features.TensorImageTypeJIT, features.TensorVideoTypeJIT],
+    inpt: Union[datapoints.TensorImageTypeJIT, datapoints.TensorVideoTypeJIT],
     mean: List[float],
     std: List[float],
     inplace: bool = False,
 ) -> torch.Tensor:
     if not torch.jit.is_scripting():
-        if features.is_simple_tensor(inpt) or isinstance(inpt, (features.Image, features.Video)):
+        if is_simple_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video)):
             inpt = inpt.as_subclass(torch.Tensor)
         else:
             raise TypeError(
-                f"Input can either be a plain tensor or an `Image` or `Video` tensor subclass, "
+                f"Input can either be a plain tensor or an `Image` or `Video` datapoint, "
                 f"but got {type(inpt)} instead."
             )
 
@@ -163,16 +166,18 @@ def gaussian_blur_video(
 
 
 def gaussian_blur(
-    inpt: features.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
+    inpt: datapoints.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> datapoints.InputTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (
+        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
+    ):
         return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, features._Feature):
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma)
     elif isinstance(inpt, PIL.Image.Image):
         return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, one of the tensor subclasses TorchVision provides, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
diff --git a/torchvision/prototype/transforms/functional/_temporal.py b/torchvision/prototype/transforms/functional/_temporal.py
index 15d9918ae..63b3baf94 100644
--- a/torchvision/prototype/transforms/functional/_temporal.py
+++ b/torchvision/prototype/transforms/functional/_temporal.py
@@ -1,6 +1,6 @@
 import torch
 
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 
 
 def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temporal_dim: int = -4) -> torch.Tensor:
@@ -11,18 +11,16 @@ def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temp
 
 
 def uniform_temporal_subsample(
-    inpt: features.VideoTypeJIT, num_samples: int, temporal_dim: int = -4
-) -> features.VideoTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)):
+    inpt: datapoints.VideoTypeJIT, num_samples: int, temporal_dim: int = -4
+) -> datapoints.VideoTypeJIT:
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Video)):
         return uniform_temporal_subsample_video(inpt, num_samples, temporal_dim=temporal_dim)
-    elif isinstance(inpt, features.Video):
+    elif isinstance(inpt, datapoints.Video):
         if temporal_dim != -4 and inpt.ndim - 4 != temporal_dim:
             raise ValueError("Video inputs must have temporal_dim equivalent to -4")
         output = uniform_temporal_subsample_video(
             inpt.as_subclass(torch.Tensor), num_samples, temporal_dim=temporal_dim
         )
-        return features.Video.wrap_like(inpt, output)
+        return datapoints.Video.wrap_like(inpt, output)
     else:
-        raise TypeError(
-            f"Input can either be a plain tensor or a `Video` tensor subclass, but got {type(inpt)} instead."
-        )
+        raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
index ff2a8bdf4..286aa7485 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/prototype/transforms/functional/_type_conversion.py
@@ -3,12 +3,12 @@ from typing import Union
 import numpy as np
 import PIL.Image
 import torch
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
 from torchvision.transforms import functional as _F
 
 
 @torch.jit.unused
-def to_image_tensor(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image:
+def to_image_tensor(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoints.Image:
     if isinstance(inpt, np.ndarray):
         output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous()
     elif isinstance(inpt, PIL.Image.Image):
@@ -17,7 +17,7 @@ def to_image_tensor(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> f
         output = inpt
     else:
         raise TypeError(f"Input can either be a numpy array or a PIL image, but got {type(inpt)} instead.")
-    return features.Image(output)
+    return datapoints.Image(output)
 
 
 to_image_pil = _F.to_pil_image
diff --git a/torchvision/prototype/transforms/utils.py b/torchvision/prototype/transforms/utils.py
index 73ab34661..9ab2ed260 100644
--- a/torchvision/prototype/transforms/utils.py
+++ b/torchvision/prototype/transforms/utils.py
@@ -1,14 +1,22 @@
+from __future__ import annotations
+
 from typing import Any, Callable, List, Tuple, Type, Union
 
 import PIL.Image
+import torch
 
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import features
+from torchvision.prototype import datapoints
+from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.transforms.functional import get_dimensions, get_spatial_size
 
 
-def query_bounding_box(flat_inputs: List[Any]) -> features.BoundingBox:
-    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)]
+def is_simple_tensor(inpt: Any) -> bool:
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, Datapoint)
+
+
+def query_bounding_box(flat_inputs: List[Any]) -> datapoints.BoundingBox:
+    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBox)]
     if not bounding_boxes:
         raise TypeError("No bounding box was found in the sample")
     elif len(bounding_boxes) > 1:
@@ -20,7 +28,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs
-        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(inpt)
+        if isinstance(inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video)) or is_simple_tensor(inpt)
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -34,8 +42,10 @@ def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
     sizes = {
         tuple(get_spatial_size(inpt))
         for inpt in flat_inputs
-        if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox))
-        or features.is_simple_tensor(inpt)
+        if isinstance(
+            inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBox)
+        )
+        or is_simple_tensor(inpt)
     }
     if not sizes:
         raise TypeError("No image, video, mask or bounding box was found in the sample")
-- 
GitLab


From 842e178a488722720b6eb1e9cb508439e8e1ecd9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 5 Dec 2022 23:37:20 +0100
Subject: [PATCH 180/624] fix prototype dataset imagenet mock data (#7009)

* fix prototype dataset imagenet mock data

* [REVERT ME] temporarily add prototype datasets test to CI

* Revert "[REVERT ME] temporarily add prototype datasets test to CI"

This reverts commit 7cafb5f01487c7625bcffd88e76832349b3e5055.
---
 test/builtin_dataset_mocks.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index 001e7e831..bbccec208 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -12,7 +12,6 @@ import pickle
 import random
 import shutil
 import unittest.mock
-import warnings
 import xml.etree.ElementTree as ET
 from collections import Counter, defaultdict
 
@@ -519,10 +518,22 @@ def imagenet(root, config):
         ]
         num_children = 1
         synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
-        with warnings.catch_warnings():
-            # The warning is not for savemat, but rather for some internals savemet is using
-            warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
-            savemat(data_root / "meta.mat", dict(synsets=synsets))
+        synsets = np.array(
+            synsets,
+            dtype=np.dtype(
+                [
+                    ("ILSVRC2012_ID", "O"),
+                    ("WNID", "O"),
+                    ("words", "O"),
+                    ("gloss", "O"),
+                    ("num_children", "O"),
+                    ("children", "O"),
+                    ("wordnet_height", "O"),
+                    ("num_train_images", "O"),
+                ]
+            ),
+        )
+        savemat(data_root / "meta.mat", dict(synsets=synsets))
 
         make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
     else:  # config["split"] == "test"
-- 
GitLab


From c65d57a5c773f16a59a1f234391f0ec4c72085ba Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 5 Dec 2022 19:48:21 -0500
Subject: [PATCH 181/624] [Nova] Move Linux and Mac Conda binary publishing to
 GHA (#7004)

---
 .circleci/config.yml                    | 432 ------------------------
 .circleci/regenerate.py                 |   4 +
 .github/workflows/build-conda-linux.yml |   4 +-
 .github/workflows/build-conda-m1.yml    |   4 +-
 .github/workflows/build-conda-macos.yml |   4 +-
 .github/workflows/build-m1-binaries.yml |  10 -
 6 files changed, 7 insertions(+), 451 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7eba2450b..d15a20216 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1228,102 +1228,6 @@ workflows:
           cu_version: cu117
           name: binary_win_wheel_py3.10_cu117
           python_version: '3.10'
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.7_cu117
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1914,342 +1818,6 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.10_cu117
           subfolder: cu117/
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu117
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu117
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.7_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.8_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.9_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.10_cpu
       - binary_win_conda:
           cu_version: cpu
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 88c7460b2..6826febf3 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -71,6 +71,10 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         if os_type == "macos" and btype == "wheel":
                             continue
 
+                        # Disable all non-Windows Conda workflows
+                        if os_type != "win" and btype == "conda":
+                            continue
+
                         w += workflow_pair(
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
                         )
diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
index 85d6f08aa..d38ba9fa2 100644
--- a/.github/workflows/build-conda-linux.yml
+++ b/.github/workflows/build-conda-linux.yml
@@ -40,8 +40,6 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
index dd4559f78..cb0b687f0 100644
--- a/.github/workflows/build-conda-m1.yml
+++ b/.github/workflows/build-conda-m1.yml
@@ -41,8 +41,6 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       runner-type: macos-m1-12
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml
index 11871ac21..4b43528a3 100644
--- a/.github/workflows/build-conda-macos.yml
+++ b/.github/workflows/build-conda-macos.yml
@@ -41,8 +41,6 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       runner-type: macos-12
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml
index 94846bea4..8118fe35b 100644
--- a/.github/workflows/build-m1-binaries.yml
+++ b/.github/workflows/build-m1-binaries.yml
@@ -138,13 +138,3 @@ jobs:
         with:
           name: torchvision-py${{ matrix.py_vers }}-macos11-m1-conda
           path: dist/
-      - name: Upload package to conda
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }}
-        shell: arch -arch arm64 bash {0}
-        env:
-          CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          conda install -yq anaconda-client
-          set -x
-          export ANACONDA_PATH=$(conda info --base)/bin
-          $ANACONDA_PATH/anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload dist/osx-arm64/*.tar.bz2 -u "pytorch-${CHANNEL}" --label main --no-progress --force
-- 
GitLab


From 511924c1ced4ce0461197e5caa64ce5b9e558aab Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 6 Dec 2022 14:53:42 +0100
Subject: [PATCH 182/624] add usage logging to prototype dispatchers / kernels
 (#7012)

---
 test/prototype_transforms_kernel_infos.py     |  6 +++
 test/test_prototype_transforms_functional.py  | 26 +++++++++++++
 .../transforms/functional/_augment.py         |  4 ++
 .../prototype/transforms/functional/_color.py | 35 +++++++++++++++++
 .../transforms/functional/_geometry.py        | 39 +++++++++++++++++++
 .../prototype/transforms/functional/_meta.py  | 26 +++++++++++++
 .../prototype/transforms/functional/_misc.py  |  7 ++++
 .../transforms/functional/_temporal.py        |  5 +++
 8 files changed, 148 insertions(+)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 8849365ea..9d97b6ca7 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -57,6 +57,9 @@ class KernelInfo(InfoBase):
         # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input
         # dtype.
         float32_vs_uint8=False,
+        # Some kernels don't have dispatchers that would handle logging the usage. Thus, the kernel has to do it
+        # manually. If set, triggers a test that makes sure this happens.
+        logs_usage=False,
         # See InfoBase
         test_marks=None,
         # See InfoBase
@@ -71,6 +74,7 @@ class KernelInfo(InfoBase):
         if float32_vs_uint8 and not callable(float32_vs_uint8):
             float32_vs_uint8 = lambda other_args, kwargs: (other_args, kwargs)  # noqa: E731
         self.float32_vs_uint8 = float32_vs_uint8
+        self.logs_usage = logs_usage
 
 
 def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False):
@@ -675,6 +679,7 @@ KERNEL_INFOS.append(
         sample_inputs_fn=sample_inputs_convert_format_bounding_box,
         reference_fn=reference_convert_format_bounding_box,
         reference_inputs_fn=reference_inputs_convert_format_bounding_box,
+        logs_usage=True,
     ),
 )
 
@@ -2100,6 +2105,7 @@ KERNEL_INFOS.append(
     KernelInfo(
         F.clamp_bounding_box,
         sample_inputs_fn=sample_inputs_clamp_bounding_box,
+        logs_usage=True,
     )
 )
 
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 7cd84fbcd..f33992234 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -108,6 +108,19 @@ class TestKernels:
         args_kwargs_fn=lambda info: info.reference_inputs_fn(),
     )
 
+    @make_info_args_kwargs_parametrization(
+        [info for info in KERNEL_INFOS if info.logs_usage],
+        args_kwargs_fn=lambda info: info.sample_inputs_fn(),
+    )
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_logging(self, spy_on, info, args_kwargs, device):
+        spy = spy_on(torch._C._log_api_usage_once)
+
+        args, kwargs = args_kwargs.load(device)
+        info.kernel(*args, **kwargs)
+
+        spy.assert_any_call(f"{info.kernel.__module__}.{info.id}")
+
     @ignore_jit_warning_no_profile
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -291,6 +304,19 @@ class TestDispatchers:
         args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
     )
 
+    @make_info_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(),
+    )
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_logging(self, spy_on, info, args_kwargs, device):
+        spy = spy_on(torch._C._log_api_usage_once)
+
+        args, kwargs = args_kwargs.load(device)
+        info.dispatcher(*args, **kwargs)
+
+        spy.assert_any_call(f"{info.dispatcher.__module__}.{info.id}")
+
     @ignore_jit_warning_no_profile
     @image_sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 9f4a24808..12af2444e 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -5,6 +5,7 @@ import PIL.Image
 import torch
 from torchvision.prototype import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+from torchvision.utils import _log_api_usage_once
 
 
 def erase_image_tensor(
@@ -41,6 +42,9 @@ def erase(
     v: torch.Tensor,
     inplace: bool = False,
 ) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(erase)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 618968cbb..517f74577 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -5,6 +5,8 @@ from torchvision.prototype import datapoints
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
+from torchvision.utils import _log_api_usage_once
+
 from ._meta import _num_value_bits, _rgb_to_gray, convert_dtype_image_tensor
 
 
@@ -38,6 +40,9 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
 
 
 def adjust_brightness(inpt: datapoints.InputTypeJIT, brightness_factor: float) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(adjust_brightness)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -79,6 +84,9 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
 
 
 def adjust_saturation(inpt: datapoints.InputTypeJIT, saturation_factor: float) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(adjust_saturation)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -120,6 +128,9 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
 
 
 def adjust_contrast(inpt: datapoints.InputTypeJIT, contrast_factor: float) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(adjust_contrast)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -195,6 +206,9 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
 
 
 def adjust_sharpness(inpt: datapoints.InputTypeJIT, sharpness_factor: float) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(adjust_sharpness)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -309,6 +323,9 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
 
 
 def adjust_hue(inpt: datapoints.InputTypeJIT, hue_factor: float) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(adjust_hue)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -351,6 +368,9 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
 
 
 def adjust_gamma(inpt: datapoints.InputTypeJIT, gamma: float, gain: float = 1) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(adjust_gamma)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -387,6 +407,9 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
 
 
 def posterize(inpt: datapoints.InputTypeJIT, bits: int) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(posterize)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -417,6 +440,9 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
 
 
 def solarize(inpt: datapoints.InputTypeJIT, threshold: float) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(solarize)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -469,6 +495,9 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def autocontrast(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(autocontrast)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -561,6 +590,9 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def equalize(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(equalize)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -594,6 +626,9 @@ def invert_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def invert(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(invert)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index cef68d66e..ba417a0ce 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -19,6 +19,8 @@ from torchvision.transforms.functional import (
 )
 from torchvision.transforms.functional_tensor import _pad_symmetric
 
+from torchvision.utils import _log_api_usage_once
+
 from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
 
 
@@ -55,6 +57,9 @@ def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def horizontal_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(horizontal_flip)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -103,6 +108,9 @@ def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(vertical_flip)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -231,6 +239,8 @@ def resize(
     max_size: Optional[int] = None,
     antialias: Optional[bool] = None,
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(resize)
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -730,6 +740,9 @@ def affine(
     fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(affine)
+
     # TODO: consider deprecating integers from angle and shear on the future
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
@@ -913,6 +926,9 @@ def rotate(
     center: Optional[List[float]] = None,
     fill: datapoints.FillTypeJIT = None,
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(rotate)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1120,6 +1136,9 @@ def pad(
     fill: datapoints.FillTypeJIT = None,
     padding_mode: str = "constant",
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(pad)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1197,6 +1216,9 @@ def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int
 
 
 def crop(inpt: datapoints.InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(crop)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1452,6 +1474,8 @@ def perspective(
     fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(perspective)
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1612,6 +1636,9 @@ def elastic(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(elastic)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1724,6 +1751,9 @@ def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tens
 
 
 def center_crop(inpt: datapoints.InputTypeJIT, output_size: List[int]) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(center_crop)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1817,6 +1847,9 @@ def resized_crop(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     antialias: Optional[bool] = None,
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(resized_crop)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -1897,6 +1930,9 @@ ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
 def five_crop(
     inpt: ImageOrVideoTypeJIT, size: List[int]
 ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(five_crop)
+
     # TODO: consider breaking BC here to return List[datapoints.ImageTypeJIT/VideoTypeJIT] to align this op with
     #  `ten_crop`
     if isinstance(inpt, torch.Tensor) and (
@@ -1952,6 +1988,9 @@ def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = F
 def ten_crop(
     inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], size: List[int], vertical_flip: bool = False
 ) -> Union[List[datapoints.ImageTypeJIT], List[datapoints.VideoTypeJIT]]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(ten_crop)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index a6b9c7738..28de05369 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -7,6 +7,8 @@ from torchvision.prototype.datapoints import BoundingBoxFormat, ColorSpace
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
+from torchvision.utils import _log_api_usage_once
+
 
 def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
     chw = list(image.shape[-3:])
@@ -24,6 +26,9 @@ get_dimensions_image_pil = _FP.get_dimensions
 
 
 def get_dimensions(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> List[int]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(get_dimensions)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
@@ -60,6 +65,9 @@ def get_num_channels_video(video: torch.Tensor) -> int:
 
 
 def get_num_channels(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> int:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(get_num_channels)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
@@ -109,6 +117,9 @@ def get_spatial_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[
 
 
 def get_spatial_size(inpt: datapoints.InputTypeJIT) -> List[int]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(get_spatial_size)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
@@ -129,6 +140,9 @@ def get_num_frames_video(video: torch.Tensor) -> int:
 
 
 def get_num_frames(inpt: datapoints.VideoTypeJIT) -> int:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(get_num_frames)
+
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Video)):
         return get_num_frames_video(inpt)
     elif isinstance(inpt, datapoints.Video):
@@ -179,6 +193,9 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
 def convert_format_bounding_box(
     bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_format_bounding_box)
+
     if new_format == old_format:
         return bounding_box
 
@@ -199,6 +216,9 @@ def convert_format_bounding_box(
 def clamp_bounding_box(
     bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_bounding_box)
+
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
     xyxy_boxes = convert_format_bounding_box(
@@ -313,6 +333,9 @@ def convert_color_space(
     color_space: ColorSpace,
     old_color_space: Optional[ColorSpace] = None,
 ) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_color_space)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
@@ -417,6 +440,9 @@ def convert_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float) -
 def convert_dtype(
     inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], dtype: torch.dtype = torch.float
 ) -> torch.Tensor:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_dtype)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
     ):
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 779918737..bc9408d0e 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -8,6 +8,8 @@ from torch.nn.functional import conv2d, pad as torch_pad
 from torchvision.prototype import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
+from torchvision.utils import _log_api_usage_once
+
 from ..utils import is_simple_tensor
 
 
@@ -57,6 +59,8 @@ def normalize(
     inplace: bool = False,
 ) -> torch.Tensor:
     if not torch.jit.is_scripting():
+        _log_api_usage_once(normalize)
+
         if is_simple_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video)):
             inpt = inpt.as_subclass(torch.Tensor)
         else:
@@ -168,6 +172,9 @@ def gaussian_blur_video(
 def gaussian_blur(
     inpt: datapoints.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(gaussian_blur)
+
     if isinstance(inpt, torch.Tensor) and (
         torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
     ):
diff --git a/torchvision/prototype/transforms/functional/_temporal.py b/torchvision/prototype/transforms/functional/_temporal.py
index 63b3baf94..35f4a84ce 100644
--- a/torchvision/prototype/transforms/functional/_temporal.py
+++ b/torchvision/prototype/transforms/functional/_temporal.py
@@ -2,6 +2,8 @@ import torch
 
 from torchvision.prototype import datapoints
 
+from torchvision.utils import _log_api_usage_once
+
 
 def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temporal_dim: int = -4) -> torch.Tensor:
     # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
@@ -13,6 +15,9 @@ def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temp
 def uniform_temporal_subsample(
     inpt: datapoints.VideoTypeJIT, num_samples: int, temporal_dim: int = -4
 ) -> datapoints.VideoTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(uniform_temporal_subsample)
+
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Video)):
         return uniform_temporal_subsample_video(inpt, num_samples, temporal_dim=temporal_dim)
     elif isinstance(inpt, datapoints.Video):
-- 
GitLab


From f93eb8ff5222b4f95d0f030e0f7813de0455ae4e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 7 Dec 2022 09:45:24 +0100
Subject: [PATCH 183/624] improve download tests and fix SBU (#7013)

* improve download tests

* move SBU to xfailed

* add SSL version output to CI
---
 .github/workflows/tests-schedule.yml |  3 +++
 test/test_datasets_download.py       | 21 ++++++++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml
index ecc283cac..1e5a78207 100644
--- a/.github/workflows/tests-schedule.yml
+++ b/.github/workflows/tests-schedule.yml
@@ -23,6 +23,9 @@ jobs:
       - name: Upgrade system packages
         run: python -m pip install --upgrade pip setuptools wheel
 
+      - name: SSL
+        run: python -c 'import ssl; print(ssl.OPENSSL_VERSION)'
+
       - name: Checkout repository
         uses: actions/checkout@v2
 
diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
index b44d95424..1e76ba42e 100644
--- a/test/test_datasets_download.py
+++ b/test/test_datasets_download.py
@@ -2,6 +2,7 @@ import contextlib
 import itertools
 import tempfile
 import time
+import traceback
 import unittest.mock
 import warnings
 from datetime import datetime
@@ -127,19 +128,23 @@ def log_download_attempts(
 
 
 def retry(fn, times=1, wait=5.0):
-    msgs = []
+    tbs = []
     for _ in range(times + 1):
         try:
             return fn()
         except AssertionError as error:
-            msgs.append(str(error))
+            tbs.append("".join(traceback.format_exception(type(error), error, error.__traceback__)))
             time.sleep(wait)
     else:
         raise AssertionError(
             "\n".join(
                 (
-                    f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time.\n",
-                    *(f"{idx}: {error}" for idx, error in enumerate(msgs, 1)),
+                    "\n",
+                    *[f"{'_' * 40}  {idx:2d}  {'_' * 40}\n\n{tb}" for idx, tb in enumerate(tbs, 1)],
+                    (
+                        f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time. "
+                        f"You can find the the full tracebacks above."
+                    ),
                 )
             )
         )
@@ -149,10 +154,12 @@ def retry(fn, times=1, wait=5.0):
 def assert_server_response_ok():
     try:
         yield
-    except URLError as error:
-        raise AssertionError("The request timed out.") from error
     except HTTPError as error:
         raise AssertionError(f"The server returned {error.code}: {error.reason}.") from error
+    except URLError as error:
+        raise AssertionError(
+            "Connection not possible due to SSL." if "SSL" in str(error) else "The request timed out."
+        ) from error
     except RecursionError as error:
         raise AssertionError(str(error)) from error
 
@@ -448,7 +455,6 @@ def make_parametrize_kwargs(download_configs):
             omniglot(),
             phototour(),
             sbdataset(),
-            sbu(),
             semeion(),
             stl10(),
             svhn(),
@@ -472,6 +478,7 @@ def test_url_is_accessible(url, md5):
     **make_parametrize_kwargs(
         itertools.chain(
             places365(),  # https://github.com/pytorch/vision/issues/6268
+            sbu(),  # https://github.com/pytorch/vision/issues/7005
         )
     )
 )
-- 
GitLab


From 23d3f78aeea9329a8257e17b90c37f6f2016c171 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 7 Dec 2022 08:34:18 -0500
Subject: [PATCH 184/624] Add more advanced smoke test for project Nova and
 validation workflows (#7014)

* Add more advanced smoke test

* add torch import

* remove dependency on torch

* Add missing vars

* More code and ufmt

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
---
 test/smoke_test.py | 61 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index e4334c659..f80aba1d1 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,12 +1,63 @@
 """Run smoke tests"""
 
 import os
+from pathlib import Path
 
+import torch
 import torchvision
 from torchvision.io import read_image
+from torchvision.models import resnet50, ResNet50_Weights
 
-image_path = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
-)
-print("torchvision version is ", torchvision.__version__)
-img = read_image(image_path)
+SCRIPT_DIR = Path(__file__).parent
+
+
+def smoke_test_torchvision() -> None:
+    print(
+        "Is torchvision useable?",
+        all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]),
+    )
+
+
+def smoke_test_torchvision_read_decode() -> None:
+    img_jpg = read_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
+    if img_jpg.ndim != 3 or img_jpg.numel() < 100:
+        raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
+    img_png = read_image(str(SCRIPT_DIR / "assets" / "interlaced_png" / "wizard_low.png"))
+    if img_png.ndim != 3 or img_png.numel() < 100:
+        raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
+
+
+def smoke_test_torchvision_resnet50_classify() -> None:
+    img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg"))
+
+    # Step 1: Initialize model with the best available weights
+    weights = ResNet50_Weights.DEFAULT
+    model = resnet50(weights=weights)
+    model.eval()
+
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
+
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(img).unsqueeze(0)
+
+    # Step 4: Use the model and print the predicted category
+    prediction = model(batch).squeeze(0).softmax(0)
+    class_id = prediction.argmax().item()
+    score = prediction[class_id].item()
+    category_name = weights.meta["categories"][class_id]
+    expected_category = "German shepherd"
+    print(f"{category_name}: {100 * score:.1f}%")
+    if category_name != expected_category:
+        raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}")
+
+
+def main() -> None:
+    print(f"torchvision: {torchvision.__version__}")
+    smoke_test_torchvision()
+    smoke_test_torchvision_read_decode()
+    smoke_test_torchvision_resnet50_classify()
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From 09030c02ae88c87bbeafded72d27df7ea4213196 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 8 Dec 2022 11:48:53 -0500
Subject: [PATCH 185/624] Add cuda resnet50 test to smoke test (#7020)

* Add cuda resnet50 test

* Fix path

* Tune vision smoke test
---
 test/smoke_test.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index f80aba1d1..9c58add73 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -17,7 +17,6 @@ def smoke_test_torchvision() -> None:
         all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]),
     )
 
-
 def smoke_test_torchvision_read_decode() -> None:
     img_jpg = read_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
     if img_jpg.ndim != 3 or img_jpg.numel() < 100:
@@ -26,13 +25,12 @@ def smoke_test_torchvision_read_decode() -> None:
     if img_png.ndim != 3 or img_png.numel() < 100:
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
-
-def smoke_test_torchvision_resnet50_classify() -> None:
-    img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg"))
+def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
+    img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device)
 
     # Step 1: Initialize model with the best available weights
     weights = ResNet50_Weights.DEFAULT
-    model = resnet50(weights=weights)
+    model = resnet50(weights=weights).to(device)
     model.eval()
 
     # Step 2: Initialize the inference transforms
@@ -47,17 +45,19 @@ def smoke_test_torchvision_resnet50_classify() -> None:
     score = prediction[class_id].item()
     category_name = weights.meta["categories"][class_id]
     expected_category = "German shepherd"
-    print(f"{category_name}: {100 * score:.1f}%")
+    print(f"{category_name} ({device}): {100 * score:.1f}%")
     if category_name != expected_category:
-        raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}")
-
+        raise RuntimeError(
+            f"Failed ResNet50 classify {category_name} Expected: {expected_category}"
+        )
 
 def main() -> None:
     print(f"torchvision: {torchvision.__version__}")
     smoke_test_torchvision()
     smoke_test_torchvision_read_decode()
     smoke_test_torchvision_resnet50_classify()
-
+    if torch.cuda.is_available():
+        smoke_test_torchvision_resnet50_classify("cuda")
 
 if __name__ == "__main__":
     main()
-- 
GitLab


From 029cb3fe4526084172c30be14278d46ecd5bf17c Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Thu, 8 Dec 2022 16:33:26 -0500
Subject: [PATCH 186/624] [Nova] Remove old M1 Workflow (#7010)

---
 .github/workflows/build-m1-binaries.yml | 140 ------------------------
 1 file changed, 140 deletions(-)
 delete mode 100644 .github/workflows/build-m1-binaries.yml

diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml
deleted file mode 100644
index 8118fe35b..000000000
--- a/.github/workflows/build-m1-binaries.yml
+++ /dev/null
@@ -1,140 +0,0 @@
-name: Build on M1
-on:
-  pull_request:
-    paths:
-      - .github/workflows/build-m1-binaries.yml
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-    tags:
-        # NOTE: Binary build pipelines should only get triggered on release candidate builds
-        # Release candidate tags look like: v1.11.0-rc1
-        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-  workflow_dispatch:
-env:
-  CHANNEL: "nightly"
-jobs:
-  build_wheels:
-    name: "Build TorchVision M1 wheels"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8", "3.9", "3.10" ]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-      - name: Build TorchVision M1 wheel
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')`
-          set -ex
-          . packaging/pkg_helpers.bash
-          # if we are uploading to test channell, our version consist only of the base: 0.x.x - no date string or suffix added
-          if [[ $CHANNEL == "test" ]]; then
-            setup_base_build_version
-          else
-            setup_build_version
-          fi
-
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg wheel pkg-config
-          conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 -mpip install delocate
-          conda run -p ${ENV_NAME} python3 setup.py bdist_wheel
-          export PYTORCH_VERSION="$(conda run -p ${ENV_NAME} python3 -mpip show torch | grep ^Version: | sed 's/Version:  *//')"
-          conda run -p ${ENV_NAME} DYLD_FALLBACK_LIBRARY_PATH="${ENV_NAME}/lib" delocate-wheel -v --ignore-missing-dependencies dist/*.whl
-          conda env remove -p ${ENV_NAME}
-      - name: Test wheel
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-test-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          set -ex
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy
-          conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 -mpip install dist/*.whl
-          # Test torch is importable, by changing cwd and running import commands
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print('torchvision version is ', torchvision.__version__)"
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torch;import torchvision;print('Is torchvision useable?', all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]))"
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print(torchvision.io.read_image('${PWD}/gallery/assets/dog1.jpg').shape)"
-          conda env remove -p ${ENV_NAME}
-      - name: Upload wheel to GitHub
-        uses: actions/upload-artifact@v3
-        with:
-          name: torchvision-py${{ matrix.py_vers }}-macos11-m1
-          path: dist/
-  build_conda:
-    name: "Build TorchVision M1 conda packages"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8", "3.9", "3.10" ]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Set CHANNEL Release (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-      - name: Install conda-build and purge previous artifacts
-        shell: arch -arch arm64 bash {0}
-        run: |
-          conda install -yq conda-build
-          conda build purge-all
-      - name: Build TorchVision M1 conda package
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PYTHON_VERSION: ${{ matrix.py_vers }}
-          CU_VERSION: cpu
-        run: |
-          set -ex
-          . packaging/pkg_helpers.bash
-
-          if [[ $CHANNEL == "test" ]]; then
-            setup_base_build_version
-          else
-            setup_build_version
-          fi
-
-          setup_conda_pytorch_constraint
-          export SOURCE_ROOT_DIR=$(pwd)
-          conda build \
-            -c defaults \
-            $CONDA_CHANNEL_FLAGS \
-            --no-anaconda-upload \
-            --python "$PYTHON_VERSION" \
-            --output-folder=dist/ \
-            packaging/torchvision
-      - name: Upload package to GitHub
-        uses: actions/upload-artifact@v3
-        with:
-          name: torchvision-py${{ matrix.py_vers }}-macos11-m1-conda
-          path: dist/
-- 
GitLab


From 677fc939b21a8893f07db4c1f90482b648b6573f Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 9 Dec 2022 13:31:24 +0100
Subject: [PATCH 187/624] [proto][ci] Try add GPU ci for prototype transforms
 (#6919)

* Try add GPU ci for prototype transforms

* Update prototype-transforms-tests-linux-gpu.yml

* Update prototype-transforms-tests-linux-gpu.yml

* Update prototype-transforms-tests-linux-gpu.yml

* Update prototype-transforms-tests-linux-gpu.yml

* Update prototype-transforms-tests-linux-gpu.yml

* add small tolerance for adjust_contrast on CUDA

* [skip-ci] update

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .../prototype-transforms-tests-linux-gpu.yml  | 70 +++++++++++++++++++
 test/prototype_transforms_kernel_infos.py     |  2 +
 2 files changed, 72 insertions(+)
 create mode 100644 .github/workflows/prototype-transforms-tests-linux-gpu.yml

diff --git a/.github/workflows/prototype-transforms-tests-linux-gpu.yml b/.github/workflows/prototype-transforms-tests-linux-gpu.yml
new file mode 100644
index 000000000..e5740886b
--- /dev/null
+++ b/.github/workflows/prototype-transforms-tests-linux-gpu.yml
@@ -0,0 +1,70 @@
+name: Prototype transforms unit-tests on Linux GPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+env:
+  CHANNEL: "nightly"
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python_version: ["3.8"]
+        cuda_arch_version: ["11.6"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.4xlarge.nvidia.gpu
+      repository: pytorch/vision
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda_arch_version }}
+      timeout: 120
+      script: |
+        # Mark Build Directory Safe
+        git config --global --add safe.directory /__w/vision/vision
+
+        # Set up Environment Variables
+        export PYTHON_VERSION="${{ matrix.python_version }}"
+        export VERSION="${{ matrix.cuda_arch_version }}"
+        export CUDATOOLKIT="pytorch-cuda=${VERSION}"
+
+        # Set CHANNEL
+        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+          export CHANNEL=test
+        else
+          export CHANNEL=nightly
+        fi
+
+        # Create Conda Env
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda activate /work/ci_env
+
+        # Install PyTorch, Torchvision, and testing libraries
+        set -ex
+        conda install \
+          --yes \
+          -c "pytorch-${CHANNEL}" \
+          -c nvidia \
+          pytorch \
+          torchdata \
+          "${CUDATOOLKIT}"
+
+        python3 -c "import torch; exit(not torch.cuda.is_available())"
+          
+        python3 setup.py develop
+        python3 -m pip install pytest pytest-mock pytest-cov
+
+        # Run Tests
+        python3 -m torch.utils.collect_env
+        python3 -m pytest \
+            --durations=20 \
+            --cov=torchvision/prototype/transforms \
+            --cov-report=term-missing \
+            test/test_prototype_transforms*.py
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 9d97b6ca7..ded888a4a 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -1940,11 +1940,13 @@ KERNEL_INFOS.extend(
             closeness_kwargs={
                 **pil_reference_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(2),
+                **cuda_vs_cpu_pixel_difference(),
             },
         ),
         KernelInfo(
             F.adjust_contrast_video,
             sample_inputs_fn=sample_inputs_adjust_contrast_video,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
-- 
GitLab


From 5a75fa9f1a21bf981f770655df848eedd0854799 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 12 Dec 2022 10:37:19 +0100
Subject: [PATCH 188/624] Update prototype-transforms-tests-linux-gpu.yml
 (#7033)

---
 .github/workflows/prototype-transforms-tests-linux-gpu.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/prototype-transforms-tests-linux-gpu.yml b/.github/workflows/prototype-transforms-tests-linux-gpu.yml
index e5740886b..01f1c1ccc 100644
--- a/.github/workflows/prototype-transforms-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-transforms-tests-linux-gpu.yml
@@ -53,7 +53,6 @@ jobs:
           -c "pytorch-${CHANNEL}" \
           -c nvidia \
           pytorch \
-          torchdata \
           "${CUDATOOLKIT}"
 
         python3 -c "import torch; exit(not torch.cuda.is_available())"
-- 
GitLab


From 5785e2b05cdffeb39678914b8308a260e7e757db Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 12 Dec 2022 09:40:21 +0000
Subject: [PATCH 189/624] Allow dropout overwrites on EfficientNet (#7031)

---
 test/smoke_test.py                 |  8 +++---
 torchvision/models/efficientnet.py | 42 ++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index 9c58add73..81166309a 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -17,6 +17,7 @@ def smoke_test_torchvision() -> None:
         all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]),
     )
 
+
 def smoke_test_torchvision_read_decode() -> None:
     img_jpg = read_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
     if img_jpg.ndim != 3 or img_jpg.numel() < 100:
@@ -25,6 +26,7 @@ def smoke_test_torchvision_read_decode() -> None:
     if img_png.ndim != 3 or img_png.numel() < 100:
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
+
 def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
     img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device)
 
@@ -47,9 +49,8 @@ def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
     expected_category = "German shepherd"
     print(f"{category_name} ({device}): {100 * score:.1f}%")
     if category_name != expected_category:
-        raise RuntimeError(
-            f"Failed ResNet50 classify {category_name} Expected: {expected_category}"
-        )
+        raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}")
+
 
 def main() -> None:
     print(f"torchvision: {torchvision.__version__}")
@@ -59,5 +60,6 @@ def main() -> None:
     if torch.cuda.is_available():
         smoke_test_torchvision_resnet50_classify("cuda")
 
+
 if __name__ == "__main__":
     main()
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index 05414c931..3877dff6a 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -779,7 +779,9 @@ def efficientnet_b0(
     weights = EfficientNet_B0_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b0", width_mult=1.0, depth_mult=1.0)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -808,7 +810,9 @@ def efficientnet_b1(
     weights = EfficientNet_B1_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b1", width_mult=1.0, depth_mult=1.1)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -837,7 +841,9 @@ def efficientnet_b2(
     weights = EfficientNet_B2_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b2", width_mult=1.1, depth_mult=1.2)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.3), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -866,7 +872,14 @@ def efficientnet_b3(
     weights = EfficientNet_B3_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b3", width_mult=1.2, depth_mult=1.4)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.3),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
 
 
 @register_model()
@@ -895,7 +908,14 @@ def efficientnet_b4(
     weights = EfficientNet_B4_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b4", width_mult=1.4, depth_mult=1.8)
-    return _efficientnet(inverted_residual_setting, 0.4, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
 
 
 @register_model()
@@ -926,7 +946,7 @@ def efficientnet_b5(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b5", width_mult=1.6, depth_mult=2.2)
     return _efficientnet(
         inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
         last_channel,
         weights,
         progress,
@@ -963,7 +983,7 @@ def efficientnet_b6(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b6", width_mult=1.8, depth_mult=2.6)
     return _efficientnet(
         inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
         last_channel,
         weights,
         progress,
@@ -1000,7 +1020,7 @@ def efficientnet_b7(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b7", width_mult=2.0, depth_mult=3.1)
     return _efficientnet(
         inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
         last_channel,
         weights,
         progress,
@@ -1038,7 +1058,7 @@ def efficientnet_v2_s(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
     return _efficientnet(
         inverted_residual_setting,
-        0.2,
+        kwargs.pop("dropout", 0.2),
         last_channel,
         weights,
         progress,
@@ -1076,7 +1096,7 @@ def efficientnet_v2_m(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_m")
     return _efficientnet(
         inverted_residual_setting,
-        0.3,
+        kwargs.pop("dropout", 0.3),
         last_channel,
         weights,
         progress,
@@ -1114,7 +1134,7 @@ def efficientnet_v2_l(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_l")
     return _efficientnet(
         inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
         last_channel,
         weights,
         progress,
-- 
GitLab


From 93723b481d1f6e7f247c847d200e0b8de939f0c7 Mon Sep 17 00:00:00 2001
From: F-G Fernandez <26927750+frgfm@users.noreply.github.com>
Date: Mon, 12 Dec 2022 15:15:58 +0100
Subject: [PATCH 190/624] Added typing annotations to datasets/_optical_flow
 (#6845)

* style: Added typing annotations to datasets/_optical_flow

* style: Reverted back to str typing

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/datasets/_optical_flow.py | 68 +++++++++++++++++----------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py
index bc26f51dc..c76632588 100644
--- a/torchvision/datasets/_optical_flow.py
+++ b/torchvision/datasets/_optical_flow.py
@@ -3,6 +3,7 @@ import os
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -13,6 +14,10 @@ from .utils import _read_pfm, verify_str_arg
 from .vision import VisionDataset
 
 
+T1 = Tuple[Image.Image, Image.Image, Optional[np.ndarray], Optional[np.ndarray]]
+T2 = Tuple[Image.Image, Image.Image, Optional[np.ndarray]]
+
+
 __all__ = (
     "KittiFlow",
     "Sintel",
@@ -28,26 +33,26 @@ class FlowDataset(ABC, VisionDataset):
     # and it's up to whatever consumes the dataset to decide what valid_flow_mask should be.
     _has_builtin_flow_mask = False
 
-    def __init__(self, root, transforms=None):
+    def __init__(self, root: str, transforms: Optional[Callable] = None) -> None:
 
         super().__init__(root=root)
         self.transforms = transforms
 
-        self._flow_list = []
-        self._image_list = []
+        self._flow_list: List[str] = []
+        self._image_list: List[List[str]] = []
 
-    def _read_img(self, file_name):
+    def _read_img(self, file_name: str) -> Image.Image:
         img = Image.open(file_name)
         if img.mode != "RGB":
             img = img.convert("RGB")
         return img
 
     @abstractmethod
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str):
         # Return the flow or a tuple with the flow and the valid_flow_mask if _has_builtin_flow_mask is True
         pass
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
 
         img1 = self._read_img(self._image_list[index][0])
         img2 = self._read_img(self._image_list[index][1])
@@ -70,10 +75,10 @@ class FlowDataset(ABC, VisionDataset):
         else:
             return img1, img2, flow
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._image_list)
 
-    def __rmul__(self, v):
+    def __rmul__(self, v: int) -> torch.utils.data.ConcatDataset:
         return torch.utils.data.ConcatDataset([self] * v)
 
 
@@ -118,7 +123,13 @@ class Sintel(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", pass_name="clean", transforms=None):
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+        pass_name: str = "clean",
+        transforms: Optional[Callable] = None,
+    ) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -139,7 +150,7 @@ class Sintel(FlowDataset):
                 if split == "train":
                     self._flow_list += sorted(glob(str(flow_root / scene / "*.flo")))
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -154,7 +165,7 @@ class Sintel(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_flo(file_name)
 
 
@@ -180,7 +191,7 @@ class KittiFlow(FlowDataset):
 
     _has_builtin_flow_mask = True
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -200,7 +211,7 @@ class KittiFlow(FlowDataset):
         if split == "train":
             self._flow_list = sorted(glob(str(root / "flow_occ" / "*_10.png")))
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -215,7 +226,7 @@ class KittiFlow(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> Tuple[np.ndarray, np.ndarray]:
         return _read_16bits_png_with_flow_and_valid_mask(file_name)
 
 
@@ -245,7 +256,7 @@ class FlyingChairs(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "val"))
@@ -268,7 +279,7 @@ class FlyingChairs(FlowDataset):
                 self._flow_list += [flows[i]]
                 self._image_list += [[images[2 * i], images[2 * i + 1]]]
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -283,7 +294,7 @@ class FlyingChairs(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_flo(file_name)
 
 
@@ -316,7 +327,14 @@ class FlyingThings3D(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", pass_name="clean", camera="left", transforms=None):
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+        pass_name: str = "clean",
+        camera: str = "left",
+        transforms: Optional[Callable] = None,
+    ) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -359,7 +377,7 @@ class FlyingThings3D(FlowDataset):
                         self._image_list += [[images[i + 1], images[i]]]
                         self._flow_list += [flows[i + 1]]
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -374,7 +392,7 @@ class FlyingThings3D(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_pfm(file_name)
 
 
@@ -401,7 +419,7 @@ class HD1K(FlowDataset):
 
     _has_builtin_flow_mask = True
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -426,10 +444,10 @@ class HD1K(FlowDataset):
                 "Could not find the HD1K images. Please make sure the directory structure is correct."
             )
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> Tuple[np.ndarray, np.ndarray]:
         return _read_16bits_png_with_flow_and_valid_mask(file_name)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -445,7 +463,7 @@ class HD1K(FlowDataset):
         return super().__getitem__(index)
 
 
-def _read_flo(file_name):
+def _read_flo(file_name: str) -> np.ndarray:
     """Read .flo file in Middlebury format"""
     # Code adapted from:
     # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
@@ -462,7 +480,7 @@ def _read_flo(file_name):
         return data.reshape(h, w, 2).transpose(2, 0, 1)
 
 
-def _read_16bits_png_with_flow_and_valid_mask(file_name):
+def _read_16bits_png_with_flow_and_valid_mask(file_name: str) -> Tuple[np.ndarray, np.ndarray]:
 
     flow_and_valid = _read_png_16(file_name).to(torch.float32)
     flow, valid_flow_mask = flow_and_valid[:2, :, :], flow_and_valid[2, :, :]
-- 
GitLab


From 0dceac025615a1c2df6ec1675d8f9d7757432a49 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Tue, 13 Dec 2022 00:51:08 -0800
Subject: [PATCH 191/624] Fix non-existing parameters in docstrings (#7025)

---
 references/depth/stereo/utils/losses.py | 2 +-
 test/datasets_utils.py                  | 2 +-
 test/test_ops.py                        | 5 +++--
 torchvision/models/_api.py              | 3 +--
 torchvision/models/detection/_utils.py  | 2 +-
 torchvision/transforms/transforms.py    | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py
index f950f1745..f02988667 100644
--- a/references/depth/stereo/utils/losses.py
+++ b/references/depth/stereo/utils/losses.py
@@ -275,7 +275,7 @@ class SmoothnessLoss(nn.Module):
         """
         Args:
             images: tensor of shape (D1, D2, ..., DN, C, H, W)
-            depths: tensor of shape (D1, D2, ..., DN, 1, H, W)
+            vals: tensor of shape (D1, D2, ..., DN, 1, H, W)
 
         Returns:
             smoothness loss of shape (D1, D2, ..., DN)
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index c232e7132..d945a5606 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -951,7 +951,7 @@ def create_random_string(length: int, *digits: str) -> str:
 
     Args:
         length (int): Number of characters in the generated string.
-        *characters (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
+        *digits (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
     """
     if not digits:
         digits = string.ascii_lowercase
diff --git a/test/test_ops.py b/test/test_ops.py
index 99b58bb93..c2f101b39 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -585,8 +585,9 @@ class TestNMS:
     def _reference_nms(self, boxes, scores, iou_threshold):
         """
         Args:
-            box_scores (N, 5): boxes in corner-form and probabilities.
-            iou_threshold: intersection over union threshold.
+            boxes: boxes in corner-form
+            scores: probabilities
+            iou_threshold: intersection over union threshold
         Returns:
              picked: a list of indexes of the kept boxes
         """
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index d550594c5..992ebbbae 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -117,7 +117,7 @@ def get_weight(name: str) -> WeightsEnum:
 
 def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
     """
-    Retuns the weights enum class associated to the given model.
+    Returns the weights enum class associated to the given model.
 
     .. betastatus:: function
 
@@ -137,7 +137,6 @@ def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
 
     Args:
         fn (Callable): The builder method used to create the model.
-        weight_name (str): The name of the weight enum entry of the specific model.
     Returns:
         WeightsEnum: The requested weight enum.
     """
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
index 10d318528..83e9f107c 100644
--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
@@ -25,7 +25,7 @@ class BalancedPositiveNegativeSampler:
     def __call__(self, matched_idxs: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
         """
         Args:
-            matched idxs: list of tensors containing -1, 0 or positive values.
+            matched_idxs: list of tensors containing -1, 0 or positive values.
                 Each tensor corresponds to a specific image.
                 -1 values are ignored, 0 are considered as negatives and > 0 as
                 positives.
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 985937678..351992444 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -2123,7 +2123,7 @@ class ElasticTransform(torch.nn.Module):
     def forward(self, tensor: Tensor) -> Tensor:
         """
         Args:
-            img (PIL Image or Tensor): Image to be transformed.
+            tensor (PIL Image or Tensor): Image to be transformed.
 
         Returns:
             PIL Image or Tensor: Transformed image.
-- 
GitLab


From 386cfe2afb886a5d896a66cc561982570670525c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 19 Dec 2022 13:30:36 +0100
Subject: [PATCH 192/624] remove RandomApply inheritance from SimpleCopyPaste
 (#7045)

---
 torchvision/prototype/transforms/_augment.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 23238c7a5..3160770a0 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -9,7 +9,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 
 from torchvision.ops import masks_to_boxes
 from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, InterpolationMode
+from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
 
 from ._transform import _RandomApplyTransform
 from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
@@ -191,15 +191,14 @@ class RandomCutmix(_BaseMixupCutmix):
             return inpt
 
 
-class SimpleCopyPaste(_RandomApplyTransform):
+class SimpleCopyPaste(Transform):
     def __init__(
         self,
-        p: float = 0.5,
         blending: bool = True,
         resize_interpolation: InterpolationMode = F.InterpolationMode.BILINEAR,
         antialias: Optional[bool] = None,
     ) -> None:
-        super().__init__(p=p)
+        super().__init__()
         self.resize_interpolation = resize_interpolation
         self.blending = blending
         self.antialias = antialias
-- 
GitLab


From 657c0767c5ca5564c8b437ac44263994c8e01352 Mon Sep 17 00:00:00 2001
From: nps1ngh <npsingh0181@gmail.com>
Date: Mon, 19 Dec 2022 13:34:12 +0100
Subject: [PATCH 193/624] fix doc of `ops.misc.MLP` (#7041)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/ops/misc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index d4bda7dec..e1be0caa8 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -266,8 +266,8 @@ class MLP(torch.nn.Sequential):
     Args:
         in_channels (int): Number of channels of the input
         hidden_channels (List[int]): List of the hidden channel dimensions
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer wont be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool): Whether to use bias in the linear layer. Default ``True``
         dropout (float): The probability for the dropout layer. Default: 0.0
-- 
GitLab


From 9851a69f6d294f5d672d973d8a1dbeebdd2aa04e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 20 Dec 2022 16:06:02 +0100
Subject: [PATCH 194/624] migrate prototype CI to Nova (#7037)

* improve prototype GPU CI config

* consolidate CPU and GPU prototype workflows

* [REVERTME] deactivate other CI

* syntax

* fix name

* fix echo mutex

* style

* fix runner

* add jobname

* run all prototype tests

* fix mutex echo

* cleanup

* Revert "[REVERTME] deactivate other CI"

This reverts commit c931ff2c27bbc2b04324b777ae39a6fc25e50b8a.

* remove old workflows

* use free GHA runner for CPU tests

* Revert "use free GHA runner for CPU tests"

This reverts commit a209b31387d9d14bc6446743587db244bccc2106.

* [REVERTME] turn warnings into errors

* try fix test warnings

* fix module import in tests

* revert

* int -> float

* [REVERTME] why do we have to mark as safe?
---
 .../workflows/prototype-tests-linux-gpu.yml   | 95 +++++++++++++++++++
 .github/workflows/prototype-tests.yml         | 73 --------------
 .../prototype-transforms-tests-linux-gpu.yml  | 69 --------------
 test/test_prototype_transforms_consistency.py | 15 ++-
 test/test_prototype_transforms_functional.py  |  8 +-
 5 files changed, 111 insertions(+), 149 deletions(-)
 create mode 100644 .github/workflows/prototype-tests-linux-gpu.yml
 delete mode 100644 .github/workflows/prototype-tests.yml
 delete mode 100644 .github/workflows/prototype-transforms-tests-linux-gpu.yml

diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
new file mode 100644
index 000000000..fe1e46ab3
--- /dev/null
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -0,0 +1,95 @@
+name: Prototype tests on Linux
+
+on:
+  pull_request:
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python-version:
+          - "3.7"
+          - "3.8"
+          - "3.9"
+          - "3.10"
+        gpu-arch-type: ["cpu"]
+        gpu-arch-version: [""]
+        runner: ["linux.2xlarge"]
+        include:
+          - python-version: "3.8"
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.6"
+            runner: linux.4xlarge.nvidia.gpu
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      job-name: Python ${{ matrix.python-version }}, ${{ matrix.gpu-arch-type }}
+      repository: pytorch/vision
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      runner: ${{ matrix.runner }}
+      timeout: 45
+      script: |
+        # Mark Build Directory Safe
+
+        echo '::group::Set PyTorch conda channel'
+        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+          POSTFIX=test
+        else
+          POSTFIX=nightly
+        fi
+        PYTORCH_CHANNEL=pytorch-"${POSTFIX}"
+        echo "${PYTORCH_CHANNEL}"
+        echo '::endgroup::'
+                
+        echo '::group::Set PyTorch conda mutex'
+        if [[ ${{ matrix.gpu-arch-type }} = 'cuda' ]]; then
+          PYTORCH_MUTEX="pytorch-cuda=${{ matrix.gpu-arch-version }}"
+        else
+          PYTORCH_MUTEX=cpuonly
+        fi
+        echo "${PYTORCH_MUTEX}"
+        echo '::endgroup::'
+        
+        echo '::group::Create conda environment'
+        conda create --prefix $PWD/ci \
+          --quiet --yes \
+          python=${{ matrix.python-version }} \
+          numpy libpng jpeg scipy
+        conda activate $PWD/ci
+        echo '::endgroup::'
+
+        echo '::group::Install PyTorch'
+        conda install \
+          --quiet --yes \
+          -c "${PYTORCH_CHANNEL}" \
+          -c nvidia \
+          pytorch \
+          "${PYTORCH_MUTEX}"
+        if [[ ${{ matrix.gpu-arch-type }} = 'cuda' ]]; then
+          python3 -c "import torch; exit(not torch.cuda.is_available())"
+        fi
+        echo '::endgroup::'
+          
+        echo '::group::Install TorchVision'
+        python setup.py develop
+        echo '::endgroup::'
+        
+        echo '::group::Collect PyTorch environment information'
+        python -m torch.utils.collect_env
+        echo '::endgroup::'
+        
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest pytest-mock pytest-cov
+        echo '::endgroup::'
+
+        echo '::group::Run prototype tests'
+        # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e. 
+        # `test/test_prototype*.py` takes the highest priority, neither `--ignore` nor `--ignore-glob` can help us here.
+        rm test/test_prototype_datasets*.py
+        pytest \
+          --durations=25 \
+          --cov=torchvision/prototype \
+          --cov-report=term-missing \
+          test/test_prototype*.py
+        echo '::endgroup::'
diff --git a/.github/workflows/prototype-tests.yml b/.github/workflows/prototype-tests.yml
deleted file mode 100644
index d3a9fbf1e..000000000
--- a/.github/workflows/prototype-tests.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-name: tests
-
-on:
-  pull_request:
-
-jobs:
-  prototype:
-    strategy:
-      matrix:
-        os:
-          - ubuntu-latest
-          - windows-latest
-          - macos-latest
-      fail-fast: false
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Set up python
-        uses: actions/setup-python@v3
-        with:
-          python-version: 3.7
-
-      - name: Upgrade system packages
-        run: python -m pip install --upgrade pip setuptools wheel
-
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install PyTorch nightly builds
-        run: pip install --progress-bar=off --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
-
-      - name: Install torchvision
-        run: pip install --progress-bar=off --no-build-isolation --editable .
-
-      - name: Install other prototype dependencies
-        run: pip install --progress-bar=off scipy pycocotools h5py
-
-      - name: Install test requirements
-        run: pip install --progress-bar=off pytest pytest-mock pytest-cov
-
-      - name: Mark setup as complete
-        id: setup
-        run: exit 0
-
-      - name: Run prototype datapoints tests
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/datapoints \
-            --cov-report=term-missing \
-            test/test_prototype_datapoints*.py
-
-      - name: Run prototype transforms tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/transforms \
-            --cov-report=term-missing \
-            test/test_prototype_transforms*.py
-
-      - name: Run prototype models tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/models \
-            --cov-report=term-missing \
-            test/test_prototype_models*.py
diff --git a/.github/workflows/prototype-transforms-tests-linux-gpu.yml b/.github/workflows/prototype-transforms-tests-linux-gpu.yml
deleted file mode 100644
index 01f1c1ccc..000000000
--- a/.github/workflows/prototype-transforms-tests-linux-gpu.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: Prototype transforms unit-tests on Linux GPU
-
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-
-env:
-  CHANNEL: "nightly"
-
-jobs:
-  tests:
-    strategy:
-      matrix:
-        python_version: ["3.8"]
-        cuda_arch_version: ["11.6"]
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.4xlarge.nvidia.gpu
-      repository: pytorch/vision
-      gpu-arch-type: cuda
-      gpu-arch-version: ${{ matrix.cuda_arch_version }}
-      timeout: 120
-      script: |
-        # Mark Build Directory Safe
-        git config --global --add safe.directory /__w/vision/vision
-
-        # Set up Environment Variables
-        export PYTHON_VERSION="${{ matrix.python_version }}"
-        export VERSION="${{ matrix.cuda_arch_version }}"
-        export CUDATOOLKIT="pytorch-cuda=${VERSION}"
-
-        # Set CHANNEL
-        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
-          export CHANNEL=test
-        else
-          export CHANNEL=nightly
-        fi
-
-        # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
-        conda activate /work/ci_env
-
-        # Install PyTorch, Torchvision, and testing libraries
-        set -ex
-        conda install \
-          --yes \
-          -c "pytorch-${CHANNEL}" \
-          -c nvidia \
-          pytorch \
-          "${CUDATOOLKIT}"
-
-        python3 -c "import torch; exit(not torch.cuda.is_available())"
-          
-        python3 setup.py develop
-        python3 -m pip install pytest pytest-mock pytest-cov
-
-        # Run Tests
-        python3 -m torch.utils.collect_env
-        python3 -m pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/transforms \
-            --cov-report=term-missing \
-            test/test_prototype_transforms*.py
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index f562649be..8cda11008 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -1,9 +1,10 @@
 import enum
+import importlib.machinery
+import importlib.util
 import inspect
 import random
 import re
 from collections import defaultdict
-from importlib.machinery import SourceFileLoader
 from pathlib import Path
 
 import numpy as np
@@ -890,8 +891,16 @@ class TestAATransforms:
 
 
 def import_transforms_from_references(reference):
-    ref_det_filepath = Path(__file__).parent.parent / "references" / reference / "transforms.py"
-    return SourceFileLoader(ref_det_filepath.stem, ref_det_filepath.as_posix()).load_module()
+    HERE = Path(__file__).parent
+    PROJECT_ROOT = HERE.parent
+
+    loader = importlib.machinery.SourceFileLoader(
+        "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py")
+    )
+    spec = importlib.util.spec_from_loader("transforms", loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
 
 
 det_transforms = import_transforms_from_references("detection")
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index f33992234..d199625df 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -625,10 +625,10 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
         )
         transformed_points = np.matmul(points, affine_matrix.T)
         out_bbox = [
-            np.min(transformed_points[:4, 0]),
-            np.min(transformed_points[:4, 1]),
-            np.max(transformed_points[:4, 0]),
-            np.max(transformed_points[:4, 1]),
+            float(np.min(transformed_points[:4, 0])),
+            float(np.min(transformed_points[:4, 1])),
+            float(np.max(transformed_points[:4, 0])),
+            float(np.max(transformed_points[:4, 1])),
         ]
         if expand_:
             tr_x = np.min(transformed_points[4:, 0])
-- 
GitLab


From 7aef1153d0fab6fc48947dff48bd6ef4822905ca Mon Sep 17 00:00:00 2001
From: nps1ngh <npsingh0181@gmail.com>
Date: Wed, 21 Dec 2022 11:45:02 +0100
Subject: [PATCH 195/624] fix `repr` of `transforms.RandomResizedCrop` (#7048)

---
 torchvision/transforms/transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 351992444..0bd5c65fb 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -967,7 +967,7 @@ class RandomResizedCrop(torch.nn.Module):
         format_string = self.__class__.__name__ + f"(size={self.size}"
         format_string += f", scale={tuple(round(s, 4) for s in self.scale)}"
         format_string += f", ratio={tuple(round(r, 4) for r in self.ratio)}"
-        format_string += f", interpolation={interpolate_str})"
+        format_string += f", interpolation={interpolate_str}"
         format_string += f", antialias={self.antialias})"
         return format_string
 
-- 
GitLab


From b0aa60ca0fd72c85164269dc8097e4d460ccf0fd Mon Sep 17 00:00:00 2001
From: F-G Fernandez <26927750+frgfm@users.noreply.github.com>
Date: Wed, 21 Dec 2022 13:13:10 +0100
Subject: [PATCH 196/624] Added missing typing annotations in
 datasets/_stereo_matching (#6846)

* style: Added missing typing annotations in datasets/_stereo_matching

* style: Specified typing

* style: Specified type annotations further

* style: Specified typing of __getitem__

* style: Fixed linting

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/datasets/_stereo_matching.py | 101 ++++++++++++-----------
 1 file changed, 54 insertions(+), 47 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index cd06cfe1c..02a790199 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -6,7 +6,7 @@ import shutil
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, cast, List, Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image
@@ -14,6 +14,9 @@ from PIL import Image
 from .utils import _read_pfm, download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
 
+T1 = Tuple[Image.Image, Image.Image, Optional[np.ndarray], np.ndarray]
+T2 = Tuple[Image.Image, Image.Image, Optional[np.ndarray]]
+
 __all__ = ()
 
 _read_pfm_file = functools.partial(_read_pfm, slice_channels=1)
@@ -24,7 +27,7 @@ class StereoMatchingDataset(ABC, VisionDataset):
 
     _has_built_in_disparity_mask = False
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: str, transforms: Optional[Callable] = None) -> None:
         """
         Args:
             root(str): Root directory of the dataset.
@@ -58,7 +61,11 @@ class StereoMatchingDataset(ABC, VisionDataset):
             img = img.convert("RGB")
         return img
 
-    def _scan_pairs(self, paths_left_pattern: str, paths_right_pattern: Optional[str] = None):
+    def _scan_pairs(
+        self,
+        paths_left_pattern: str,
+        paths_right_pattern: Optional[str] = None,
+    ) -> List[Tuple[str, Optional[str]]]:
 
         left_paths = list(sorted(glob(paths_left_pattern)))
 
@@ -85,11 +92,11 @@ class StereoMatchingDataset(ABC, VisionDataset):
         return paths
 
     @abstractmethod
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
         # function that returns a disparity map and an occlusion map
         pass
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -120,7 +127,7 @@ class StereoMatchingDataset(ABC, VisionDataset):
             ) = self.transforms(imgs, dsp_maps, valid_masks)
 
         if self._has_built_in_disparity_mask or valid_masks[0] is not None:
-            return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
+            return imgs[0], imgs[1], dsp_maps[0], cast(np.ndarray, valid_masks[0])
         else:
             return imgs[0], imgs[1], dsp_maps[0]
 
@@ -156,7 +163,7 @@ class CarlaStereo(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: str, transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "carla-highres"
@@ -171,13 +178,13 @@ class CarlaStereo(StereoMatchingDataset):
         disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
         self._disparities = disparities
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -189,7 +196,7 @@ class CarlaStereo(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Kitti2012Stereo(StereoMatchingDataset):
@@ -233,7 +240,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -250,7 +257,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
         else:
             self._disparities = list((None, None) for _ in self._images)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[Optional[np.ndarray], None]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -261,7 +268,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -274,7 +281,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Kitti2015Stereo(StereoMatchingDataset):
@@ -321,7 +328,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -338,7 +345,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
         else:
             self._disparities = list((None, None) for _ in self._images)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[Optional[np.ndarray], None]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -349,7 +356,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -362,7 +369,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Middlebury2014Stereo(StereoMatchingDataset):
@@ -479,7 +486,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
@@ -558,7 +565,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
             file_path = random.choice(ambient_file_paths)  # type: ignore
         return super()._read_img(file_path)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[Tuple[None, None], Tuple[np.ndarray, np.ndarray]]:
         # test split has not disparity maps
         if file_path is None:
             return None, None
@@ -569,7 +576,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
         valid_mask = (disparity_map > 0).squeeze(0)  # mask out invalid disparities
         return disparity_map, valid_mask
 
-    def _download_dataset(self, root: str):
+    def _download_dataset(self, root: str) -> None:
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
         # train and additional splits have 2 different calibration settings
         root = Path(root) / "Middlebury2014"
@@ -608,7 +615,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
                 # cleanup MiddEval3 directory
                 shutil.rmtree(str(root / "MiddEval3"))
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -619,7 +626,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
             The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
             ``valid_mask`` is implicitly ``None`` for `split=test`.
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
 
 
 class CREStereo(StereoMatchingDataset):
@@ -670,7 +677,7 @@ class CREStereo(StereoMatchingDataset):
         self,
         root: str,
         transforms: Optional[Callable] = None,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "CREStereo"
@@ -688,14 +695,14 @@ class CREStereo(StereoMatchingDataset):
             disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
             self._disparities += disparities
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :] / 32.0
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -707,7 +714,7 @@ class CREStereo(StereoMatchingDataset):
             ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
             generate a valid mask.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class FallingThingsStereo(StereoMatchingDataset):
@@ -755,7 +762,7 @@ class FallingThingsStereo(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, variant: str = "single", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, variant: str = "single", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "FallingThings"
@@ -782,7 +789,7 @@ class FallingThingsStereo(StereoMatchingDataset):
             right_disparity_pattern = str(root / s / split_prefix[s] / "*.right.depth.png")
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         # (H, W) image
         depth = np.asarray(Image.open(file_path))
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
@@ -799,7 +806,7 @@ class FallingThingsStereo(StereoMatchingDataset):
             valid_mask = None
             return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -811,7 +818,7 @@ class FallingThingsStereo(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class SceneFlowStereo(StereoMatchingDataset):
@@ -874,7 +881,7 @@ class SceneFlowStereo(StereoMatchingDataset):
         variant: str = "FlyingThings3D",
         pass_name: str = "clean",
         transforms: Optional[Callable] = None,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -905,13 +912,13 @@ class SceneFlowStereo(StereoMatchingDataset):
             right_disparity_pattern = str(root / "disparity" / prefix_directories[variant] / "right" / "*.pfm")
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -923,7 +930,7 @@ class SceneFlowStereo(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class SintelStereo(StereoMatchingDataset):
@@ -973,7 +980,7 @@ class SintelStereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(pass_name, "pass_name", valid_values=("final", "clean", "both"))
@@ -1014,7 +1021,7 @@ class SintelStereo(StereoMatchingDataset):
 
         return occlusion_path, outofframe_path
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[Tuple[None, None], Tuple[np.ndarray, np.ndarray]]:
         if file_path is None:
             return None, None
 
@@ -1034,7 +1041,7 @@ class SintelStereo(StereoMatchingDataset):
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -1045,7 +1052,7 @@ class SintelStereo(StereoMatchingDataset):
             The disparity is a numpy array of shape (1, H, W) and the images are PIL images whilst
             the valid_mask is a numpy array of shape (H, W).
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
 
 
 class InStereo2k(StereoMatchingDataset):
@@ -1080,7 +1087,7 @@ class InStereo2k(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "InStereo2k" / split
@@ -1095,14 +1102,14 @@ class InStereo2k(StereoMatchingDataset):
         right_disparity_pattern = str(root / "*" / "right_disp.png")
         self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze disparity to (C, H, W)
         disparity_map = disparity_map[None, :, :] / 1024.0
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -1114,7 +1121,7 @@ class InStereo2k(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class ETH3DStereo(StereoMatchingDataset):
@@ -1169,7 +1176,7 @@ class ETH3DStereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -1189,7 +1196,7 @@ class ETH3DStereo(StereoMatchingDataset):
             disparity_pattern = str(root / anot_dir / "*" / "disp0GT.pfm")
             self._disparities = self._scan_pairs(disparity_pattern, None)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[Tuple[None, None], Tuple[np.ndarray, np.ndarray]]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -1201,7 +1208,7 @@ class ETH3DStereo(StereoMatchingDataset):
         valid_mask = np.asarray(valid_mask).astype(bool)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -1214,4 +1221,4 @@ class ETH3DStereo(StereoMatchingDataset):
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
-- 
GitLab


From 32d254bbfcf14975f846765775584e61ef25a5bc Mon Sep 17 00:00:00 2001
From: deepsghimire <70006817+deepsghimire@users.noreply.github.com>
Date: Wed, 21 Dec 2022 20:59:10 +0545
Subject: [PATCH 197/624] Fixes use download_and_extract in SBU dataset (#7046)
 (#7051)

---
 torchvision/datasets/sbu.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/torchvision/datasets/sbu.py b/torchvision/datasets/sbu.py
index 9e46c1dff..ee90eeb64 100644
--- a/torchvision/datasets/sbu.py
+++ b/torchvision/datasets/sbu.py
@@ -3,7 +3,7 @@ from typing import Any, Callable, Optional, Tuple
 
 from PIL import Image
 
-from .utils import check_integrity, download_url
+from .utils import check_integrity, download_and_extract_archive, download_url
 from .vision import VisionDataset
 
 
@@ -90,17 +90,12 @@ class SBU(VisionDataset):
 
     def download(self) -> None:
         """Download and extract the tarball, and download each individual photo."""
-        import tarfile
 
         if self._check_integrity():
             print("Files already downloaded and verified")
             return
 
-        download_url(self.url, self.root, self.filename, self.md5_checksum)
-
-        # Extract file
-        with tarfile.open(os.path.join(self.root, self.filename), "r:gz") as tar:
-            tar.extractall(path=self.root)
+        download_and_extract_archive(self.url, self.root, self.root, self.filename, self.md5_checksum)
 
         # Download individual photos
         with open(os.path.join(self.root, "dataset", "SBU_captioned_photo_dataset_urls.txt")) as fh:
-- 
GitLab


From 46b7e27122deb011a3ff09720da9c3787757f7eb Mon Sep 17 00:00:00 2001
From: Akira Noda <61897166+tsugumi-sys@users.noreply.github.com>
Date: Wed, 4 Jan 2023 15:55:46 +0900
Subject: [PATCH 198/624] Add MovingMNIST dataset (#7042)

* add moving mnist dataset

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* remove unused modules

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* modify docstring

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* modify docstring and docs

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* add split and split ratio kwargs

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* fix checking split argument

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* remove unused package

* delete lines

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* fix filename property

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* fix reviews

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* modify docstrings

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* add split tests and etc

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

* fix tests

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>

Signed-off-by: tsugumi-sys <tidemark0105@gmail.com>
---
 docs/source/datasets.rst             |  8 +++
 test/test_datasets.py                | 31 ++++++++++
 test/test_datasets_download.py       |  4 ++
 torchvision/datasets/__init__.py     |  1 +
 torchvision/datasets/moving_mnist.py | 93 ++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+)
 create mode 100644 torchvision/datasets/moving_mnist.py

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index f3f0b466d..5ecb60d36 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -149,6 +149,14 @@ Video classification
     Kinetics
     UCF101
 
+Video prediction
+~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+    :toctree: generated/
+    :template: class_dataset.rst
+
+    MovingMNIST
 
 .. _base_classes_datasets:
 
diff --git a/test/test_datasets.py b/test/test_datasets.py
index dbce7853e..bd6d1dcb2 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1494,6 +1494,37 @@ class QMNISTTestCase(MNISTTestCase):
             assert len(dataset) == info["num_examples"] - 10000
 
 
+class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
+    DATASET_CLASS = datasets.MovingMNIST
+    FEATURE_TYPES = (torch.Tensor,)
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__)
+        os.makedirs(base_folder, exist_ok=True)
+        num_samples = 20
+        data = np.concatenate(
+            [
+                np.zeros((config["split_ratio"], num_samples, 64, 64)),
+                np.ones((20 - config["split_ratio"], num_samples, 64, 64)),
+            ]
+        )
+        np.save(os.path.join(base_folder, "mnist_test_seq.npy"), data)
+        return num_samples
+
+    @datasets_utils.test_all_configs
+    def test_split(self, config):
+        if config["split"] is None:
+            return
+
+        with self.create_dataset(config) as (dataset, info):
+            if config["split"] == "train":
+                assert (dataset.data == 0).all()
+            else:
+                assert (dataset.data == 1).all()
+
+
 class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DatasetFolder
 
diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
index 1e76ba42e..c748a8a0f 100644
--- a/test/test_datasets_download.py
+++ b/test/test_datasets_download.py
@@ -296,6 +296,10 @@ def qmnist():
     )
 
 
+def moving_mnist():
+    return collect_download_configs(lambda: datasets.MovingMNIST(ROOT, download=True), name="MovingMNIST")
+
+
 def omniglot():
     return itertools.chain(
         *[
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 23eddb236..e18a9a54b 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -36,6 +36,7 @@ from .kitti import Kitti
 from .lfw import LFWPairs, LFWPeople
 from .lsun import LSUN, LSUNClass
 from .mnist import EMNIST, FashionMNIST, KMNIST, MNIST, QMNIST
+from .moving_mnist import MovingMNIST
 from .omniglot import Omniglot
 from .oxford_iiit_pet import OxfordIIITPet
 from .pcam import PCAM
diff --git a/torchvision/datasets/moving_mnist.py b/torchvision/datasets/moving_mnist.py
new file mode 100644
index 000000000..afff0bfa3
--- /dev/null
+++ b/torchvision/datasets/moving_mnist.py
@@ -0,0 +1,93 @@
+import os.path
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+from torchvision.datasets.utils import download_url, verify_str_arg
+from torchvision.datasets.vision import VisionDataset
+
+
+class MovingMNIST(VisionDataset):
+    """`MovingMNIST <http://www.cs.toronto.edu/~nitish/unsupervised_video/>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where ``MovingMNIST/mnist_test_seq.npy`` exists.
+        split (string, optional): The dataset split, supports ``None`` (default), ``"train"`` and ``"test"``.
+            If ``split=None``, the full data is returned.
+        split_ratio (int, optional): The split ratio of number of frames. If ``split="train"``, the first split
+            frames ``data[:, :split_ratio]`` is returned. If ``split="test"``, the last split frames ``data[:, split_ratio:]``
+            is returned. If ``split=None``, this parameter is ignored and the all frames data is returned.
+        transform (callable, optional): A function/transform that takes in an torch Tensor
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    _URL = "http://www.cs.toronto.edu/~nitish/unsupervised_video/mnist_test_seq.npy"
+
+    def __init__(
+        self,
+        root: str,
+        split: Optional[str] = None,
+        split_ratio: int = 10,
+        download: bool = False,
+        transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transform=transform)
+
+        self._base_folder = os.path.join(self.root, self.__class__.__name__)
+        self._filename = self._URL.split("/")[-1]
+
+        if split is not None:
+            verify_str_arg(split, "split", ("train", "test"))
+        self.split = split
+
+        if not isinstance(split_ratio, int):
+            raise TypeError(f"`split_ratio` should be an integer, but got {type(split_ratio)}")
+        elif not (1 <= split_ratio <= 19):
+            raise ValueError(f"`split_ratio` should be `1 <= split_ratio <= 19`, but got {split_ratio} instead.")
+        self.split_ratio = split_ratio
+
+        if download:
+            self.download()
+
+        if not self._check_exists():
+            raise RuntimeError("Dataset not found. You can use download=True to download it.")
+
+        data = torch.from_numpy(np.load(os.path.join(self._base_folder, self._filename)))
+        if self.split == "train":
+            data = data[: self.split_ratio]
+        else:
+            data = data[self.split_ratio :]
+        self.data = data.transpose(0, 1).unsqueeze(2).contiguous()
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            torch.Tensor: Video frames (torch Tensor[T, C, H, W]). The `T` is the number of frames.
+        """
+        data = self.data[idx]
+        if self.transform is not None:
+            data = self.transform(data)
+
+        return data
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def _check_exists(self) -> bool:
+        return os.path.exists(os.path.join(self._base_folder, self._filename))
+
+    def download(self) -> None:
+        if self._check_exists():
+            return
+
+        download_url(
+            url=self._URL,
+            root=self._base_folder,
+            filename=self._filename,
+            md5="be083ec986bfe91a449d63653c411eb2",
+        )
-- 
GitLab


From 2fb9c49cb9691ca3ecf077cd53320a4cbf7aa886 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 4 Jan 2023 14:18:36 -0500
Subject: [PATCH 199/624] Update to cuda 118 (#7058)

add to python script

Add cuda118

Add cuda 11.8 constrain
---
 .circleci/config.yml                        | 245 ++++++++++++++++++++
 .circleci/regenerate.py                     |   5 +-
 packaging/pkg_helpers.bash                  |  14 ++
 packaging/windows/internal/cuda_install.bat |  27 +++
 4 files changed, 289 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d15a20216..aa413df65 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1120,6 +1120,12 @@ workflows:
           name: binary_linux_wheel_py3.7_cu117
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda117
+      - binary_linux_wheel:
+          conda_docker_image: pytorch/conda-builder:cuda118
+          cu_version: cu118
+          name: binary_linux_wheel_py3.7_cu118
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda118
       - binary_linux_wheel:
           cu_version: rocm5.2
           name: binary_linux_wheel_py3.7_rocm5.2
@@ -1157,6 +1163,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.7_cu117
           python_version: '3.7'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.7_cu118
+          python_version: '3.7'
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1184,6 +1199,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.8_cu117
           python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu118
+          python_version: '3.8'
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1211,6 +1235,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.9_cu117
           python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu118
+          python_version: '3.9'
       - binary_win_wheel:
           cu_version: cpu
           name: binary_win_wheel_py3.10_cpu
@@ -1226,8 +1259,17 @@ workflows:
           python_version: '3.10'
       - binary_win_wheel:
           cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.10_cu117
           python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cu118
+          name: binary_win_wheel_py3.10_cu118
+          python_version: '3.10'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1255,6 +1297,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.7_cu117
           python_version: '3.7'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.7_cu118
+          python_version: '3.7'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1282,6 +1333,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.8_cu117
           python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu118
+          python_version: '3.8'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1309,6 +1369,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.9_cu117
           python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu118
+          python_version: '3.9'
       - binary_win_conda:
           cu_version: cpu
           name: binary_win_conda_py3.10_cpu
@@ -1324,8 +1393,17 @@ workflows:
           python_version: '3.10'
       - binary_win_conda:
           cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.10_cu117
           python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cu118
+          name: binary_win_conda_py3.10_cu118
+          python_version: '3.10'
       - build_docs:
           filters:
             branches:
@@ -1558,6 +1636,17 @@ workflows:
           name: nightly_binary_linux_wheel_py3.7_cu117
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda117
+      - binary_linux_wheel:
+          conda_docker_image: pytorch/conda-builder:cuda118
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.7_cu118
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda118
       - binary_linux_wheel:
           cu_version: rocm5.2
           filters:
@@ -1638,6 +1727,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.7_cu117
           subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu118
+          python_version: '3.7'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.7_cu118
+          subfolder: cu118/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1698,6 +1807,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.8_cu117
           subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu118
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.8_cu118
+          subfolder: cu118/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1758,6 +1887,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.9_cu117
           subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu118
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.9_cu118
+          subfolder: cu118/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1818,6 +1967,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.10_cu117
           subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu118
+          python_version: '3.10'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.10_cu118
+          subfolder: cu118/
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1875,6 +2044,25 @@ workflows:
           name: nightly_binary_win_conda_py3.7_cu117_upload
           requires:
           - nightly_binary_win_conda_py3.7_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.7_cu118
+          python_version: '3.7'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.7_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.7_cu118
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1932,6 +2120,25 @@ workflows:
           name: nightly_binary_win_conda_py3.8_cu117_upload
           requires:
           - nightly_binary_win_conda_py3.8_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu118
+          python_version: '3.8'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.8_cu118
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1989,6 +2196,25 @@ workflows:
           name: nightly_binary_win_conda_py3.9_cu117_upload
           requires:
           - nightly_binary_win_conda_py3.9_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu118
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.9_cu118
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -2046,6 +2272,25 @@ workflows:
           name: nightly_binary_win_conda_py3.10_cu117_upload
           requires:
           - nightly_binary_win_conda_py3.10_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu118
+          python_version: '3.10'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.10_cu118
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 6826febf3..676ce2aeb 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -32,8 +32,8 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
             cu_versions_dict = {
-                "linux": ["cpu", "cu116", "cu117", "rocm5.2", "rocm5.3"],
-                "win": ["cpu", "cu116", "cu117"],
+                "linux": ["cpu", "cu116", "cu117", "cu118", "rocm5.2", "rocm5.3"],
+                "win": ["cpu", "cu116", "cu117", "cu118"],
                 "macos": ["cpu"],
             }
             cu_versions = cu_versions_dict[os_type]
@@ -145,6 +145,7 @@ def upload_doc_job(filter_branch):
 manylinux_images = {
     "cu116": "pytorch/manylinux-cuda116",
     "cu117": "pytorch/manylinux-cuda117",
+    "cu118": "pytorch/manylinux-cuda118",
 }
 
 
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 758b712ea..5687d90fe 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -46,6 +46,14 @@ setup_cuda() {
 
   # Now work out the CUDA settings
   case "$CU_VERSION" in
+    cu118)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.8/
+      fi
+      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+      ;;
     cu117)
       if [[ "$OSTYPE" == "msys" ]]; then
         export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7"
@@ -253,6 +261,9 @@ setup_conda_cudatoolkit_constraint() {
     export CONDA_BUILD_VARIANT="cpu"
   else
     case "$CU_VERSION" in
+      cu118)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.8 # [not osx]"
+        ;;
       cu117)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
         ;;
@@ -279,6 +290,9 @@ setup_conda_cudatoolkit_plain_constraint() {
     export CMAKE_USE_CUDA=0
   else
     case "$CU_VERSION" in
+      cu118)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.8"
+        ;;
       cu117)
         export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7"
         ;;
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index 66e922289..633c2c642 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -25,6 +25,7 @@ set CUDNN_LIB_FOLDER="lib\x64"
 
 if %CUDA_VER% EQU 116 goto cuda116
 if %CUDA_VER% EQU 117 goto cuda117
+if %CUDA_VER% EQU 118 goto cuda118
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
@@ -81,6 +82,32 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
 
 goto cuda_common
 
+:cuda118
+
+set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
+)
+
+set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.5.0.96_cuda11-archive.zip
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.5.0.96_cuda11-archive
+set CUDNN_LIB_FOLDER="lib"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+
+    rem Make sure windows path contains zlib dll
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+)
+
+goto cuda_common
+
 :cuda_common
 
 if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
-- 
GitLab


From 1353df9e45aae6739c23db4a053c3d9a9fd17b3d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 5 Jan 2023 13:11:02 +0100
Subject: [PATCH 200/624] [FBcode->GH] [reland] rename DisableTorchFunction to
 DisableTorchFunctionSubclass (#88218) (#7062)

[FBcode->GH]
https://www.internalfb.com/diff/D41268423

Co-authored-by: Samantha Andow <samdow@meta.com>
---
 torchvision/prototype/datapoints/_datapoint.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index 53d1b05fb..659d4e958 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -5,7 +5,7 @@ from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type
 
 import PIL.Image
 import torch
-from torch._C import DisableTorchFunction
+from torch._C import DisableTorchFunctionSubclass
 from torch.types import _device, _dtype, _size
 from torchvision.transforms import InterpolationMode
 
@@ -87,7 +87,7 @@ class Datapoint(torch.Tensor):
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
 
-        with DisableTorchFunction():
+        with DisableTorchFunctionSubclass():
             output = func(*args, **kwargs or dict())
 
             wrapper = cls._NO_WRAPPING_EXCEPTIONS.get(func)
@@ -129,22 +129,22 @@ class Datapoint(torch.Tensor):
     # this way we return the result without passing into __torch_function__
     @property
     def shape(self) -> _size:  # type: ignore[override]
-        with DisableTorchFunction():
+        with DisableTorchFunctionSubclass():
             return super().shape
 
     @property
     def ndim(self) -> int:  # type: ignore[override]
-        with DisableTorchFunction():
+        with DisableTorchFunctionSubclass():
             return super().ndim
 
     @property
     def device(self, *args: Any, **kwargs: Any) -> _device:  # type: ignore[override]
-        with DisableTorchFunction():
+        with DisableTorchFunctionSubclass():
             return super().device
 
     @property
     def dtype(self) -> _dtype:  # type: ignore[override]
-        with DisableTorchFunction():
+        with DisableTorchFunctionSubclass():
             return super().dtype
 
     def horizontal_flip(self) -> Datapoint:
-- 
GitLab


From 35f68a09f94b2d7afb3f6adc2ba850216413f28e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 6 Jan 2023 09:54:39 +0100
Subject: [PATCH 201/624] Remove deprecated np.int usages (#7059)

* np.int -> np.int32

* Update test/test_transforms.py
---
 test/test_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_transforms.py b/test/test_transforms.py
index e0f8d4a59..6534caa90 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2067,7 +2067,7 @@ class TestAffine:
                 # https://github.com/python-pillow/Pillow/blob/71f8ec6a0cfc1008076a023c0756542539d057ab/
                 # src/libImaging/Geometry.c#L1060
                 input_pt = np.array([x + 0.5, y + 0.5, 1.0])
-                res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(np.int)
+                res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(int)
                 _x, _y = res[:2]
                 if 0 <= _x < input_img.shape[1] and 0 <= _y < input_img.shape[0]:
                     true_result[y, x, :] = input_img[_y, _x, :]
-- 
GitLab


From 2b16299fdf6d128ab4cc270f28fef0488489f4e4 Mon Sep 17 00:00:00 2001
From: David Berard <davidberard98@gmail.com>
Date: Tue, 10 Jan 2023 01:33:26 -0800
Subject: [PATCH 202/624] Remove torch.jit.fuser("fuser2") in test (#7069)

* [WIP] Remove torch.jit.fuser("fuser2") in test

Internally we're considering removing support for fuser2, so we need to remove this special case from the test.

* completely remove special-casing
---
 test/test_ops.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index c2f101b39..eb2e31c9b 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1555,13 +1555,7 @@ class TestFocalLoss:
         torch.random.manual_seed(seed)
         inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
         focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        if device == "cpu":
-            scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        else:
-            with torch.jit.fuser("fuser2"):
-                # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476
-                # We may remove this condition once the bug is resolved
-                scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+        scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
 
         tol = 1e-3 if dtype is torch.half else 1e-5
         torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
-- 
GitLab


From 90cfb10dc49187842247d3bffb25a06af0b1e826 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 10 Jan 2023 17:18:17 +0100
Subject: [PATCH 203/624] pin python version for cmake workflows (#7060)

* pin python version for macos cmake

* ping python for all OS

* revert to only set this for macos

* debug

* remove debug
---
 .circleci/config.yml    | 7 +------
 .circleci/config.yml.in | 7 +------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index aa413df65..6b6a64662 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -958,11 +958,6 @@ jobs:
     machine:
       image: ubuntu-2004-cuda-11.4:202110-01
     resource_class: gpu.nvidia.small
-    environment:
-      PYTHON_VERSION: << parameters.python_version >>
-      PYTORCH_VERSION: << parameters.pytorch_version >>
-      UNICODE_ABI: << parameters.unicode_abi >>
-      CU_VERSION: << parameters.cu_version >>
     steps:
       - checkout_merge
       - designate_upload_channel
@@ -986,7 +981,7 @@ jobs:
             curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
             sh conda.sh -b
             source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake
+            conda install -yq conda-build cmake python=<< parameters.python_version >>
             packaging/build_cmake.sh
 
   cmake_windows_cpu:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index d3146568b..c3fe3fb6c 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -958,11 +958,6 @@ jobs:
     machine:
       image: ubuntu-2004-cuda-11.4:202110-01
     resource_class: gpu.nvidia.small
-    environment:
-      PYTHON_VERSION: << parameters.python_version >>
-      PYTORCH_VERSION: << parameters.pytorch_version >>
-      UNICODE_ABI: << parameters.unicode_abi >>
-      CU_VERSION: << parameters.cu_version >>
     steps:
       - checkout_merge
       - designate_upload_channel
@@ -986,7 +981,7 @@ jobs:
             curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
             sh conda.sh -b
             source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake
+            conda install -yq conda-build cmake python=<< parameters.python_version >>
             packaging/build_cmake.sh
 
   cmake_windows_cpu:
-- 
GitLab


From ed2a0adbde0f38f39512df7d0c343bdb8c703017 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 11 Jan 2023 11:27:01 +0100
Subject: [PATCH 204/624] Cleanup weight docs (#7074)

* _weight_size -> _file_size

* Better formatting of individual weights tables

* Remove file size from main tables to avoid confusion with weight size (as in RAM)

* Remove unnecessary (file size) suffix

* Fix CI error?

* Formatting

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/conf.py                           | 20 +++---
 test/common_extended_utils.py                 |  2 +-
 test/test_extended_models.py                  | 10 +--
 torchvision/models/alexnet.py                 |  2 +-
 torchvision/models/convnext.py                |  8 +--
 torchvision/models/densenet.py                |  8 +--
 torchvision/models/detection/faster_rcnn.py   |  8 +--
 torchvision/models/detection/fcos.py          |  2 +-
 torchvision/models/detection/keypoint_rcnn.py |  4 +-
 torchvision/models/detection/mask_rcnn.py     |  4 +-
 torchvision/models/detection/retinanet.py     |  4 +-
 torchvision/models/detection/ssd.py           |  2 +-
 torchvision/models/detection/ssdlite.py       |  2 +-
 torchvision/models/efficientnet.py            | 24 +++----
 torchvision/models/googlenet.py               |  2 +-
 torchvision/models/inception.py               |  2 +-
 torchvision/models/maxvit.py                  |  2 +-
 torchvision/models/mnasnet.py                 |  8 +--
 torchvision/models/mobilenetv2.py             |  4 +-
 torchvision/models/mobilenetv3.py             |  6 +-
 torchvision/models/optical_flow/raft.py       | 16 ++---
 torchvision/models/quantization/googlenet.py  |  2 +-
 torchvision/models/quantization/inception.py  |  2 +-
 .../models/quantization/mobilenetv2.py        |  2 +-
 .../models/quantization/mobilenetv3.py        |  2 +-
 torchvision/models/quantization/resnet.py     | 12 ++--
 .../models/quantization/shufflenetv2.py       |  8 +--
 torchvision/models/regnet.py                  | 68 +++++++++----------
 torchvision/models/resnet.py                  | 34 +++++-----
 torchvision/models/segmentation/deeplabv3.py  |  6 +-
 torchvision/models/segmentation/fcn.py        |  4 +-
 torchvision/models/segmentation/lraspp.py     |  2 +-
 torchvision/models/shufflenetv2.py            |  8 +--
 torchvision/models/squeezenet.py              |  4 +-
 torchvision/models/swin_transformer.py        | 12 ++--
 torchvision/models/vgg.py                     | 18 ++---
 torchvision/models/video/mvit.py              |  4 +-
 torchvision/models/video/resnet.py            |  6 +-
 torchvision/models/video/s3d.py               |  2 +-
 torchvision/models/video/swin_transformer.py  |  8 +--
 torchvision/models/vision_transformer.py      | 20 +++---
 41 files changed, 180 insertions(+), 184 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2d1eb2d7a..568e83cc9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -363,12 +363,11 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
                     v_sample = ", ".join(v[:max_visible])
                     v = f"{v_sample}, ... ({len(v)-max_visible} omitted)" if len(v) > max_visible else v_sample
                 elif k == "_ops":
-                    if obj.__name__.endswith("_QuantizedWeights"):
-                        v = f"{v} giga instructions per sec"
-                    else:
-                        v = f"{v} giga floating-point operations per sec"
-                elif k == "_weight_size":
-                    v = f"{v} MB (file size)"
+                    v = f"{v:.2f}"
+                    k = "GIPS" if obj.__name__.endswith("_QuantizedWeights") else "GFLOPS"
+                elif k == "_file_size":
+                    k = "File size"
+                    v = f"{v:.1f} MB"
 
                 table.append((str(k), str(v)))
             table = tabulate(table, tablefmt="rst")
@@ -396,9 +395,7 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     ops_name = "GIPS" if "QuantizedWeights" in weights_endswith else "GFLOPS"
 
     metrics_keys, metrics_names = zip(*metrics)
-    column_names = (
-        ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Size (MB)", "Recipe"]
-    )  # Final column order
+    column_names = ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Recipe"]  # Final column order
     column_names = [f"**{name}**" for name in column_names]  # Add bold
 
     content = []
@@ -407,14 +404,13 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
             f":class:`{w} <{type(w).__name__}>`",
             *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys),
             f"{w.meta['num_params']/1e6:.1f}M",
-            f"{w.meta['_ops']:.3f}",
-            f"{round(w.meta['_weight_size'], 1):.1f}",
+            f"{w.meta['_ops']:.2f}",
             f"`link <{w.meta['recipe']}>`__",
         ]
 
         content.append(row)
 
-    column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 3 + ["10"]
+    column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 2 + ["10"]
     widths_table = " ".join(column_widths)
 
     table = tabulate(content, headers=column_names, tablefmt="rst")
diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py
index ea631a674..4993de930 100644
--- a/test/common_extended_utils.py
+++ b/test/common_extended_utils.py
@@ -296,7 +296,7 @@ def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512):
     return round(flops, 3)
 
 
-def get_weight_size_mb(weight):
+def get_file_size_mb(weight):
     weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1])
     weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024
 
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index f6c9ce087..5505f5b5e 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -4,7 +4,7 @@ import os
 import pytest
 import test_models as TM
 import torch
-from common_extended_utils import get_ops, get_weight_size_mb
+from common_extended_utils import get_file_size_mb, get_ops
 from torchvision import models
 from torchvision.models._api import get_model_weights, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
@@ -172,12 +172,12 @@ def test_schema_meta_validation(model_fn):
         "unquantized",
         "_docs",
         "_ops",
-        "_weight_size",
+        "_file_size",
     }
     # mandatory fields for each computer vision task
     classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")}
     defaults = {
-        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_weight_size", "_ops"},
+        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_file_size", "_ops"},
         "models": classification_fields,
         "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")},
         "quantization": classification_fields | {"backend", "unquantized"},
@@ -245,8 +245,8 @@ def test_schema_meta_validation(model_fn):
         if not w.name.isupper():
             bad_names.append(w)
 
-        if get_weight_size_mb(w) != w.meta.get("_weight_size"):
-            incorrect_meta.append((w, "_weight_size"))
+        if get_file_size_mb(w) != w.meta.get("_file_size"):
+            incorrect_meta.append((w, "_file_size"))
 
     assert not problematic_weights
     assert not incorrect_meta
diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py
index 50179d07c..5612fb45c 100644
--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -68,7 +68,7 @@ class AlexNet_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.714,
-            "_weight_size": 233.087,
+            "_file_size": 233.087,
             "_docs": """
                 These weights reproduce closely the results of the paper using a simplified training recipe.
             """,
diff --git a/torchvision/models/convnext.py b/torchvision/models/convnext.py
index 21e36b063..6228c730f 100644
--- a/torchvision/models/convnext.py
+++ b/torchvision/models/convnext.py
@@ -220,7 +220,7 @@ class ConvNeXt_Tiny_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.456,
-            "_weight_size": 109.119,
+            "_file_size": 109.119,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -240,7 +240,7 @@ class ConvNeXt_Small_Weights(WeightsEnum):
                 }
             },
             "_ops": 8.684,
-            "_weight_size": 191.703,
+            "_file_size": 191.703,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -260,7 +260,7 @@ class ConvNeXt_Base_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.355,
-            "_weight_size": 338.064,
+            "_file_size": 338.064,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -280,7 +280,7 @@ class ConvNeXt_Large_Weights(WeightsEnum):
                 }
             },
             "_ops": 34.361,
-            "_weight_size": 754.537,
+            "_file_size": 754.537,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py
index 575d123c4..0668c76ef 100644
--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -278,7 +278,7 @@ class DenseNet121_Weights(WeightsEnum):
                 }
             },
             "_ops": 2.834,
-            "_weight_size": 30.845,
+            "_file_size": 30.845,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -298,7 +298,7 @@ class DenseNet161_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.728,
-            "_weight_size": 110.369,
+            "_file_size": 110.369,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -318,7 +318,7 @@ class DenseNet169_Weights(WeightsEnum):
                 }
             },
             "_ops": 3.36,
-            "_weight_size": 54.708,
+            "_file_size": 54.708,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -338,7 +338,7 @@ class DenseNet201_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.291,
-            "_weight_size": 77.373,
+            "_file_size": 77.373,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 5b97c8fc2..8e3107050 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -389,7 +389,7 @@ class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 134.38,
-            "_weight_size": 159.743,
+            "_file_size": 159.743,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -410,7 +410,7 @@ class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                 }
             },
             "_ops": 280.371,
-            "_weight_size": 167.104,
+            "_file_size": 167.104,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -431,7 +431,7 @@ class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.494,
-            "_weight_size": 74.239,
+            "_file_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -452,7 +452,7 @@ class FasterRCNN_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.719,
-            "_weight_size": 74.239,
+            "_file_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index 535518f82..eeb15aed6 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -663,7 +663,7 @@ class FCOS_ResNet50_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 128.207,
-            "_weight_size": 123.608,
+            "_file_size": 123.608,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 6964389f1..8648368a6 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -329,7 +329,7 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 133.924,
-            "_weight_size": 226.054,
+            "_file_size": 226.054,
             "_docs": """
                 These weights were produced by following a similar training recipe as on the paper but use a checkpoint
                 from an early epoch.
@@ -350,7 +350,7 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 137.42,
-            "_weight_size": 226.054,
+            "_file_size": 226.054,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index f8a13a658..a9ef832af 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -371,7 +371,7 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 134.38,
-            "_weight_size": 169.84,
+            "_file_size": 169.84,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -393,7 +393,7 @@ class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                 }
             },
             "_ops": 333.577,
-            "_weight_size": 177.219,
+            "_file_size": 177.219,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 498ff2207..49425d20b 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -691,7 +691,7 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
                 }
             },
             "_ops": 151.54,
-            "_weight_size": 130.267,
+            "_file_size": 130.267,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -712,7 +712,7 @@ class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
                 }
             },
             "_ops": 152.238,
-            "_weight_size": 146.037,
+            "_file_size": 146.037,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index 5ec27f45f..224f33aae 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -40,7 +40,7 @@ class SSD300_VGG16_Weights(WeightsEnum):
                 }
             },
             "_ops": 34.858,
-            "_weight_size": 135.988,
+            "_file_size": 135.988,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index 10e32d248..f3bb4133f 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -199,7 +199,7 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.583,
-            "_weight_size": 13.418,
+            "_file_size": 13.418,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index 3877dff6a..fc69771ce 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -465,7 +465,7 @@ class EfficientNet_B0_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.386,
-            "_weight_size": 20.451,
+            "_file_size": 20.451,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -489,7 +489,7 @@ class EfficientNet_B1_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.687,
-            "_weight_size": 30.134,
+            "_file_size": 30.134,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -509,7 +509,7 @@ class EfficientNet_B1_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.687,
-            "_weight_size": 30.136,
+            "_file_size": 30.136,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -537,7 +537,7 @@ class EfficientNet_B2_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.088,
-            "_weight_size": 35.174,
+            "_file_size": 35.174,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -561,7 +561,7 @@ class EfficientNet_B3_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.827,
-            "_weight_size": 47.184,
+            "_file_size": 47.184,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -585,7 +585,7 @@ class EfficientNet_B4_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.394,
-            "_weight_size": 74.489,
+            "_file_size": 74.489,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -609,7 +609,7 @@ class EfficientNet_B5_Weights(WeightsEnum):
                 }
             },
             "_ops": 10.266,
-            "_weight_size": 116.864,
+            "_file_size": 116.864,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -633,7 +633,7 @@ class EfficientNet_B6_Weights(WeightsEnum):
                 }
             },
             "_ops": 19.068,
-            "_weight_size": 165.362,
+            "_file_size": 165.362,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -657,7 +657,7 @@ class EfficientNet_B7_Weights(WeightsEnum):
                 }
             },
             "_ops": 37.746,
-            "_weight_size": 254.675,
+            "_file_size": 254.675,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -683,7 +683,7 @@ class EfficientNet_V2_S_Weights(WeightsEnum):
                 }
             },
             "_ops": 8.366,
-            "_weight_size": 82.704,
+            "_file_size": 82.704,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -713,7 +713,7 @@ class EfficientNet_V2_M_Weights(WeightsEnum):
                 }
             },
             "_ops": 24.582,
-            "_weight_size": 208.01,
+            "_file_size": 208.01,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -746,7 +746,7 @@ class EfficientNet_V2_L_Weights(WeightsEnum):
                 }
             },
             "_ops": 56.08,
-            "_weight_size": 454.573,
+            "_file_size": 454.573,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py
index b5435c7bd..02d379d73 100644
--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -291,7 +291,7 @@ class GoogLeNet_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.498,
-            "_weight_size": 49.731,
+            "_file_size": 49.731,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index d2adb0842..bc43ac3d7 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -423,7 +423,7 @@ class Inception_V3_Weights(WeightsEnum):
                 }
             },
             "_ops": 5.713,
-            "_weight_size": 103.903,
+            "_file_size": 103.903,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
index d32fc5bb5..c55621499 100644
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -787,7 +787,7 @@ class MaxVit_T_Weights(WeightsEnum):
                 }
             },
             "_ops": 5.558,
-            "_weight_size": 118.769,
+            "_file_size": 118.769,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index bf94b9630..2b59ae0c8 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -232,7 +232,7 @@ class MNASNet0_5_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.104,
-            "_weight_size": 8.591,
+            "_file_size": 8.591,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -254,7 +254,7 @@ class MNASNet0_75_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.215,
-            "_weight_size": 12.303,
+            "_file_size": 12.303,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -278,7 +278,7 @@ class MNASNet1_0_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.314,
-            "_weight_size": 16.915,
+            "_file_size": 16.915,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -300,7 +300,7 @@ class MNASNet1_3_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.526,
-            "_weight_size": 24.246,
+            "_file_size": 24.246,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index 7920b906c..da5b24910 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -195,7 +195,7 @@ class MobileNet_V2_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.301,
-            "_weight_size": 13.555,
+            "_file_size": 13.555,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -212,7 +212,7 @@ class MobileNet_V2_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.301,
-            "_weight_size": 13.598,
+            "_file_size": 13.598,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index 8ae4b7d60..057fe26db 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -308,7 +308,7 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.217,
-            "_weight_size": 21.114,
+            "_file_size": 21.114,
             "_docs": """These weights were trained from scratch by using a simple training recipe.""",
         },
     )
@@ -326,7 +326,7 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.217,
-            "_weight_size": 21.107,
+            "_file_size": 21.107,
             "_docs": """
                 These weights improve marginally upon the results of the original paper by using a modified version of
                 TorchVision's `new training recipe
@@ -352,7 +352,7 @@ class MobileNet_V3_Small_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.057,
-            "_weight_size": 9.829,
+            "_file_size": 9.829,
             "_docs": """
                 These weights improve upon the results of the original paper by using a simple training recipe.
             """,
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index 7aa468492..8d06956d6 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -553,7 +553,7 @@ class Raft_Large_Weights(WeightsEnum):
                 "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
             },
             "_ops": 211.007,
-            "_weight_size": 20.129,
+            "_file_size": 20.129,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -573,7 +573,7 @@ class Raft_Large_Weights(WeightsEnum):
                 "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
             },
             "_ops": 211.007,
-            "_weight_size": 20.129,
+            "_file_size": 20.129,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -593,7 +593,7 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Finalpass": {"epe": 3.18},
             },
             "_ops": 211.007,
-            "_weight_size": 20.129,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -619,7 +619,7 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Finalpass": {"epe": 3.067},
             },
             "_ops": 211.007,
-            "_weight_size": 20.129,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -645,7 +645,7 @@ class Raft_Large_Weights(WeightsEnum):
                 "Kitti-Test": {"fl_all": 5.10},
             },
             "_ops": 211.007,
-            "_weight_size": 20.129,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -668,7 +668,7 @@ class Raft_Large_Weights(WeightsEnum):
                 "Kitti-Test": {"fl_all": 5.19},
             },
             "_ops": 211.007,
-            "_weight_size": 20.129,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -711,7 +711,7 @@ class Raft_Small_Weights(WeightsEnum):
                 "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
             },
             "_ops": 47.655,
-            "_weight_size": 3.821,
+            "_file_size": 3.821,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -730,7 +730,7 @@ class Raft_Small_Weights(WeightsEnum):
                 "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
             },
             "_ops": 47.655,
-            "_weight_size": 3.821,
+            "_file_size": 3.821,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index 96d3fc261..cb06594cd 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -124,7 +124,7 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 1.498,
-            "_weight_size": 12.618,
+            "_file_size": 12.618,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index bb6b5aa5b..46d4f6d66 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -184,7 +184,7 @@ class Inception_V3_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 5.713,
-            "_weight_size": 23.146,
+            "_file_size": 23.146,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py
index c85c54f33..8561c3cac 100644
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -81,7 +81,7 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 0.301,
-            "_weight_size": 3.423,
+            "_file_size": 3.423,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py
index c0a613edd..4ee9434c8 100644
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -176,7 +176,7 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 0.217,
-            "_weight_size": 21.554,
+            "_file_size": 21.554,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py
index b6f4165e6..0f376f5c9 100644
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -176,7 +176,7 @@ class ResNet18_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 1.814,
-            "_weight_size": 11.238,
+            "_file_size": 11.238,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -197,7 +197,7 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 4.089,
-            "_weight_size": 24.759,
+            "_file_size": 24.759,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -214,7 +214,7 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 4.089,
-            "_weight_size": 24.953,
+            "_file_size": 24.953,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -235,7 +235,7 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 16.414,
-            "_weight_size": 86.034,
+            "_file_size": 86.034,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -252,7 +252,7 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 16.414,
-            "_weight_size": 86.645,
+            "_file_size": 86.645,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -274,7 +274,7 @@ class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 15.46,
-            "_weight_size": 81.556,
+            "_file_size": 81.556,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index 11d89c0b3..d18b08bb2 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -140,7 +140,7 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 0.04,
-            "_weight_size": 1.501,
+            "_file_size": 1.501,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -161,7 +161,7 @@ class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 0.145,
-            "_weight_size": 2.334,
+            "_file_size": 2.334,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -183,7 +183,7 @@ class ShuffleNet_V2_X1_5_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 0.296,
-            "_weight_size": 3.672,
+            "_file_size": 3.672,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -205,7 +205,7 @@ class ShuffleNet_V2_X2_0_QuantizedWeights(WeightsEnum):
                 }
             },
             "_ops": 0.583,
-            "_weight_size": 7.467,
+            "_file_size": 7.467,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index a60262c3b..ba6e5cd3b 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -429,7 +429,7 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.402,
-            "_weight_size": 16.806,
+            "_file_size": 16.806,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -447,7 +447,7 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.402,
-            "_weight_size": 16.806,
+            "_file_size": 16.806,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -473,7 +473,7 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.834,
-            "_weight_size": 24.774,
+            "_file_size": 24.774,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -491,7 +491,7 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.834,
-            "_weight_size": 24.774,
+            "_file_size": 24.774,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -517,7 +517,7 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.612,
-            "_weight_size": 43.152,
+            "_file_size": 43.152,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -535,7 +535,7 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.612,
-            "_weight_size": 43.152,
+            "_file_size": 43.152,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -561,7 +561,7 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 3.176,
-            "_weight_size": 74.567,
+            "_file_size": 74.567,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -579,7 +579,7 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 3.176,
-            "_weight_size": 74.567,
+            "_file_size": 74.567,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -605,7 +605,7 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 8.473,
-            "_weight_size": 150.701,
+            "_file_size": 150.701,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -623,7 +623,7 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 8.473,
-            "_weight_size": 150.701,
+            "_file_size": 150.701,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -649,7 +649,7 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.912,
-            "_weight_size": 319.49,
+            "_file_size": 319.49,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -667,7 +667,7 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.912,
-            "_weight_size": 319.49,
+            "_file_size": 319.49,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -690,7 +690,7 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 46.735,
-            "_weight_size": 319.49,
+            "_file_size": 319.49,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -713,7 +713,7 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.912,
-            "_weight_size": 319.49,
+            "_file_size": 319.49,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -738,7 +738,7 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 32.28,
-            "_weight_size": 554.076,
+            "_file_size": 554.076,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -756,7 +756,7 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 32.28,
-            "_weight_size": 554.076,
+            "_file_size": 554.076,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -779,7 +779,7 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 94.826,
-            "_weight_size": 554.076,
+            "_file_size": 554.076,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -802,7 +802,7 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 32.28,
-            "_weight_size": 554.076,
+            "_file_size": 554.076,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -828,7 +828,7 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 374.57,
-            "_weight_size": 2461.564,
+            "_file_size": 2461.564,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -851,7 +851,7 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 127.518,
-            "_weight_size": 2461.564,
+            "_file_size": 2461.564,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -876,7 +876,7 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.414,
-            "_weight_size": 21.258,
+            "_file_size": 21.258,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -894,7 +894,7 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.414,
-            "_weight_size": 21.257,
+            "_file_size": 21.257,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -920,7 +920,7 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.8,
-            "_weight_size": 27.945,
+            "_file_size": 27.945,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -938,7 +938,7 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.8,
-            "_weight_size": 27.945,
+            "_file_size": 27.945,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -964,7 +964,7 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.603,
-            "_weight_size": 35.339,
+            "_file_size": 35.339,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -982,7 +982,7 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.603,
-            "_weight_size": 35.339,
+            "_file_size": 35.339,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1008,7 +1008,7 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 3.177,
-            "_weight_size": 58.756,
+            "_file_size": 58.756,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1026,7 +1026,7 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 3.177,
-            "_weight_size": 58.756,
+            "_file_size": 58.756,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1052,7 +1052,7 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.995,
-            "_weight_size": 151.456,
+            "_file_size": 151.456,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1070,7 +1070,7 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.995,
-            "_weight_size": 151.456,
+            "_file_size": 151.456,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1096,7 +1096,7 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.941,
-            "_weight_size": 207.627,
+            "_file_size": 207.627,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1114,7 +1114,7 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.941,
-            "_weight_size": 207.627,
+            "_file_size": 207.627,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1140,7 +1140,7 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 31.736,
-            "_weight_size": 412.039,
+            "_file_size": 412.039,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1158,7 +1158,7 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                 }
             },
             "_ops": 31.736,
-            "_weight_size": 412.039,
+            "_file_size": 412.039,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index 80eb46669..f0eb501c0 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -324,7 +324,7 @@ class ResNet18_Weights(WeightsEnum):
                 }
             },
             "_ops": 1.814,
-            "_weight_size": 44.661,
+            "_file_size": 44.661,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -346,7 +346,7 @@ class ResNet34_Weights(WeightsEnum):
                 }
             },
             "_ops": 3.664,
-            "_weight_size": 83.275,
+            "_file_size": 83.275,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -368,7 +368,7 @@ class ResNet50_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.089,
-            "_weight_size": 97.781,
+            "_file_size": 97.781,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -386,7 +386,7 @@ class ResNet50_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.089,
-            "_weight_size": 97.79,
+            "_file_size": 97.79,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -411,7 +411,7 @@ class ResNet101_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.801,
-            "_weight_size": 170.511,
+            "_file_size": 170.511,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -429,7 +429,7 @@ class ResNet101_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.801,
-            "_weight_size": 170.53,
+            "_file_size": 170.53,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -454,7 +454,7 @@ class ResNet152_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.514,
-            "_weight_size": 230.434,
+            "_file_size": 230.434,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -472,7 +472,7 @@ class ResNet152_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.514,
-            "_weight_size": 230.474,
+            "_file_size": 230.474,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -497,7 +497,7 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.23,
-            "_weight_size": 95.789,
+            "_file_size": 95.789,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -515,7 +515,7 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.23,
-            "_weight_size": 95.833,
+            "_file_size": 95.833,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -540,7 +540,7 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                 }
             },
             "_ops": 16.414,
-            "_weight_size": 339.586,
+            "_file_size": 339.586,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -558,7 +558,7 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                 }
             },
             "_ops": 16.414,
-            "_weight_size": 339.673,
+            "_file_size": 339.673,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -583,7 +583,7 @@ class ResNeXt101_64X4D_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.46,
-            "_weight_size": 319.318,
+            "_file_size": 319.318,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -608,7 +608,7 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.398,
-            "_weight_size": 131.82,
+            "_file_size": 131.82,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -626,7 +626,7 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.398,
-            "_weight_size": 263.124,
+            "_file_size": 263.124,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -651,7 +651,7 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                 }
             },
             "_ops": 22.753,
-            "_weight_size": 242.896,
+            "_file_size": 242.896,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -669,7 +669,7 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                 }
             },
             "_ops": 22.753,
-            "_weight_size": 484.747,
+            "_file_size": 484.747,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
index ce5867380..b08889538 100644
--- a/torchvision/models/segmentation/deeplabv3.py
+++ b/torchvision/models/segmentation/deeplabv3.py
@@ -153,7 +153,7 @@ class DeepLabV3_ResNet50_Weights(WeightsEnum):
                 }
             },
             "_ops": 178.722,
-            "_weight_size": 160.515,
+            "_file_size": 160.515,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -174,7 +174,7 @@ class DeepLabV3_ResNet101_Weights(WeightsEnum):
                 }
             },
             "_ops": 258.743,
-            "_weight_size": 233.217,
+            "_file_size": 233.217,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -195,7 +195,7 @@ class DeepLabV3_MobileNet_V3_Large_Weights(WeightsEnum):
                 }
             },
             "_ops": 10.452,
-            "_weight_size": 42.301,
+            "_file_size": 42.301,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
index 8031a7d0d..fc13f1d7d 100644
--- a/torchvision/models/segmentation/fcn.py
+++ b/torchvision/models/segmentation/fcn.py
@@ -72,7 +72,7 @@ class FCN_ResNet50_Weights(WeightsEnum):
                 }
             },
             "_ops": 152.717,
-            "_weight_size": 135.009,
+            "_file_size": 135.009,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -93,7 +93,7 @@ class FCN_ResNet101_Weights(WeightsEnum):
                 }
             },
             "_ops": 232.738,
-            "_weight_size": 207.711,
+            "_file_size": 207.711,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
index e90a2917d..d8a1d4ed7 100644
--- a/torchvision/models/segmentation/lraspp.py
+++ b/torchvision/models/segmentation/lraspp.py
@@ -109,7 +109,7 @@ class LRASPP_MobileNet_V3_Large_Weights(WeightsEnum):
                 }
             },
             "_ops": 2.086,
-            "_weight_size": 12.49,
+            "_file_size": 12.49,
             "_docs": """
                 These weights were trained on a subset of COCO, using only the 20 categories that are present in the
                 Pascal VOC dataset.
diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index 005b338b7..99583e3b9 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -205,7 +205,7 @@ class ShuffleNet_V2_X0_5_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.04,
-            "_weight_size": 5.282,
+            "_file_size": 5.282,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -227,7 +227,7 @@ class ShuffleNet_V2_X1_0_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.145,
-            "_weight_size": 8.791,
+            "_file_size": 8.791,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -249,7 +249,7 @@ class ShuffleNet_V2_X1_5_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.296,
-            "_weight_size": 13.557,
+            "_file_size": 13.557,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -274,7 +274,7 @@ class ShuffleNet_V2_X2_0_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.583,
-            "_weight_size": 28.433,
+            "_file_size": 28.433,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py
index 94f5d50e6..80ee7982e 100644
--- a/torchvision/models/squeezenet.py
+++ b/torchvision/models/squeezenet.py
@@ -136,7 +136,7 @@ class SqueezeNet1_0_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.819,
-            "_weight_size": 4.778,
+            "_file_size": 4.778,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -157,7 +157,7 @@ class SqueezeNet1_1_Weights(WeightsEnum):
                 }
             },
             "_ops": 0.349,
-            "_weight_size": 4.729,
+            "_file_size": 4.729,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 2347217bd..0d3ab9ad3 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -663,7 +663,7 @@ class Swin_T_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.491,
-            "_weight_size": 108.19,
+            "_file_size": 108.19,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -688,7 +688,7 @@ class Swin_S_Weights(WeightsEnum):
                 }
             },
             "_ops": 8.741,
-            "_weight_size": 189.786,
+            "_file_size": 189.786,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -713,7 +713,7 @@ class Swin_B_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.431,
-            "_weight_size": 335.364,
+            "_file_size": 335.364,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -738,7 +738,7 @@ class Swin_V2_T_Weights(WeightsEnum):
                 }
             },
             "_ops": 5.94,
-            "_weight_size": 108.626,
+            "_file_size": 108.626,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -763,7 +763,7 @@ class Swin_V2_S_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.546,
-            "_weight_size": 190.675,
+            "_file_size": 190.675,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -788,7 +788,7 @@ class Swin_V2_B_Weights(WeightsEnum):
                 }
             },
             "_ops": 20.325,
-            "_weight_size": 336.372,
+            "_file_size": 336.372,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index 6725dedd4..fca69928a 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -128,7 +128,7 @@ class VGG11_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.609,
-            "_weight_size": 506.84,
+            "_file_size": 506.84,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -148,7 +148,7 @@ class VGG11_BN_Weights(WeightsEnum):
                 }
             },
             "_ops": 7.609,
-            "_weight_size": 506.881,
+            "_file_size": 506.881,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -168,7 +168,7 @@ class VGG13_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.308,
-            "_weight_size": 507.545,
+            "_file_size": 507.545,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -188,7 +188,7 @@ class VGG13_BN_Weights(WeightsEnum):
                 }
             },
             "_ops": 11.308,
-            "_weight_size": 507.59,
+            "_file_size": 507.59,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -208,7 +208,7 @@ class VGG16_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.47,
-            "_weight_size": 527.796,
+            "_file_size": 527.796,
         },
     )
     IMAGENET1K_FEATURES = Weights(
@@ -232,7 +232,7 @@ class VGG16_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.47,
-            "_weight_size": 527.802,
+            "_file_size": 527.802,
             "_docs": """
                 These weights can't be used for classification because they are missing values in the `classifier`
                 module. Only the `features` module has valid values and can be used for feature extraction. The weights
@@ -257,7 +257,7 @@ class VGG16_BN_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.47,
-            "_weight_size": 527.866,
+            "_file_size": 527.866,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -277,7 +277,7 @@ class VGG19_Weights(WeightsEnum):
                 }
             },
             "_ops": 19.632,
-            "_weight_size": 548.051,
+            "_file_size": 548.051,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -297,7 +297,7 @@ class VGG19_BN_Weights(WeightsEnum):
                 }
             },
             "_ops": 19.632,
-            "_weight_size": 548.143,
+            "_file_size": 548.143,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index d20d6e907..d3be03740 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -625,7 +625,7 @@ class MViT_V1_B_Weights(WeightsEnum):
                 }
             },
             "_ops": 70.599,
-            "_weight_size": 139.764,
+            "_file_size": 139.764,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -658,7 +658,7 @@ class MViT_V2_S_Weights(WeightsEnum):
                 }
             },
             "_ops": 64.224,
-            "_weight_size": 131.884,
+            "_file_size": 131.884,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/resnet.py b/torchvision/models/video/resnet.py
index 6f5bd8764..5bc46a8ed 100644
--- a/torchvision/models/video/resnet.py
+++ b/torchvision/models/video/resnet.py
@@ -333,7 +333,7 @@ class R3D_18_Weights(WeightsEnum):
                 }
             },
             "_ops": 40.697,
-            "_weight_size": 127.359,
+            "_file_size": 127.359,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -353,7 +353,7 @@ class MC3_18_Weights(WeightsEnum):
                 }
             },
             "_ops": 43.343,
-            "_weight_size": 44.672,
+            "_file_size": 44.672,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -373,7 +373,7 @@ class R2Plus1D_18_Weights(WeightsEnum):
                 }
             },
             "_ops": 40.519,
-            "_weight_size": 120.318,
+            "_file_size": 120.318,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py
index 64874712f..8716a454b 100644
--- a/torchvision/models/video/s3d.py
+++ b/torchvision/models/video/s3d.py
@@ -176,7 +176,7 @@ class S3D_Weights(WeightsEnum):
                 }
             },
             "_ops": 17.979,
-            "_weight_size": 31.972,
+            "_file_size": 31.972,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/swin_transformer.py b/torchvision/models/video/swin_transformer.py
index 3d19f97ce..c6a1602d2 100644
--- a/torchvision/models/video/swin_transformer.py
+++ b/torchvision/models/video/swin_transformer.py
@@ -531,7 +531,7 @@ class Swin3D_T_Weights(WeightsEnum):
                 }
             },
             "_ops": 43.882,
-            "_weight_size": 121.543,
+            "_file_size": 121.543,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -562,7 +562,7 @@ class Swin3D_S_Weights(WeightsEnum):
                 }
             },
             "_ops": 82.841,
-            "_weight_size": 218.288,
+            "_file_size": 218.288,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -593,7 +593,7 @@ class Swin3D_B_Weights(WeightsEnum):
                 }
             },
             "_ops": 140.667,
-            "_weight_size": 364.134,
+            "_file_size": 364.134,
         },
     )
     KINETICS400_IMAGENET22K_V1 = Weights(
@@ -620,7 +620,7 @@ class Swin3D_B_Weights(WeightsEnum):
                 }
             },
             "_ops": 140.667,
-            "_weight_size": 364.134,
+            "_file_size": 364.134,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index 2045ae509..8474b6882 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -364,7 +364,7 @@ class ViT_B_16_Weights(WeightsEnum):
                 }
             },
             "_ops": 17.564,
-            "_weight_size": 330.285,
+            "_file_size": 330.285,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -390,7 +390,7 @@ class ViT_B_16_Weights(WeightsEnum):
                 }
             },
             "_ops": 55.484,
-            "_weight_size": 331.398,
+            "_file_size": 331.398,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -417,7 +417,7 @@ class ViT_B_16_Weights(WeightsEnum):
                 }
             },
             "_ops": 17.564,
-            "_weight_size": 330.285,
+            "_file_size": 330.285,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -443,7 +443,7 @@ class ViT_B_32_Weights(WeightsEnum):
                 }
             },
             "_ops": 4.409,
-            "_weight_size": 336.604,
+            "_file_size": 336.604,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -469,7 +469,7 @@ class ViT_L_16_Weights(WeightsEnum):
                 }
             },
             "_ops": 61.555,
-            "_weight_size": 1161.023,
+            "_file_size": 1161.023,
             "_docs": """
                 These weights were trained from scratch by using a modified version of TorchVision's
                 `new training recipe
@@ -496,7 +496,7 @@ class ViT_L_16_Weights(WeightsEnum):
                 }
             },
             "_ops": 361.986,
-            "_weight_size": 1164.258,
+            "_file_size": 1164.258,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -523,7 +523,7 @@ class ViT_L_16_Weights(WeightsEnum):
                 }
             },
             "_ops": 61.555,
-            "_weight_size": 1161.023,
+            "_file_size": 1161.023,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -549,7 +549,7 @@ class ViT_L_32_Weights(WeightsEnum):
                 }
             },
             "_ops": 15.378,
-            "_weight_size": 1169.449,
+            "_file_size": 1169.449,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -579,7 +579,7 @@ class ViT_H_14_Weights(WeightsEnum):
                 }
             },
             "_ops": 1016.717,
-            "_weight_size": 2416.643,
+            "_file_size": 2416.643,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -606,7 +606,7 @@ class ViT_H_14_Weights(WeightsEnum):
                 }
             },
             "_ops": 167.295,
-            "_weight_size": 2411.209,
+            "_file_size": 2411.209,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
-- 
GitLab


From 7dc5e5bd60b55eb4e6ea5c1265d6dc7b17d2e917 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 11 Jan 2023 15:42:24 +0100
Subject: [PATCH 205/624] Fix typos and grammar errors (#7065)

* fix typos throughout the code base

* fix grammar

* revert formatting changes to gallery

* revert 'an uXX'

* remove 'number of the best'
---
 CONTRIBUTING.md                               |  4 +-
 CONTRIBUTING_MODELS.md                        |  6 +--
 docs/source/conf.py                           |  2 +-
 docs/source/models/alexnet.rst                |  2 +-
 docs/source/models/efficientnet.rst           |  2 +-
 docs/source/models/efficientnetv2.rst         |  2 +-
 docs/source/models/googlenet.rst              |  2 +-
 docs/source/models/googlenet_quant.rst        |  2 +-
 docs/source/models/inception.rst              |  2 +-
 docs/source/models/inception_quant.rst        |  2 +-
 docs/source/models/mnasnet.rst                |  2 +-
 docs/source/models/ssd.rst                    |  2 +-
 docs/source/utils.rst                         |  2 +-
 gallery/plot_scripted_tensor_transforms.py    |  2 +-
 gallery/plot_visualization_utils.py           |  6 +--
 references/classification/README.md           |  2 +-
 references/classification/train.py            |  4 +-
 references/classification/utils.py            |  4 +-
 references/depth/stereo/README.md             | 14 +++---
 references/depth/stereo/cascade_evaluation.py | 14 +++---
 references/depth/stereo/train.py              | 26 +++++------
 references/depth/stereo/transforms.py         | 10 ++---
 references/depth/stereo/utils/losses.py       |  4 +-
 references/depth/stereo/utils/metrics.py      |  2 +-
 .../{vizualization.py => visualization.py}    |  0
 references/detection/coco_utils.py            |  2 +-
 references/detection/group_by_aspect_ratio.py |  2 +-
 references/optical_flow/README.md             |  2 +-
 references/optical_flow/train.py              |  4 +-
 references/optical_flow/transforms.py         |  4 +-
 references/optical_flow/utils.py              |  4 +-
 references/segmentation/train.py              |  2 +-
 references/similarity/sampler.py              |  2 +-
 references/video_classification/README.md     |  2 +-
 test/conftest.py                              |  4 +-
 test/datasets_utils.py                        | 10 ++---
 test/smoke_test.py                            |  2 +-
 test/test_backbone_utils.py                   |  2 +-
 test/test_models.py                           |  2 +-
 test/test_models_detection_utils.py           |  2 +-
 test/test_ops.py                              |  6 +--
 test/test_prototype_transforms.py             |  2 +-
 test/test_transforms.py                       |  2 +-
 test/test_utils.py                            |  6 +--
 test/test_video_reader.py                     |  2 +-
 torchvision/csrc/io/decoder/defs.h            |  2 +-
 torchvision/csrc/io/decoder/memory_buffer.cpp |  2 +-
 .../csrc/io/decoder/sync_decoder_test.cpp     |  4 +-
 torchvision/csrc/io/video/video.cpp           |  4 +-
 torchvision/csrc/io/video/video.h             |  4 +-
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp |  2 +-
 .../ops/quantized/cpu/qroi_align_kernel.cpp   |  2 +-
 torchvision/datasets/_stereo_matching.py      | 12 ++---
 torchvision/datasets/celeba.py                |  2 +-
 torchvision/datasets/cityscapes.py            |  2 +-
 torchvision/datasets/country211.py            |  2 +-
 torchvision/datasets/hmdb51.py                |  2 +-
 torchvision/datasets/mnist.py                 |  2 +-
 torchvision/datasets/places365.py             |  4 +-
 torchvision/datasets/stl10.py                 |  2 +-
 torchvision/datasets/svhn.py                  |  2 +-
 torchvision/datasets/ucf101.py                |  2 +-
 torchvision/datasets/video_utils.py           |  2 +-
 torchvision/io/_video_opt.py                  | 12 +++--
 torchvision/io/image.py                       |  2 +-
 torchvision/io/video.py                       |  3 +-
 torchvision/io/video_reader.py                | 10 ++---
 torchvision/models/_utils.py                  |  2 +-
 torchvision/models/detection/_utils.py        | 10 ++---
 torchvision/models/detection/anchor_utils.py  |  2 +-
 .../models/detection/backbone_utils.py        |  8 ++--
 torchvision/models/detection/faster_rcnn.py   | 14 +++---
 torchvision/models/detection/fcos.py          | 12 ++---
 torchvision/models/detection/keypoint_rcnn.py | 12 ++---
 torchvision/models/detection/mask_rcnn.py     | 12 ++---
 torchvision/models/detection/retinanet.py     | 10 ++---
 torchvision/models/detection/roi_heads.py     |  2 +-
 torchvision/models/detection/ssd.py           | 14 +++---
 torchvision/models/detection/ssdlite.py       |  2 +-
 torchvision/models/detection/transform.py     |  4 +-
 torchvision/models/feature_extraction.py      | 12 ++---
 torchvision/models/inception.py               |  2 +-
 torchvision/models/maxvit.py                  |  2 +-
 torchvision/models/mnasnet.py                 |  2 +-
 torchvision/models/mobilenetv2.py             |  2 +-
 torchvision/models/optical_flow/raft.py       |  8 ++--
 torchvision/models/regnet.py                  |  2 +-
 torchvision/models/resnet.py                  |  2 +-
 torchvision/models/vision_transformer.py      |  4 +-
 torchvision/ops/_box_convert.py               |  2 +-
 torchvision/ops/diou_loss.py                  |  2 +-
 torchvision/ops/feature_pyramid_network.py    |  2 +-
 torchvision/ops/giou_loss.py                  |  2 +-
 torchvision/ops/misc.py                       | 16 +++----
 torchvision/ops/poolers.py                    |  4 +-
 .../models/depth/stereo/crestereo.py          | 44 +++++++++----------
 .../models/depth/stereo/raft_stereo.py        | 26 +++++------
 .../prototype/transforms/functional/_color.py |  4 +-
 torchvision/transforms/_transforms_video.py   |  2 +-
 torchvision/transforms/functional.py          | 16 +++----
 torchvision/transforms/functional_tensor.py   |  4 +-
 torchvision/transforms/transforms.py          | 14 +++---
 torchvision/utils.py                          |  4 +-
 103 files changed, 272 insertions(+), 275 deletions(-)
 rename references/depth/stereo/{vizualization.py => visualization.py} (100%)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3eedb6261..5bca03097 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -69,7 +69,7 @@ If you plan to modify the code or documentation, please follow the steps below:
 For more details about pull requests, 
 please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). 
 
-If you would like to contribute a new model, please see [here](#New-model).
+If you would like to contribute a new model, please see [here](#New-architecture-or-improved-model-weights).
 
 If you would like to contribute a new dataset, please see [here](#New-dataset). 
 
@@ -198,7 +198,7 @@ it in an issue as, most likely, it will not be accepted.
 ### Pull Request
 
 If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on 
-different operation systems, python versions and hardwares.
+different operating systems, python versions and hardware.
 
 For more details about pull requests workflow, 
 please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
diff --git a/CONTRIBUTING_MODELS.md b/CONTRIBUTING_MODELS.md
index 82845e657..390a25a0f 100644
--- a/CONTRIBUTING_MODELS.md
+++ b/CONTRIBUTING_MODELS.md
@@ -20,13 +20,13 @@ So, before starting any work and submitting a PR there are a few critical things
 
 ### 1. Preparation work
 
-- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether or not this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it.
+- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it.
 
 - If the decision is to include the new model, then please create a new ticket which will be used for all design and implementation discussions prior to the PR. One of the TorchVision maintainers will reach out at this stage and this will be your POC from this point onwards in order to provide support, guidance and regular feedback.
 
 ### 2.  Implement the model
 
-Please take a look at existing models in TorchVision to get familiar with the idioms. Also please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1.  Example of things to take into account:
+Please take a look at existing models in TorchVision to get familiar with the idioms. Also, please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1.  Example of things to take into account:
 
 - The implementation should be as close as possible to the canonical implementation/paper
 - The PR must include the code implementation, documentation and tests
@@ -34,7 +34,7 @@ Please take a look at existing models in TorchVision to get familiar with the id
 - The weights need to reproduce closely the results of the paper in terms of accuracy, even though the final weights to be deployed will be those trained by the TorchVision maintainers
 - The PR description should include commands/configuration used to train the model, so that the TorchVision maintainers can easily run them to verify the implementation and generate the final model to be released
 - Make sure we re-use existing components as much as possible (inheritance)
-- New primitives (transforms, losses, etc) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer
+- New primitives (transforms, losses, etc.) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer
 - Please take a look at the detailed [implementation and documentation guidelines](https://github.com/pytorch/vision/issues/5319) for a fine grain list of things not to be missed
 
 ### 3. Train the model with reference scripts
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 568e83cc9..72c83d789 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -331,7 +331,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
         ]
 
         if obj.__doc__ != "An enumeration.":
-            # We only show the custom enum doc if it was overriden. The default one from Python is "An enumeration"
+            # We only show the custom enum doc if it was overridden. The default one from Python is "An enumeration"
             lines.append("")
             lines.append(obj.__doc__)
 
diff --git a/docs/source/models/alexnet.rst b/docs/source/models/alexnet.rst
index 080c24198..8e94b4eee 100644
--- a/docs/source/models/alexnet.rst
+++ b/docs/source/models/alexnet.rst
@@ -14,7 +14,7 @@ and is based on `One weird trick for parallelizing convolutional neural networks
 Model builders
 --------------
 
-The following model builders can be used to instanciate an AlexNet model, with or
+The following model builders can be used to instantiate an AlexNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.alexnet.AlexNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/efficientnet.rst b/docs/source/models/efficientnet.rst
index 4df547b3c..cbc971895 100644
--- a/docs/source/models/efficientnet.rst
+++ b/docs/source/models/efficientnet.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an EfficientNet model, with or
+The following model builders can be used to instantiate an EfficientNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/efficientnetv2.rst b/docs/source/models/efficientnetv2.rst
index 05c953b13..3066c28eb 100644
--- a/docs/source/models/efficientnetv2.rst
+++ b/docs/source/models/efficientnetv2.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an EfficientNetV2 model, with or
+The following model builders can be used to instantiate an EfficientNetV2 model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/googlenet.rst b/docs/source/models/googlenet.rst
index ed4f1345e..91ea03ddf 100644
--- a/docs/source/models/googlenet.rst
+++ b/docs/source/models/googlenet.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a GoogLeNet model, with or
+The following model builders can be used to instantiate a GoogLeNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.googlenet.GoogLeNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/googlenet_quant.rst b/docs/source/models/googlenet_quant.rst
index acb2737b5..4358389b3 100644
--- a/docs/source/models/googlenet_quant.rst
+++ b/docs/source/models/googlenet_quant.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a quantized GoogLeNet
+The following model builders can be used to instantiate a quantized GoogLeNet
 model, with or without pre-trained weights. All the model builders internally
 rely on the ``torchvision.models.quantization.googlenet.QuantizableGoogLeNet``
 base class. Please refer to the `source code
diff --git a/docs/source/models/inception.rst b/docs/source/models/inception.rst
index 72aa9724d..e162eef5d 100644
--- a/docs/source/models/inception.rst
+++ b/docs/source/models/inception.rst
@@ -10,7 +10,7 @@ Computer Vision <https://arxiv.org/abs/1512.00567>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an InceptionV3 model, with or
+The following model builders can be used to instantiate an InceptionV3 model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.inception.Inception3`` base class. Please refer to the `source
 code <https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py>`_ for
diff --git a/docs/source/models/inception_quant.rst b/docs/source/models/inception_quant.rst
index 397fd10df..d26f1ab09 100644
--- a/docs/source/models/inception_quant.rst
+++ b/docs/source/models/inception_quant.rst
@@ -10,7 +10,7 @@ Computer Vision <https://arxiv.org/abs/1512.00567>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a quantized Inception
+The following model builders can be used to instantiate a quantized Inception
 model, with or without pre-trained weights. All the model builders internally
 rely on the ``torchvision.models.quantization.inception.QuantizableInception3``
 base class. Please refer to the `source code
diff --git a/docs/source/models/mnasnet.rst b/docs/source/models/mnasnet.rst
index e31b4aca1..fd9ea5115 100644
--- a/docs/source/models/mnasnet.rst
+++ b/docs/source/models/mnasnet.rst
@@ -11,7 +11,7 @@ Search for Mobile <https://arxiv.org/pdf/1807.11626.pdf>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an MNASNet model.
+The following model builders can be used to instantiate an MNASNet model.
 All the model builders internally rely on the
 ``torchvision.models.mnasnet.MNASNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/ssd.rst b/docs/source/models/ssd.rst
index 7d73b234a..68b0bb224 100644
--- a/docs/source/models/ssd.rst
+++ b/docs/source/models/ssd.rst
@@ -12,7 +12,7 @@ The SSD model is based on the `SSD: Single Shot MultiBox Detector
 Model builders
 --------------
 
-The following model builders can be used to instanciate a SSD model, with or
+The following model builders can be used to instantiate a SSD model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.SSD`` base class. Please refer to the `source
 code
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
index 276f730c2..971381a65 100644
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -4,7 +4,7 @@ Utils
 =====
 
 The ``torchvision.utils`` module contains various utilities, mostly :ref:`for
-vizualization <sphx_glr_auto_examples_plot_visualization_utils.py>`. 
+visualization <sphx_glr_auto_examples_plot_visualization_utils.py>`.
 
 .. currentmodule:: torchvision.utils
 
diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py
index 995383d46..b0851217e 100644
--- a/gallery/plot_scripted_tensor_transforms.py
+++ b/gallery/plot_scripted_tensor_transforms.py
@@ -10,7 +10,7 @@ them using JIT compilation.
 
 Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric
 and presented multiple limitations due to that. Now, since v0.8.0, transforms
-implementations are Tensor and PIL compatible and we can achieve the following
+implementations are Tensor and PIL compatible, and we can achieve the following
 new features:
 
 - transform multi-band torch tensor images (with more than 3-4 channels)
diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index b04e0b6cf..d6350a7a4 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -188,7 +188,7 @@ show(dogs_with_masks)
 # We can plot more than one mask per image! Remember that the model returned as
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
-# C, is class C the most most likely class?"
+# C, is class C the most likely class?"
 #
 # This one is a bit more involved, so we'll first show how to do it with a
 # single image, and then we'll generalize to the batch
@@ -317,7 +317,7 @@ show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
 
 #####################################
 # The model seems to have properly detected the dog, but it also confused trees
-# with people. Looking more closely at the scores will help us plotting more
+# with people. Looking more closely at the scores will help us plot more
 # relevant masks:
 
 print(dog1_output['scores'])
@@ -343,7 +343,7 @@ show(dogs_with_masks)
 
 #####################################
 # The two 'people' masks in the first image where not selected because they have
-# a lower score than the score threshold. Similarly in the second image, the
+# a lower score than the score threshold. Similarly, in the second image, the
 # instance with class 15 (which corresponds to 'bench') was not selected.
 
 #####################################
diff --git a/references/classification/README.md b/references/classification/README.md
index da5cd9886..66ae871ae 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -298,7 +298,7 @@ Here `$MODEL` is one of `googlenet`, `inception_v3`, `resnet18`, `resnet50`, `re
 
 ### Quantized ShuffleNet V2
 
-Here are commands that we use to quantized the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
+Here are commands that we use to quantize the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
 ```
 # For shufflenet_v2_x1_5
 python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \
diff --git a/references/classification/train.py b/references/classification/train.py
index 00af63018..995c608ac 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -314,11 +314,11 @@ def main(args):
 
     model_ema = None
     if args.model_ema:
-        # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at:
+        # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at:
         # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
         #
         # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
-        # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus:
+        # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus:
         # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
         adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs
         alpha = 1.0 - args.model_ema_decay
diff --git a/references/classification/utils.py b/references/classification/utils.py
index c31f3928e..3e6c2e89e 100644
--- a/references/classification/utils.py
+++ b/references/classification/utils.py
@@ -365,12 +365,12 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T
     checkpoint_path = os.path.abspath(checkpoint_path)
     output_dir = os.path.dirname(checkpoint_path)
 
-    # Deep copy to avoid side-effects on the model object.
+    # Deep copy to avoid side effects on the model object.
     model = copy.deepcopy(model)
     checkpoint = torch.load(checkpoint_path, map_location="cpu")
 
     # Load the weights to the model to validate that everything works
-    # and remove unnecessary weights (such as auxiliaries, etc)
+    # and remove unnecessary weights (such as auxiliaries, etc.)
     if checkpoint_key == "model_ema":
         del checkpoint[checkpoint_key]["n_averaged"]
         torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.")
diff --git a/references/depth/stereo/README.md b/references/depth/stereo/README.md
index 922fbcdb3..22bcae27a 100644
--- a/references/depth/stereo/README.md
+++ b/references/depth/stereo/README.md
@@ -12,8 +12,8 @@ A ratio of **88-6-6** was used in order to train a baseline weight set. We provi
 Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The
 rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo.
 The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate
-schedule to one that starts decaying the weight much sooner. Throughout experiments we found that this reduces overfitting
-during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
+schedule to one that starts decaying the weight much sooner. Throughout the experiments we found that this reduces 
+overfitting during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
 
 ```
 torchrun --nproc_per_node 8 --nnodes 1 train.py \
@@ -31,7 +31,7 @@ torchrun --nproc_per_node 8 --nnodes 1 train.py \
     --clip-grad-norm 1.0 \
 ```
 
-We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggresive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
+We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggressive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
 
 ```
 torchrun --nproc_per_node 8 --nnodes 1 train.py \
@@ -59,7 +59,7 @@ Evaluating the base weights
 torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1
 ```
 
-This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate resuts use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
+This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
 
 ```
 Dataset: middlebury2014-train @size: [384, 512]:
@@ -135,7 +135,7 @@ Dataset: middlebury2014-train @size: [384, 512]:
 
 # Concerns when training
 
-We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targetting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence with naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
+We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targeting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence of naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
 
  Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments.
 
@@ -146,14 +146,14 @@ We encourage users to be aware of the **aspect-ratio** and **disparity scale** t
 
 ![Disparity1](assets/disparity-domain-drift.jpg)
 
-From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correcly estimate the continous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremly large scene the crop size of `384x512` does not correctly capture the general training distribution.
+From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correctly estimate the continuous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremely large scene the crop size of `384x512` does not correctly capture the general training distribution.
 
 
 ##### Sample B
 
-The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exagerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
+The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exaggerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
 
 ![Disparity2](assets/disparity-background-mode-collapse.jpg)
 
diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py
index 0fe8eb3b1..ee506ce39 100644
--- a/references/depth/stereo/cascade_evaluation.py
+++ b/references/depth/stereo/cascade_evaluation.py
@@ -9,7 +9,7 @@ from torch.nn import functional as F
 from train import make_eval_loader
 
 from utils.metrics import AVAILABLE_METRICS
-from vizualization import make_prediction_image_side_to_side
+from visualization import make_prediction_image_side_to_side
 
 
 def get_args_parser(add_help=True):
@@ -113,7 +113,7 @@ def _evaluate(
     *,
     padder_mode,
     print_freq=10,
-    writter=None,
+    writer=None,
     step=None,
     iterations=10,
     cascades=1,
@@ -180,10 +180,10 @@ def _evaluate(
             "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results."
         )
 
-    if writter is not None and args.rank == 0:
+    if writer is not None and args.rank == 0:
         for meter_name, meter_value in logger.meters.items():
             scalar_name = f"{meter_name} {header}"
-            writter.add_scalar(scalar_name, meter_value.avg, step)
+            writer.add_scalar(scalar_name, meter_value.avg, step)
 
     logger.synchronize_between_processes()
     print(header, logger)
@@ -192,7 +192,7 @@ def _evaluate(
     return logger_metrics
 
 
-def evaluate(model, loader, args, writter=None, step=None):
+def evaluate(model, loader, args, writer=None, step=None):
     os.makedirs(args.img_folder, exist_ok=True)
     checkpoint_name = os.path.basename(args.checkpoint) or args.weights
     image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name)
@@ -215,7 +215,7 @@ def evaluate(model, loader, args, writter=None, step=None):
                 padder_mode=args.padder_type,
                 header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}",
                 batch_size=args.batch_size,
-                writter=writter,
+                writer=writer,
                 step=step,
                 iterations=n_iters,
                 cascades=n_cascades,
@@ -271,7 +271,7 @@ def load_checkpoint(args):
             model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None)
             model.load_state_dict(checkpoint)
 
-        # set the appropiate devices
+        # set the appropriate devices
         if args.distributed and args.device == "cpu":
             raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
         device = torch.device(args.device)
diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py
index 548387f96..30c73628c 100644
--- a/references/depth/stereo/train.py
+++ b/references/depth/stereo/train.py
@@ -10,7 +10,7 @@ import torch.distributed as dist
 import torchvision.models.optical_flow
 import torchvision.prototype.models.depth.stereo
 import utils
-import vizualization
+import visualization
 
 from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS
 from torch import nn
@@ -148,7 +148,7 @@ def _evaluate(
     *,
     padder_mode,
     print_freq=10,
-    writter=None,
+    writer=None,
     step=None,
     iterations=None,
     batch_size=None,
@@ -198,10 +198,10 @@ def _evaluate(
             "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results."
         )
 
-    if writter is not None and args.rank == 0:
+    if writer is not None and args.rank == 0:
         for meter_name, meter_value in logger.meters.items():
             scalar_name = f"{meter_name} {header}"
-            writter.add_scalar(scalar_name, meter_value.avg, step)
+            writer.add_scalar(scalar_name, meter_value.avg, step)
 
     logger.synchronize_between_processes()
     print(header, logger)
@@ -249,7 +249,7 @@ def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils
     return val_loader
 
 
-def evaluate(model, loaders, args, writter=None, step=None):
+def evaluate(model, loaders, args, writer=None, step=None):
     for loader_name, loader in loaders.items():
         _evaluate(
             model,
@@ -259,7 +259,7 @@ def evaluate(model, loaders, args, writter=None, step=None):
             padder_mode=args.padder_type,
             header=f"{loader_name} evaluation",
             batch_size=args.batch_size,
-            writter=writter,
+            writer=writer,
             step=step,
         )
 
@@ -394,13 +394,13 @@ def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer,
                 for name, value in logger.meters.items():
                     writer.add_scalar(name, value.avg, step)
                 # log the images to tensorboard
-                pred_grid = vizualization.make_training_sample_grid(
+                pred_grid = visualization.make_training_sample_grid(
                     image_left, image_right, disp_mask, valid_disp_mask, disp_predictions
                 )
                 writer.add_image("predictions", pred_grid, step, dataformats="HWC")
 
                 # second thing we want to see is how relevant the iterative refinement is
-                pred_sequence_grid = vizualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
+                pred_sequence_grid = visualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
                 writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC")
 
         if step % args.save_frequency == 0:
@@ -446,13 +446,13 @@ def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer,
 def main(args):
     args.total_iterations = sum(args.dataset_steps)
 
-    # intialize DDP setting
+    # initialize DDP setting
     utils.setup_ddp(args)
     print(args)
 
     args.test_only = args.train_datasets is None
 
-    # set the appropiate devices
+    # set the appropriate devices
     if args.distributed and args.device == "cpu":
         raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
     device = torch.device(args.device)
@@ -495,7 +495,7 @@ def main(args):
     # initialize the learning rate schedule
     scheduler = make_lr_schedule(args, optimizer)
 
-    # load them from checkpoint if need
+    # load them from checkpoint if needed
     args.start_step = 0
     if args.resume_path is not None:
         checkpoint = torch.load(args.resume_path, map_location="cpu")
@@ -531,7 +531,7 @@ def main(args):
         # the train dataset is preshuffled in order to respect the iteration order
         sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True)
     else:
-        # the train dataset is already shuffled so we can use a simple SequentialSampler
+        # the train dataset is already shuffled, so we can use a simple SequentialSampler
         sampler = torch.utils.data.SequentialSampler(train_dataset)
 
     train_loader = torch.utils.data.DataLoader(
@@ -542,7 +542,7 @@ def main(args):
         num_workers=args.workers,
     )
 
-    # intialize the logger
+    # initialize the logger
     if args.tensorboard_summaries:
         from torch.utils.tensorboard import SummaryWriter
 
diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py
index 10c5be687..f9e05feba 100644
--- a/references/depth/stereo/transforms.py
+++ b/references/depth/stereo/transforms.py
@@ -253,7 +253,7 @@ class AsymetricGammaAdjust(torch.nn.Module):
 
 
 class RandomErase(torch.nn.Module):
-    # Produces multiple symetric random erasures
+    # Produces multiple symmetric random erasures
     # these can be viewed as occlusions present in both camera views.
     # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map
     def __init__(
@@ -400,7 +400,7 @@ class RandomSpatialShift(torch.nn.Module):
             img_right = F.affine(
                 img_right,
                 angle=angle,
-                translate=[0, shift],  # translation only on the y axis
+                translate=[0, shift],  # translation only on the y-axis
                 center=[x, y],
                 scale=1.0,
                 shear=0.0,
@@ -491,7 +491,7 @@ class RandomRescaleAndCrop(torch.nn.Module):
     # The reason we don't rely on RandomResizedCrop is because of a significant
     # difference in the parametrization of both transforms, in particular,
     # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
+    # which leads to fairly different results (and different epe). For more details see
     # https://github.com/pytorch/vision/pull/5026/files#r762932579
     def __init__(
         self,
@@ -533,7 +533,7 @@ class RandomRescaleAndCrop(torch.nn.Module):
         # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise
         # 2 to the power of that random value. This final scale distribution will have a different
         # mean and variance than a uniform distribution. Note that a scale of 1 will result in
-        # in a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
+        # a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
         # of 0.5X the original size.
         if self.scaling_type == "exponential":
             scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
@@ -577,7 +577,7 @@ class RandomRescaleAndCrop(torch.nn.Module):
 
         # Note: For sparse datasets (Kitti), the original code uses a "margin"
         # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
+        # We don't, not sure if it matters much
         y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item()
         x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item()
 
diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py
index f02988667..c809cc74d 100644
--- a/references/depth/stereo/utils/losses.py
+++ b/references/depth/stereo/utils/losses.py
@@ -54,7 +54,7 @@ def _sequence_loss_fn(
     abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
     num_predictions = flow_preds.shape[0]
 
-    # alocating on CPU and moving to device during run-time can force
+    # allocating on CPU and moving to device during run-time can force
     # an unwanted GPU synchronization that produces a large overhead
     if weights is None or len(weights) != num_predictions:
         weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
@@ -303,7 +303,7 @@ def _flow_sequence_consistency_loss_fn(
     # In the original paper, an additional refinement network is used to refine a flow prediction.
     # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update.
     # which should be consistent with the previous step. In this implementation, we simplify the overall loss
-    # term and ignore left-right consistency loss or photometric loss which can be treated separetely.
+    # term and ignore left-right consistency loss or photometric loss which can be treated separately.
 
     torch._assert(
         rescale_factor <= 1.0,
diff --git a/references/depth/stereo/utils/metrics.py b/references/depth/stereo/utils/metrics.py
index eaf67822e..05b149fb0 100644
--- a/references/depth/stereo/utils/metrics.py
+++ b/references/depth/stereo/utils/metrics.py
@@ -15,7 +15,7 @@ def compute_metrics(
     metrics_dict = {}
 
     pixels_diffs = (flow_pred - flow_gt).abs()
-    # there is no Y flow in Stereo Matching, therefor flow.abs() = flow.pow(2).sum(dim=1).sqrt()
+    # there is no Y flow in Stereo Matching, therefore flow.abs() = flow.pow(2).sum(dim=1).sqrt()
     flow_norm = flow_gt.abs()
 
     if valid_flow_mask is not None:
diff --git a/references/depth/stereo/vizualization.py b/references/depth/stereo/visualization.py
similarity index 100%
rename from references/depth/stereo/vizualization.py
rename to references/depth/stereo/visualization.py
diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 396de6329..38c8279c3 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -116,7 +116,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
         # if all boxes have close to zero area, there is no annotation
         if _has_only_empty_bbox(anno):
             return False
-        # keypoints task have a slight different critera for considering
+        # keypoints task have a slight different criteria for considering
         # if an annotation is valid
         if "keypoints" not in anno[0]:
             return True
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
index 5312cc036..d12e14b54 100644
--- a/references/detection/group_by_aspect_ratio.py
+++ b/references/detection/group_by_aspect_ratio.py
@@ -63,7 +63,7 @@ class GroupedBatchSampler(BatchSampler):
         expected_num_batches = len(self)
         num_remaining = expected_num_batches - num_batches
         if num_remaining > 0:
-            # for the remaining batches, take first the buffers with largest number
+            # for the remaining batches, take first the buffers with the largest number
             # of elements
             for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
                 remaining = self.batch_size - len(buffer_per_group[group_id])
diff --git a/references/optical_flow/README.md b/references/optical_flow/README.md
index a7ac02237..6ad1d4079 100644
--- a/references/optical_flow/README.md
+++ b/references/optical_flow/README.md
@@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 --nnodes 1 train.py --val-dataset sintel --batch-siz
 
 This should give an epe of about 1.3822 on the clean pass and 2.7161 on the
 final pass of Sintel-train. Results may vary slightly depending on the batch
-size and the number of GPUs. For the most accurate resuts use 1 GPU and
+size and the number of GPUs. For the most accurate results use 1 GPU and
 `--batch-size 1`:
 
 ```
diff --git a/references/optical_flow/train.py b/references/optical_flow/train.py
index be6ffe4cc..ab99cc3ae 100644
--- a/references/optical_flow/train.py
+++ b/references/optical_flow/train.py
@@ -82,7 +82,7 @@ def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, b
 
     def inner_loop(blob):
         if blob[0].dim() == 3:
-            # input is not batched so we add an extra dim for consistency
+            # input is not batched, so we add an extra dim for consistency
             blob = [x[None, :, :, :] if x is not None else None for x in blob]
 
         image1, image2, flow_gt = blob[:3]
@@ -150,7 +150,7 @@ def evaluate(model, args):
 
     for name in val_datasets:
         if name == "kitti":
-            # Kitti has different image sizes so we need to individually pad them, we can't batch.
+            # Kitti has different image sizes, so we need to individually pad them, we can't batch.
             # see comment in InputPadder
             if args.batch_size != 1 and (not args.distributed or args.rank == 0):
                 warnings.warn(
diff --git a/references/optical_flow/transforms.py b/references/optical_flow/transforms.py
index 601160818..1ca3ca2a8 100644
--- a/references/optical_flow/transforms.py
+++ b/references/optical_flow/transforms.py
@@ -164,7 +164,7 @@ class RandomResizeAndCrop(torch.nn.Module):
     # The reason we don't rely on RandomResizedCrop is because of a significant
     # difference in the parametrization of both transforms, in particular,
     # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
+    # which leads to fairly different results (and different epe). For more details see
     # https://github.com/pytorch/vision/pull/5026/files#r762932579
     def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, stretch_prob=0.8):
         super().__init__()
@@ -208,7 +208,7 @@ class RandomResizeAndCrop(torch.nn.Module):
 
         # Note: For sparse datasets (Kitti), the original code uses a "margin"
         # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
+        # We don't, not sure if it matters much
         y0 = torch.randint(0, img1.shape[1] - self.crop_size[0], size=(1,)).item()
         x0 = torch.randint(0, img1.shape[2] - self.crop_size[1], size=(1,)).item()
 
diff --git a/references/optical_flow/utils.py b/references/optical_flow/utils.py
index 8b07e9de3..cd4b16eb0 100644
--- a/references/optical_flow/utils.py
+++ b/references/optical_flow/utils.py
@@ -181,7 +181,7 @@ def sequence_loss(flow_preds, flow_gt, valid_flow_mask, gamma=0.8, max_flow=400)
     if gamma > 1:
         raise ValueError(f"Gamma should be < 1, got {gamma}.")
 
-    # exlude invalid pixels and extremely large diplacements
+    # exclude invalid pixels and extremely large diplacements
     flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
     valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
 
@@ -248,7 +248,7 @@ def setup_ddp(args):
     # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2
 
     if all(key in os.environ for key in ("LOCAL_RANK", "RANK", "WORLD_SIZE")):
-        # if we're here, the script was called with torchrun. Otherwise
+        # if we're here, the script was called with torchrun. Otherwise,
         # these args will be set already by the run_with_submitit script
         args.local_rank = int(os.environ["LOCAL_RANK"])
         args.rank = int(os.environ["RANK"])
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index bb57e65b8..1aa72a9fe 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -260,7 +260,7 @@ def get_args_parser(add_help=True):
     parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
     parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
     parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name")
-    parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss")
+    parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument(
         "-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
diff --git a/references/similarity/sampler.py b/references/similarity/sampler.py
index f4564eca3..fe6517418 100644
--- a/references/similarity/sampler.py
+++ b/references/similarity/sampler.py
@@ -48,7 +48,7 @@ class PKSampler(Sampler):
 
         # Ensures there are enough classes to sample from
         if len(self.groups) < p:
-            raise ValueError("There are not enought classes to sample from")
+            raise ValueError("There are not enough classes to sample from")
 
     def __iter__(self):
         # Shuffle samples within groups
diff --git a/references/video_classification/README.md b/references/video_classification/README.md
index cbd303275..39c5d8f1b 100644
--- a/references/video_classification/README.md
+++ b/references/video_classification/README.md
@@ -76,7 +76,7 @@ Input data augmentations at validation time (with optional parameters):
 5. Convert BCHW to CBHW
 
 This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the
-batch size per GPU. Moreover note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
+batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
 Video resnet models:
 ```
 # number of frames per clip
diff --git a/test/conftest.py b/test/conftest.py
index 1a9b2db7f..fb072ae25 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -16,7 +16,7 @@ def pytest_collection_modifyitems(items):
     # This hook is called by pytest after it has collected the tests (google its name to check out its doc!)
     # We can ignore some tests as we see fit here, or add marks, such as a skip mark.
     #
-    # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
+    # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
     # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already.
     # This is true for both CircleCI and the fbcode internal CI.
     # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on
@@ -57,7 +57,7 @@ def pytest_collection_modifyitems(items):
                 item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG))
 
         if item.get_closest_marker("dont_collect") is not None:
-            # currently, this is only used for some tests we're sure we dont want to run on fbcode
+            # currently, this is only used for some tests we're sure we don't want to run on fbcode
             continue
 
         out_items.append(item)
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index d945a5606..1f186650a 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -137,7 +137,7 @@ def test_all_configs(test):
 
     .. note::
 
-        This will try to remove duplicate configurations. During this process it will not not preserve a potential
+        This will try to remove duplicate configurations. During this process it will not preserve a potential
         ordering of the configurations or an inner ordering of a configuration.
     """
 
@@ -146,7 +146,7 @@ def test_all_configs(test):
             return [dict(config_) for config_ in {tuple(sorted(config.items())) for config in configs}]
         except TypeError:
             # A TypeError will be raised if a value of any config is not hashable, e.g. a list. In that case duplicate
-            # removal would be a lot more elaborate and we simply bail out.
+            # removal would be a lot more elaborate, and we simply bail out.
             return configs
 
     @functools.wraps(test)
@@ -297,7 +297,7 @@ class DatasetTestCase(unittest.TestCase):
         .. note::
 
             The default behavior is only valid if the dataset to be tested has ``root`` as the only required parameter.
-            Otherwise you need to overwrite this method.
+            Otherwise, you need to overwrite this method.
 
         Args:
             tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset
@@ -604,7 +604,7 @@ class ImageDatasetTestCase(DatasetTestCase):
             patch_checks=patch_checks,
             **kwargs,
         ) as (dataset, info):
-            # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access
+            # PIL.Image.open() only loads the image metadata upfront and keeps the file open until the first access
             # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we
             # force-load opened images.
             # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an
@@ -786,7 +786,7 @@ def create_video_file(
     fps: float = 25,
     **kwargs: Any,
 ) -> pathlib.Path:
-    """Create an video file from random data.
+    """Create a video file from random data.
 
     Args:
         root (Union[str, pathlib.Path]): Root directory the video file will be placed in.
diff --git a/test/smoke_test.py b/test/smoke_test.py
index 81166309a..728c9440f 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -13,7 +13,7 @@ SCRIPT_DIR = Path(__file__).parent
 
 def smoke_test_torchvision() -> None:
     print(
-        "Is torchvision useable?",
+        "Is torchvision usable?",
         all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]),
     )
 
diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py
index 4fba3c3d0..befceca02 100644
--- a/test/test_backbone_utils.py
+++ b/test/test_backbone_utils.py
@@ -194,7 +194,7 @@ class TestFxFeatureExtraction:
             assert n1 == n2
             assert p1.equal(p2)
 
-        # And that ouputs match
+        # And that outputs match
         with torch.no_grad():
             ilg_out = ilg_model(self.inp)
             fgn_out = fx_model(self.inp)
diff --git a/test/test_models.py b/test/test_models.py
index 083096ae2..d4dab1bbc 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -1026,7 +1026,7 @@ def test_raft(model_fn, scripted):
     preds = model(img1, img2)
     flow_pred = preds[-1]
     # Tolerance is fairly high, but there are 2 * H * W outputs to check
-    # The .pkl were generated on the AWS cluter, on the CI it looks like the resuts are slightly different
+    # The .pkl were generated on the AWS cluter, on the CI it looks like the results are slightly different
     _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1)
 
 
diff --git a/test/test_models_detection_utils.py b/test/test_models_detection_utils.py
index 09895057a..69703ab58 100644
--- a/test/test_models_detection_utils.py
+++ b/test/test_models_detection_utils.py
@@ -38,7 +38,7 @@ class TestModelsDetectionUtils:
     def test_resnet_fpn_backbone_frozen_layers(self, train_layers, exp_froz_params):
         # we know how many initial layers and parameters of the network should
         # be frozen for each trainable_backbone_layers parameter value
-        # i.e all 53 params are frozen if trainable_backbone_layers=0
+        # i.e. all 53 params are frozen if trainable_backbone_layers=0
         # ad first 24 params are frozen if trainable_backbone_layers=2
         model = backbone_utils.resnet_fpn_backbone("resnet50", weights=None, trainable_layers=train_layers)
         # boolean list that is true if the param at that index is frozen
diff --git a/test/test_ops.py b/test/test_ops.py
index eb2e31c9b..3f9400257 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -87,7 +87,7 @@ class RoIOpTester(ABC):
         x_dtype = self.dtype if x_dtype is None else x_dtype
         rois_dtype = self.dtype if rois_dtype is None else rois_dtype
         pool_size = 5
-        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        # n_channels % (pool_size ** 2) == 0 required for PS operations.
         n_channels = 2 * (pool_size**2)
         x = torch.rand(2, n_channels, 10, 10, dtype=x_dtype, device=device)
         if not contiguous:
@@ -647,11 +647,11 @@ class TestNMS:
     @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 50), (3, 10)))
     def test_qnms(self, iou, scale, zero_point):
         # Note: we compare qnms vs nms instead of qnms vs reference implementation.
-        # This is because with the int convertion, the trick used in _create_tensors_with_iou
+        # This is because with the int conversion, the trick used in _create_tensors_with_iou
         # doesn't really work (in fact, nms vs reference implem will also fail with ints)
         err_msg = "NMS and QNMS give different results for IoU={}"
         boxes, scores = self._create_tensors_with_iou(1000, iou)
-        scores *= 100  # otherwise most scores would be 0 or 1 after int convertion
+        scores *= 100  # otherwise most scores would be 0 or 1 after int conversion
 
         qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point, dtype=torch.quint8)
         qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point, dtype=torch.quint8)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 44474e888..d737f4e44 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -135,7 +135,7 @@ class TestSmoke:
     def test_mixup_cutmix(self, transform, input):
         transform(input)
 
-        # add other data that should bypass and wont raise any error
+        # add other data that should bypass and won't raise any error
         input_copy = dict(input)
         input_copy["path"] = "/path/to/somewhere"
         input_copy["num"] = 1234
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 6534caa90..0c388dbb5 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -1818,7 +1818,7 @@ def test_random_erasing(seed):
     tol = 0.05
     assert 1 / 3 - tol <= aspect_ratio <= 3 + tol
 
-    # Make sure that h > w and h < w are equaly likely (log-scale sampling)
+    # Make sure that h > w and h < w are equally likely (log-scale sampling)
     aspect_ratios = []
     random.seed(42)
     trial = 1000
diff --git a/test/test_utils.py b/test/test_utils.py
index dde3ee90d..84b1a673c 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -184,7 +184,7 @@ def test_draw_no_boxes():
     boxes = torch.full((0, 4), 0, dtype=torch.float)
     with pytest.warns(UserWarning, match=re.escape("boxes doesn't contain any box. No box was drawn")):
         res = utils.draw_bounding_boxes(img, boxes)
-        # Check that the function didnt change the image
+        # Check that the function didn't change the image
         assert res.eq(img).all()
 
 
@@ -209,7 +209,7 @@ def test_draw_segmentation_masks(colors, alpha):
 
     # For testing we enforce that there's no overlap between the masks. The
     # current behaviour is that the last mask's color will take priority when
-    # masks overlap, but this makes testing slightly harder so we don't really
+    # masks overlap, but this makes testing slightly harder, so we don't really
     # care
     overlap = masks[0] & masks[1]
     masks[:, overlap] = False
@@ -283,7 +283,7 @@ def test_draw_no_segmention_mask():
     masks = torch.full((0, 100, 100), 0, dtype=torch.bool)
     with pytest.warns(UserWarning, match=re.escape("masks doesn't contain any mask. No mask was drawn")):
         res = utils.draw_segmentation_masks(img, masks)
-        # Check that the function didnt change the image
+        # Check that the function didn't change the image
         assert res.eq(img).all()
 
 
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
index 867923d10..243aa12fc 100644
--- a/test/test_video_reader.py
+++ b/test/test_video_reader.py
@@ -127,7 +127,7 @@ def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer
             ascending order. We need to decode more frames even when we meet end
             pts
     """
-    # seeking in the stream is imprecise. Thus, seek to an ealier PTS by a margin
+    # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin
     margin = 1
     seek_offset = max(start_pts - margin, 0)
 
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
index ba82625c3..6be50f8ab 100644
--- a/torchvision/csrc/io/decoder/defs.h
+++ b/torchvision/csrc/io/decoder/defs.h
@@ -301,7 +301,7 @@ struct DecoderMetadata {
 };
 /**
  * Abstract class for decoding media bytes
- * It has two diffrent modes. Internal media bytes retrieval for given uri and
+ * It has two different modes. Internal media bytes retrieval for given uri and
  * external media bytes provider in case of memory streams
  */
 class MediaDecoder {
diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
index a7b0128e3..4e420c3b3 100644
--- a/torchvision/csrc/io/decoder/memory_buffer.cpp
+++ b/torchvision/csrc/io/decoder/memory_buffer.cpp
@@ -61,7 +61,7 @@ DecoderInCallback MemoryBuffer::getCallback(
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - supported
+          // seek capability, yes - supported
           return 0;
         }
         return object.seek(size, whence);
diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
index 936d1e94f..980725c2f 100644
--- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp
+++ b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
@@ -368,7 +368,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - no
+          // seek capability, yes - no
           return -1;
         }
         return object.seek(size, whence);
@@ -408,7 +408,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - no
+          // seek capability, yes - no
           return -1;
         }
         return object.seek(size, whence);
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
index d8b36a35a..2167ea695 100644
--- a/torchvision/csrc/io/video/video.cpp
+++ b/torchvision/csrc/io/video/video.cpp
@@ -319,14 +319,14 @@ void Video::Seek(double ts, bool fastSeek = false) {
 std::tuple<torch::Tensor, double> Video::Next() {
   TORCH_CHECK(initialized, "Video object has to be initialized first");
   // if failing to decode simply return a null tensor (note, should we
-  // raise an exeption?)
+  // raise an exception?)
   double frame_pts_s;
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
   // decode single frame
   DecoderOutputMessage out;
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
-  // if successfull
+  // if successful
   if (res == 0) {
     frame_pts_s = double(double(out.header.pts) * 1e-6);
 
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
index 5249ee003..7e57fcf6e 100644
--- a/torchvision/csrc/io/video/video.h
+++ b/torchvision/csrc/io/video/video.h
@@ -42,8 +42,8 @@ struct Video : torch::CustomClassHolder {
  private:
   bool succeeded = false; // decoder init flag
   // seekTS and doSeek act as a flag - if it's not set, next function simply
-  // retruns the next frame. If it's set, we look at the global seek
-  // time in comination with any_frame settings
+  // returns the next frame. If it's set, we look at the global seek
+  // time in combination with any_frame settings
   double seekTS = -1;
 
   bool initialized = false;
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index e6684e953..b787de6f6 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -60,7 +60,7 @@ void roi_align_forward_kernel_impl(
     // When the grid is empty, output zeros.
     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indices and weights shared by all chanels,
+    // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<detail::PreCalc<T>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 15f468b31..5b13a8ca3 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -164,7 +164,7 @@ void qroi_align_forward_kernel_impl(
     const float count =
         std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indices and weights shared by all chanels,
+    // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<detail::PreCalc<float>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 02a790199..b07161d27 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -424,9 +424,9 @@ class Middlebury2014Stereo(StereoMatchingDataset):
         split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
         use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
             The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
-        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
+        calibration (string, optional): Whether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
-        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
+        download (boolean, optional): Whether or not to download the dataset in the ``root`` directory.
     """
 
     splits = {
@@ -720,7 +720,7 @@ class CREStereo(StereoMatchingDataset):
 class FallingThingsStereo(StereoMatchingDataset):
     """`FallingThings <https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             FallingThings
@@ -825,7 +825,7 @@ class SceneFlowStereo(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
     This interface provides access to the `FlyingThings3D, `Monkaa` and `Driving` datasets.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             SceneFlow
@@ -1031,7 +1031,7 @@ class SintelStereo(StereoMatchingDataset):
         disparity_map = r * 4 + g / (2**6) + b / (2**14)
         # reshape into (C, H, W) format
         disparity_map = np.transpose(disparity_map, (2, 0, 1))
-        # find the appropiate file paths
+        # find the appropriate file paths
         occlued_mask_path, out_of_frame_mask_path = self._get_occlussion_mask_paths(file_path)
         # occlusion masks
         valid_mask = np.asarray(Image.open(occlued_mask_path)) == 0
@@ -1058,7 +1058,7 @@ class SintelStereo(StereoMatchingDataset):
 class InStereo2k(StereoMatchingDataset):
     """`InStereo2k <https://github.com/YuhuaXu/StereoDataset>`_ dataset.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             InStereo2k
diff --git a/torchvision/datasets/celeba.py b/torchvision/datasets/celeba.py
index dbacece88..2e3206fcb 100644
--- a/torchvision/datasets/celeba.py
+++ b/torchvision/datasets/celeba.py
@@ -41,7 +41,7 @@ class CelebA(VisionDataset):
     """
 
     base_folder = "celeba"
-    # There currently does not appear to be a easy way to extract 7z in python (without introducing additional
+    # There currently does not appear to be an easy way to extract 7z in python (without introducing additional
     # dependencies). The "in-the-wild" (not aligned+cropped) images are only in 7z, so they are not available
     # right now.
     file_list = [
diff --git a/torchvision/datasets/cityscapes.py b/torchvision/datasets/cityscapes.py
index 86d65c7c0..855445981 100644
--- a/torchvision/datasets/cityscapes.py
+++ b/torchvision/datasets/cityscapes.py
@@ -177,7 +177,7 @@ class Cityscapes(VisionDataset):
             index (int): Index
         Returns:
             tuple: (image, target) where target is a tuple of all target types if target_type is a list with more
-            than one item. Otherwise target is a json object if target_type="polygon", else the image segmentation.
+            than one item. Otherwise, target is a json object if target_type="polygon", else the image segmentation.
         """
 
         image = Image.open(self.images[index]).convert("RGB")
diff --git a/torchvision/datasets/country211.py b/torchvision/datasets/country211.py
index 9a62520fe..59598fd44 100644
--- a/torchvision/datasets/country211.py
+++ b/torchvision/datasets/country211.py
@@ -11,7 +11,7 @@ class Country211(ImageFolder):
     This dataset was built by filtering the images from the YFCC100m dataset
     that have GPS coordinate corresponding to a ISO-3166 country code. The
     dataset is balanced by sampling 150 train images, 50 validation images, and
-    100 test images images for each country.
+    100 test images for each country.
 
     Args:
         root (string): Root directory of the dataset.
diff --git a/torchvision/datasets/hmdb51.py b/torchvision/datasets/hmdb51.py
index 9067418d8..a58ddc293 100644
--- a/torchvision/datasets/hmdb51.py
+++ b/torchvision/datasets/hmdb51.py
@@ -102,7 +102,7 @@ class HMDB51(VisionDataset):
             output_format=output_format,
         )
         # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
         # video clips
         self.full_video_clips = video_clips
         self.fold = fold
diff --git a/torchvision/datasets/mnist.py b/torchvision/datasets/mnist.py
index fd7425449..a5b23cfe0 100644
--- a/torchvision/datasets/mnist.py
+++ b/torchvision/datasets/mnist.py
@@ -366,7 +366,7 @@ class QMNIST(MNIST):
             that takes in the target and transforms it.
         train (bool,optional,compatibility): When argument 'what' is
             not specified, this boolean decides whether to load the
-            training set ot the testing set.  Default: True.
+            training set or the testing set.  Default: True.
     """
 
     subsets = {"train": "train", "test": "test", "test10k": "test", "test50k": "test", "nist": "nist"}
diff --git a/torchvision/datasets/places365.py b/torchvision/datasets/places365.py
index c26b6f030..5c97202a2 100644
--- a/torchvision/datasets/places365.py
+++ b/torchvision/datasets/places365.py
@@ -15,7 +15,7 @@ class Places365(VisionDataset):
         root (string): Root directory of the Places365 dataset.
         split (string, optional): The dataset split. Can be one of ``train-standard`` (default), ``train-challenge``,
             ``val``.
-        small (bool, optional): If ``True``, uses the small images, i. e. resized to 256 x 256 pixels, instead of the
+        small (bool, optional): If ``True``, uses the small images, i.e. resized to 256 x 256 pixels, instead of the
             high resolution ones.
         download (bool, optional): If ``True``, downloads the dataset components and places them in ``root``. Already
             downloaded archives are not downloaded again.
@@ -32,7 +32,7 @@ class Places365(VisionDataset):
         targets (list): The class_index value for each image in the dataset
 
     Raises:
-        RuntimeError: If ``download is False`` and the meta files, i. e. the devkit, are not present or corrupted.
+        RuntimeError: If ``download is False`` and the meta files, i.e. the devkit, are not present or corrupted.
         RuntimeError: If ``download is True`` and the image archive is already extracted.
     """
     _SPLITS = ("train-standard", "train-challenge", "val")
diff --git a/torchvision/datasets/stl10.py b/torchvision/datasets/stl10.py
index 11b170b19..f47d0c32a 100644
--- a/torchvision/datasets/stl10.py
+++ b/torchvision/datasets/stl10.py
@@ -15,7 +15,7 @@ class STL10(VisionDataset):
         root (string): Root directory of dataset where directory
             ``stl10_binary`` exists.
         split (string): One of {'train', 'test', 'unlabeled', 'train+unlabeled'}.
-            Accordingly dataset is selected.
+            Accordingly, dataset is selected.
         folds (int, optional): One of {0-9} or None.
             For training, loads one of the 10 pre-defined folds of 1k samples for the
             standard evaluation procedure. If no value is passed, loads the 5k samples.
diff --git a/torchvision/datasets/svhn.py b/torchvision/datasets/svhn.py
index facb2d885..8a0d70b8f 100644
--- a/torchvision/datasets/svhn.py
+++ b/torchvision/datasets/svhn.py
@@ -78,7 +78,7 @@ class SVHN(VisionDataset):
         loaded_mat = sio.loadmat(os.path.join(self.root, self.filename))
 
         self.data = loaded_mat["X"]
-        # loading from the .mat file gives an np array of type np.uint8
+        # loading from the .mat file gives an np.ndarray of type np.uint8
         # converting to np.int64, so that we have a LongTensor after
         # the conversion from the numpy array
         # the squeeze is needed to obtain a 1D tensor
diff --git a/torchvision/datasets/ucf101.py b/torchvision/datasets/ucf101.py
index 749646080..60e83e158 100644
--- a/torchvision/datasets/ucf101.py
+++ b/torchvision/datasets/ucf101.py
@@ -93,7 +93,7 @@ class UCF101(VisionDataset):
             output_format=output_format,
         )
         # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
         # video clips
         self.full_video_clips = video_clips
         self.indices = self._select_fold(video_list, annotation_path, fold, train)
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index b607def24..bb1974b7a 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -49,7 +49,7 @@ class _VideoTimestampsDataset:
     Dataset used to parallelize the reading of the timestamps
     of a list of videos, given their paths in the filesystem.
 
-    Used in VideoClips and defined at top level so it can be
+    Used in VideoClips and defined at top level, so it can be
     pickled when forking.
     """
 
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
index 1f04a57c6..2bd7d1192 100644
--- a/torchvision/io/_video_opt.py
+++ b/torchvision/io/_video_opt.py
@@ -137,8 +137,7 @@ def _read_video_from_file(
     audio_timebase: Fraction = default_timebase,
 ) -> Tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
     """
-    Reads a video from a file, returning both the video frames as well as
-    the audio frames
+    Reads a video from a file, returning both the video frames and the audio frames
 
     Args:
     filename (str): path to the video file
@@ -281,8 +280,7 @@ def _read_video_from_memory(
     audio_timebase_denominator: int = 1,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Reads a video from memory, returning both the video frames as well as
-    the audio frames
+    Reads a video from memory, returning both the video frames as the audio frames
     This function is torchscriptable.
 
     Args:
@@ -337,7 +335,7 @@ def _read_video_from_memory(
 
     if not isinstance(video_data, torch.Tensor):
         with warnings.catch_warnings():
-            # Ignore the warning because we actually dont modify the buffer in this function
+            # Ignore the warning because we actually don't modify the buffer in this function
             warnings.filterwarnings("ignore", message="The given buffer is not writable")
             video_data = torch.frombuffer(video_data, dtype=torch.uint8)
 
@@ -382,7 +380,7 @@ def _read_video_timestamps_from_memory(
     """
     if not isinstance(video_data, torch.Tensor):
         with warnings.catch_warnings():
-            # Ignore the warning because we actually dont modify the buffer in this function
+            # Ignore the warning because we actually don't modify the buffer in this function
             warnings.filterwarnings("ignore", message="The given buffer is not writable")
             video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.read_video_from_memory(
@@ -423,7 +421,7 @@ def _probe_video_from_memory(
     """
     if not isinstance(video_data, torch.Tensor):
         with warnings.catch_warnings():
-            # Ignore the warning because we actually dont modify the buffer in this function
+            # Ignore the warning because we actually don't modify the buffer in this function
             warnings.filterwarnings("ignore", message="The given buffer is not writable")
             video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.probe_video_from_memory(video_data)
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
index 005c58e32..416174db3 100644
--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -50,7 +50,7 @@ def read_file(path: str) -> torch.Tensor:
 
 def write_file(filename: str, data: torch.Tensor) -> None:
     """
-    Writes the contents of a uint8 tensor with one dimension to a
+    Writes the contents of an uint8 tensor with one dimension to a
     file.
 
     Args:
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index 6d549bed4..73e61dac1 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -241,8 +241,7 @@ def read_video(
     output_format: str = "THWC",
 ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
     """
-    Reads a video from a file, returning both the video frames as well as
-    the audio frames
+    Reads a video from a file, returning both the video frames and the audio frames
 
     Args:
         filename (str): path to the video file
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 36e9512e1..1cdcb267d 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -92,13 +92,13 @@ class VideoReader:
         Each stream descriptor consists of two parts: stream type (e.g. 'video') and
         a unique stream id (which are determined by the video encoding).
         In this way, if the video contaner contains multiple
-        streams of the same type, users can acces the one they want.
+        streams of the same type, users can access the one they want.
         If only stream type is passed, the decoder auto-detects first stream of that type.
 
     Args:
         src (string, bytes object, or tensor): The media source.
             If string-type, it must be a file path supported by FFMPEG.
-            If bytes shoud be an in memory representatin of a file supported by FFMPEG.
+            If bytes should be an in memory representatin of a file supported by FFMPEG.
             If Tensor, it is interpreted internally as byte buffer.
             It must be one-dimensional, of type ``torch.uint8``.
 
@@ -145,7 +145,7 @@ class VideoReader:
                 src = io.BytesIO(src)
             else:
                 with warnings.catch_warnings():
-                    # Ignore the warning because we actually dont modify the buffer in this function
+                    # Ignore the warning because we actually don't modify the buffer in this function
                     warnings.filterwarnings("ignore", message="The given buffer is not writable")
                     src = torch.frombuffer(src, dtype=torch.uint8)
         elif isinstance(src, torch.Tensor):
@@ -280,12 +280,12 @@ class VideoReader:
                 Each descriptor consists of two parts: stream type (e.g. 'video') and
                 a unique stream id (which are determined by video encoding).
                 In this way, if the video contaner contains multiple
-                streams of the same type, users can acces the one they want.
+                streams of the same type, users can access the one they want.
                 If only stream type is passed, the decoder auto-detects first stream
                 of that type and returns it.
 
         Returns:
-            (bool): True on succes, False otherwise
+            (bool): True on success, False otherwise
         """
         if self.backend == "cuda":
             warnings.warn("GPU decoding only works with video stream.")
diff --git a/torchvision/models/_utils.py b/torchvision/models/_utils.py
index f73d19380..d59a6220b 100644
--- a/torchvision/models/_utils.py
+++ b/torchvision/models/_utils.py
@@ -191,7 +191,7 @@ def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[D
                 # used to be a pretrained parameter.
                 pretrained_positional = weights_arg is not sentinel
                 if pretrained_positional:
-                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have a
+                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have
                     # unified access to the value if the default value is a callable.
                     kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
                 else:
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
index 83e9f107c..a25bdc1d4 100644
--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
@@ -403,9 +403,9 @@ class Matcher:
         it is unmatched, then match it to the ground-truth with which it has the highest
         quality value.
         """
-        # For each gt, find the prediction with which it has highest quality
+        # For each gt, find the prediction with which it has the highest quality
         highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find highest quality match available, even if it is low, including ties
+        # Find the highest quality match available, even if it is low, including ties
         gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
         # Example gt_pred_pairs_of_highest_quality:
         #   tensor([[    0, 39796],
@@ -501,14 +501,14 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
     if K exceeds the number of elements along that axis. Previously, python's min() function was
     used to determine whether to use the provided k-value or the specified dim axis value.
 
-    However in cases where the model is being exported in tracing mode, python min() is
+    However, in cases where the model is being exported in tracing mode, python min() is
     static causing the model to be traced incorrectly and eventually fail at the topk node.
     In order to avoid this situation, in tracing mode, torch.min() is used instead.
 
     Args:
-        input (Tensor): The orignal input tensor.
+        input (Tensor): The original input tensor.
         orig_kval (int): The provided k-value.
-        axis(int): Axis along which we retreive the input size.
+        axis(int): Axis along which we retrieve the input size.
 
     Returns:
         min_kval (int): Appropriately selected k-value.
diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py
index cdf572a8b..253f6502a 100644
--- a/torchvision/models/detection/anchor_utils.py
+++ b/torchvision/models/detection/anchor_utils.py
@@ -145,7 +145,7 @@ class DefaultBoxGenerator(nn.Module):
             of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
         scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
             the ``min_ratio`` and ``max_ratio`` parameters.
-        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of defalt boxes. If not provided
+        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
             it will be estimated from the data.
         clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
             is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
index 4941d7ec4..62ea13e32 100644
--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -102,12 +102,12 @@ def resnet_fpn_backbone(
         trainable_layers (int): number of trainable (not frozen) layers starting from final block.
             Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
         returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
-            By default all layers are returned.
+            By default, all layers are returned.
         extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
             be performed. It is expected to take the fpn features, the original
             features and the names of the original features as input, and returns
             a new list of feature maps and their corresponding names. By
-            default a ``LastLevelMaxPool`` is used.
+            default, a ``LastLevelMaxPool`` is used.
     """
     backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
     return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks)
@@ -121,7 +121,7 @@ def _resnet_fpn_extractor(
     norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> BackboneWithFPN:
 
-    # select layers that wont be frozen
+    # select layers that won't be frozen
     if trainable_layers < 0 or trainable_layers > 5:
         raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
     layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
@@ -208,7 +208,7 @@ def _mobilenet_extractor(
     stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     if trainable_layers < 0 or trainable_layers > num_stages:
         raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ")
     freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 8e3107050..1eaadaa20 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -47,9 +47,9 @@ class FasterRCNN(GeneralizedRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -68,7 +68,7 @@ class FasterRCNN(GeneralizedRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
@@ -128,7 +128,7 @@ class FasterRCNN(GeneralizedRCNN):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # FasterRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -483,9 +483,9 @@ def fasterrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and a targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -714,7 +714,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(
     **kwargs: Any,
 ) -> FasterRCNN:
     """
-    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tunned for mobile use cases.
+    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
 
     .. betastatus:: detection module
 
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index eeb15aed6..a5c73c8b4 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -70,7 +70,7 @@ class FCOSHead(nn.Module):
             else:
                 gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
                 gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
-            gt_classes_targets[matched_idxs_per_image < 0] = -1  # backgroud
+            gt_classes_targets[matched_idxs_per_image < 0] = -1  # background
             all_gt_classes_targets.append(gt_classes_targets)
             all_gt_boxes_targets.append(gt_boxes_targets)
 
@@ -274,9 +274,9 @@ class FCOS(nn.Module):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -329,7 +329,7 @@ class FCOS(nn.Module):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # FCOS needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -695,9 +695,9 @@ def fcos_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 8648368a6..556ae7dc0 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -29,9 +29,9 @@ class KeypointRCNN(FasterRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -55,7 +55,7 @@ class KeypointRCNN(FasterRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
@@ -121,7 +121,7 @@ class KeypointRCNN(FasterRCNN):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # KeypointRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -387,9 +387,9 @@ def keypointrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index a9ef832af..df755e252 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -56,7 +56,7 @@ class MaskRCNN(FasterRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
@@ -123,7 +123,7 @@ class MaskRCNN(FasterRCNN):
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # MaskRCNN needs to know the number of
         >>> # output channels in a backbone. For mobilenet_v2, it's 1280
-        >>> # so we need to add it here
+        >>> # so we need to add it here,
         >>> backbone.out_channels = 1280
         >>>
         >>> # let's make the RPN generate 5 x 3 anchors per spatial
@@ -422,9 +422,9 @@ def maskrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 49425d20b..44513d8e8 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -327,9 +327,9 @@ class RetinaNet(nn.Module):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -382,7 +382,7 @@ class RetinaNet(nn.Module):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # RetinaNet needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -743,9 +743,9 @@ def retinanet_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
index f6347a0d9..51b210cb6 100644
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -315,7 +315,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
     valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
     valid = torch.where(valid)[0]
 
-    # torch.mean (in binary_cross_entropy_with_logits) does'nt
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
     # accept empty tensors, so handle it sepaartely
     if keypoint_targets.numel() == 0 or len(valid) == 0:
         return keypoint_logits.sum() * 0
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index 224f33aae..8d75ff189 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -128,12 +128,12 @@ class SSD(nn.Module):
     Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
     to a fixed size before passing it to the backbone.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -556,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
     stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     torch._assert(
         0 <= trainable_layers <= num_stages,
         f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
@@ -590,12 +590,12 @@ def ssd300_vgg16(
     .. betastatus:: detection module
 
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
     to a fixed size before passing it to the backbone.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index f3bb4133f..f06dcef52 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -172,7 +172,7 @@ def _mobilenet_extractor(
     stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     if not 0 <= trainable_layers <= num_stages:
         raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
     freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py
index dd2d728ab..589d5e45b 100644
--- a/torchvision/models/detection/transform.py
+++ b/torchvision/models/detection/transform.py
@@ -76,7 +76,7 @@ class GeneralizedRCNNTransform(nn.Module):
     Performs input / target transformation before feeding the data to a GeneralizedRCNN
     model.
 
-    The transformations it perform are:
+    The transformations it performs are:
         - input normalization (mean subtraction and std division)
         - input / target resizing to match min_size / max_size
 
@@ -158,7 +158,7 @@ class GeneralizedRCNNTransform(nn.Module):
 
     def torch_choice(self, k: List[int]) -> int:
         """
-        Implements `random.choice` via torch ops so it can be compiled with
+        Implements `random.choice` via torch ops, so it can be compiled with
         TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
         is fixed.
         """
diff --git a/torchvision/models/feature_extraction.py b/torchvision/models/feature_extraction.py
index d247d9a3e..d8c2dca4a 100644
--- a/torchvision/models/feature_extraction.py
+++ b/torchvision/models/feature_extraction.py
@@ -18,7 +18,7 @@ __all__ = ["create_feature_extractor", "get_graph_node_names"]
 
 class LeafModuleAwareTracer(fx.Tracer):
     """
-    An fx.Tracer that allows the user to specify a set of leaf modules, ie.
+    An fx.Tracer that allows the user to specify a set of leaf modules, i.e.
     modules that are not to be traced through. The resulting graph ends up
     having single nodes referencing calls to the leaf modules' forward methods.
     """
@@ -103,7 +103,7 @@ class NodePathTracer(LeafModuleAwareTracer):
 
         if node.op != "call_module":
             # In this case module_qualname from torch.fx doesn't go all the
-            # way to the leaf function/op so we need to append it
+            # way to the leaf function/op, so we need to append it
             if len(node_qualname) > 0:
                 # Only append '.' if we are deeper than the top level module
                 node_qualname += "."
@@ -136,7 +136,7 @@ class NodePathTracer(LeafModuleAwareTracer):
 
 
 def _is_subseq(x, y):
-    """Check if y is a subseqence of x
+    """Check if y is a subsequence of x
     https://stackoverflow.com/a/24017747/4391249
     """
     iter_x = iter(x)
@@ -228,7 +228,7 @@ def get_graph_node_names(
         tracer_kwargs (dict, optional): a dictionary of keyword arguments for
             ``NodePathTracer`` (they are eventually passed onto
             `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
             {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
             WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
             provided dictionary.
@@ -391,7 +391,7 @@ def create_feature_extractor(
         tracer_kwargs (dict, optional): a dictionary of keyword arguments for
             ``NodePathTracer`` (which passes them onto it's parent class
             `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
             {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
             WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
             provided dictionary.
@@ -544,7 +544,7 @@ def create_feature_extractor(
         graph_module.graph.eliminate_dead_code()
         graph_module.recompile()
 
-        # Keep track of the tracer and graph so we can choose the main one
+        # Keep track of the tracer and graph, so we can choose the main one
         tracers[mode] = tracer
         graphs[mode] = graph
 
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index bc43ac3d7..a7b2a8e44 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -48,7 +48,7 @@ class Inception3(nn.Module):
             )
             init_weights = True
         if len(inception_blocks) != 7:
-            raise ValueError(f"lenght of inception_blocks should be 7 instead of {len(inception_blocks)}")
+            raise ValueError(f"length of inception_blocks should be 7 instead of {len(inception_blocks)}")
         conv_block = inception_blocks[0]
         inception_a = inception_blocks[1]
         inception_b = inception_blocks[2]
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
index c55621499..633be4d4e 100644
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -301,7 +301,7 @@ class PartitionAttentionLayer(nn.Module):
         self,
         in_channels: int,
         head_dim: int,
-        # partitioning parameteres
+        # partitioning parameters
         partition_size: int,
         partition_type: str,
         # grid size needs to be known at initialization time
diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 2b59ae0c8..4b501b721 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -88,7 +88,7 @@ def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9)
 
 
 def _get_depths(alpha: float) -> List[int]:
-    """Scales tensor depths as in reference MobileNet code, prefers rouding up
+    """Scales tensor depths as in reference MobileNet code, prefers rounding up
     rather than down."""
     depths = [32, 16, 24, 40, 80, 96, 192, 320]
     return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index da5b24910..b6d59d10e 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -23,7 +23,7 @@ class InvertedResidual(nn.Module):
         super().__init__()
         self.stride = stride
         if stride not in [1, 2]:
-            raise ValueError(f"stride should be 1 or 2 insted of {stride}")
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")
 
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index 8d06956d6..37da4ff0a 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -35,7 +35,7 @@ class ResidualBlock(nn.Module):
         # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset,
         # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful
         # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm
-        # because these aren't frozen, but we don't bother (also, we woudn't be able to load the original weights).
+        # because these aren't frozen, but we don't bother (also, we wouldn't be able to load the original weights).
         self.convnormrelu1 = Conv2dNormActivation(
             in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
         )
@@ -318,7 +318,7 @@ class MaskPredictor(nn.Module):
     def __init__(self, *, in_channels, hidden_size, multiplier=0.25):
         super().__init__()
         self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder
+        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder,
         # and we interpolate with all 9 surrounding neighbors. See paper and appendix B.
         self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0)
 
@@ -430,7 +430,7 @@ class RAFT(nn.Module):
                 Its input is ``image1``. As in the original implementation, its output will be split into 2 parts:
 
                 - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                   the ``update_block``
 
                 These 2 parts are split according to the ``hidden_state_size`` of the ``update_block``, so the output
@@ -474,7 +474,7 @@ class RAFT(nn.Module):
         if (h, w) != image2.shape[-2:]:
             raise ValueError(f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}")
         if not (h % 8 == 0) and (w % 8 == 0):
-            raise ValueError(f"input image H and W should be divisible by 8, insted got {h} (h) and {w} (w)")
+            raise ValueError(f"input image H and W should be divisible by 8, instead got {h} (h) and {w} (w)")
 
         fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
         fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index ba6e5cd3b..a971dc09b 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -212,7 +212,7 @@ class BlockParams:
         **kwargs: Any,
     ) -> "BlockParams":
         """
-        Programatically compute all the per-block settings,
+        Programmatically compute all the per-block settings,
         given the RegNet parameters.
 
         The first step is to compute the quantized linear block parameters,
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index f0eb501c0..1d3638917 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -108,7 +108,7 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
     # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
     # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
     # This variant is also known as ResNet V1.5 and improves accuracy according to
     # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
 
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index 8474b6882..10a41444d 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -793,7 +793,7 @@ def interpolate_embeddings(
     interpolation_mode: str = "bicubic",
     reset_heads: bool = False,
 ) -> "OrderedDict[str, torch.Tensor]":
-    """This function helps interpolating positional embeddings during checkpoint loading,
+    """This function helps interpolate positional embeddings during checkpoint loading,
     especially when you want to apply a pre-trained model on images with different resolution.
 
     Args:
@@ -818,7 +818,7 @@ def interpolate_embeddings(
     # We do this by reshaping the positions embeddings to a 2d grid, performing
     # an interpolation in the (h, w) space and then reshaping back to a 1d grid.
     if new_seq_length != seq_length:
-        # The class token embedding shouldn't be interpolated so we split it up.
+        # The class token embedding shouldn't be interpolated, so we split it up.
         seq_length -= 1
         new_seq_length -= 1
         pos_embedding_token = pos_embedding[:, :1, :]
diff --git a/torchvision/ops/_box_convert.py b/torchvision/ops/_box_convert.py
index 5e0520fc4..124bdd0bc 100644
--- a/torchvision/ops/_box_convert.py
+++ b/torchvision/ops/_box_convert.py
@@ -50,7 +50,7 @@ def _box_xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
 def _box_xywh_to_xyxy(boxes: Tensor) -> Tensor:
     """
     Converts bounding boxes from (x, y, w, h) format to (x1, y1, x2, y2) format.
-    (x, y) refers to top left of bouding box.
+    (x, y) refers to top left of bounding box.
     (w, h) refers to width and height of box.
     Args:
         boxes (Tensor[N, 4]): boxes in (x, y, w, h) which will be converted.
diff --git a/torchvision/ops/diou_loss.py b/torchvision/ops/diou_loss.py
index caf62bd2c..c64c6673a 100644
--- a/torchvision/ops/diou_loss.py
+++ b/torchvision/ops/diou_loss.py
@@ -36,7 +36,7 @@ def distance_box_iou_loss(
         Tensor: Loss tensor with the reduction option applied.
 
     Reference:
-        Zhaohui Zheng et. al: Distance Intersection over Union Loss:
+        Zhaohui Zheng et al.: Distance Intersection over Union Loss:
         https://arxiv.org/abs/1911.08287
     """
 
diff --git a/torchvision/ops/feature_pyramid_network.py b/torchvision/ops/feature_pyramid_network.py
index ffec3505e..f4b190844 100644
--- a/torchvision/ops/feature_pyramid_network.py
+++ b/torchvision/ops/feature_pyramid_network.py
@@ -178,7 +178,7 @@ class FeaturePyramidNetwork(nn.Module):
 
         Returns:
             results (OrderedDict[Tensor]): feature maps after FPN layers.
-                They are ordered from highest resolution first.
+                They are ordered from the highest resolution first.
         """
         # unpack OrderedDict into two lists for easier handling
         names = list(x.keys())
diff --git a/torchvision/ops/giou_loss.py b/torchvision/ops/giou_loss.py
index 03ef8e622..ec8bc8852 100644
--- a/torchvision/ops/giou_loss.py
+++ b/torchvision/ops/giou_loss.py
@@ -33,7 +33,7 @@ def generalized_box_iou_loss(
         Tensor: Loss tensor with the reduction option applied.
 
     Reference:
-        Hamid Rezatofighi et. al: Generalized Intersection over Union:
+        Hamid Rezatofighi et al.: Generalized Intersection over Union:
         A Metric and A Loss for Bounding Box Regression:
         https://arxiv.org/abs/1902.09630
     """
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index e1be0caa8..91784e602 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -131,10 +131,10 @@ class Conv2dNormActivation(ConvNormActivation):
         out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
         kernel_size: (int, optional): Size of the convolving kernel. Default: 3
         stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -181,10 +181,10 @@ class Conv3dNormActivation(ConvNormActivation):
         out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
         kernel_size: (int, optional): Size of the convolving kernel. Default: 3
         stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -266,8 +266,8 @@ class MLP(torch.nn.Sequential):
     Args:
         in_channels (int): Number of channels of the input
         hidden_channels (List[int]): List of the hidden channel dimensions
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer won't be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool): Whether to use bias in the linear layer. Default ``True``
         dropout (float): The probability for the dropout layer. Default: 0.0
diff --git a/torchvision/ops/poolers.py b/torchvision/ops/poolers.py
index 4bca4a50c..9cdd83a59 100644
--- a/torchvision/ops/poolers.py
+++ b/torchvision/ops/poolers.py
@@ -160,8 +160,8 @@ def _multiscale_roi_align(
             reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         output_size (Union[List[Tuple[int, int]], List[int]]): size of the output
         sampling_ratio (int): sampling ratio for ROIAlign
-        scales (Optional[List[float]]): If None, scales will be automatically infered. Default value is None.
-        mapper (Optional[LevelMapper]): If none, mapper will be automatically infered. Default value is None.
+        scales (Optional[List[float]]): If None, scales will be automatically inferred. Default value is None.
+        mapper (Optional[LevelMapper]): If none, mapper will be automatically inferred. Default value is None.
     Returns:
         result (Tensor)
     """
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
index bf5d7534c..89a23aae7 100644
--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -54,7 +54,7 @@ def get_correlation(
 ) -> Tensor:
     """Function that computes a correlation product between the left and right features.
 
-    The correlation is computed in a sliding window fashion, namely the the left features are fixed
+    The correlation is computed in a sliding window fashion, namely the left features are fixed
     and for each ``(i, j)`` location we compute the correlation with a sliding window anchored in
     ``(i, j)`` from the right feature map. The sliding window selects pixels obtained in the range of the sliding
     window; i.e ``(i - window_size // 2, i + window_size // 2)`` respectively ``(j - window_size // 2, j + window_size // 2)``.
@@ -76,7 +76,7 @@ def get_correlation(
     # we expand the left features for broadcasting
     left_feature = left_feature.unsqueeze(1)
     # this will compute an element product of between [B, 1, C, H, W] * [B, n_views, C, H, W]
-    # to obtain correlations over the pixel canditates we perform a mean on the C dimension
+    # to obtain correlations over the pixel candidates we perform a mean on the C dimension
     correlation = torch.mean(left_feature * right_padded, dim=2, keepdim=False)
     # the final correlation tensor shape will be [B, n_views, H, W]
     # where on the i-th position of the n_views dimension we will have
@@ -138,7 +138,7 @@ class IterativeCorrelationLayer(nn.Module):
         self.search_pixels = np.prod(search_window_1d)
         self.groups = groups
 
-        # two selection tables for dealing withh the small_patch argument in the forward function
+        # two selection tables for dealing with the small_patch argument in the forward function
         self.patch_sizes = {
             "2d": [search_window_2d for _ in range(self.groups)],
             "1d": [search_window_1d for _ in range(self.groups)],
@@ -167,7 +167,7 @@ class IterativeCorrelationLayer(nn.Module):
         dilate_size_list = self.dilate_sizes[window_type]
 
         # chunking the left and right feature to perform group-wise correlation
-        # mechanism simillar to GroupNorm. See section 3.1 ``Group-wise correlation``.
+        # mechanism similar to GroupNorm. See section 3.1 ``Group-wise correlation``.
         left_groups = torch.chunk(left_feature, self.groups, dim=1)
         right_groups = torch.chunk(right_feature, self.groups, dim=1)
 
@@ -202,7 +202,7 @@ class AttentionOffsetCorrelationLayer(nn.Module):
         self.search_pixels = int(np.prod(search_window_1d))
         self.groups = groups
 
-        # two selection tables for dealing withh the small_patch argument in the forward function
+        # two selection tables for dealing with the small_patch argument in the forward function
         self.patch_sizes = {
             "2d": [search_window_2d for _ in range(self.groups)],
             "1d": [search_window_1d for _ in range(self.groups)],
@@ -234,7 +234,7 @@ class AttentionOffsetCorrelationLayer(nn.Module):
             # prepare for transformer required input shapes
             left_feature = left_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
             right_feature = right_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
-            # this can be either self attention or cross attention, hence the tupple return
+            # this can be either self attention or cross attention, hence the tuple return
             left_feature, right_feature = self.attention_module(left_feature, right_feature)
             left_feature = left_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
             right_feature = right_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
@@ -272,7 +272,7 @@ class AttentionOffsetCorrelationLayer(nn.Module):
 
             for d in (0, 2, 3):
                 offsets = offsets.unsqueeze(d)
-            # extra offsets for search (i.e. deformed search indexes. Simillar concept to deformable convolutions)
+            # extra offsets for search (i.e. deformed search indexes. Similar concept to deformable convolutions)
             offsets = offsets + extra_offset
 
             coords = (
@@ -344,7 +344,7 @@ def elu_feature_map(x: Tensor) -> Tensor:
 class LinearAttention(nn.Module):
     """
     Linear attention operation from: https://arxiv.org/pdf/2006.16236.pdf
-    Cannonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+    Canonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
     LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
     """
 
@@ -437,7 +437,7 @@ class SoftmaxAttention(nn.Module):
 
 class PositionalEncodingSine(nn.Module):
     """
-    Sinusoidal positonal encodings
+    Sinusoidal positional encodings
 
     Using the scaling term from https://github.com/megvii-research/CREStereo/blob/master/nets/attention/position_encoding.py
     Reference implementation from https://github.com/facebookresearch/detr/blob/8a144f83a287f4d3fece4acdf073f387c5af387d/models/position_encoding.py#L28-L48
@@ -484,7 +484,7 @@ class PositionalEncodingSine(nn.Module):
 class LocalFeatureEncoderLayer(nn.Module):
     """
     LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
     """
 
     def __init__(
@@ -556,7 +556,7 @@ class LocalFeatureEncoderLayer(nn.Module):
 class LocalFeatureTransformer(nn.Module):
     """
     LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
     """
 
     def __init__(
@@ -652,9 +652,9 @@ class CREStereo(nn.Module):
         feature_downsample_rates (List[int]): The downsample rates used to build a feature pyramid from the outputs of the `feature_encoder`. Default: [2, 4]
         correlation_groups (int): In how many groups should the features be split when computer per-pixel correlation. Defaults 4.
         search_window_1d (Tuple[int, int]): The alternate search window size in the x and y directions for the 1D case. Defaults to (1, 9).
-        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Simillar to `nn.Conv2d` dilate. Defaults to (1, 1).
+        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
         search_window_2d (Tuple[int, int]): The alternate search window size in the x and y directions for the 2D case. Defaults to (3, 3).
-        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Simillar to `nn.Conv2d` dilate. Defaults to (1, 1).
+        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
     """
 
     def __init__(
@@ -699,7 +699,7 @@ class CREStereo(nn.Module):
             multiplier=0.25,
         )
 
-        # offsets modules for offseted feature selection
+        # offsets modules for offsetted feature selection
         self.offset_convs = nn.ModuleDict()
         self.correlation_layers = nn.ModuleDict()
 
@@ -715,7 +715,7 @@ class CREStereo(nn.Module):
         # useful for iterating through torch.jit.script module given the network forward pass
         #
         # Ignore the largest resolution. We handle that separately due to torch.jit.script
-        # not being to able access to runtime generated keys in ModuleDicts.
+        # not being able to access to runtime generated keys in ModuleDicts.
         # This way, we can keep a generic way of processing all pyramid levels but except
         # the final one
         iterative_correlation_layer = partial(
@@ -814,7 +814,7 @@ class CREStereo(nn.Module):
         flow_estimates: Dict[str, Tensor] = {}
         # we added this because of torch.script.jit
         # also, the predicition prior is always going to have the
-        # spatial size of the features outputed by the feature encoder
+        # spatial size of the features outputted by the feature encoder
         flow_pred_prior: Tensor = torch.empty(
             size=(B, 2, left_features.shape[2], left_features.shape[3]),
             dtype=l_pyramid[max_res].dtype,
@@ -860,9 +860,9 @@ class CREStereo(nn.Module):
                 # compute the scale difference between the first pyramid scale and the current pyramid scale
                 scale_to_base = l_pyramid[fine_grained_resolution].shape[2] // l_pyramid[resolution].shape[2]
                 for it in range(num_iters // 2):
-                    # set wether or not we want to search on (X, Y) axes for correlation or just on X axis
+                    # set whether we want to search on (X, Y) axes for correlation or just on X axis
                     window_type = self._get_window_type(it)
-                    # we consider this a prior, therefor we do not want to back-propagate through it
+                    # we consider this a prior, therefore we do not want to back-propagate through it
                     flow_estimates[resolution] = flow_estimates[resolution].detach()
 
                     correlations = correlation_layer(
@@ -918,8 +918,8 @@ class CREStereo(nn.Module):
         # this coincides with the maximum resolution
 
         # we keep a separate loop here in order to avoid python control flow
-        # to decide how much iterations should we do based on the current resolution
-        # further more, if provided with an inital flow, there is no need to generate
+        # to decide how many iterations should we do based on the current resolution
+        # furthermore, if provided with an initial flow, there is no need to generate
         # a prior estimate when moving into the final refinement stage
 
         for it in range(num_iters):
@@ -1095,7 +1095,7 @@ class CREStereo_Base_Weights(WeightsEnum):
                     "_detailed": {
                         # 1 is the number of cascades
                         1: {
-                            # 2 is number of refininement interations
+                            # 2 is number of refininement iterations
                             2: {
                                 "mae": 1.704,
                                 "rmse": 3.738,
@@ -1307,7 +1307,7 @@ class CREStereo_Base_Weights(WeightsEnum):
                     "_detailed": {
                         # 1 is the number of cascades
                         1: {
-                            # 2 is number of refininement interations
+                            # 2 is number of refininement iterations
                             2: {
                                 "mae": 1.85,
                                 "rmse": 3.797,
diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
index 4b6f5a0bd..aca12948c 100644
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py
@@ -53,7 +53,7 @@ class BaseEncoder(raft.FeatureEncoder):
 class FeatureEncoder(nn.Module):
     """Feature Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Context Encoder.
 
-    The FeatureEncoder takes concatination of left and right image as input, it produce feature embedding that later
+    The FeatureEncoder takes concatenation of left and right image as input. It produces feature embedding that later
     will be used to construct correlation volume.
     """
 
@@ -89,7 +89,7 @@ class FeatureEncoder(nn.Module):
 class MultiLevelContextEncoder(nn.Module):
     """Context Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Feature Encoder.
 
-    The ContextEncoder takes left image as input and it outputs concatenated hidden_states and contexts.
+    The ContextEncoder takes left image as input, and it outputs concatenated hidden_states and contexts.
     In Raft-Stereo we have multi level GRUs and this context encoder will also multi outputs (list of Tensor)
     that correspond to each GRUs.
     Take note that the length of "out_with_blocks" parameter represent the number of GRU's level.
@@ -180,7 +180,7 @@ class MultiLevelUpdateBlock(nn.Module):
 
         # The GRU input size is the size of previous level hidden_dim plus next level hidden_dim
         # if this is the first gru, then we replace previous level with motion_encoder output channels
-        # for the last GRU, we dont add the next level hidden_dim
+        # for the last GRU, we don't add the next level hidden_dim
         gru_input_dims = []
         for i in range(len(hidden_dims)):
             input_dim = hidden_dims[i - 1] if i > 0 else motion_encoder.out_channels
@@ -191,8 +191,8 @@ class MultiLevelUpdateBlock(nn.Module):
         self.grus = nn.ModuleList(
             [
                 ConvGRU(input_size=gru_input_dims[i], hidden_size=hidden_dims[i], kernel_size=3, padding=1)
-                # Ideally we should reverse the direction during forward to use the gru with smallest resolution first
-                # however currently there is no way to reverse a ModuleList that is jit script compatible
+                # Ideally we should reverse the direction during forward to use the gru with the smallest resolution
+                # first however currently there is no way to reverse a ModuleList that is jit script compatible
                 # hence we reverse the ordering of self.grus on the constructor instead
                 # see: https://github.com/pytorch/pytorch/issues/31772
                 for i in reversed(list(range(len(hidden_dims))))
@@ -214,7 +214,7 @@ class MultiLevelUpdateBlock(nn.Module):
         for reverse_i, gru in enumerate(self.grus):
             i = len(self.grus) - 1 - reverse_i
             if level_processed[i]:
-                # X is concatination of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
+                # X is concatenation of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
                 # upsampled hidden_dim (or nothing if not exist).
                 if i == 0:
                     features = self.motion_encoder(disparity, corr_features)
@@ -237,7 +237,7 @@ class MultiLevelUpdateBlock(nn.Module):
 
                 hidden_states[i] = gru(hidden_states[i], features, contexts[i])
 
-                # NOTE: For slow-fast gru, we dont always want to calculate delta disparity for every call on UpdateBlock
+                # NOTE: For slow-fast gru, we don't always want to calculate delta disparity for every call on UpdateBlock
                 # Hence we move the delta disparity calculation to the RAFT-Stereo main forward
 
         return hidden_states
@@ -361,10 +361,10 @@ class RaftStereo(nn.Module):
                 It has multi-level output and each level will have 2 parts:
 
                 - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                   the ``update_block``
 
-            corr_pyramid (CorrPyramid1d): Module to buid the correlation pyramid from feature encoder output
+            corr_pyramid (CorrPyramid1d): Module to build the correlation pyramid from feature encoder output
             corr_block (CorrBlock1d): The correlation block, which uses the correlation pyramid indexes
                 to create correlation features. It takes the coordinate of the centroid pixel and correlation pyramid
                 as input and returns the correlation features.
@@ -382,7 +382,7 @@ class RaftStereo(nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
-        # This indicate that the disparity output will be only have 1 channel (represent horizontal axis).
+        # This indicates that the disparity output will be only have 1 channel (represent horizontal axis).
         # We need this because some stereo matching model like CREStereo might have 2 channel on the output
         self.output_channels = 1
 
@@ -409,7 +409,7 @@ class RaftStereo(nn.Module):
         self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 12
     ) -> List[Tensor]:
         """
-        Return disparity predictions on every iterations as a list of Tensor.
+        Return disparity predictions on every iteration as a list of Tensor.
         args:
             left_image (Tensor): The input left image with layout B, C, H, W
             right_image (Tensor): The input right image with layout B, C, H, W
@@ -424,7 +424,7 @@ class RaftStereo(nn.Module):
 
         torch._assert(
             (h % self.base_downsampling_ratio == 0 and w % self.base_downsampling_ratio == 0),
-            f"input image H and W should be divisible by {self.base_downsampling_ratio}, insted got H={h} and W={w}",
+            f"input image H and W should be divisible by {self.base_downsampling_ratio}, instead got H={h} and W={w}",
         )
 
         fmaps = self.feature_encoder(torch.cat([left_image, right_image], dim=0))
@@ -655,7 +655,7 @@ class Raft_Stereo_Base_Weights(WeightsEnum):
             "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
             "_metrics": {
                 # Following metrics from paper: https://arxiv.org/abs/2109.07547
-                # Using standard metrics for each datasets
+                # Using standard metrics for each dataset
                 "Kitty2015": {
                     # Ratio of pixels with difference less than 3px from ground truth
                     "3px": 0.9426,
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 517f74577..037bb5f84 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -187,7 +187,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     # x+(1-r)*(y-x) = x + (1-r)*y - (1-r)*x = x*r + y*(1-r)
     view.add_(blurred_degenerate.sub_(view), alpha=(1.0 - sharpness_factor))
 
-    # The actual data of ouput have been modified by the above. We only need to clamp and cast now.
+    # The actual data of output have been modified by the above. We only need to clamp and cast now.
     output = output.clamp_(0, bound)
     if not fp:
         output = output.to(image.dtype)
@@ -236,7 +236,7 @@ def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
     #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
     #   + H channel has division by `(maxc - minc)`.
     #
-    # Instead of overwriting NaN afterwards, we just prevent it from occuring so
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring so
     # we don't need to deal with it in case we save the NaN in a buffer in
     # backprop, if it is ever supported, but it doesn't hurt to do so.
     eqc = maxc == minc
diff --git a/torchvision/transforms/_transforms_video.py b/torchvision/transforms/_transforms_video.py
index a67eca5ff..a04da4f74 100644
--- a/torchvision/transforms/_transforms_video.py
+++ b/torchvision/transforms/_transforms_video.py
@@ -151,7 +151,7 @@ class ToTensorVideo:
 
 class RandomHorizontalFlipVideo:
     """
-    Flip the video clip along the horizonal direction with a given probability
+    Flip the video clip along the horizontal direction with a given probability
     Args:
         p (float): probability of the clip being flipped. Default value is 0.5
     """
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 94bed3d7b..5d662a2c1 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -427,7 +427,7 @@ def resize(
             the resized image: if the longer edge of the image is greater
             than ``max_size`` after being resized according to ``size``, then
             the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
             smaller edge may be shorter than ``size``. This is only supported
             if ``size`` is an int (or a sequence of length 1 in torchscript
             mode).
@@ -859,7 +859,7 @@ def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         brightness_factor (float):  How much to adjust the brightness. Can be
-            any non negative number. 0 gives a black image, 1 gives the
+            any non-negative number. 0 gives a black image, 1 gives the
             original image while 2 increases the brightness by a factor of 2.
 
     Returns:
@@ -881,7 +881,7 @@ def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         contrast_factor (float): How much to adjust the contrast. Can be any
-            non negative number. 0 gives a solid gray image, 1 gives the
+            non-negative number. 0 gives a solid gray image, 1 gives the
             original image while 2 increases the contrast by a factor of 2.
 
     Returns:
@@ -1143,8 +1143,8 @@ def affine(
         translate (sequence of integers): horizontal and vertical translations (post-rotation translation)
         scale (float): overall scale
         shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction.
-            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
-            the second value corresponds to a shear parallel to the y axis.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x-axis, while
+            the second value corresponds to a shear parallel to the y-axis.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
@@ -1295,7 +1295,7 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
         h (int): Height of the erased region.
         w (int): Width of the erased region.
         v: Erasing value.
-        inplace(bool, optional): For in-place operations. By default is set False.
+        inplace(bool, optional): For in-place operations. By default, is set False.
 
     Returns:
         Tensor Image: Erased image.
@@ -1400,7 +1400,7 @@ def posterize(img: Tensor, bits: int) -> Tensor:
 
     Args:
         img (PIL Image or Tensor): Image to have its colors posterized.
-            If img is torch Tensor, it should be of type torch.uint8 and
+            If img is torch Tensor, it should be of type torch.uint8, and
             it is expected to be in [..., 1 or 3, H, W] format, where ... means
             it can have an arbitrary number of leading dimensions.
             If img is PIL Image, it is expected to be in mode "L" or "RGB".
@@ -1447,7 +1447,7 @@ def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
             original image while 2 increases the sharpness by a factor of 2.
 
     Returns:
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index aa40516d4..30414bf1c 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -272,7 +272,7 @@ def _rgb2hsv(img: Tensor) -> Tensor:
     #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
     #   + H channel has division by `(maxc - minc)`.
     #
-    # Instead of overwriting NaN afterwards, we just prevent it from occuring so
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring, so
     # we don't need to deal with it in case we save the NaN in a buffer in
     # backprop, if it is ever supported, but it doesn't hurt to do so.
     eqc = maxc == minc
@@ -416,7 +416,7 @@ def pad(
     out_dtype = img.dtype
     need_cast = False
     if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
-        # Here we temporary cast input tensor to float
+        # Here we temporarily cast input tensor to float
         # until pytorch issue is resolved :
         # https://github.com/pytorch/pytorch/issues/40763
         need_cast = True
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 0bd5c65fb..2f513058a 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -304,7 +304,7 @@ class Resize(torch.nn.Module):
             the resized image: if the longer edge of the image is greater
             than ``max_size`` after being resized according to ``size``, then
             the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
             smaller edge may be shorter than ``size``. This is only supported
             if ``size`` is an int (or a sequence of length 1 in torchscript
             mode).
@@ -1160,7 +1160,7 @@ class ColorJitter(torch.nn.Module):
             or the given [min, max]. Should be non negative numbers.
         contrast (float or tuple of float (min, max)): How much to jitter contrast.
             contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-            or the given [min, max]. Should be non negative numbers.
+            or the given [min, max]. Should be non-negative numbers.
         saturation (float or tuple of float (min, max)): How much to jitter saturation.
             saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
             or the given [min, max]. Should be non negative numbers.
@@ -1381,10 +1381,10 @@ class RandomAffine(torch.nn.Module):
         scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
             randomly sampled from the range a <= scale <= b. Will keep original scale by default.
         shear (sequence or number, optional): Range of degrees to select from.
-            If shear is a number, a shear parallel to the x axis in the range (-shear, +shear)
-            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x axis in the
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
             range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
-            a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Will not apply shear by default.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
@@ -1602,7 +1602,7 @@ class RandomGrayscale(torch.nn.Module):
 
 
 class RandomErasing(torch.nn.Module):
-    """Randomly selects a rectangle region in an torch Tensor image and erases its pixels.
+    """Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
     This transform does not support PIL Image.
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
 
@@ -1938,7 +1938,7 @@ class RandomAdjustSharpness(torch.nn.Module):
 
     Args:
         sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
             original image while 2 increases the sharpness by a factor of 2.
         p (float): probability of the image being sharpened. Default value is 0.5
     """
diff --git a/torchvision/utils.py b/torchvision/utils.py
index 98f9381e1..ebaf82a46 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -145,7 +145,7 @@ def save_image(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(save_image)
     grid = make_grid(tensor, **kwargs)
-    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
+    # Add 0.5 after unnormalizing to [0, 255] to round to the nearest integer
     ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
     im = Image.fromarray(ndarr)
     im.save(fp, format=format)
@@ -565,7 +565,7 @@ def _log_api_usage_once(obj: Any) -> None:
 def _make_ntuple(x: Any, n: int) -> Tuple[Any, ...]:
     """
     Make n-tuple from input x. If x is an iterable, then we just convert it to tuple.
-    Otherwise we will make a tuple of length n, all with value of x.
+    Otherwise, we will make a tuple of length n, all with value of x.
     reference: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/utils.py#L8
 
     Args:
-- 
GitLab


From a679c1b5737006e74b838a50a2139c9af50a17cf Mon Sep 17 00:00:00 2001
From: Aditya Oke <47158509+oke-aditya@users.noreply.github.com>
Date: Thu, 12 Jan 2023 15:11:59 +0530
Subject: [PATCH 206/624] Update license_file to license_files in setup.cfg
 (setuptools deprecation) (#7077)

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f36195194..a57e341e6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,7 +2,7 @@
 universal=1
 
 [metadata]
-license_file = LICENSE
+license_files = LICENSE
 
 [pep8]
 max-line-length = 120
-- 
GitLab


From 372f4faeeb89d644705891be411499fc750d571c Mon Sep 17 00:00:00 2001
From: Aditya Oke <47158509+oke-aditya@users.noreply.github.com>
Date: Thu, 12 Jan 2023 15:49:49 +0530
Subject: [PATCH 207/624] Expose maxvit and swin_v2 models in hubconf.py
 (#7078)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 hubconf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hubconf.py b/hubconf.py
index 57ce7a0d1..25bbf4e07 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -20,6 +20,7 @@ from torchvision.models.efficientnet import (
 )
 from torchvision.models.googlenet import googlenet
 from torchvision.models.inception import inception_v3
+from torchvision.models.maxvit import maxvit_t
 from torchvision.models.mnasnet import mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3
 from torchvision.models.mobilenetv2 import mobilenet_v2
 from torchvision.models.mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small
@@ -68,6 +69,6 @@ from torchvision.models.shufflenetv2 import (
     shufflenet_v2_x2_0,
 )
 from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
-from torchvision.models.swin_transformer import swin_b, swin_s, swin_t
+from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t
 from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn
 from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32
-- 
GitLab


From 934ce3b88c39b94f2851a94d3da89ec91026889b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 12 Jan 2023 17:25:05 +0100
Subject: [PATCH 208/624] fix MNIST byte flipping (#7081)

* fix MNIST byte flipping

* add test

* move to utils

* remove lazy import
---
 test/test_datasets_utils.py   | 20 ++++++++++++++++++++
 torchvision/datasets/mnist.py | 13 ++++++-------
 torchvision/datasets/utils.py |  6 ++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/test/test_datasets_utils.py b/test/test_datasets_utils.py
index ec68fd72a..4e30dfab2 100644
--- a/test/test_datasets_utils.py
+++ b/test/test_datasets_utils.py
@@ -7,7 +7,9 @@ import tarfile
 import zipfile
 
 import pytest
+import torch
 import torchvision.datasets.utils as utils
+from common_utils import assert_equal
 from torch._utils_internal import get_file_path_2
 from torchvision.datasets.folder import make_dataset
 from torchvision.datasets.utils import _COMPRESSED_FILE_OPENERS
@@ -215,6 +217,24 @@ class TestDatasetsUtils:
         pytest.raises(ValueError, utils.verify_str_arg, 0, ("a",), "arg")
         pytest.raises(ValueError, utils.verify_str_arg, "b", ("a",), "arg")
 
+    @pytest.mark.parametrize(
+        ("dtype", "actual_hex", "expected_hex"),
+        [
+            (torch.uint8, "01 23 45 67 89 AB CD EF", "01 23 45 67 89 AB CD EF"),
+            (torch.float16, "01 23 45 67 89 AB CD EF", "23 01 67 45 AB 89 EF CD"),
+            (torch.int32, "01 23 45 67 89 AB CD EF", "67 45 23 01 EF CD AB 89"),
+            (torch.float64, "01 23 45 67 89 AB CD EF", "EF CD AB 89 67 45 23 01"),
+        ],
+    )
+    def test_flip_byte_order(self, dtype, actual_hex, expected_hex):
+        def to_tensor(hex):
+            return torch.frombuffer(bytes.fromhex(hex), dtype=dtype)
+
+        assert_equal(
+            utils._flip_byte_order(to_tensor(actual_hex)),
+            to_tensor(expected_hex),
+        )
+
 
 @pytest.mark.parametrize(
     ("kwargs", "expected_error_msg"),
diff --git a/torchvision/datasets/mnist.py b/torchvision/datasets/mnist.py
index a5b23cfe0..6953d1fc5 100644
--- a/torchvision/datasets/mnist.py
+++ b/torchvision/datasets/mnist.py
@@ -12,7 +12,7 @@ import numpy as np
 import torch
 from PIL import Image
 
-from .utils import check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
+from .utils import _flip_byte_order, check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
 from .vision import VisionDataset
 
 
@@ -519,13 +519,12 @@ def read_sn3_pascalvincent_tensor(path: str, strict: bool = True) -> torch.Tenso
     torch_type = SN3_PASCALVINCENT_TYPEMAP[ty]
     s = [get_int(data[4 * (i + 1) : 4 * (i + 2)]) for i in range(nd)]
 
-    num_bytes_per_value = torch.iinfo(torch_type).bits // 8
-    # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
-    # we need to reverse the bytes before we can read them with torch.frombuffer().
-    needs_byte_reversal = sys.byteorder == "little" and num_bytes_per_value > 1
     parsed = torch.frombuffer(bytearray(data), dtype=torch_type, offset=(4 * (nd + 1)))
-    if needs_byte_reversal:
-        parsed = parsed.flip(0)
+
+    # The MNIST format uses the big endian byte order, while `torch.frombuffer` uses whatever the system uses. In case
+    # that is little endian and the dtype has more than one byte, we need to flip them.
+    if sys.byteorder == "little" and parsed.element_size() > 1:
+        parsed = _flip_byte_order(parsed)
 
     assert parsed.shape[0] == np.prod(s) or not strict
     return parsed.view(*s)
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index b8aaff3d7..fb9de2e44 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -520,3 +520,9 @@ def _read_pfm(file_name: str, slice_channels: int = 2) -> np.ndarray:
     data = np.flip(data, axis=1)  # flip on h dimension
     data = data[:slice_channels, :, :]
     return data.astype(np.float32)
+
+
+def _flip_byte_order(t: torch.Tensor) -> torch.Tensor:
+    return (
+        t.contiguous().view(torch.uint8).view(*t.shape, t.element_size()).flip(-1).view(*t.shape[:-1], -1).view(t.dtype)
+    )
-- 
GitLab


From 035d99f1e7f7206a8708e76f4568aac0c3b1346d Mon Sep 17 00:00:00 2001
From: Aditya Oke <47158509+oke-aditya@users.noreply.github.com>
Date: Thu, 12 Jan 2023 23:18:13 +0530
Subject: [PATCH 209/624] Add video models to torchhub (#7083)

---
 hubconf.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/hubconf.py b/hubconf.py
index 25bbf4e07..637827127 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -71,4 +71,15 @@ from torchvision.models.shufflenetv2 import (
 from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
 from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t
 from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn
+from torchvision.models.video import (
+    mc3_18,
+    mvit_v1_b,
+    mvit_v2_s,
+    r2plus1d_18,
+    r3d_18,
+    s3d,
+    swin3d_b,
+    swin3d_s,
+    swin3d_t,
+)
 from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32
-- 
GitLab


From 69ae61a198a21e78ed8ffa61476c9fecc3c727ab Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 16 Jan 2023 10:55:01 +0100
Subject: [PATCH 210/624] reinstate Places365 download tests (#7088)

---
 test/test_datasets_download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
index c748a8a0f..f0fd4330a 100644
--- a/test/test_datasets_download.py
+++ b/test/test_datasets_download.py
@@ -467,6 +467,7 @@ def make_parametrize_kwargs(download_configs):
             widerface(),
             kinetics(),
             kitti(),
+            places365(),
         )
     )
 )
@@ -481,7 +482,6 @@ def test_url_is_accessible(url, md5):
 @pytest.mark.parametrize(
     **make_parametrize_kwargs(
         itertools.chain(
-            places365(),  # https://github.com/pytorch/vision/issues/6268
             sbu(),  # https://github.com/pytorch/vision/issues/7005
         )
     )
-- 
GitLab


From f71c4308c62b97658bdef8dd516ae958b613f38f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 16 Jan 2023 13:43:14 +0100
Subject: [PATCH 211/624] simplify dispatcher if-elif (#7084)

---
 mypy.ini                                      |  1 -
 .../prototype/transforms/_type_conversion.py  |  2 +-
 .../transforms/functional/__init__.py         |  3 ++
 .../transforms/functional/_augment.py         |  6 +--
 .../prototype/transforms/functional/_color.py | 39 ++++----------
 .../transforms/functional/_deprecated.py      | 10 ++--
 .../transforms/functional/_geometry.py        | 54 ++++++-------------
 .../prototype/transforms/functional/_meta.py  | 26 ++++-----
 .../prototype/transforms/functional/_misc.py  | 10 ++--
 .../transforms/functional/_temporal.py        |  4 +-
 .../prototype/transforms/functional/_utils.py |  8 +++
 torchvision/prototype/transforms/utils.py     |  8 +--
 12 files changed, 64 insertions(+), 107 deletions(-)
 create mode 100644 torchvision/prototype/transforms/functional/_utils.py

diff --git a/mypy.ini b/mypy.ini
index c1d174f45..eb88b233f 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -32,7 +32,6 @@ no_implicit_optional = True
 
 ; warnings
 warn_unused_ignores = True
-warn_return_any = True
 
 ; miscellaneous strictness flags
 allow_redefinition = True
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index 01908650f..c84aee62a 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -46,7 +46,7 @@ class ToImageTensor(Transform):
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> datapoints.Image:
-        return F.to_image_tensor(inpt)  # type: ignore[no-any-return]
+        return F.to_image_tensor(inpt)
 
 
 class ToImagePIL(Transform):
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index ec2da6ee5..30ef6e3fc 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -1,6 +1,9 @@
 # TODO: Add _log_api_usage_once() in all mid-level kernels. If they remain not jit-scriptable we can use decorators
 
 from torchvision.transforms import InterpolationMode  # usort: skip
+
+from ._utils import is_simple_tensor  # usort: skip
+
 from ._meta import (
     clamp_bounding_box,
     convert_format_bounding_box,
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
index 12af2444e..0164a0b5b 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/prototype/transforms/functional/_augment.py
@@ -7,6 +7,8 @@ from torchvision.prototype import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 from torchvision.utils import _log_api_usage_once
 
+from ._utils import is_simple_tensor
+
 
 def erase_image_tensor(
     image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
@@ -45,9 +47,7 @@ def erase(
     if not torch.jit.is_scripting():
         _log_api_usage_once(erase)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
     elif isinstance(inpt, datapoints.Image):
         output = erase_image_tensor(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 037bb5f84..53de1f407 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -8,6 +8,7 @@ from torchvision.transforms.functional_tensor import _max_value
 from torchvision.utils import _log_api_usage_once
 
 from ._meta import _num_value_bits, _rgb_to_gray, convert_dtype_image_tensor
+from ._utils import is_simple_tensor
 
 
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
@@ -43,9 +44,7 @@ def adjust_brightness(inpt: datapoints.InputTypeJIT, brightness_factor: float) -
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_brightness)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_brightness(brightness_factor=brightness_factor)
@@ -131,9 +130,7 @@ def adjust_contrast(inpt: datapoints.InputTypeJIT, contrast_factor: float) -> da
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_contrast)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_contrast(contrast_factor=contrast_factor)
@@ -326,9 +323,7 @@ def adjust_hue(inpt: datapoints.InputTypeJIT, hue_factor: float) -> datapoints.I
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_hue)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_hue(hue_factor=hue_factor)
@@ -371,9 +366,7 @@ def adjust_gamma(inpt: datapoints.InputTypeJIT, gamma: float, gain: float = 1) -
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_gamma)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.adjust_gamma(gamma=gamma, gain=gain)
@@ -410,9 +403,7 @@ def posterize(inpt: datapoints.InputTypeJIT, bits: int) -> datapoints.InputTypeJ
     if not torch.jit.is_scripting():
         _log_api_usage_once(posterize)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return posterize_image_tensor(inpt, bits=bits)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.posterize(bits=bits)
@@ -443,9 +434,7 @@ def solarize(inpt: datapoints.InputTypeJIT, threshold: float) -> datapoints.Inpu
     if not torch.jit.is_scripting():
         _log_api_usage_once(solarize)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return solarize_image_tensor(inpt, threshold=threshold)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.solarize(threshold=threshold)
@@ -498,9 +487,7 @@ def autocontrast(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(autocontrast)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return autocontrast_image_tensor(inpt)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.autocontrast()
@@ -593,9 +580,7 @@ def equalize(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(equalize)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return equalize_image_tensor(inpt)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.equalize()
@@ -610,7 +595,7 @@ def equalize(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
 
 def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.is_floating_point():
-        return 1.0 - image  # type: ignore[no-any-return]
+        return 1.0 - image
     elif image.dtype == torch.uint8:
         return image.bitwise_not()
     else:  # signed integer dtypes
@@ -629,9 +614,7 @@ def invert(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(invert)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return invert_image_tensor(inpt)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.invert()
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index 25b54917b..f6fb0af0a 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -7,6 +7,8 @@ import torch
 from torchvision.prototype import datapoints
 from torchvision.transforms import functional as _F
 
+from ._utils import is_simple_tensor
+
 
 @torch.jit.unused
 def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
@@ -25,14 +27,14 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
 def rgb_to_grayscale(
     inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], num_output_channels: int = 1
 ) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
-    if not torch.jit.is_scripting() and isinstance(inpt, (datapoints.Image, datapoints.Video)):
-        inpt = inpt.as_subclass(torch.Tensor)
-        old_color_space = None
-    elif isinstance(inpt, torch.Tensor):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         old_color_space = datapoints._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
     else:
         old_color_space = None
 
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            inpt = inpt.as_subclass(torch.Tensor)
+
     call = ", num_output_channels=3" if num_output_channels == 3 else ""
     replacement = (
         f"convert_color_space(..., color_space=datapoints.ColorSpace.GRAY"
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index ba417a0ce..66e777dbd 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -23,6 +23,8 @@ from torchvision.utils import _log_api_usage_once
 
 from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
 
+from ._utils import is_simple_tensor
+
 
 def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
@@ -60,9 +62,7 @@ def horizontal_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(horizontal_flip)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return horizontal_flip_image_tensor(inpt)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.horizontal_flip()
@@ -111,9 +111,7 @@ def vertical_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(vertical_flip)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return vertical_flip_image_tensor(inpt)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.vertical_flip()
@@ -241,9 +239,7 @@ def resize(
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(resize)
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
@@ -744,9 +740,7 @@ def affine(
         _log_api_usage_once(affine)
 
     # TODO: consider deprecating integers from angle and shear on the future
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return affine_image_tensor(
             inpt,
             angle,
@@ -929,9 +923,7 @@ def rotate(
     if not torch.jit.is_scripting():
         _log_api_usage_once(rotate)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.rotate(angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
@@ -1139,9 +1131,7 @@ def pad(
     if not torch.jit.is_scripting():
         _log_api_usage_once(pad)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
 
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
@@ -1219,9 +1209,7 @@ def crop(inpt: datapoints.InputTypeJIT, top: int, left: int, height: int, width:
     if not torch.jit.is_scripting():
         _log_api_usage_once(crop)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return crop_image_tensor(inpt, top, left, height, width)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.crop(top, left, height, width)
@@ -1476,9 +1464,7 @@ def perspective(
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(perspective)
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return perspective_image_tensor(
             inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
         )
@@ -1639,9 +1625,7 @@ def elastic(
     if not torch.jit.is_scripting():
         _log_api_usage_once(elastic)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.elastic(displacement, interpolation=interpolation, fill=fill)
@@ -1754,9 +1738,7 @@ def center_crop(inpt: datapoints.InputTypeJIT, output_size: List[int]) -> datapo
     if not torch.jit.is_scripting():
         _log_api_usage_once(center_crop)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return center_crop_image_tensor(inpt, output_size)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.center_crop(output_size)
@@ -1850,9 +1832,7 @@ def resized_crop(
     if not torch.jit.is_scripting():
         _log_api_usage_once(resized_crop)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return resized_crop_image_tensor(
             inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
         )
@@ -1935,9 +1915,7 @@ def five_crop(
 
     # TODO: consider breaking BC here to return List[datapoints.ImageTypeJIT/VideoTypeJIT] to align this op with
     #  `ten_crop`
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return five_crop_image_tensor(inpt, size)
     elif isinstance(inpt, datapoints.Image):
         output = five_crop_image_tensor(inpt.as_subclass(torch.Tensor), size)
@@ -1991,9 +1969,7 @@ def ten_crop(
     if not torch.jit.is_scripting():
         _log_api_usage_once(ten_crop)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
     elif isinstance(inpt, datapoints.Image):
         output = ten_crop_image_tensor(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 28de05369..62f9664fc 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -9,6 +9,8 @@ from torchvision.transforms.functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
+from ._utils import is_simple_tensor
+
 
 def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
     chw = list(image.shape[-3:])
@@ -29,9 +31,7 @@ def get_dimensions(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_dimensions)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_dimensions_image_tensor(inpt)
     elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
         channels = inpt.num_channels
@@ -68,9 +68,7 @@ def get_num_channels(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJI
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_num_channels)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_channels_image_tensor(inpt)
     elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
         return inpt.num_channels
@@ -120,14 +118,12 @@ def get_spatial_size(inpt: datapoints.InputTypeJIT) -> List[int]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_spatial_size)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_spatial_size_image_tensor(inpt)
     elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBox, datapoints.Mask)):
         return list(inpt.spatial_size)
     elif isinstance(inpt, PIL.Image.Image):
-        return get_spatial_size_image_pil(inpt)  # type: ignore[no-any-return]
+        return get_spatial_size_image_pil(inpt)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -143,7 +139,7 @@ def get_num_frames(inpt: datapoints.VideoTypeJIT) -> int:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_num_frames)
 
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Video)):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_frames_video(inpt)
     elif isinstance(inpt, datapoints.Video):
         return inpt.num_frames
@@ -336,9 +332,7 @@ def convert_color_space(
     if not torch.jit.is_scripting():
         _log_api_usage_once(convert_color_space)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         if old_color_space is None:
             raise RuntimeError(
                 "In order to convert the color space of simple tensors, "
@@ -443,9 +437,7 @@ def convert_dtype(
     if not torch.jit.is_scripting():
         _log_api_usage_once(convert_dtype)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, (datapoints.Image, datapoints.Video))
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return convert_dtype_image_tensor(inpt, dtype)
     elif isinstance(inpt, datapoints.Image):
         output = convert_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype)
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index bc9408d0e..595707681 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -10,7 +10,7 @@ from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 from torchvision.utils import _log_api_usage_once
 
-from ..utils import is_simple_tensor
+from ._utils import is_simple_tensor
 
 
 def normalize_image_tensor(
@@ -61,9 +61,9 @@ def normalize(
     if not torch.jit.is_scripting():
         _log_api_usage_once(normalize)
 
-        if is_simple_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video)):
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
             inpt = inpt.as_subclass(torch.Tensor)
-        else:
+        elif not is_simple_tensor(inpt):
             raise TypeError(
                 f"Input can either be a plain tensor or an `Image` or `Video` datapoint, "
                 f"but got {type(inpt)} instead."
@@ -175,9 +175,7 @@ def gaussian_blur(
     if not torch.jit.is_scripting():
         _log_api_usage_once(gaussian_blur)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma)
diff --git a/torchvision/prototype/transforms/functional/_temporal.py b/torchvision/prototype/transforms/functional/_temporal.py
index 35f4a84ce..d39a64534 100644
--- a/torchvision/prototype/transforms/functional/_temporal.py
+++ b/torchvision/prototype/transforms/functional/_temporal.py
@@ -4,6 +4,8 @@ from torchvision.prototype import datapoints
 
 from torchvision.utils import _log_api_usage_once
 
+from ._utils import is_simple_tensor
+
 
 def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temporal_dim: int = -4) -> torch.Tensor:
     # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
@@ -18,7 +20,7 @@ def uniform_temporal_subsample(
     if not torch.jit.is_scripting():
         _log_api_usage_once(uniform_temporal_subsample)
 
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Video)):
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return uniform_temporal_subsample_video(inpt, num_samples, temporal_dim=temporal_dim)
     elif isinstance(inpt, datapoints.Video):
         if temporal_dim != -4 and inpt.ndim - 4 != temporal_dim:
diff --git a/torchvision/prototype/transforms/functional/_utils.py b/torchvision/prototype/transforms/functional/_utils.py
new file mode 100644
index 000000000..e4efeb601
--- /dev/null
+++ b/torchvision/prototype/transforms/functional/_utils.py
@@ -0,0 +1,8 @@
+from typing import Any
+
+import torch
+from torchvision.prototype.datapoints._datapoint import Datapoint
+
+
+def is_simple_tensor(inpt: Any) -> bool:
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, Datapoint)
diff --git a/torchvision/prototype/transforms/utils.py b/torchvision/prototype/transforms/utils.py
index 9ab2ed260..ff7fff50c 100644
--- a/torchvision/prototype/transforms/utils.py
+++ b/torchvision/prototype/transforms/utils.py
@@ -3,16 +3,10 @@ from __future__ import annotations
 from typing import Any, Callable, List, Tuple, Type, Union
 
 import PIL.Image
-import torch
 
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import datapoints
-from torchvision.prototype.datapoints._datapoint import Datapoint
-from torchvision.prototype.transforms.functional import get_dimensions, get_spatial_size
-
-
-def is_simple_tensor(inpt: Any) -> bool:
-    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, Datapoint)
+from torchvision.prototype.transforms.functional import get_dimensions, get_spatial_size, is_simple_tensor
 
 
 def query_bounding_box(flat_inputs: List[Any]) -> datapoints.BoundingBox:
-- 
GitLab


From 8985b598a69250d65959941c863d76a4225ae7ac Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 16 Jan 2023 14:04:59 +0100
Subject: [PATCH 212/624] Relax mypy in prototype namespace (#7091)

---
 mypy.ini                                       | 5 +----
 torchvision/prototype/datapoints/_datapoint.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index eb88b233f..900c5479c 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,7 +7,7 @@ allow_redefinition = True
 no_implicit_optional = True
 warn_redundant_casts = True
 
-[mypy-torchvision.prototype.features.*]
+[mypy-torchvision.prototype.datapoints.*]
 
 ; untyped definitions and calls
 disallow_untyped_defs = True
@@ -17,7 +17,6 @@ no_implicit_optional = True
 
 ; warnings
 warn_unused_ignores = True
-warn_return_any = True
 
 ; miscellaneous strictness flags
 allow_redefinition = True
@@ -46,8 +45,6 @@ no_implicit_optional = True
 
 ; warnings
 warn_unused_ignores = True
-warn_return_any = True
-warn_unreachable = True
 
 ; miscellaneous strictness flags
 allow_redefinition = True
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index 659d4e958..d6472301e 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -98,7 +98,7 @@ class Datapoint(torch.Tensor):
             # `args = (torch.Tensor(), datapoints.Image())` first. Without this guard, the original `torch.Tensor` would
             # be wrapped into a `datapoints.Image`.
             if wrapper and isinstance(args[0], cls):
-                return wrapper(cls, args[0], output)  # type: ignore[no-any-return]
+                return wrapper(cls, args[0], output)
 
             # Inplace `func`'s, canonically identified with a trailing underscore in their name like `.add_(...)`,
             # will retain the input type. Thus, we need to unwrap here.
-- 
GitLab


From 93df9a50885d0345e31bba691576c83d5cee7737 Mon Sep 17 00:00:00 2001
From: RoiEX <8350879+RoiEXLab@users.noreply.github.com>
Date: Tue, 17 Jan 2023 14:11:00 +0100
Subject: [PATCH 213/624] Add missing type hints to ColorJitter constructor
 (#7087)

---
 torchvision/transforms/transforms.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 2f513058a..cb2bfdb92 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -3,7 +3,7 @@ import numbers
 import random
 import warnings
 from collections.abc import Sequence
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -1172,7 +1172,13 @@ class ColorJitter(torch.nn.Module):
             or use an interpolation that generates negative values before using this function.
     """
 
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+    def __init__(
+        self,
+        brightness: Union[float, Tuple[float, float]] = 0,
+        contrast: Union[float, Tuple[float, float]] = 0,
+        saturation: Union[float, Tuple[float, float]] = 0,
+        hue: Union[float, Tuple[float, float]] = 0,
+    ) -> None:
         super().__init__()
         _log_api_usage_once(self)
         self.brightness = self._check_input(brightness, "brightness")
-- 
GitLab


From c06d52b1c5f6aee36802661c3ebc6347b97cc59e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 19 Jan 2023 14:56:20 +0100
Subject: [PATCH 214/624] properly support deepcopying and serialization of
 model weights (#7107)

---
 test/test_extended_models.py | 31 +++++++++++++++++++++++++++----
 torchvision/models/_api.py   | 30 +++++++++++++++++++++++++++---
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index 5505f5b5e..068d3e238 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -1,5 +1,6 @@
 import copy
 import os
+import pickle
 
 import pytest
 import test_models as TM
@@ -73,10 +74,32 @@ def test_get_model_weights(name, weight):
     ],
 )
 def test_weights_copyable(copy_fn, name):
-    model_weights = models.get_model_weights(name)
-    for weights in list(model_weights):
-        copied_weights = copy_fn(weights)
-        assert copied_weights is weights
+    for weights in list(models.get_model_weights(name)):
+        # It is somewhat surprising that (deep-)copying is an identity operation here, but this is the default behavior
+        # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances
+        # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop
+        # support for the identity operation in the future.
+        assert copy_fn(weights) is weights
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_deserializable(name):
+    for weights in list(models.get_model_weights(name)):
+        # It is somewhat surprising that deserialization is an identity operation here, but this is the default behavior
+        # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances
+        # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop
+        # support for the identity operation in the future.
+        assert pickle.loads(pickle.dumps(weights)) is weights
 
 
 @pytest.mark.parametrize(
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 992ebbbae..7c9ef3415 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -2,6 +2,7 @@ import importlib
 import inspect
 import sys
 from dataclasses import dataclass, fields
+from functools import partial
 from inspect import signature
 from types import ModuleType
 from typing import Any, Callable, cast, Dict, List, Mapping, Optional, TypeVar, Union
@@ -37,6 +38,32 @@ class Weights:
     transforms: Callable
     meta: Dict[str, Any]
 
+    def __eq__(self, other: Any) -> bool:
+        # We need this custom implementation for correct deep-copy and deserialization behavior.
+        # TL;DR: After the definition of an enum, creating a new instance, i.e. by deep-copying or deserializing it,
+        # involves an equality check against the defined members. Unfortunately, the `transforms` attribute is often
+        # defined with `functools.partial` and `fn = partial(...); assert deepcopy(fn) != fn`. Without custom handling
+        # for it, the check against the defined members would fail and effectively prevent the weights from being
+        # deep-copied or deserialized.
+        # See https://github.com/pytorch/vision/pull/7107 for details.
+        if not isinstance(other, Weights):
+            return NotImplemented
+
+        if self.url != other.url:
+            return False
+
+        if self.meta != other.meta:
+            return False
+
+        if isinstance(self.transforms, partial) and isinstance(other.transforms, partial):
+            return (
+                self.transforms.func == other.transforms.func
+                and self.transforms.args == other.transforms.args
+                and self.transforms.keywords == other.transforms.keywords
+            )
+        else:
+            return self.transforms == other.transforms
+
 
 class WeightsEnum(StrEnum):
     """
@@ -75,9 +102,6 @@ class WeightsEnum(StrEnum):
                 return object.__getattribute__(self.value, name)
         return super().__getattr__(name)
 
-    def __deepcopy__(self, memodict=None):
-        return self
-
 
 def get_weight(name: str) -> WeightsEnum:
     """
-- 
GitLab


From d7e5b6a1ef60347b055dc824dd51a535cc52bbb1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 20 Jan 2023 10:15:45 +0100
Subject: [PATCH 215/624] Let Normalize() and RandomPhotometricDistort return
 datapoints instead of tensors (#7113)

---
 test/prototype_transforms_dispatcher_infos.py |  1 -
 test/test_prototype_transforms_functional.py  | 14 +------------
 torchvision/prototype/datapoints/_image.py    |  4 ++++
 torchvision/prototype/datapoints/_video.py    |  4 ++++
 torchvision/prototype/transforms/_color.py    | 10 +++++-----
 .../prototype/transforms/functional/_misc.py  | 20 ++++++++-----------
 6 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index b92278fef..90e2e7f57 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -426,7 +426,6 @@ DISPATCHER_INFOS = [
             datapoints.Video: F.normalize_video,
         },
         test_marks=[
-            skip_dispatch_feature,
             xfail_jit_python_scalar_arg("mean"),
             xfail_jit_python_scalar_arg("std"),
         ],
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index d199625df..a80e0f457 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -13,7 +13,7 @@ import torch
 
 import torchvision.prototype.transforms.utils
 from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
-from prototype_common_utils import assert_close, make_bounding_boxes, make_image, parametrized_error_message
+from prototype_common_utils import assert_close, make_bounding_boxes, parametrized_error_message
 from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
 from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
@@ -1185,18 +1185,6 @@ def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize,
     torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
 
 
-def test_normalize_output_type():
-    inpt = torch.rand(1, 3, 32, 32)
-    output = F.normalize(inpt, mean=[0.5, 0.5, 0.5], std=[1.0, 1.0, 1.0])
-    assert type(output) is torch.Tensor
-    torch.testing.assert_close(inpt - 0.5, output)
-
-    inpt = make_image(color_space=datapoints.ColorSpace.RGB)
-    output = F.normalize(inpt, mean=[0.5, 0.5, 0.5], std=[1.0, 1.0, 1.0])
-    assert type(output) is torch.Tensor
-    torch.testing.assert_close(inpt - 0.5, output)
-
-
 @pytest.mark.parametrize(
     "inpt",
     [
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index fc2069110..d674745a7 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -289,6 +289,10 @@ class Image(Datapoint):
         )
         return Image.wrap_like(self, output)
 
+    def normalize(self, mean: List[float], std: List[float], inplace: bool = False) -> Image:
+        output = self._F.normalize_image_tensor(self.as_subclass(torch.Tensor), mean=mean, std=std, inplace=inplace)
+        return Image.wrap_like(self, output)
+
 
 ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
 ImageTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 5c55d23a1..c72738746 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -241,6 +241,10 @@ class Video(Datapoint):
         output = self._F.gaussian_blur_video(self.as_subclass(torch.Tensor), kernel_size=kernel_size, sigma=sigma)
         return Video.wrap_like(self, output)
 
+    def normalize(self, mean: List[float], std: List[float], inplace: bool = False) -> Video:
+        output = self._F.normalize_video(self.as_subclass(torch.Tensor), mean=mean, std=std, inplace=inplace)
+        return Video.wrap_like(self, output)
+
 
 VideoType = Union[torch.Tensor, Video]
 VideoTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 0254dd7c2..0eb20e577 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -82,6 +82,7 @@ class ColorJitter(Transform):
         return output
 
 
+# TODO: This class seems to be untested
 class RandomPhotometricDistort(Transform):
     _transformed_types = (
         datapoints.Image,
@@ -119,15 +120,14 @@ class RandomPhotometricDistort(Transform):
     def _permute_channels(
         self, inpt: Union[datapoints.ImageType, datapoints.VideoType], permutation: torch.Tensor
     ) -> Union[datapoints.ImageType, datapoints.VideoType]:
-        if isinstance(inpt, PIL.Image.Image):
+
+        orig_inpt = inpt
+        if isinstance(orig_inpt, PIL.Image.Image):
             inpt = F.pil_to_tensor(inpt)
 
         output = inpt[..., permutation, :, :]
 
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = inpt.wrap_like(inpt, output, color_space=datapoints.ColorSpace.OTHER)  # type: ignore[arg-type]
-
-        elif isinstance(inpt, PIL.Image.Image):
+        if isinstance(orig_inpt, PIL.Image.Image):
             output = F.to_image_pil(output)
 
         return output
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index 595707681..9d0a00f88 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -60,18 +60,14 @@ def normalize(
 ) -> torch.Tensor:
     if not torch.jit.is_scripting():
         _log_api_usage_once(normalize)
-
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            inpt = inpt.as_subclass(torch.Tensor)
-        elif not is_simple_tensor(inpt):
-            raise TypeError(
-                f"Input can either be a plain tensor or an `Image` or `Video` datapoint, "
-                f"but got {type(inpt)} instead."
-            )
-
-    # Image or Video type should not be retained after normalization due to unknown data range
-    # Thus we return Tensor for input Image
-    return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
+    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
+        return inpt.normalize(mean=mean, std=std, inplace=inplace)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or an `Image` or `Video` datapoint, " f"but got {type(inpt)} instead."
+        )
 
 
 def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-- 
GitLab


From 01d138d81ae97b846ff63339eeee51f44a1725a2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 20 Jan 2023 10:24:58 +0100
Subject: [PATCH 216/624] update naming feature -> datapoint in prototype test
 suite (#7117)

---
 test/prototype_transforms_dispatcher_infos.py | 30 ++++----
 test/test_prototype_datapoints.py             |  2 +-
 test/test_prototype_transforms.py             | 12 ++--
 test/test_prototype_transforms_consistency.py | 20 +++---
 test/test_prototype_transforms_functional.py  | 72 ++++++++++---------
 5 files changed, 69 insertions(+), 67 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 90e2e7f57..4a54cb40d 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -44,19 +44,19 @@ class DispatcherInfo(InfoBase):
         self.pil_kernel_info = pil_kernel_info
 
         kernel_infos = {}
-        for feature_type, kernel in self.kernels.items():
+        for datapoint_type, kernel in self.kernels.items():
             kernel_info = self._KERNEL_INFO_MAP.get(kernel)
             if not kernel_info:
                 raise pytest.UsageError(
-                    f"Can't register {kernel.__name__} for type {feature_type} since there is no `KernelInfo` for it. "
+                    f"Can't register {kernel.__name__} for type {datapoint_type} since there is no `KernelInfo` for it. "
                     f"Please add a `KernelInfo` for it in `prototype_transforms_kernel_infos.py`."
                 )
-            kernel_infos[feature_type] = kernel_info
+            kernel_infos[datapoint_type] = kernel_info
         self.kernel_infos = kernel_infos
 
-    def sample_inputs(self, *feature_types, filter_metadata=True):
-        for feature_type in feature_types or self.kernel_infos.keys():
-            kernel_info = self.kernel_infos.get(feature_type)
+    def sample_inputs(self, *datapoint_types, filter_metadata=True):
+        for datapoint_type in datapoint_types or self.kernel_infos.keys():
+            kernel_info = self.kernel_infos.get(datapoint_type)
             if not kernel_info:
                 raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}")
 
@@ -66,7 +66,7 @@ class DispatcherInfo(InfoBase):
                 yield from sample_inputs
             else:
                 for args_kwargs in sample_inputs:
-                    for attribute in feature_type.__annotations__.keys():
+                    for attribute in datapoint_type.__annotations__.keys():
                         if attribute in args_kwargs.kwargs:
                             del args_kwargs.kwargs[attribute]
 
@@ -107,9 +107,9 @@ def xfail_jit_list_of_ints(name, *, reason=None):
     )
 
 
-skip_dispatch_feature = TestMark(
-    ("TestDispatchers", "test_dispatch_feature"),
-    pytest.mark.skip(reason="Dispatcher doesn't support arbitrary feature dispatch."),
+skip_dispatch_datapoint = TestMark(
+    ("TestDispatchers", "test_dispatch_datapoint"),
+    pytest.mark.skip(reason="Dispatcher doesn't support arbitrary datapoint dispatch."),
 )
 
 
@@ -352,7 +352,7 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.erase_image_pil),
         test_marks=[
-            skip_dispatch_feature,
+            skip_dispatch_datapoint,
         ],
     ),
     DispatcherInfo(
@@ -404,7 +404,7 @@ DISPATCHER_INFOS = [
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
-            skip_dispatch_feature,
+            skip_dispatch_datapoint,
         ],
     ),
     DispatcherInfo(
@@ -415,7 +415,7 @@ DISPATCHER_INFOS = [
         },
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
-            skip_dispatch_feature,
+            skip_dispatch_datapoint,
         ],
         pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
     ),
@@ -437,7 +437,7 @@ DISPATCHER_INFOS = [
             datapoints.Video: F.convert_dtype_video,
         },
         test_marks=[
-            skip_dispatch_feature,
+            skip_dispatch_datapoint,
         ],
     ),
     DispatcherInfo(
@@ -446,7 +446,7 @@ DISPATCHER_INFOS = [
             datapoints.Video: F.uniform_temporal_subsample_video,
         },
         test_marks=[
-            skip_dispatch_feature,
+            skip_dispatch_datapoint,
         ],
     ),
 ]
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index d036b5db1..2fc79a506 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -28,7 +28,7 @@ def test_to_wrapping():
     assert label_to.categories is label.categories
 
 
-def test_to_feature_reference():
+def test_to_datapoint_reference():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
     label = datapoints.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index d737f4e44..3826293f3 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -285,7 +285,7 @@ class TestRandomHorizontalFlip:
 
         assert_equal(expected, pil_to_tensor(actual))
 
-    def test_features_image(self, p):
+    def test_datapoints_image(self, p):
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomHorizontalFlip(p=p)
 
@@ -293,7 +293,7 @@ class TestRandomHorizontalFlip:
 
         assert_equal(datapoints.Image(expected), actual)
 
-    def test_features_mask(self, p):
+    def test_datapoints_mask(self, p):
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomHorizontalFlip(p=p)
 
@@ -301,7 +301,7 @@ class TestRandomHorizontalFlip:
 
         assert_equal(datapoints.Mask(expected), actual)
 
-    def test_features_bounding_box(self, p):
+    def test_datapoints_bounding_box(self, p):
         input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
         transform = transforms.RandomHorizontalFlip(p=p)
 
@@ -338,7 +338,7 @@ class TestRandomVerticalFlip:
 
         assert_equal(expected, pil_to_tensor(actual))
 
-    def test_features_image(self, p):
+    def test_datapoints_image(self, p):
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomVerticalFlip(p=p)
 
@@ -346,7 +346,7 @@ class TestRandomVerticalFlip:
 
         assert_equal(datapoints.Image(expected), actual)
 
-    def test_features_mask(self, p):
+    def test_datapoints_mask(self, p):
         input, expected = self.input_expected_image_tensor(p)
         transform = transforms.RandomVerticalFlip(p=p)
 
@@ -354,7 +354,7 @@ class TestRandomVerticalFlip:
 
         assert_equal(datapoints.Mask(expected), actual)
 
-    def test_features_bounding_box(self, p):
+    def test_datapoints_bounding_box(self, p):
         input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
         transform = transforms.RandomVerticalFlip(p=p)
 
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 8cda11008..00dc40fb0 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -558,15 +558,15 @@ def check_call_consistency(
             output_prototype_image = prototype_transform(image)
         except Exception as exc:
             raise AssertionError(
-                f"Transforming a feature image with shape {image_repr} failed in the prototype transform with "
+                f"Transforming a image datapoint with shape {image_repr} failed in the prototype transform with "
                 f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                f"`features.Image` path in `_transform`."
+                f"`datapoints.Image` path in `_transform`."
             ) from exc
 
         assert_close(
             output_prototype_image,
             output_prototype_tensor,
-            msg=lambda msg: f"Output for feature and tensor images is not equal: \n\n{msg}",
+            msg=lambda msg: f"Output for datapoint and tensor images is not equal: \n\n{msg}",
             **closeness_kwargs,
         )
 
@@ -931,7 +931,7 @@ class TestRefDetTransforms:
 
         yield (tensor_image, target)
 
-        feature_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB)
+        datapoint_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB)
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -939,7 +939,7 @@ class TestRefDetTransforms:
         if with_mask:
             target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
 
-        yield (feature_image, target)
+        yield (datapoint_image, target)
 
     @pytest.mark.parametrize(
         "t_ref, t, data_kwargs",
@@ -1015,13 +1015,13 @@ class TestRefSegTransforms:
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
-            feature_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB, dtype=image_dtype)
-            feature_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
+            datapoint_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB, dtype=image_dtype)
+            datapoint_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
 
-            dp = (conv_fn(feature_image), feature_mask)
+            dp = (conv_fn(datapoint_image), datapoint_mask)
             dp_ref = (
-                to_image_pil(feature_image) if supports_pil else feature_image.as_subclass(torch.Tensor),
-                to_image_pil(feature_mask),
+                to_image_pil(datapoint_image) if supports_pil else datapoint_image.as_subclass(torch.Tensor),
+                to_image_pil(datapoint_mask),
             )
 
             yield dp, dp_ref
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index a80e0f457..bc299fd1f 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -162,7 +162,7 @@ class TestKernels:
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
-        feature_type = (
+        datapoint_type = (
             datapoints.Image
             if torchvision.prototype.transforms.utils.is_simple_tensor(batched_input)
             else type(batched_input)
@@ -178,10 +178,10 @@ class TestKernels:
             # common ground.
             datapoints.Mask: 2,
             datapoints.Video: 4,
-        }.get(feature_type)
+        }.get(datapoint_type)
         if data_dims is None:
             raise pytest.UsageError(
-                f"The number of data dimensions cannot be determined for input of type {feature_type.__name__}."
+                f"The number of data dimensions cannot be determined for input of type {datapoint_type.__name__}."
             ) from None
         elif batched_input.ndim <= data_dims:
             pytest.skip("Input is not batched.")
@@ -323,8 +323,8 @@ class TestDispatchers:
     def test_scripted_smoke(self, info, args_kwargs, device):
         dispatcher = script(info.dispatcher)
 
-        (image_feature, *other_args), kwargs = args_kwargs.load(device)
-        image_simple_tensor = torch.Tensor(image_feature)
+        (image_datapoint, *other_args), kwargs = args_kwargs.load(device)
+        image_simple_tensor = torch.Tensor(image_datapoint)
 
         dispatcher(image_simple_tensor, *other_args, **kwargs)
 
@@ -352,8 +352,8 @@ class TestDispatchers:
 
     @image_sample_inputs
     def test_dispatch_simple_tensor(self, info, args_kwargs, spy_on):
-        (image_feature, *other_args), kwargs = args_kwargs.load()
-        image_simple_tensor = torch.Tensor(image_feature)
+        (image_datapoint, *other_args), kwargs = args_kwargs.load()
+        image_simple_tensor = torch.Tensor(image_datapoint)
 
         kernel_info = info.kernel_infos[datapoints.Image]
         spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.id)
@@ -367,12 +367,12 @@ class TestDispatchers:
         args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
     )
     def test_dispatch_pil(self, info, args_kwargs, spy_on):
-        (image_feature, *other_args), kwargs = args_kwargs.load()
+        (image_datapoint, *other_args), kwargs = args_kwargs.load()
 
-        if image_feature.ndim > 3:
+        if image_datapoint.ndim > 3:
             pytest.skip("Input is batched")
 
-        image_pil = F.to_image_pil(image_feature)
+        image_pil = F.to_image_pil(image_datapoint)
 
         pil_kernel_info = info.pil_kernel_info
         spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.id)
@@ -385,37 +385,39 @@ class TestDispatchers:
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(),
     )
-    def test_dispatch_feature(self, info, args_kwargs, spy_on):
-        (feature, *other_args), kwargs = args_kwargs.load()
+    def test_dispatch_datapoint(self, info, args_kwargs, spy_on):
+        (datapoint, *other_args), kwargs = args_kwargs.load()
 
         method_name = info.id
-        method = getattr(feature, method_name)
-        feature_type = type(feature)
-        spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{method_name}")
+        method = getattr(datapoint, method_name)
+        datapoint_type = type(datapoint)
+        spy = spy_on(method, module=datapoint_type.__module__, name=f"{datapoint_type.__name__}.{method_name}")
 
-        info.dispatcher(feature, *other_args, **kwargs)
+        info.dispatcher(datapoint, *other_args, **kwargs)
 
         spy.assert_called_once()
 
     @pytest.mark.parametrize(
-        ("dispatcher_info", "feature_type", "kernel_info"),
+        ("dispatcher_info", "datapoint_type", "kernel_info"),
         [
-            pytest.param(dispatcher_info, feature_type, kernel_info, id=f"{dispatcher_info.id}-{feature_type.__name__}")
+            pytest.param(
+                dispatcher_info, datapoint_type, kernel_info, id=f"{dispatcher_info.id}-{datapoint_type.__name__}"
+            )
             for dispatcher_info in DISPATCHER_INFOS
-            for feature_type, kernel_info in dispatcher_info.kernel_infos.items()
+            for datapoint_type, kernel_info in dispatcher_info.kernel_infos.items()
         ],
     )
-    def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, feature_type, kernel_info):
+    def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, datapoint_type, kernel_info):
         dispatcher_signature = inspect.signature(dispatcher_info.dispatcher)
         dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
 
         kernel_signature = inspect.signature(kernel_info.kernel)
         kernel_params = list(kernel_signature.parameters.values())[1:]
 
-        # We filter out metadata that is implicitly passed to the dispatcher through the input feature, but has to be
+        # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be
         # explicit passed to the kernel.
-        feature_type_metadata = feature_type.__annotations__.keys()
-        kernel_params = [param for param in kernel_params if param.name not in feature_type_metadata]
+        datapoint_type_metadata = datapoint_type.__annotations__.keys()
+        kernel_params = [param for param in kernel_params if param.name not in datapoint_type_metadata]
 
         dispatcher_params = iter(dispatcher_params)
         for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
@@ -433,26 +435,26 @@ class TestDispatchers:
             assert dispatcher_param == kernel_param
 
     @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
-    def test_dispatcher_feature_signatures_consistency(self, info):
+    def test_dispatcher_datapoint_signatures_consistency(self, info):
         try:
-            feature_method = getattr(datapoints._datapoint.Datapoint, info.id)
+            datapoint_method = getattr(datapoints._datapoint.Datapoint, info.id)
         except AttributeError:
-            pytest.skip("Dispatcher doesn't support arbitrary feature dispatch.")
+            pytest.skip("Dispatcher doesn't support arbitrary datapoint dispatch.")
 
         dispatcher_signature = inspect.signature(info.dispatcher)
         dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
 
-        feature_signature = inspect.signature(feature_method)
-        feature_params = list(feature_signature.parameters.values())[1:]
+        datapoint_signature = inspect.signature(datapoint_method)
+        datapoint_params = list(datapoint_signature.parameters.values())[1:]
 
-        # Because we use `from __future__ import annotations` inside the module where `features._datapoint` is defined,
-        # the annotations are stored as strings. This makes them concrete again, so they can be compared to the natively
-        # concrete dispatcher annotations.
-        feature_annotations = get_type_hints(feature_method)
-        for param in feature_params:
-            param._annotation = feature_annotations[param.name]
+        # Because we use `from __future__ import annotations` inside the module where `datapoints._datapoint` is
+        # defined, the annotations are stored as strings. This makes them concrete again, so they can be compared to the
+        # natively concrete dispatcher annotations.
+        datapoint_annotations = get_type_hints(datapoint_method)
+        for param in datapoint_params:
+            param._annotation = datapoint_annotations[param.name]
 
-        assert dispatcher_params == feature_params
+        assert dispatcher_params == datapoint_params
 
     @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
     def test_unkown_type(self, info):
-- 
GitLab


From d2d448c71b4cb054d160000a0f63eecad7867bdb Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 20 Jan 2023 13:12:22 +0100
Subject: [PATCH 217/624] add tests for the output types of prototype
 functional dispatchers (#7118)

---
 test/prototype_transforms_dispatcher_infos.py | 13 ++++++-
 test/test_prototype_transforms_functional.py  | 37 +++++++++++++++++++
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 4a54cb40d..f6b878657 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -112,6 +112,15 @@ skip_dispatch_datapoint = TestMark(
     pytest.mark.skip(reason="Dispatcher doesn't support arbitrary datapoint dispatch."),
 )
 
+multi_crop_skips = [
+    TestMark(
+        ("TestDispatchers", test_name),
+        pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."),
+    )
+    for test_name in ["test_simple_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"]
+]
+multi_crop_skips.append(skip_dispatch_datapoint)
+
 
 def fill_sequence_needs_broadcast(args_kwargs):
     (image_loader, *_), kwargs = args_kwargs
@@ -404,7 +413,7 @@ DISPATCHER_INFOS = [
         pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
-            skip_dispatch_datapoint,
+            *multi_crop_skips,
         ],
     ),
     DispatcherInfo(
@@ -415,7 +424,7 @@ DISPATCHER_INFOS = [
         },
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
-            skip_dispatch_datapoint,
+            *multi_crop_skips,
         ],
         pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
     ),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index bc299fd1f..102f78e6e 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -362,6 +362,16 @@ class TestDispatchers:
 
         spy.assert_called_once()
 
+    @image_sample_inputs
+    def test_simple_tensor_output_type(self, info, args_kwargs):
+        (image_datapoint, *other_args), kwargs = args_kwargs.load()
+        image_simple_tensor = image_datapoint.as_subclass(torch.Tensor)
+
+        output = info.dispatcher(image_simple_tensor, *other_args, **kwargs)
+
+        # We cannot use `isinstance` here since all datapoints are instances of `torch.Tensor` as well
+        assert type(output) is torch.Tensor
+
     @make_info_args_kwargs_parametrization(
         [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
         args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
@@ -381,6 +391,22 @@ class TestDispatchers:
 
         spy.assert_called_once()
 
+    @make_info_args_kwargs_parametrization(
+        [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
+        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
+    )
+    def test_pil_output_type(self, info, args_kwargs):
+        (image_datapoint, *other_args), kwargs = args_kwargs.load()
+
+        if image_datapoint.ndim > 3:
+            pytest.skip("Input is batched")
+
+        image_pil = F.to_image_pil(image_datapoint)
+
+        output = info.dispatcher(image_pil, *other_args, **kwargs)
+
+        assert isinstance(output, PIL.Image.Image)
+
     @make_info_args_kwargs_parametrization(
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(),
@@ -397,6 +423,17 @@ class TestDispatchers:
 
         spy.assert_called_once()
 
+    @make_info_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(),
+    )
+    def test_datapoint_output_type(self, info, args_kwargs):
+        (datapoint, *other_args), kwargs = args_kwargs.load()
+
+        output = info.dispatcher(datapoint, *other_args, **kwargs)
+
+        assert isinstance(output, type(datapoint))
+
     @pytest.mark.parametrize(
         ("dispatcher_info", "datapoint_type", "kernel_info"),
         [
-- 
GitLab


From c206a471617e41ba04a0f3cc5d926a4b7c391afe Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 23 Jan 2023 12:21:19 +0100
Subject: [PATCH 218/624] add reference test for normalize_image_tensor (#7119)

---
 test/prototype_transforms_kernel_infos.py    | 18 +++++++++++++++
 test/test_prototype_transforms_functional.py | 23 +++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index ded888a4a..e1420d1cc 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -2232,6 +2232,22 @@ def sample_inputs_normalize_image_tensor():
         yield ArgsKwargs(image_loader, mean=mean, std=std)
 
 
+def reference_normalize_image_tensor(image, mean, std, inplace=False):
+    mean = torch.tensor(mean).view(-1, 1, 1)
+    std = torch.tensor(std).view(-1, 1, 1)
+
+    sub = torch.Tensor.sub_ if inplace else torch.Tensor.sub
+    return sub(image, mean).div_(std)
+
+
+def reference_inputs_normalize_image_tensor():
+    yield ArgsKwargs(
+        make_image_loader(size=(32, 32), color_space=datapoints.ColorSpace.RGB, extra_dims=[1]),
+        mean=[0.5, 0.5, 0.5],
+        std=[1.0, 1.0, 1.0],
+    )
+
+
 def sample_inputs_normalize_video():
     mean, std = _NORMALIZE_MEANS_STDS[0]
     for video_loader in make_video_loaders(
@@ -2246,6 +2262,8 @@ KERNEL_INFOS.extend(
             F.normalize_image_tensor,
             kernel_name="normalize_image_tensor",
             sample_inputs_fn=sample_inputs_normalize_image_tensor,
+            reference_fn=reference_normalize_image_tensor,
+            reference_inputs_fn=reference_inputs_normalize_image_tensor,
             test_marks=[
                 xfail_jit_python_scalar_arg("mean"),
                 xfail_jit_python_scalar_arg("std"),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 102f78e6e..7f0781fb0 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -13,7 +13,12 @@ import torch
 
 import torchvision.prototype.transforms.utils
 from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
-from prototype_common_utils import assert_close, make_bounding_boxes, parametrized_error_message
+from prototype_common_utils import (
+    assert_close,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    make_bounding_boxes,
+    parametrized_error_message,
+)
 from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
 from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
@@ -538,6 +543,22 @@ def test_convert_dtype_image_tensor_dtype_and_device(info, args_kwargs, device):
     assert output.device == input.device
 
 
+@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("num_channels", [1, 3])
+def test_normalize_image_tensor_stats(device, num_channels):
+    stats = pytest.importorskip("scipy.stats", reason="SciPy is not available")
+
+    def assert_samples_from_standard_normal(t):
+        p_value = stats.kstest(t.flatten(), cdf="norm", args=(0, 1)).pvalue
+        return p_value > 1e-4
+
+    image = torch.rand(num_channels, DEFAULT_SQUARE_SPATIAL_SIZE, DEFAULT_SQUARE_SPATIAL_SIZE)
+    mean = image.mean(dim=(1, 2)).tolist()
+    std = image.std(dim=(1, 2)).tolist()
+
+    assert_samples_from_standard_normal(F.normalize_image_tensor(image, mean, std))
+
+
 # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
 #  `prototype_transforms_kernel_infos.py`
 
-- 
GitLab


From 5dd95944c609ac399743fa843ddb7b83780512b3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 24 Jan 2023 14:01:20 +0100
Subject: [PATCH 219/624] Remove `color_space` metadata and
 `ConvertColorSpace()` transform (#7120)

---
 test/prototype_common_utils.py                |  55 +++--
 test/prototype_transforms_kernel_infos.py     | 188 ++++--------------
 test/test_prototype_transforms.py             |  47 +----
 test/test_prototype_transforms_consistency.py |  26 +--
 test/test_prototype_transforms_functional.py  |   1 -
 test/test_prototype_transforms_utils.py       |   2 +-
 torchvision/prototype/datapoints/__init__.py  |   2 +-
 torchvision/prototype/datapoints/_image.py    |  74 +------
 torchvision/prototype/datapoints/_video.py    |  30 +--
 torchvision/prototype/transforms/__init__.py  |   2 +-
 .../prototype/transforms/_deprecated.py       |   5 +-
 torchvision/prototype/transforms/_meta.py     |  33 +--
 .../transforms/functional/__init__.py         |   4 -
 .../transforms/functional/_deprecated.py      |  12 +-
 .../prototype/transforms/functional/_meta.py  | 128 +-----------
 torchvision/transforms/functional.py          |   3 +
 16 files changed, 106 insertions(+), 506 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 18664eb09..1cea10603 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -238,7 +238,6 @@ class TensorLoader:
 
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
-    color_space: datapoints.ColorSpace
     spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
 
@@ -248,10 +247,10 @@ class ImageLoader(TensorLoader):
 
 
 NUM_CHANNELS_MAP = {
-    datapoints.ColorSpace.GRAY: 1,
-    datapoints.ColorSpace.GRAY_ALPHA: 2,
-    datapoints.ColorSpace.RGB: 3,
-    datapoints.ColorSpace.RGB_ALPHA: 4,
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
 }
 
 
@@ -265,7 +264,7 @@ def get_num_channels(color_space):
 def make_image_loader(
     size="random",
     *,
-    color_space=datapoints.ColorSpace.RGB,
+    color_space="RGB",
     extra_dims=(),
     dtype=torch.float32,
     constant_alpha=True,
@@ -276,11 +275,11 @@ def make_image_loader(
     def fn(shape, dtype, device):
         max_value = get_max_value(dtype)
         data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
-        if color_space in {datapoints.ColorSpace.GRAY_ALPHA, datapoints.ColorSpace.RGB_ALPHA} and constant_alpha:
+        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
             data[..., -1, :, :] = max_value
-        return datapoints.Image(data, color_space=color_space)
+        return datapoints.Image(data)
 
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, color_space=color_space)
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype)
 
 
 make_image = from_loader(make_image_loader)
@@ -290,10 +289,10 @@ def make_image_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
-        datapoints.ColorSpace.GRAY,
-        datapoints.ColorSpace.GRAY_ALPHA,
-        datapoints.ColorSpace.RGB,
-        datapoints.ColorSpace.RGB_ALPHA,
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
     ),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.float32, torch.uint8),
@@ -306,7 +305,7 @@ def make_image_loaders(
 make_images = from_loaders(make_image_loaders)
 
 
-def make_image_loader_for_interpolation(size="random", *, color_space=datapoints.ColorSpace.RGB, dtype=torch.uint8):
+def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dtype=torch.uint8):
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
@@ -318,24 +317,24 @@ def make_image_loader_for_interpolation(size="random", *, color_space=datapoints
             .resize((width, height))
             .convert(
                 {
-                    datapoints.ColorSpace.GRAY: "L",
-                    datapoints.ColorSpace.GRAY_ALPHA: "LA",
-                    datapoints.ColorSpace.RGB: "RGB",
-                    datapoints.ColorSpace.RGB_ALPHA: "RGBA",
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
                 }[color_space]
             )
         )
 
         image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
 
-        return datapoints.Image(image_tensor, color_space=color_space)
+        return datapoints.Image(image_tensor)
 
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, color_space=color_space)
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype)
 
 
 def make_image_loaders_for_interpolation(
     sizes=((233, 147),),
-    color_spaces=(datapoints.ColorSpace.RGB,),
+    color_spaces=("RGB",),
     dtypes=(torch.uint8,),
 ):
     for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
@@ -583,7 +582,7 @@ class VideoLoader(ImageLoader):
 def make_video_loader(
     size="random",
     *,
-    color_space=datapoints.ColorSpace.RGB,
+    color_space="RGB",
     num_frames="random",
     extra_dims=(),
     dtype=torch.uint8,
@@ -592,12 +591,10 @@ def make_video_loader(
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device):
-        video = make_image(size=shape[-2:], color_space=color_space, extra_dims=shape[:-3], dtype=dtype, device=device)
-        return datapoints.Video(video, color_space=color_space)
+        video = make_image(size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device)
+        return datapoints.Video(video)
 
-    return VideoLoader(
-        fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype, color_space=color_space
-    )
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
 
 
 make_video = from_loader(make_video_loader)
@@ -607,8 +604,8 @@ def make_video_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
-        datapoints.ColorSpace.GRAY,
-        datapoints.ColorSpace.RGB,
+        "GRAY",
+        "RGB",
     ),
     num_frames=(1, 0, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index e1420d1cc..1fac15262 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -9,7 +9,6 @@ import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.prototype.transforms.functional as F
-from common_utils import cycle_over
 from datasets_utils import combinations_grid
 from prototype_common_utils import (
     ArgsKwargs,
@@ -261,14 +260,12 @@ def _get_resize_sizes(spatial_size):
 
 
 def sample_inputs_resize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]):
         for size in _get_resize_sizes(image_loader.spatial_size):
             yield ArgsKwargs(image_loader, size=size)
 
     for image_loader, interpolation in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB]),
+        make_image_loaders(sizes=["random"], color_spaces=["RGB"]),
         [
             F.InterpolationMode.NEAREST,
             F.InterpolationMode.BILINEAR,
@@ -472,7 +469,7 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
 
 def sample_inputs_affine_image_tensor():
     make_affine_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
+        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
     )
 
     for image_loader, affine_params in itertools.product(make_affine_image_loaders(), _DIVERSE_AFFINE_PARAMS):
@@ -684,69 +681,6 @@ KERNEL_INFOS.append(
 )
 
 
-def sample_inputs_convert_color_space_image_tensor():
-    color_spaces = sorted(
-        set(datapoints.ColorSpace) - {datapoints.ColorSpace.OTHER}, key=lambda color_space: color_space.value
-    )
-
-    for old_color_space, new_color_space in cycle_over(color_spaces):
-        for image_loader in make_image_loaders(sizes=["random"], color_spaces=[old_color_space], constant_alpha=True):
-            yield ArgsKwargs(image_loader, old_color_space=old_color_space, new_color_space=new_color_space)
-
-    for color_space in color_spaces:
-        for image_loader in make_image_loaders(
-            sizes=["random"], color_spaces=[color_space], dtypes=[torch.float32], constant_alpha=True
-        ):
-            yield ArgsKwargs(image_loader, old_color_space=color_space, new_color_space=color_space)
-
-
-@pil_reference_wrapper
-def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space):
-    color_space_pil = datapoints.ColorSpace.from_pil_mode(image_pil.mode)
-    if color_space_pil != old_color_space:
-        raise pytest.UsageError(
-            f"Converting the tensor image into an PIL image changed the colorspace "
-            f"from {old_color_space} to {color_space_pil}"
-        )
-
-    return F.convert_color_space_image_pil(image_pil, color_space=new_color_space)
-
-
-def reference_inputs_convert_color_space_image_tensor():
-    for args_kwargs in sample_inputs_convert_color_space_image_tensor():
-        (image_loader, *other_args), kwargs = args_kwargs
-        if len(image_loader.shape) == 3 and image_loader.dtype == torch.uint8:
-            yield args_kwargs
-
-
-def sample_inputs_convert_color_space_video():
-    color_spaces = [datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB]
-
-    for old_color_space, new_color_space in cycle_over(color_spaces):
-        for video_loader in make_video_loaders(sizes=["random"], color_spaces=[old_color_space], num_frames=["random"]):
-            yield ArgsKwargs(video_loader, old_color_space=old_color_space, new_color_space=new_color_space)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.convert_color_space_image_tensor,
-            sample_inputs_fn=sample_inputs_convert_color_space_image_tensor,
-            reference_fn=reference_convert_color_space_image_tensor,
-            reference_inputs_fn=reference_inputs_convert_color_space_image_tensor,
-            closeness_kwargs={
-                **pil_reference_pixel_difference(),
-                **float32_vs_uint8_pixel_difference(),
-            },
-        ),
-        KernelInfo(
-            F.convert_color_space_video,
-            sample_inputs_fn=sample_inputs_convert_color_space_video,
-        ),
-    ]
-)
-
-
 def sample_inputs_vertical_flip_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.float32]):
         yield ArgsKwargs(image_loader)
@@ -822,7 +756,7 @@ _ROTATE_ANGLES = [-87, 15, 90]
 
 def sample_inputs_rotate_image_tensor():
     make_rotate_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
+        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
     )
 
     for image_loader in make_rotate_image_loaders():
@@ -904,7 +838,7 @@ _CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20
 
 def sample_inputs_crop_image_tensor():
     for image_loader, params in itertools.product(
-        make_image_loaders(sizes=[(16, 17)], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]),
+        make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]),
         [
             dict(top=4, left=3, height=7, width=8),
             dict(top=-1, left=3, height=7, width=8),
@@ -1090,7 +1024,7 @@ _PAD_PARAMS = combinations_grid(
 
 def sample_inputs_pad_image_tensor():
     make_pad_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]
+        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
     )
 
     for image_loader, padding in itertools.product(
@@ -1406,7 +1340,7 @@ _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
 
 def sample_inputs_center_crop_image_tensor():
     for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=[(16, 17)], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]),
+        make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]),
         [
             # valid `output_size` types for which cropping is applied to both dimensions
             *[5, (4,), (2, 3), [6], [3, 2]],
@@ -1492,9 +1426,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_gaussian_blur_image_tensor():
-    make_gaussian_blur_image_loaders = functools.partial(
-        make_image_loaders, sizes=[(7, 33)], color_spaces=[datapoints.ColorSpace.RGB]
-    )
+    make_gaussian_blur_image_loaders = functools.partial(make_image_loaders, sizes=[(7, 33)], color_spaces=["RGB"])
 
     for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
         yield ArgsKwargs(image_loader, kernel_size=kernel_size)
@@ -1531,9 +1463,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_equalize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1560,7 +1490,7 @@ def reference_inputs_equalize_image_tensor():
     spatial_size = (256, 256)
     for dtype, color_space, fn in itertools.product(
         [torch.uint8],
-        [datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB],
+        ["GRAY", "RGB"],
         [
             lambda shape, dtype, device: torch.zeros(shape, dtype=dtype, device=device),
             lambda shape, dtype, device: torch.full(
@@ -1585,9 +1515,7 @@ def reference_inputs_equalize_image_tensor():
             ],
         ],
     ):
-        image_loader = ImageLoader(
-            fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype, color_space=color_space
-        )
+        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype)
         yield ArgsKwargs(image_loader)
 
 
@@ -1615,16 +1543,12 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_invert_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
 def reference_inputs_invert_image_tensor():
-    for image_loader in make_image_loaders(
-        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-    ):
+    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
         yield ArgsKwargs(image_loader)
 
 
@@ -1655,17 +1579,13 @@ _POSTERIZE_BITS = [1, 4, 8]
 
 
 def sample_inputs_posterize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
 
 
 def reference_inputs_posterize_image_tensor():
     for image_loader, bits in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _POSTERIZE_BITS,
     ):
         yield ArgsKwargs(image_loader, bits=bits)
@@ -1702,16 +1622,12 @@ def _get_solarize_thresholds(dtype):
 
 
 def sample_inputs_solarize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))
 
 
 def reference_inputs_solarize_image_tensor():
-    for image_loader in make_image_loaders(
-        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-    ):
+    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
         for threshold in _get_solarize_thresholds(image_loader.dtype):
             yield ArgsKwargs(image_loader, threshold=threshold)
 
@@ -1745,16 +1661,12 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_autocontrast_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
 def reference_inputs_autocontrast_image_tensor():
-    for image_loader in make_image_loaders(
-        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-    ):
+    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
         yield ArgsKwargs(image_loader)
 
 
@@ -1790,16 +1702,14 @@ _ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]
 def sample_inputs_adjust_sharpness_image_tensor():
     for image_loader in make_image_loaders(
         sizes=["random", (2, 2)],
-        color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB),
+        color_spaces=("GRAY", "RGB"),
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
 
 
 def reference_inputs_adjust_sharpness_image_tensor():
     for image_loader, sharpness_factor in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _ADJUST_SHARPNESS_FACTORS,
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor)
@@ -1863,17 +1773,13 @@ _ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]
 
 
 def sample_inputs_adjust_brightness_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
 
 
 def reference_inputs_adjust_brightness_image_tensor():
     for image_loader, brightness_factor in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _ADJUST_BRIGHTNESS_FACTORS,
     ):
         yield ArgsKwargs(image_loader, brightness_factor=brightness_factor)
@@ -1907,17 +1813,13 @@ _ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
 
 
 def sample_inputs_adjust_contrast_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
 def reference_inputs_adjust_contrast_image_tensor():
     for image_loader, contrast_factor in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _ADJUST_CONTRAST_FACTORS,
     ):
         yield ArgsKwargs(image_loader, contrast_factor=contrast_factor)
@@ -1959,17 +1861,13 @@ _ADJUST_GAMMA_GAMMAS_GAINS = [
 
 def sample_inputs_adjust_gamma_image_tensor():
     gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
 
 
 def reference_inputs_adjust_gamma_image_tensor():
     for image_loader, (gamma, gain) in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _ADJUST_GAMMA_GAMMAS_GAINS,
     ):
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
@@ -2007,17 +1905,13 @@ _ADJUST_HUE_FACTORS = [-0.1, 0.5]
 
 
 def sample_inputs_adjust_hue_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
 
 
 def reference_inputs_adjust_hue_image_tensor():
     for image_loader, hue_factor in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _ADJUST_HUE_FACTORS,
     ):
         yield ArgsKwargs(image_loader, hue_factor=hue_factor)
@@ -2053,17 +1947,13 @@ _ADJUST_SATURATION_FACTORS = [0.1, 0.5]
 
 
 def sample_inputs_adjust_saturation_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB)
-    ):
+    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
 def reference_inputs_adjust_saturation_image_tensor():
     for image_loader, saturation_factor in itertools.product(
-        make_image_loaders(
-            color_spaces=(datapoints.ColorSpace.GRAY, datapoints.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
         _ADJUST_SATURATION_FACTORS,
     ):
         yield ArgsKwargs(image_loader, saturation_factor=saturation_factor)
@@ -2128,7 +2018,7 @@ def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
             sizes=[_get_five_ten_crop_spatial_size(size)],
-            color_spaces=[datapoints.ColorSpace.RGB],
+            color_spaces=["RGB"],
             dtypes=[torch.float32],
         ):
             yield ArgsKwargs(image_loader, size=size)
@@ -2152,7 +2042,7 @@ def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
             sizes=[_get_five_ten_crop_spatial_size(size)],
-            color_spaces=[datapoints.ColorSpace.RGB],
+            color_spaces=["RGB"],
             dtypes=[torch.float32],
         ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
@@ -2226,7 +2116,7 @@ _NORMALIZE_MEANS_STDS = [
 
 def sample_inputs_normalize_image_tensor():
     for image_loader, (mean, std) in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32]),
+        make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]),
         _NORMALIZE_MEANS_STDS,
     ):
         yield ArgsKwargs(image_loader, mean=mean, std=std)
@@ -2242,7 +2132,7 @@ def reference_normalize_image_tensor(image, mean, std, inplace=False):
 
 def reference_inputs_normalize_image_tensor():
     yield ArgsKwargs(
-        make_image_loader(size=(32, 32), color_space=datapoints.ColorSpace.RGB, extra_dims=[1]),
+        make_image_loader(size=(32, 32), color_space="RGB", extra_dims=[1]),
         mean=[0.5, 0.5, 0.5],
         std=[1.0, 1.0, 1.0],
     )
@@ -2251,7 +2141,7 @@ def reference_inputs_normalize_image_tensor():
 def sample_inputs_normalize_video():
     mean, std = _NORMALIZE_MEANS_STDS[0]
     for video_loader in make_video_loaders(
-        sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], num_frames=["random"], dtypes=[torch.float32]
+        sizes=["random"], color_spaces=["RGB"], num_frames=["random"], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(video_loader, mean=mean, std=std)
 
@@ -2285,9 +2175,7 @@ def sample_inputs_convert_dtype_image_tensor():
             # conversion cannot be performed safely
             continue
 
-        for image_loader in make_image_loaders(
-            sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], dtypes=[input_dtype]
-        ):
+        for image_loader in make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[input_dtype]):
             yield ArgsKwargs(image_loader, dtype=output_dtype)
 
 
@@ -2414,7 +2302,7 @@ def reference_uniform_temporal_subsample_video(x, num_samples, temporal_dim=-4):
 
 
 def reference_inputs_uniform_temporal_subsample_video():
-    for video_loader in make_video_loaders(sizes=["random"], color_spaces=[datapoints.ColorSpace.RGB], num_frames=[10]):
+    for video_loader in make_video_loaders(sizes=["random"], color_spaces=["RGB"], num_frames=[10]):
         for num_samples in range(1, video_loader.shape[-4] + 1):
             yield ArgsKwargs(video_loader, num_samples)
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 3826293f3..335fbfd4f 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -161,8 +161,8 @@ class TestSmoke:
                 itertools.chain.from_iterable(
                     fn(
                         color_spaces=[
-                            datapoints.ColorSpace.GRAY,
-                            datapoints.ColorSpace.RGB,
+                            "GRAY",
+                            "RGB",
                         ],
                         dtypes=[torch.uint8],
                         extra_dims=[(), (4,)],
@@ -192,7 +192,7 @@ class TestSmoke:
             (
                 transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
                 itertools.chain.from_iterable(
-                    fn(color_spaces=[datapoints.ColorSpace.RGB], dtypes=[torch.float32])
+                    fn(color_spaces=["RGB"], dtypes=[torch.float32])
                     for fn in [
                         make_images,
                         make_vanilla_tensor_images,
@@ -221,45 +221,6 @@ class TestSmoke:
     def test_random_resized_crop(self, transform, input):
         transform(input)
 
-    @parametrize(
-        [
-            (
-                transforms.ConvertColorSpace(color_space=new_color_space, old_color_space=old_color_space),
-                itertools.chain.from_iterable(
-                    [
-                        fn(color_spaces=[old_color_space])
-                        for fn in (
-                            make_images,
-                            make_vanilla_tensor_images,
-                            make_pil_images,
-                            make_videos,
-                        )
-                    ]
-                ),
-            )
-            for old_color_space, new_color_space in itertools.product(
-                [
-                    datapoints.ColorSpace.GRAY,
-                    datapoints.ColorSpace.GRAY_ALPHA,
-                    datapoints.ColorSpace.RGB,
-                    datapoints.ColorSpace.RGB_ALPHA,
-                ],
-                repeat=2,
-            )
-        ]
-    )
-    def test_convert_color_space(self, transform, input):
-        transform(input)
-
-    def test_convert_color_space_unsupported_types(self):
-        transform = transforms.ConvertColorSpace(
-            color_space=datapoints.ColorSpace.RGB, old_color_space=datapoints.ColorSpace.GRAY
-        )
-
-        for inpt in [make_bounding_box(format="XYXY"), make_masks()]:
-            output = transform(inpt)
-            assert output is inpt
-
 
 @pytest.mark.parametrize("p", [0.0, 1.0])
 class TestRandomHorizontalFlip:
@@ -1558,7 +1519,7 @@ class TestFixedSizeCrop:
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(size=spatial_size, color_space=datapoints.ColorSpace.RGB),
+            make_image(size=spatial_size, color_space="RGB"),
             make_bounding_box(
                 format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
             ),
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 00dc40fb0..3b69b72dd 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -31,7 +31,7 @@ from torchvision.prototype.transforms.functional import to_image_pil
 from torchvision.prototype.transforms.utils import query_spatial_size
 from torchvision.transforms import functional as legacy_F
 
-DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[datapoints.ColorSpace.RGB], extra_dims=[(4,)])
+DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
 
 class ConsistencyConfig:
@@ -138,9 +138,7 @@ CONSISTENCY_CONFIGS = [
         ],
         # Make sure that the product of the height, width and number of channels matches the number of elements in
         # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36.
-        make_images_kwargs=dict(
-            DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=[datapoints.ColorSpace.RGB]
-        ),
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"]),
         supports_pil=False,
     ),
     ConsistencyConfig(
@@ -150,9 +148,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(num_output_channels=1),
             ArgsKwargs(num_output_channels=3),
         ],
-        make_images_kwargs=dict(
-            DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=[datapoints.ColorSpace.RGB, datapoints.ColorSpace.GRAY]
-        ),
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]),
     ),
     ConsistencyConfig(
         prototype_transforms.ConvertDtype,
@@ -174,10 +170,10 @@ CONSISTENCY_CONFIGS = [
         [ArgsKwargs()],
         make_images_kwargs=dict(
             color_spaces=[
-                datapoints.ColorSpace.GRAY,
-                datapoints.ColorSpace.GRAY_ALPHA,
-                datapoints.ColorSpace.RGB,
-                datapoints.ColorSpace.RGB_ALPHA,
+                "GRAY",
+                "GRAY_ALPHA",
+                "RGB",
+                "RGBA",
             ],
             extra_dims=[()],
         ),
@@ -911,7 +907,7 @@ class TestRefDetTransforms:
         size = (600, 800)
         num_objects = 22
 
-        pil_image = to_image_pil(make_image(size=size, color_space=datapoints.ColorSpace.RGB))
+        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -921,7 +917,7 @@ class TestRefDetTransforms:
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space=datapoints.ColorSpace.RGB))
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -931,7 +927,7 @@ class TestRefDetTransforms:
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB)
+        datapoint_image = make_image(size=size, color_space="RGB")
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -1015,7 +1011,7 @@ class TestRefSegTransforms:
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
-            datapoint_image = make_image(size=size, color_space=datapoints.ColorSpace.RGB, dtype=image_dtype)
+            datapoint_image = make_image(size=size, color_space="RGB", dtype=image_dtype)
             datapoint_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
 
             dp = (conv_fn(datapoint_image), datapoint_mask)
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 7f0781fb0..649620eda 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -340,7 +340,6 @@ class TestDispatchers:
         "dispatcher",
         [
             F.clamp_bounding_box,
-            F.convert_color_space,
             F.get_dimensions,
             F.get_image_num_channels,
             F.get_image_size,
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
index 8774b3bb8..befccf0be 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -11,7 +11,7 @@ from torchvision.prototype.transforms.functional import to_image_pil
 from torchvision.prototype.transforms.utils import has_all, has_any
 
 
-IMAGE = make_image(color_space=datapoints.ColorSpace.RGB)
+IMAGE = make_image(color_space="RGB")
 BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
 MASK = make_detection_mask(size=IMAGE.spatial_size)
 
diff --git a/torchvision/prototype/datapoints/__init__.py b/torchvision/prototype/datapoints/__init__.py
index 92f345e20..f85cb3dd5 100644
--- a/torchvision/prototype/datapoints/__init__.py
+++ b/torchvision/prototype/datapoints/__init__.py
@@ -1,6 +1,6 @@
 from ._bounding_box import BoundingBox, BoundingBoxFormat
 from ._datapoint import FillType, FillTypeJIT, InputType, InputTypeJIT
-from ._image import ColorSpace, Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
+from ._image import Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
 from ._label import Label, OneHotLabel
 from ._mask import Mask
 from ._video import TensorVideoType, TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index d674745a7..ece95169a 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -1,76 +1,24 @@
 from __future__ import annotations
 
-import warnings
 from typing import Any, List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
-from torchvision._utils import StrEnum
 from torchvision.transforms.functional import InterpolationMode
 
 from ._datapoint import Datapoint, FillTypeJIT
 
 
-class ColorSpace(StrEnum):
-    OTHER = StrEnum.auto()
-    GRAY = StrEnum.auto()
-    GRAY_ALPHA = StrEnum.auto()
-    RGB = StrEnum.auto()
-    RGB_ALPHA = StrEnum.auto()
-
-    @classmethod
-    def from_pil_mode(cls, mode: str) -> ColorSpace:
-        if mode == "L":
-            return cls.GRAY
-        elif mode == "LA":
-            return cls.GRAY_ALPHA
-        elif mode == "RGB":
-            return cls.RGB
-        elif mode == "RGBA":
-            return cls.RGB_ALPHA
-        else:
-            return cls.OTHER
-
-    @staticmethod
-    def from_tensor_shape(shape: List[int]) -> ColorSpace:
-        return _from_tensor_shape(shape)
-
-
-def _from_tensor_shape(shape: List[int]) -> ColorSpace:
-    # Needed as a standalone method for JIT
-    ndim = len(shape)
-    if ndim < 2:
-        return ColorSpace.OTHER
-    elif ndim == 2:
-        return ColorSpace.GRAY
-
-    num_channels = shape[-3]
-    if num_channels == 1:
-        return ColorSpace.GRAY
-    elif num_channels == 2:
-        return ColorSpace.GRAY_ALPHA
-    elif num_channels == 3:
-        return ColorSpace.RGB
-    elif num_channels == 4:
-        return ColorSpace.RGB_ALPHA
-    else:
-        return ColorSpace.OTHER
-
-
 class Image(Datapoint):
-    color_space: ColorSpace
-
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Image:
+    def _wrap(cls, tensor: torch.Tensor) -> Image:
         image = tensor.as_subclass(cls)
-        image.color_space = color_space
         return image
 
     def __new__(
         cls,
         data: Any,
         *,
-        color_space: Optional[Union[ColorSpace, str]] = None,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
@@ -81,26 +29,14 @@ class Image(Datapoint):
         elif tensor.ndim == 2:
             tensor = tensor.unsqueeze(0)
 
-        if color_space is None:
-            color_space = ColorSpace.from_tensor_shape(tensor.shape)  # type: ignore[arg-type]
-            if color_space == ColorSpace.OTHER:
-                warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.")
-        elif isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-        elif not isinstance(color_space, ColorSpace):
-            raise ValueError
-
-        return cls._wrap(tensor, color_space=color_space)
+        return cls._wrap(tensor)
 
     @classmethod
-    def wrap_like(cls, other: Image, tensor: torch.Tensor, *, color_space: Optional[ColorSpace] = None) -> Image:
-        return cls._wrap(
-            tensor,
-            color_space=color_space if color_space is not None else other.color_space,
-        )
+    def wrap_like(cls, other: Image, tensor: torch.Tensor) -> Image:
+        return cls._wrap(tensor)
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(color_space=self.color_space)
+        return self._make_repr()
 
     @property
     def spatial_size(self) -> Tuple[int, int]:
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index c72738746..5a73d3536 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -1,29 +1,23 @@
 from __future__ import annotations
 
-import warnings
 from typing import Any, List, Optional, Tuple, Union
 
 import torch
 from torchvision.transforms.functional import InterpolationMode
 
 from ._datapoint import Datapoint, FillTypeJIT
-from ._image import ColorSpace
 
 
 class Video(Datapoint):
-    color_space: ColorSpace
-
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Video:
+    def _wrap(cls, tensor: torch.Tensor) -> Video:
         video = tensor.as_subclass(cls)
-        video.color_space = color_space
         return video
 
     def __new__(
         cls,
         data: Any,
         *,
-        color_space: Optional[Union[ColorSpace, str]] = None,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: bool = False,
@@ -31,28 +25,14 @@ class Video(Datapoint):
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         if data.ndim < 4:
             raise ValueError
-        video = super().__new__(cls, data, requires_grad=requires_grad)
-
-        if color_space is None:
-            color_space = ColorSpace.from_tensor_shape(video.shape)  # type: ignore[arg-type]
-            if color_space == ColorSpace.OTHER:
-                warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.")
-        elif isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-        elif not isinstance(color_space, ColorSpace):
-            raise ValueError
-
-        return cls._wrap(tensor, color_space=color_space)
+        return cls._wrap(tensor)
 
     @classmethod
-    def wrap_like(cls, other: Video, tensor: torch.Tensor, *, color_space: Optional[ColorSpace] = None) -> Video:
-        return cls._wrap(
-            tensor,
-            color_space=color_space if color_space is not None else other.color_space,
-        )
+    def wrap_like(cls, other: Video, tensor: torch.Tensor) -> Video:
+        return cls._wrap(tensor)
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(color_space=self.color_space)
+        return self._make_repr()
 
     @property
     def spatial_size(self) -> Tuple[int, int]:
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 04b007190..fa75cf633 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -39,7 +39,7 @@ from ._geometry import (
     ScaleJitter,
     TenCrop,
 )
-from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertColorSpace, ConvertDtype, ConvertImageDtype
+from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
 from ._misc import (
     GaussianBlur,
     Identity,
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index 3247a8051..974fe2b27 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -28,6 +28,7 @@ class ToTensor(Transform):
         return _F.to_tensor(inpt)
 
 
+# TODO: in other PR (?) undeprecate those and make them use _rgb_to_gray?
 class Grayscale(Transform):
     _transformed_types = (
         datapoints.Image,
@@ -62,7 +63,7 @@ class Grayscale(Transform):
     ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
         if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = inpt.wrap_like(inpt, output, color_space=datapoints.ColorSpace.GRAY)  # type: ignore[arg-type]
+            output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
         return output
 
 
@@ -98,5 +99,5 @@ class RandomGrayscale(_RandomApplyTransform):
     ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
         if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = inpt.wrap_like(inpt, output, color_space=datapoints.ColorSpace.GRAY)  # type: ignore[arg-type]
+            output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
         return output
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 6ad9e0410..0373ee1ba 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -1,6 +1,4 @@
-from typing import Any, Dict, Optional, Union
-
-import PIL.Image
+from typing import Any, Dict, Union
 
 import torch
 
@@ -46,35 +44,6 @@ class ConvertDtype(Transform):
 ConvertImageDtype = ConvertDtype
 
 
-class ConvertColorSpace(Transform):
-    _transformed_types = (
-        is_simple_tensor,
-        datapoints.Image,
-        PIL.Image.Image,
-        datapoints.Video,
-    )
-
-    def __init__(
-        self,
-        color_space: Union[str, datapoints.ColorSpace],
-        old_color_space: Optional[Union[str, datapoints.ColorSpace]] = None,
-    ) -> None:
-        super().__init__()
-
-        if isinstance(color_space, str):
-            color_space = datapoints.ColorSpace.from_str(color_space)
-        self.color_space = color_space
-
-        if isinstance(old_color_space, str):
-            old_color_space = datapoints.ColorSpace.from_str(old_color_space)
-        self.old_color_space = old_color_space
-
-    def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
-        return F.convert_color_space(inpt, color_space=self.color_space, old_color_space=self.old_color_space)
-
-
 class ClampBoundingBoxes(Transform):
     _transformed_types = (datapoints.BoundingBox,)
 
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index 30ef6e3fc..57b4cc442 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -7,10 +7,6 @@ from ._utils import is_simple_tensor  # usort: skip
 from ._meta import (
     clamp_bounding_box,
     convert_format_bounding_box,
-    convert_color_space_image_tensor,
-    convert_color_space_image_pil,
-    convert_color_space_video,
-    convert_color_space,
     convert_dtype_image_tensor,
     convert_dtype,
     convert_dtype_video,
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index f6fb0af0a..a89bcae7b 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -27,13 +27,11 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
 def rgb_to_grayscale(
     inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], num_output_channels: int = 1
 ) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        old_color_space = datapoints._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
-    else:
-        old_color_space = None
-
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            inpt = inpt.as_subclass(torch.Tensor)
+    old_color_space = None  # TODO: remove when un-deprecating
+    if not (torch.jit.is_scripting() or is_simple_tensor(inpt)) and isinstance(
+        inpt, (datapoints.Image, datapoints.Video)
+    ):
+        inpt = inpt.as_subclass(torch.Tensor)
 
     call = ", num_output_channels=3" if num_output_channels == 3 else ""
     replacement = (
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 62f9664fc..b76dc7d7b 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -1,9 +1,9 @@
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 import PIL.Image
 import torch
 from torchvision.prototype import datapoints
-from torchvision.prototype.datapoints import BoundingBoxFormat, ColorSpace
+from torchvision.prototype.datapoints import BoundingBoxFormat
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
@@ -225,29 +225,6 @@ def clamp_bounding_box(
     return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
 
 
-def _strip_alpha(image: torch.Tensor) -> torch.Tensor:
-    image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
-    if not torch.all(alpha == _max_value(alpha.dtype)):
-        raise RuntimeError(
-            "Stripping the alpha channel if it contains values other than the max value is not supported."
-        )
-    return image
-
-
-def _add_alpha(image: torch.Tensor, alpha: Optional[torch.Tensor] = None) -> torch.Tensor:
-    if alpha is None:
-        shape = list(image.shape)
-        shape[-3] = 1
-        alpha = torch.full(shape, _max_value(image.dtype), dtype=image.dtype, device=image.device)
-    return torch.cat((image, alpha), dim=-3)
-
-
-def _gray_to_rgb(grayscale: torch.Tensor) -> torch.Tensor:
-    repeats = [1] * grayscale.ndim
-    repeats[-3] = 3
-    return grayscale.repeat(repeats)
-
-
 def _rgb_to_gray(image: torch.Tensor, cast: bool = True) -> torch.Tensor:
     r, g, b = image.unbind(dim=-3)
     l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
@@ -257,107 +234,6 @@ def _rgb_to_gray(image: torch.Tensor, cast: bool = True) -> torch.Tensor:
     return l_img
 
 
-def convert_color_space_image_tensor(
-    image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace
-) -> torch.Tensor:
-    if new_color_space == old_color_space:
-        return image
-
-    if old_color_space == ColorSpace.OTHER or new_color_space == ColorSpace.OTHER:
-        raise RuntimeError(f"Conversion to or from {ColorSpace.OTHER} is not supported.")
-
-    if old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.GRAY_ALPHA:
-        return _add_alpha(image)
-    elif old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.RGB:
-        return _gray_to_rgb(image)
-    elif old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.RGB_ALPHA:
-        return _add_alpha(_gray_to_rgb(image))
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.GRAY:
-        return _strip_alpha(image)
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB:
-        return _gray_to_rgb(_strip_alpha(image))
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB_ALPHA:
-        image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
-        return _add_alpha(_gray_to_rgb(image), alpha)
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY:
-        return _rgb_to_gray(image)
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY_ALPHA:
-        return _add_alpha(_rgb_to_gray(image))
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.RGB_ALPHA:
-        return _add_alpha(image)
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY:
-        return _rgb_to_gray(_strip_alpha(image))
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY_ALPHA:
-        image, alpha = torch.tensor_split(image, indices=(-1,), dim=-3)
-        return _add_alpha(_rgb_to_gray(image), alpha)
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.RGB:
-        return _strip_alpha(image)
-    else:
-        raise RuntimeError(f"Conversion from {old_color_space} to {new_color_space} is not supported.")
-
-
-_COLOR_SPACE_TO_PIL_MODE = {
-    ColorSpace.GRAY: "L",
-    ColorSpace.GRAY_ALPHA: "LA",
-    ColorSpace.RGB: "RGB",
-    ColorSpace.RGB_ALPHA: "RGBA",
-}
-
-
-@torch.jit.unused
-def convert_color_space_image_pil(image: PIL.Image.Image, color_space: ColorSpace) -> PIL.Image.Image:
-    old_mode = image.mode
-    try:
-        new_mode = _COLOR_SPACE_TO_PIL_MODE[color_space]
-    except KeyError:
-        raise ValueError(f"Conversion from {ColorSpace.from_pil_mode(old_mode)} to {color_space} is not supported.")
-
-    if image.mode == new_mode:
-        return image
-
-    return image.convert(new_mode)
-
-
-def convert_color_space_video(
-    video: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace
-) -> torch.Tensor:
-    return convert_color_space_image_tensor(video, old_color_space=old_color_space, new_color_space=new_color_space)
-
-
-def convert_color_space(
-    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT],
-    color_space: ColorSpace,
-    old_color_space: Optional[ColorSpace] = None,
-) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(convert_color_space)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        if old_color_space is None:
-            raise RuntimeError(
-                "In order to convert the color space of simple tensors, "
-                "the `old_color_space=...` parameter needs to be passed."
-            )
-        return convert_color_space_image_tensor(inpt, old_color_space=old_color_space, new_color_space=color_space)
-    elif isinstance(inpt, datapoints.Image):
-        output = convert_color_space_image_tensor(
-            inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
-        )
-        return datapoints.Image.wrap_like(inpt, output, color_space=color_space)
-    elif isinstance(inpt, datapoints.Video):
-        output = convert_color_space_video(
-            inpt.as_subclass(torch.Tensor), old_color_space=inpt.color_space, new_color_space=color_space
-        )
-        return datapoints.Video.wrap_like(inpt, output, color_space=color_space)
-    elif isinstance(inpt, PIL.Image.Image):
-        return convert_color_space_image_pil(inpt, color_space=color_space)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
-
-
 def _num_value_bits(dtype: torch.dtype) -> int:
     if dtype == torch.uint8:
         return 8
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 5d662a2c1..69965126d 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -1234,6 +1234,9 @@ def affine(
     return F_t.affine(img, matrix=matrix, interpolation=interpolation.value, fill=fill)
 
 
+# Looks like to_grayscale() is a stand-alone functional that is never called
+# from the transform classes. Perhaps it's still here for BC? I can't be
+# bothered to dig. Anyway, this can be deprecated as we migrate to V2.
 @torch.jit.unused
 def to_grayscale(img, num_output_channels=1):
     """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image.
-- 
GitLab


From 60c78f2883e8ff9917f3e46e4a2e6a578a87bca8 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 26 Jan 2023 15:12:47 +0100
Subject: [PATCH 220/624] make RandomErasing scriptable for integer value
 (#7134)

---
 test/test_transforms_tensor.py               | 12 +++++++++++-
 torchvision/prototype/transforms/_augment.py |  8 ++++----
 torchvision/transforms/transforms.py         |  6 +++---
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index 7b75a4436..7f18aaf4c 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -672,7 +672,17 @@ def test_autoaugment__op_apply_shear(interpolation, mode):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize(
     "config",
-    [{"value": 0.2}, {"value": "random"}, {"value": (0.2, 0.2, 0.2)}, {"value": "random", "ratio": (0.1, 0.2)}],
+    [
+        {},
+        {"value": 1},
+        {"value": 0.2},
+        {"value": "random"},
+        {"value": (1, 1, 1)},
+        {"value": (0.2, 0.2, 0.2)},
+        {"value": [1, 1, 1]},
+        {"value": [0.2, 0.2, 0.2]},
+        {"value": "random", "ratio": (0.1, 0.2)},
+    ],
 )
 def test_random_erasing(device, config):
     tensor, _ = _create_data(24, 32, channels=3, device=device)
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 3160770a0..667193784 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -23,7 +23,7 @@ class RandomErasing(_RandomApplyTransform):
         p: float = 0.5,
         scale: Tuple[float, float] = (0.02, 0.33),
         ratio: Tuple[float, float] = (0.3, 3.3),
-        value: float = 0,
+        value: float = 0.0,
         inplace: bool = False,
     ):
         super().__init__(p=p)
@@ -42,11 +42,11 @@ class RandomErasing(_RandomApplyTransform):
         self.scale = scale
         self.ratio = ratio
         if isinstance(value, (int, float)):
-            self.value = [value]
+            self.value = [float(value)]
         elif isinstance(value, str):
             self.value = None
-        elif isinstance(value, tuple):
-            self.value = list(value)
+        elif isinstance(value, (list, tuple)):
+            self.value = [float(v) for v in value]
         else:
             self.value = value
         self.inplace = inplace
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index cb2bfdb92..62e36a06f 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -1713,11 +1713,11 @@ class RandomErasing(torch.nn.Module):
 
             # cast self.value to script acceptable type
             if isinstance(self.value, (int, float)):
-                value = [self.value]
+                value = [float(self.value)]
             elif isinstance(self.value, str):
                 value = None
-            elif isinstance(self.value, tuple):
-                value = list(self.value)
+            elif isinstance(self.value, (list, tuple)):
+                value = [float(v) for v in self.value]
             else:
                 value = self.value
 
-- 
GitLab


From d50915622998adf40e7ed1512f2e63ba3d9dc771 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 26 Jan 2023 15:23:19 +0100
Subject: [PATCH 221/624] Undeprecate ToGrayScale transforms and functionals
 (#7122)

---
 test/test_prototype_transforms_consistency.py |  5 ++
 .../prototype/datapoints/_datapoint.py        |  3 +
 torchvision/prototype/datapoints/_image.py    |  6 ++
 torchvision/prototype/datapoints/_video.py    |  6 ++
 torchvision/prototype/transforms/__init__.py  |  4 +-
 torchvision/prototype/transforms/_color.py    | 35 ++++++++
 .../prototype/transforms/_deprecated.py       | 82 +------------------
 .../transforms/functional/__init__.py         |  5 +-
 .../prototype/transforms/functional/_color.py | 51 +++++++++++-
 .../transforms/functional/_deprecated.py      | 29 -------
 .../prototype/transforms/functional/_meta.py  |  9 --
 11 files changed, 111 insertions(+), 124 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 3b69b72dd..b416dae20 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -149,6 +149,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(num_output_channels=3),
         ],
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]),
+        # Use default tolerances of `torch.testing.assert_close`
+        closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
         prototype_transforms.ConvertDtype,
@@ -271,6 +273,9 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(p=0),
             ArgsKwargs(p=1),
         ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]),
+        # Use default tolerances of `torch.testing.assert_close`
+        closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
         prototype_transforms.RandomResizedCrop,
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index d6472301e..5c50542b0 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -230,6 +230,9 @@ class Datapoint(torch.Tensor):
     ) -> Datapoint:
         return self
 
+    def to_grayscale(self, num_output_channels: int = 1) -> Datapoint:
+        return self
+
     def adjust_brightness(self, brightness_factor: float) -> Datapoint:
         return self
 
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index ece95169a..0b2ab7453 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -169,6 +169,12 @@ class Image(Datapoint):
         )
         return Image.wrap_like(self, output)
 
+    def to_grayscale(self, num_output_channels: int = 1) -> Image:
+        output = self._F.rgb_to_grayscale_image_tensor(
+            self.as_subclass(torch.Tensor), num_output_channels=num_output_channels
+        )
+        return Image.wrap_like(self, output)
+
     def adjust_brightness(self, brightness_factor: float) -> Image:
         output = self._F.adjust_brightness_image_tensor(
             self.as_subclass(torch.Tensor), brightness_factor=brightness_factor
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 5a73d3536..50f9110f4 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -173,6 +173,12 @@ class Video(Datapoint):
         )
         return Video.wrap_like(self, output)
 
+    def to_grayscale(self, num_output_channels: int = 1) -> Video:
+        output = self._F.rgb_to_grayscale_image_tensor(
+            self.as_subclass(torch.Tensor), num_output_channels=num_output_channels
+        )
+        return Video.wrap_like(self, output)
+
     def adjust_brightness(self, brightness_factor: float) -> Video:
         output = self._F.adjust_brightness_video(self.as_subclass(torch.Tensor), brightness_factor=brightness_factor)
         return Video.wrap_like(self, output)
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index fa75cf633..132edb1b6 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -9,9 +9,11 @@ from ._augment import RandomCutmix, RandomErasing, RandomMixup, SimpleCopyPaste
 from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
 from ._color import (
     ColorJitter,
+    Grayscale,
     RandomAdjustSharpness,
     RandomAutocontrast,
     RandomEqualize,
+    RandomGrayscale,
     RandomInvert,
     RandomPhotometricDistort,
     RandomPosterize,
@@ -54,4 +56,4 @@ from ._misc import (
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
-from ._deprecated import Grayscale, RandomGrayscale, ToTensor  # usort: skip
+from ._deprecated import ToTensor  # usort: skip
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 0eb20e577..6ab997b1e 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -11,6 +11,41 @@ from ._transform import _RandomApplyTransform
 from .utils import is_simple_tensor, query_chw
 
 
+class Grayscale(Transform):
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
+
+    def __init__(self, num_output_channels: int = 1):
+        super().__init__()
+        self.num_output_channels = num_output_channels
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
+
+
+class RandomGrayscale(_RandomApplyTransform):
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
+
+    def __init__(self, p: float = 0.1) -> None:
+        super().__init__(p=p)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_input_channels, *_ = query_chw(flat_inputs)
+        return dict(num_input_channels=num_input_channels)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
+
+
 class ColorJitter(Transform):
     def __init__(
         self,
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
index 974fe2b27..cd37f4d73 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/prototype/transforms/_deprecated.py
@@ -1,17 +1,12 @@
 import warnings
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, Union
 
 import numpy as np
 import PIL.Image
 import torch
 
-from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import Transform
 from torchvision.transforms import functional as _F
-from typing_extensions import Literal
-
-from ._transform import _RandomApplyTransform
-from .utils import is_simple_tensor, query_chw
 
 
 class ToTensor(Transform):
@@ -26,78 +21,3 @@ class ToTensor(Transform):
 
     def _transform(self, inpt: Union[PIL.Image.Image, np.ndarray], params: Dict[str, Any]) -> torch.Tensor:
         return _F.to_tensor(inpt)
-
-
-# TODO: in other PR (?) undeprecate those and make them use _rgb_to_gray?
-class Grayscale(Transform):
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
-    def __init__(self, num_output_channels: Literal[1, 3] = 1) -> None:
-        deprecation_msg = (
-            f"The transform `Grayscale(num_output_channels={num_output_channels})` "
-            f"is deprecated and will be removed in a future release."
-        )
-        if num_output_channels == 1:
-            replacement_msg = (
-                "transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY)"
-            )
-        else:
-            replacement_msg = (
-                "transforms.Compose(\n"
-                "    transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY),\n"
-                "    transforms.ConvertImageColorSpace(old_color_space=ColorSpace.GRAY, color_space=ColorSpace.RGB),\n"
-                ")"
-            )
-        warnings.warn(f"{deprecation_msg} Instead, please use\n\n{replacement_msg}")
-
-        super().__init__()
-        self.num_output_channels = num_output_channels
-
-    def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
-        output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
-        return output
-
-
-class RandomGrayscale(_RandomApplyTransform):
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
-    def __init__(self, p: float = 0.1) -> None:
-        warnings.warn(
-            "The transform `RandomGrayscale(p=...)` is deprecated and will be removed in a future release. "
-            "Instead, please use\n\n"
-            "transforms.RandomApply(\n"
-            "    transforms.Compose(\n"
-            "        transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY),\n"
-            "        transforms.ConvertImageColorSpace(old_color_space=ColorSpace.GRAY, color_space=ColorSpace.RGB),\n"
-            "    )\n"
-            "    p=...,\n"
-            ")"
-        )
-
-        super().__init__(p=p)
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        num_input_channels, *_ = query_chw(flat_inputs)
-        return dict(num_input_channels=num_input_channels)
-
-    def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
-        output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
-        return output
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index 57b4cc442..0909b7634 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -71,6 +71,9 @@ from ._color import (
     posterize_image_pil,
     posterize_image_tensor,
     posterize_video,
+    rgb_to_grayscale,
+    rgb_to_grayscale_image_pil,
+    rgb_to_grayscale_image_tensor,
     solarize,
     solarize_image_pil,
     solarize_image_tensor,
@@ -167,4 +170,4 @@ from ._misc import (
 from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
 from ._type_conversion import pil_to_tensor, to_image_pil, to_image_tensor, to_pil_image
 
-from ._deprecated import get_image_size, rgb_to_grayscale, to_grayscale, to_tensor  # usort: skip
+from ._deprecated import get_image_size, to_grayscale, to_tensor  # usort: skip
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 53de1f407..719bd801e 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -1,3 +1,5 @@
+from typing import Union
+
 import PIL.Image
 import torch
 from torch.nn.functional import conv2d
@@ -7,10 +9,53 @@ from torchvision.transforms.functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import _num_value_bits, _rgb_to_gray, convert_dtype_image_tensor
+from ._meta import _num_value_bits, convert_dtype_image_tensor
 from ._utils import is_simple_tensor
 
 
+def _rgb_to_grayscale_image_tensor(
+    image: torch.Tensor, num_output_channels: int = 1, preserve_dtype: bool = True
+) -> torch.Tensor:
+    if image.shape[-3] == 1:
+        return image.clone()
+
+    r, g, b = image.unbind(dim=-3)
+    l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
+    l_img = l_img.unsqueeze(dim=-3)
+    if preserve_dtype:
+        l_img = l_img.to(image.dtype)
+    if num_output_channels == 3:
+        l_img = l_img.expand(image.shape)
+    return l_img
+
+
+def rgb_to_grayscale_image_tensor(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    return _rgb_to_grayscale_image_tensor(image, num_output_channels=num_output_channels, preserve_dtype=True)
+
+
+rgb_to_grayscale_image_pil = _FP.to_grayscale
+
+
+def rgb_to_grayscale(
+    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], num_output_channels: int = 1
+) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(rgb_to_grayscale)
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
+        return inpt.to_grayscale(num_output_channels=num_output_channels)
+    elif isinstance(inpt, PIL.Image.Image):
+        return rgb_to_grayscale_image_pil(inpt, num_output_channels=num_output_channels)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
     ratio = float(ratio)
     fp = image1.is_floating_point()
@@ -68,7 +113,7 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
     if c == 1:  # Match PIL behaviour
         return image
 
-    grayscale_image = _rgb_to_gray(image, cast=False)
+    grayscale_image = _rgb_to_grayscale_image_tensor(image, num_output_channels=1, preserve_dtype=False)
     if not image.is_floating_point():
         grayscale_image = grayscale_image.floor_()
 
@@ -110,7 +155,7 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
         raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
     fp = image.is_floating_point()
     if c == 3:
-        grayscale_image = _rgb_to_gray(image, cast=False)
+        grayscale_image = _rgb_to_grayscale_image_tensor(image, num_output_channels=1, preserve_dtype=False)
         if not fp:
             grayscale_image = grayscale_image.floor_()
     else:
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
index a89bcae7b..098702160 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/prototype/transforms/functional/_deprecated.py
@@ -7,8 +7,6 @@ import torch
 from torchvision.prototype import datapoints
 from torchvision.transforms import functional as _F
 
-from ._utils import is_simple_tensor
-
 
 @torch.jit.unused
 def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
@@ -24,33 +22,6 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima
     return _F.to_grayscale(inpt, num_output_channels=num_output_channels)
 
 
-def rgb_to_grayscale(
-    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], num_output_channels: int = 1
-) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
-    old_color_space = None  # TODO: remove when un-deprecating
-    if not (torch.jit.is_scripting() or is_simple_tensor(inpt)) and isinstance(
-        inpt, (datapoints.Image, datapoints.Video)
-    ):
-        inpt = inpt.as_subclass(torch.Tensor)
-
-    call = ", num_output_channels=3" if num_output_channels == 3 else ""
-    replacement = (
-        f"convert_color_space(..., color_space=datapoints.ColorSpace.GRAY"
-        f"{f', old_color_space=datapoints.ColorSpace.{old_color_space}' if old_color_space is not None else ''})"
-    )
-    if num_output_channels == 3:
-        replacement = (
-            f"convert_color_space({replacement}, color_space=datapoints.ColorSpace.RGB"
-            f"{f', old_color_space=datapoints.ColorSpace.GRAY' if old_color_space is not None else ''})"
-        )
-    warnings.warn(
-        f"The function `rgb_to_grayscale(...{call})` is deprecated in will be removed in a future release. "
-        f"Instead, please use `{replacement}`.",
-    )
-
-    return _F.rgb_to_grayscale(inpt, num_output_channels=num_output_channels)
-
-
 @torch.jit.unused
 def to_tensor(inpt: Any) -> torch.Tensor:
     warnings.warn(
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index b76dc7d7b..31d86bec2 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -225,15 +225,6 @@ def clamp_bounding_box(
     return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
 
 
-def _rgb_to_gray(image: torch.Tensor, cast: bool = True) -> torch.Tensor:
-    r, g, b = image.unbind(dim=-3)
-    l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
-    if cast:
-        l_img = l_img.to(image.dtype)
-    l_img = l_img.unsqueeze(dim=-3)
-    return l_img
-
-
 def _num_value_bits(dtype: torch.dtype) -> int:
     if dtype == torch.uint8:
         return 8
-- 
GitLab


From 59dc9383e663a9bab5230370e1f0d7d14b87940f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 26 Jan 2023 15:49:44 +0100
Subject: [PATCH 222/624] perform out of bounds check for single values and two
 tuples in ColorJitter (#7133)

---
 test/test_prototype_transforms_consistency.py |  2 +-
 test/test_transforms.py                       |  6 ++++++
 torchvision/prototype/transforms/_color.py    |  8 ++++----
 torchvision/transforms/transforms.py          | 11 +++++++----
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index b416dae20..f76c0f93d 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -317,7 +317,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(saturation=(0.8, 0.9)),
             ArgsKwargs(hue=0.3),
             ArgsKwargs(hue=(-0.1, 0.2)),
-            ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.5, hue=0.6),
+            ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.5, hue=0.3),
         ],
         closeness_kwargs={"atol": 1e-5, "rtol": 1e-5},
     ),
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 0c388dbb5..0340f9f3f 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -1798,6 +1798,12 @@ def test_color_jitter():
     color_jitter.__repr__()
 
 
+@pytest.mark.parametrize("hue", [1, (-1, 1)])
+def test_color_jitter_hue_out_of_bounds(hue):
+    with pytest.raises(ValueError, match=re.escape("hue values should be between (-0.5, 0.5)")):
+        transforms.ColorJitter(hue=hue)
+
+
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.skipif(stats is None, reason="scipy.stats not available")
 def test_random_erasing(seed):
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 6ab997b1e..17b02e369 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -77,12 +77,12 @@ class ColorJitter(Transform):
             value = [center - value, center + value]
             if clip_first_on_zero:
                 value[0] = max(value[0], 0.0)
-        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
-            if not bound[0] <= value[0] <= value[1] <= bound[1]:
-                raise ValueError(f"{name} values should be between {bound}")
-        else:
+        elif not (isinstance(value, collections.abc.Sequence) and len(value) == 2):
             raise TypeError(f"{name} should be a single number or a sequence with length 2.")
 
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound}, but got {value}.")
+
         return None if value[0] == value[1] == center else (float(value[0]), float(value[1]))
 
     @staticmethod
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 62e36a06f..573791b41 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -1195,16 +1195,19 @@ class ColorJitter(torch.nn.Module):
             if clip_first_on_zero:
                 value[0] = max(value[0], 0.0)
         elif isinstance(value, (tuple, list)) and len(value) == 2:
-            if not bound[0] <= value[0] <= value[1] <= bound[1]:
-                raise ValueError(f"{name} values should be between {bound}")
+            value = [float(value[0]), float(value[1])]
         else:
             raise TypeError(f"{name} should be a single number or a list/tuple with length 2.")
 
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound}, but got {value}.")
+
         # if value is 0 or (1., 1.) for brightness/contrast/saturation
         # or (0., 0.) for hue, do nothing
         if value[0] == value[1] == center:
-            value = None
-        return value
+            return None
+        else:
+            return tuple(value)
 
     @staticmethod
     def get_params(
-- 
GitLab


From 1496ff0c8adcab75b7254e3be1dabc0784e95430 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 27 Jan 2023 09:55:32 +0100
Subject: [PATCH 223/624] DataPoint.to_grayscale() ->
 DataPoint.rgb_to_grayscale (#7136)

---
 torchvision/prototype/datapoints/_datapoint.py        | 2 +-
 torchvision/prototype/datapoints/_image.py            | 2 +-
 torchvision/prototype/datapoints/_video.py            | 2 +-
 torchvision/prototype/transforms/functional/_color.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index 5c50542b0..848808d02 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -230,7 +230,7 @@ class Datapoint(torch.Tensor):
     ) -> Datapoint:
         return self
 
-    def to_grayscale(self, num_output_channels: int = 1) -> Datapoint:
+    def rgb_to_grayscale(self, num_output_channels: int = 1) -> Datapoint:
         return self
 
     def adjust_brightness(self, brightness_factor: float) -> Datapoint:
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index 0b2ab7453..56939bf14 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -169,7 +169,7 @@ class Image(Datapoint):
         )
         return Image.wrap_like(self, output)
 
-    def to_grayscale(self, num_output_channels: int = 1) -> Image:
+    def rgb_to_grayscale(self, num_output_channels: int = 1) -> Image:
         output = self._F.rgb_to_grayscale_image_tensor(
             self.as_subclass(torch.Tensor), num_output_channels=num_output_channels
         )
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 50f9110f4..6c24197a9 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -173,7 +173,7 @@ class Video(Datapoint):
         )
         return Video.wrap_like(self, output)
 
-    def to_grayscale(self, num_output_channels: int = 1) -> Video:
+    def rgb_to_grayscale(self, num_output_channels: int = 1) -> Video:
         output = self._F.rgb_to_grayscale_image_tensor(
             self.as_subclass(torch.Tensor), num_output_channels=num_output_channels
         )
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
index 719bd801e..e1c8bb87c 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -46,7 +46,7 @@ def rgb_to_grayscale(
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.to_grayscale(num_output_channels=num_output_channels)
+        return inpt.rgb_to_grayscale(num_output_channels=num_output_channels)
     elif isinstance(inpt, PIL.Image.Image):
         return rgb_to_grayscale_image_pil(inpt, num_output_channels=num_output_channels)
     else:
-- 
GitLab


From 455eda681dc6827ecd98e0a0f4057032108f3734 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 27 Jan 2023 10:40:56 +0100
Subject: [PATCH 224/624] fix Resize for JIT single value sequences (#7139)

---
 test/test_prototype_transforms_consistency.py |  1 +
 torchvision/prototype/transforms/_geometry.py | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index f76c0f93d..2ac7e78e6 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -74,6 +74,7 @@ CONSISTENCY_CONFIGS = [
         legacy_transforms.Resize,
         [
             ArgsKwargs(32),
+            ArgsKwargs([32]),
             ArgsKwargs((32, 29)),
             ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
             ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 1cbf02d5a..a62fbf426 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -46,11 +46,16 @@ class Resize(Transform):
     ) -> None:
         super().__init__()
 
-        self.size = (
-            [size]
-            if isinstance(size, int)
-            else _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-        )
+        if isinstance(size, int):
+            size = [size]
+        elif isinstance(size, (list, tuple)) and len(size) in {1, 2}:
+            size = list(size)
+        else:
+            raise ValueError(
+                f"size can either be an integer or a list or tuple of one or two integers, " f"but got {size} instead."
+            )
+        self.size = size
+
         self.interpolation = interpolation
         self.max_size = max_size
         self.antialias = antialias
-- 
GitLab


From 2bc8a14db3143ca0e61f627bef6b57c10111daf0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 27 Jan 2023 11:25:50 +0100
Subject: [PATCH 225/624] fix requires_grad passthrough (#7138)

---
 test/test_prototype_datapoints.py             | 19 +++++++++++++++++++
 .../prototype/datapoints/_bounding_box.py     |  2 +-
 .../prototype/datapoints/_datapoint.py        |  6 ++++--
 torchvision/prototype/datapoints/_image.py    |  2 +-
 torchvision/prototype/datapoints/_label.py    |  2 +-
 torchvision/prototype/datapoints/_mask.py     |  2 +-
 torchvision/prototype/datapoints/_video.py    |  2 +-
 7 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index 2fc79a506..e6d2321fc 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -3,6 +3,25 @@ import torch
 from torchvision.prototype import datapoints
 
 
+@pytest.mark.parametrize(
+    ("data", "input_requires_grad", "expected_requires_grad"),
+    [
+        ([0.0], None, False),
+        ([0.0], False, False),
+        ([0.0], True, True),
+        (torch.tensor([0.0], requires_grad=False), None, False),
+        (torch.tensor([0.0], requires_grad=False), False, False),
+        (torch.tensor([0.0], requires_grad=False), True, True),
+        (torch.tensor([0.0], requires_grad=True), None, True),
+        (torch.tensor([0.0], requires_grad=True), False, False),
+        (torch.tensor([0.0], requires_grad=True), True, True),
+    ],
+)
+def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
+    datapoint = datapoints.Label(data, requires_grad=input_requires_grad)
+    assert datapoint.requires_grad is expected_requires_grad
+
+
 def test_isinstance():
     assert isinstance(
         datapoints.Label([0, 1, 0], categories=["foo", "bar"]),
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
index 398770cbf..f3c9b6b34 100644
--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -34,7 +34,7 @@ class BoundingBox(Datapoint):
         spatial_size: Tuple[int, int],
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> BoundingBox:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index 848808d02..fbd19ad86 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -23,8 +23,10 @@ class Datapoint(torch.Tensor):
         data: Any,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> torch.Tensor:
+        if requires_grad is None:
+            requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
         return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
     # FIXME: this is just here for BC with the prototype datasets. Some datasets use the Datapoint directly to have a
@@ -36,7 +38,7 @@ class Datapoint(torch.Tensor):
         data: Any,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> Datapoint:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return tensor.as_subclass(Datapoint)
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index 56939bf14..4ffeb37d5 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -21,7 +21,7 @@ class Image(Datapoint):
         *,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> Image:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         if tensor.ndim < 2:
diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/datapoints/_label.py
index 549154933..0ee2eb9f8 100644
--- a/torchvision/prototype/datapoints/_label.py
+++ b/torchvision/prototype/datapoints/_label.py
@@ -27,7 +27,7 @@ class _LabelBase(Datapoint):
         categories: Optional[Sequence[str]] = None,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> L:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return cls._wrap(tensor, categories=categories)
diff --git a/torchvision/prototype/datapoints/_mask.py b/torchvision/prototype/datapoints/_mask.py
index ca4aba87d..834f99051 100644
--- a/torchvision/prototype/datapoints/_mask.py
+++ b/torchvision/prototype/datapoints/_mask.py
@@ -19,7 +19,7 @@ class Mask(Datapoint):
         *,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> Mask:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return cls._wrap(tensor)
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 6c24197a9..5cc8370cd 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -20,7 +20,7 @@ class Video(Datapoint):
         *,
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
+        requires_grad: Optional[bool] = None,
     ) -> Video:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         if data.ndim < 4:
-- 
GitLab


From 71073cb50389e811a4bdd4c2e207fa835f02856b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 27 Jan 2023 13:32:03 +0100
Subject: [PATCH 226/624] add sequence fill support for ElasticTransform
 (#7141)

---
 test/test_transforms_tensor.py       | 32 ++++++++++++++++++++++++++++
 torchvision/transforms/functional.py |  2 --
 torchvision/transforms/transforms.py |  8 +++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index 7f18aaf4c..1a1de659a 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -858,3 +858,35 @@ def test_gaussian_blur(device, channels, meth_kwargs):
         agg_method="max",
         tol=tol,
     )
+
+
+@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize(
+    "fill",
+    [
+        1,
+        1.0,
+        [1],
+        [1.0],
+        (1,),
+        (1.0,),
+        [1, 2, 3],
+        [1.0, 2.0, 3.0],
+        (1, 2, 3),
+        (1.0, 2.0, 3.0),
+    ],
+)
+@pytest.mark.parametrize("channels", [1, 3])
+def test_elastic_transform(device, channels, fill):
+    if isinstance(fill, (list, tuple)) and len(fill) > 1 and channels == 1:
+        # For this the test would correctly fail, since the number of channels in the image does not match `fill`.
+        # Thus, this is not an issue in the transform, but rather a problem of parametrization that just gives the
+        # product of `fill` and `channels`.
+        return
+
+    _test_class_op(
+        T.ElasticTransform,
+        meth_kwargs=dict(fill=fill),
+        channels=channels,
+        device=device,
+    )
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 69965126d..67399274b 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -1539,8 +1539,6 @@ def elastic_transform(
         fill (number or str or tuple): Pixel fill value for constant fill. Default is 0.
             If a tuple of length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or str or tuple value is supported for PIL Image.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(elastic_transform)
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 573791b41..18b87946a 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -2104,8 +2104,12 @@ class ElasticTransform(torch.nn.Module):
             interpolation = _interpolation_modes_from_int(interpolation)
         self.interpolation = interpolation
 
-        if not isinstance(fill, (int, float)):
-            raise TypeError(f"fill should be int or float. Got {type(fill)}")
+        if isinstance(fill, (int, float)):
+            fill = [float(fill)]
+        elif isinstance(fill, (list, tuple)):
+            fill = [float(f) for f in fill]
+        else:
+            raise TypeError(f"fill should be int or float or a list or tuple of them. Got {type(fill)}")
         self.fill = fill
 
     @staticmethod
-- 
GitLab


From 78ffda7eb952571df728e2ae49c2aca788596138 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 27 Jan 2023 16:44:27 +0100
Subject: [PATCH 227/624] drop support for Python 3.7 (#7110)

Co-authored-by: Nikita Shulga <nshulga@fb.com>
---
 .circleci/config.yml                          | 332 ++----------------
 .circleci/config.yml.in                       |  24 +-
 .circleci/regenerate.py                       |  16 +-
 .../unittest/ios/scripts/binary_ios_build.sh  |   2 +-
 .../workflows/prototype-tests-linux-gpu.yml   |   1 -
 .github/workflows/test-linux-cpu.yml          |   2 +-
 .github/workflows/tests-schedule.yml          |   2 +-
 README.rst                                    |   2 +-
 packaging/pkg_helpers.bash                    |   3 +-
 packaging/torchvision/conda_build_config.yaml |   2 +-
 packaging/vs2017/conda_build_config.yaml      |   2 +-
 packaging/vs2019/conda_build_config.yaml      |   2 +-
 pyproject.toml                                |   2 +-
 setup.py                                      |   3 +-
 .../prototype/datasets/utils/_resource.py     |   3 +-
 torchvision/prototype/transforms/_geometry.py |   4 +-
 torchvision/prototype/transforms/_utils.py    |   4 +-
 torchvision/transforms/functional_pil.py      |   3 +-
 18 files changed, 73 insertions(+), 336 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6b6a64662..7b96f9682 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,8 +2,8 @@ version: 2.1
 
 # How to test the Linux jobs:
 #   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7
-#     - Replace binary_linux_wheel_py3.7 with the name of the job you want to test.
+#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.8
+#     - Replace binary_linux_wheel_py3.8 with the name of the job you want to test.
 #       Job names are 'name:' key.
 
 executors:
@@ -201,7 +201,7 @@ binary_common: &binary_common
       default: ""
     # Don't edit these
     python_version:
-      description: "Python version to build against (e.g., 3.7)"
+      description: "Python version to build against (e.g., 3.8)"
       type: string
     cu_version:
       description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
@@ -258,7 +258,7 @@ smoke_test_common: &smoke_test_common
 jobs:
   circleci_consistency:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - pip_install:
@@ -271,7 +271,7 @@ jobs:
 
   lint_python_and_config:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - pip_install:
@@ -290,7 +290,7 @@ jobs:
 
   lint_c:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - apt_install:
           args: libtinfo5
@@ -312,7 +312,7 @@ jobs:
 
   type_check_python:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - install_torchvision:
@@ -326,7 +326,7 @@ jobs:
 
   unittest_torchhub:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - install_torchvision
@@ -335,7 +335,7 @@ jobs:
 
   unittest_onnx:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - install_torchvision
@@ -347,7 +347,7 @@ jobs:
 
   unittest_extended:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     resource_class: xlarge
     steps:
       - checkout
@@ -594,7 +594,7 @@ jobs:
         description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
         type: string
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - attach_workspace:
           at: ~/workspace
@@ -1017,7 +1017,7 @@ jobs:
   build_docs:
     <<: *binary_common
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     resource_class: 2xlarge+
     steps:
       - attach_workspace:
@@ -1100,73 +1100,37 @@ workflows:
               only: /.*/
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
+          name: binary_linux_wheel_py3.8_cpu
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cpu
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda116
           cu_version: cu116
-          name: binary_linux_wheel_py3.7_cu116
-          python_version: '3.7'
+          name: binary_linux_wheel_py3.8_cu116
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda116
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda117
           cu_version: cu117
-          name: binary_linux_wheel_py3.7_cu117
-          python_version: '3.7'
+          name: binary_linux_wheel_py3.8_cu117
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda117
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda118
           cu_version: cu118
-          name: binary_linux_wheel_py3.7_cu118
-          python_version: '3.7'
+          name: binary_linux_wheel_py3.8_cu118
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda118
       - binary_linux_wheel:
           cu_version: rocm5.2
-          name: binary_linux_wheel_py3.7_rocm5.2
-          python_version: '3.7'
+          name: binary_linux_wheel_py3.8_rocm5.2
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
       - binary_linux_wheel:
           cu_version: rocm5.3
-          name: binary_linux_wheel_py3.7_rocm5.3
-          python_version: '3.7'
+          name: binary_linux_wheel_py3.8_rocm5.3
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-rocm:5.3
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cpu
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu116
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu117
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu118
-          python_version: '3.7'
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1265,42 +1229,6 @@ workflows:
           cu_version: cu118
           name: binary_win_wheel_py3.10_cu118
           python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cpu
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu116
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu117
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu118
-          python_version: '3.7'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1407,9 +1335,9 @@ workflows:
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: build_docs
-          python_version: '3.7'
+          python_version: '3.8'
           requires:
-          - binary_linux_wheel_py3.7_cpu
+          - binary_linux_wheel_py3.8_cpu
       - upload_docs:
           context: org-member
           filters:
@@ -1419,7 +1347,7 @@ workflows:
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: upload_docs
-          python_version: '3.7'
+          python_version: '3.8'
           requires:
           - build_docs
       - binary_ios_build:
@@ -1441,15 +1369,6 @@ workflows:
       - unittest_torchhub
       - unittest_onnx
       - unittest_extended
-      - unittest_linux_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.7
-          python_version: '3.7'
       - unittest_linux_gpu:
           cu_version: cu116
           name: unittest_linux_gpu_py3.8
@@ -1472,10 +1391,6 @@ workflows:
               - nightly
           name: unittest_linux_gpu_py3.10
           python_version: '3.10'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.7
-          python_version: '3.7'
       - unittest_windows_cpu:
           cu_version: cpu
           name: unittest_windows_cpu_py3.8
@@ -1488,15 +1403,6 @@ workflows:
           cu_version: cpu
           name: unittest_windows_cpu_py3.10
           python_version: '3.10'
-      - unittest_windows_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.7
-          python_version: '3.7'
       - unittest_windows_gpu:
           cu_version: cu116
           name: unittest_windows_gpu_py3.8
@@ -1519,10 +1425,6 @@ workflows:
               - nightly
           name: unittest_windows_gpu_py3.10
           python_version: '3.10'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.7
-          python_version: '3.7'
       - unittest_macos_cpu:
           cu_version: cpu
           name: unittest_macos_cpu_py3.8
@@ -1606,8 +1508,8 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
+          name: nightly_binary_linux_wheel_py3.8_cpu
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cpu
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda116
@@ -1617,8 +1519,8 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu116
-          python_version: '3.7'
+          name: nightly_binary_linux_wheel_py3.8_cu116
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda116
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda117
@@ -1628,8 +1530,8 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu117
-          python_version: '3.7'
+          name: nightly_binary_linux_wheel_py3.8_cu117
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda117
       - binary_linux_wheel:
           conda_docker_image: pytorch/conda-builder:cuda118
@@ -1639,8 +1541,8 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu118
-          python_version: '3.7'
+          name: nightly_binary_linux_wheel_py3.8_cu118
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda118
       - binary_linux_wheel:
           cu_version: rocm5.2
@@ -1649,8 +1551,8 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.2
-          python_version: '3.7'
+          name: nightly_binary_linux_wheel_py3.8_rocm5.2
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-rocm:5.2
       - binary_linux_wheel:
           cu_version: rocm5.3
@@ -1659,89 +1561,9 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.3
-          python_version: '3.7'
+          name: nightly_binary_linux_wheel_py3.8_rocm5.3
+          python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-rocm:5.3
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cpu
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu116
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu117
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu118
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu118
-          subfolder: cu118/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1982,82 +1804,6 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.10_cu118
           subfolder: cu118/
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cpu
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu116
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu116
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu117
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu118
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu118
       - binary_win_conda:
           cu_version: cpu
           filters:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index c3fe3fb6c..a84a3fa64 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -2,8 +2,8 @@ version: 2.1
 
 # How to test the Linux jobs:
 #   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7
-#     - Replace binary_linux_wheel_py3.7 with the name of the job you want to test.
+#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.8
+#     - Replace binary_linux_wheel_py3.8 with the name of the job you want to test.
 #       Job names are 'name:' key.
 
 executors:
@@ -201,7 +201,7 @@ binary_common: &binary_common
       default: ""
     # Don't edit these
     python_version:
-      description: "Python version to build against (e.g., 3.7)"
+      description: "Python version to build against (e.g., 3.8)"
       type: string
     cu_version:
       description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
@@ -258,7 +258,7 @@ smoke_test_common: &smoke_test_common
 jobs:
   circleci_consistency:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - pip_install:
@@ -271,7 +271,7 @@ jobs:
 
   lint_python_and_config:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - pip_install:
@@ -290,7 +290,7 @@ jobs:
 
   lint_c:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - apt_install:
           args: libtinfo5
@@ -312,7 +312,7 @@ jobs:
 
   type_check_python:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - install_torchvision:
@@ -326,7 +326,7 @@ jobs:
 
   unittest_torchhub:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - install_torchvision
@@ -335,7 +335,7 @@ jobs:
 
   unittest_onnx:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
       - install_torchvision
@@ -347,7 +347,7 @@ jobs:
 
   unittest_extended:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     resource_class: xlarge
     steps:
       - checkout
@@ -594,7 +594,7 @@ jobs:
         description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
         type: string
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - attach_workspace:
           at: ~/workspace
@@ -1017,7 +1017,7 @@ jobs:
   build_docs:
     <<: *binary_common
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     resource_class: 2xlarge+
     steps:
       - attach_workspace:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 676ce2aeb..2fee884ef 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -21,7 +21,7 @@ import yaml
 from jinja2 import select_autoescape
 
 
-PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
+PYTHON_VERSIONS = ["3.8", "3.9", "3.10"]
 
 RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
 
@@ -55,16 +55,16 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         ):
                             fb = "main"
                         if not fb and (
-                            os_type == "linux" and cu_version == "cpu" and btype == "wheel" and python_version == "3.7"
+                            os_type == "linux" and cu_version == "cpu" and btype == "wheel" and python_version == "3.8"
                         ):
                             # the fields must match the build_docs "requires" dependency
                             fb = "/.*/"
 
                         # Disable all Linux Wheels Workflows from CircleCI
                         # since those will now be done through Nova. We'll keep
-                        # around the py3.7 Linux Wheels build since the docs
+                        # around the py3.8 Linux Wheels build since the docs
                         # job depends on it.
-                        if os_type == "linux" and btype == "wheel" and python_version != "3.7":
+                        if os_type == "linux" and btype == "wheel" and python_version != "3.8":
                             continue
 
                         # Disable all Macos Wheels Workflows from CircleCI.
@@ -98,7 +98,7 @@ def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix=""
         )
     )
 
-    # For the remaining py3.7 Linux Wheels job left around for the docs build,
+    # For the remaining py3.8 Linux Wheels job left around for the docs build,
     # we'll disable uploads.
     if os_type == "linux" and btype == "wheel":
         upload = False
@@ -116,9 +116,9 @@ def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix=""
 def build_doc_job(filter_branch):
     job = {
         "name": "build_docs",
-        "python_version": "3.7",
+        "python_version": "3.8",
         "requires": [
-            "binary_linux_wheel_py3.7_cpu",
+            "binary_linux_wheel_py3.8_cpu",
         ],
     }
 
@@ -131,7 +131,7 @@ def upload_doc_job(filter_branch):
     job = {
         "name": "upload_docs",
         "context": "org-member",
-        "python_version": "3.7",
+        "python_version": "3.8",
         "requires": [
             "build_docs",
         ],
diff --git a/.circleci/unittest/ios/scripts/binary_ios_build.sh b/.circleci/unittest/ios/scripts/binary_ios_build.sh
index e2ad7b0c5..1f117481f 100755
--- a/.circleci/unittest/ios/scripts/binary_ios_build.sh
+++ b/.circleci/unittest/ios/scripts/binary_ios_build.sh
@@ -16,7 +16,7 @@ export PATH="~/anaconda/bin:${PATH}"
 source ~/anaconda/bin/activate
 
 # install dependencies
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests typing_extensions wget --yes
+conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests wget --yes
 conda install -c conda-forge valgrind --yes
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index fe1e46ab3..e52a07ea7 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -8,7 +8,6 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.7"
           - "3.8"
           - "3.9"
           - "3.10"
diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 234ad97f4..5dc7550d8 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -16,7 +16,7 @@ jobs:
   tests:
     strategy:
       matrix:
-        python_version: ["3.7", "3.8", "3.9", "3.10"]
+        python_version: ["3.8", "3.9", "3.10"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml
index 1e5a78207..5426fdc99 100644
--- a/.github/workflows/tests-schedule.yml
+++ b/.github/workflows/tests-schedule.yml
@@ -18,7 +18,7 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
 
       - name: Upgrade system packages
         run: python -m pip install --upgrade pip setuptools wheel
diff --git a/README.rst b/README.rst
index 0035df22b..615d96986 100644
--- a/README.rst
+++ b/README.rst
@@ -21,7 +21,7 @@ supported Python versions.
 +--------------------------+--------------------------+---------------------------------+
 | ``torch``                | ``torchvision``          | ``python``                      |
 +==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7.2``, ``<=3.10``         |
+| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.8``, ``<=3.10``           |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.13.0``               | ``0.14.0``               | ``>=3.7.2``, ``<=3.10``         |
 +--------------------------+--------------------------+---------------------------------+
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 5687d90fe..69f733918 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -150,7 +150,7 @@ retry () {
 }
 
 # Inputs:
-#   PYTHON_VERSION (3.7, 3.8, 3.9)
+#   PYTHON_VERSION (3.8, 3.9, 3.10)
 #   UNICODE_ABI (bool)
 #
 # Outputs:
@@ -169,7 +169,6 @@ setup_wheel_python() {
     # Install native CentOS libJPEG, freetype and GnuTLS
     yum install -y libjpeg-turbo-devel freetype gnutls
     case "$PYTHON_VERSION" in
-      3.7) python_abi=cp37-cp37m ;;
       3.8) python_abi=cp38-cp38 ;;
       3.9) python_abi=cp39-cp39 ;;
       3.10) python_abi=cp310-cp310 ;;
diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml
index 52b95952d..a7c25c6d5 100644
--- a/packaging/torchvision/conda_build_config.yaml
+++ b/packaging/torchvision/conda_build_config.yaml
@@ -7,7 +7,7 @@ c_compiler:
 cxx_compiler:
   - vs2017                     # [win]
 python:
-  - 3.7
+  - 3.8
 # This differs from target_platform in that it determines what subdir the compiler
 #    will target, not what subdir the compiler package will be itself.
 #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
diff --git a/packaging/vs2017/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml
index 2479ceb3e..781814fd0 100644
--- a/packaging/vs2017/conda_build_config.yaml
+++ b/packaging/vs2017/conda_build_config.yaml
@@ -5,7 +5,7 @@ c_compiler:
 cxx_compiler:
   - vs2017                     # [win]
 python:
-  - 3.7
+  - 3.8
 # This differs from target_platform in that it determines what subdir the compiler
 #    will target, not what subdir the compiler package will be itself.
 #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
diff --git a/packaging/vs2019/conda_build_config.yaml b/packaging/vs2019/conda_build_config.yaml
index 7bd8de2ea..b4dc99341 100644
--- a/packaging/vs2019/conda_build_config.yaml
+++ b/packaging/vs2019/conda_build_config.yaml
@@ -5,7 +5,7 @@ c_compiler:
 cxx_compiler:
   - vs2019                     # [win]
 python:
-  - 3.7
+  - 3.8
 # This differs from target_platform in that it determines what subdir the compiler
 #    will target, not what subdir the compiler package will be itself.
 #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
diff --git a/pyproject.toml b/pyproject.toml
index 8f0be4245..61e4a957f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ first_party_detection = false
 [tool.black]
 
 line-length = 120
-target-version = ["py37"]
+target-version = ["py38"]
 
 [tool.ufmt]
 
diff --git a/setup.py b/setup.py
index 93ca4151d..24b7a2edb 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,6 @@ if os.getenv("PYTORCH_VERSION"):
     pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
 requirements = [
-    "typing_extensions",
     "numpy",
     "requests",
     pytorch_dep,
@@ -546,7 +545,7 @@ if __name__ == "__main__":
             "scipy": ["scipy"],
         },
         ext_modules=get_extensions(),
-        python_requires=">=3.7.2",
+        python_requires=">=3.8",
         cmdclass={
             "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
             "clean": clean,
diff --git a/torchvision/prototype/datasets/utils/_resource.py b/torchvision/prototype/datasets/utils/_resource.py
index dc01c72de..af4ede38d 100644
--- a/torchvision/prototype/datasets/utils/_resource.py
+++ b/torchvision/prototype/datasets/utils/_resource.py
@@ -2,7 +2,7 @@ import abc
 import hashlib
 import itertools
 import pathlib
-from typing import Any, Callable, IO, NoReturn, Optional, Sequence, Set, Tuple, Union
+from typing import Any, Callable, IO, Literal, NoReturn, Optional, Sequence, Set, Tuple, Union
 from urllib.parse import urlparse
 
 from torchdata.datapipes.iter import (
@@ -23,7 +23,6 @@ from torchvision.datasets.utils import (
     download_url,
     extract_archive,
 )
-from typing_extensions import Literal
 
 
 class OnlineResource(abc.ABC):
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index a62fbf426..8282a5d4d 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -1,7 +1,7 @@
 import math
 import numbers
 import warnings
-from typing import Any, cast, Dict, List, Optional, Sequence, Tuple, Type, Union
+from typing import Any, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
 
 import PIL.Image
 import torch
@@ -11,8 +11,6 @@ from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
 from torchvision.transforms.functional import _get_perspective_coeffs
 
-from typing_extensions import Literal
-
 from ._transform import _RandomApplyTransform
 from ._utils import (
     _check_padding_arg,
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index cbf899230..b5ec05669 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -1,15 +1,13 @@
 import functools
 import numbers
 from collections import defaultdict
-from typing import Any, Dict, Sequence, Type, TypeVar, Union
+from typing import Any, Dict, Literal, Sequence, Type, TypeVar, Union
 
 from torchvision.prototype import datapoints
 from torchvision.prototype.datapoints._datapoint import FillType, FillTypeJIT
 
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
-from typing_extensions import Literal
-
 
 def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size: int = 2) -> Sequence[float]:
     if not isinstance(arg, (float, Sequence)):
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py
index 45c327825..a75c46b49 100644
--- a/torchvision/transforms/functional_pil.py
+++ b/torchvision/transforms/functional_pil.py
@@ -1,10 +1,9 @@
 import numbers
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
 from PIL import Image, ImageEnhance, ImageOps
-from typing_extensions import Literal
 
 try:
     import accimage
-- 
GitLab


From 6a85ef24ba7a5d5879c4bd795972881ef5001c9b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 30 Jan 2023 10:49:40 +0100
Subject: [PATCH 228/624] Add torchvision maintainers guide (#7109)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 maintainer_guide.md | 76 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 maintainer_guide.md

diff --git a/maintainer_guide.md b/maintainer_guide.md
new file mode 100644
index 000000000..24ac09438
--- /dev/null
+++ b/maintainer_guide.md
@@ -0,0 +1,76 @@
+## Torchvision maintainers guide
+
+This document aims at documenting user-facing policies / principles used when
+developing and maintaining torchvision. Other maintainer info (e.g. release
+process) can be found in the meta-internal wiki.
+
+### What is public and what is private?
+
+For the Python API, torchvision largely follows the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation)
+which is consistent with other major packages
+([numpy](https://numpy.org/neps/nep-0023-backwards-compatibility.html),
+[scikit-learn](https://scikit-learn.org/dev/glossary.html#term-API) etc.).
+We recognize that his policy is somewhat imperfect for some edge cases, and that
+it's difficult to come up with an accurate technical definition. In broad terms,
+which are usually well understood by users, the policy is that:
+
+- modules that can be accessed without leading underscore are public
+- objects in a public file that don't have a leading underscore are public
+- class attributes are public iff they have no leading underscore
+- the rest of the modules / objects / class attributes are considered private
+
+The public API has backward-compatible (BC) guarantees defined in our
+deprecation policy (see below). The private API has not BC guarantees.
+
+For C++, code is private. For Meta employees: if a C++ change breaks fbcode, fix
+fbcode or revert the change. We should be careful about models running in
+production and relying on torchvision ops.
+
+The `test` folder is not importable and is **private.** Even meta-internal
+projects should *not* rely on it (it has happened in the past and is now
+programmatically impossible).
+
+The training references do not have BC guarantees. Breaking changes are
+possible, but we should make sure that the tutorials are still running properly,
+and that their intended narrative is preserved (by e.g. checking outputs,
+etc.).
+
+The rest of the folders (build, android, ios, etc.) are private and have no BC
+guarantees.
+
+### Deprecation policy.
+
+Because they're disruptive, **deprecations should only be used sparingly**.
+
+We largely follow the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy):
+breaking changes require a deprecation period of at least 2 versions.
+
+Deprecations should clearly indicate their deadline in the docs and warning
+messages. Avoid not committing to a deadline, or keeping deprecated APIs for too
+long: it gives no incentive for users to update their code, sends conflicting
+messages ("why was this API removed while this other one is still around?"), and
+accumulates debt in the project.
+
+### Should this attribute be public? Should this function be private?
+
+When designing an API it’s not always obvious what should be exposed as public,
+and what should be kept as a private implementation detail. The following
+guidelines can be useful:
+
+* Functional consistency throughout the library is a top priority, for users and
+  developers’ sake. In doubt and unless it’s clearly wrong, expose what other
+  similar classes expose.
+* Think really hard about the users and their use-cases, and try to expose what
+  they would need to address those use-cases. Aggressively keep everything else
+  private. Remember that the “private -> public” direction is way smoother than
+  the “public -> private” one: in doubt, keep it private.
+* When thinking about use-cases, the general API motto applies: make what’s
+  simple and common easy, and make what’s complex possible (80% / 20% rule).
+  There might be a ~1% left that’s not addressed: that’s OK. Also, **make what’s
+  wrong very hard**, if not impossible.
+
+As a good practice, always create new files and even classes with a leading
+underscore in their name. This way, everything is private by default and the
+only public surface is explicitly present in an `__init__.py` file.
-- 
GitLab


From 942796ce30d8a5253547d18364c7e2e8b67eb5f0 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 30 Jan 2023 20:58:06 +0800
Subject: [PATCH 229/624] switch to C++17 following the core library (#7116)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 CMakeLists.txt | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85b878307..405f947c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.18)
 project(torchvision)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 file(STRINGS version.txt TORCHVISION_VERSION)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
@@ -13,11 +13,6 @@ if(WITH_CUDA)
   add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
   add_definitions(-DWITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-  # CUDA-11.x can not be compiled using C++14 standard on Windows
-  string(REGEX MATCH "^[0-9]+" CUDA_MAJOR ${CMAKE_CUDA_COMPILER_VERSION})
-  if(${CUDA_MAJOR} GREATER 10 AND MSVC)
-    set(CMAKE_CXX_STANDARD 17)
-  endif()
 endif()
 
 find_package(Torch REQUIRED)
-- 
GitLab


From f84af6d1da59496afbbaae26c2bd0212161d60cc Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 30 Jan 2023 17:29:57 +0100
Subject: [PATCH 230/624] improve warning message for missing image extension
 (#7150)

---
 torchvision/io/image.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torchvision/io/image.py b/torchvision/io/image.py
index 416174db3..900ac4e36 100644
--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -10,7 +10,12 @@ from ..utils import _log_api_usage_once
 try:
     _load_library("image")
 except (ImportError, OSError) as e:
-    warn(f"Failed to load image Python extension: {e}")
+    warn(
+        f"Failed to load image Python extension: '{e}'"
+        f"If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. "
+        f"Otherwise, there might be something wrong with your environment. "
+        f"Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?"
+    )
 
 
 class ImageReadMode(Enum):
-- 
GitLab


From 170160a5fe57441b691a1036dd0bad56af60a73e Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Tue, 31 Jan 2023 01:40:25 -0800
Subject: [PATCH 231/624] Raise kernel launch errors instead of just print
 error message in cuda ops (#7080)

Co-authored-by: Yuxin Wu <ppwwyyxx@users.noreply.github.com>
---
 .../csrc/ops/cuda/deform_conv2d_kernel.cu     | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
index d28d332b4..5fd039a31 100644
--- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
+++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
@@ -300,11 +300,7 @@ void deformable_im2col(
               data_col.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 int get_greatest_divisor_below_bound(int n, int bound) {
@@ -483,11 +479,7 @@ void compute_grad_input(
               grad_im.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("error in compute_grad_input: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t, typename index_t>
@@ -736,12 +728,7 @@ void compute_grad_offset_and_mask(
               grad_mask.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf(
-        "error in compute_grad_offset_and_mask: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> backward_gradient_inputs(
-- 
GitLab


From 7cf0f4cc1801ff1892007c7a11f7c35d8dfb7fd0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 31 Jan 2023 12:17:23 +0100
Subject: [PATCH 232/624] make transforms v2 JIT scriptable (#7135)

---
 test/test_prototype_transforms_consistency.py | 71 +++++++++++++++----
 torchvision/prototype/transforms/_augment.py  | 10 ++-
 .../prototype/transforms/_auto_augment.py     |  8 ++-
 torchvision/prototype/transforms/_color.py    | 23 +++++-
 torchvision/prototype/transforms/_geometry.py | 53 ++++++++++++++
 torchvision/prototype/transforms/_meta.py     |  3 +
 torchvision/prototype/transforms/_misc.py     |  6 ++
 .../prototype/transforms/_transform.py        | 49 ++++++++++++-
 8 files changed, 206 insertions(+), 17 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 2ac7e78e6..81bfa74ac 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -34,6 +34,15 @@ from torchvision.transforms import functional as legacy_F
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
 
+class NotScriptableArgsKwargs(ArgsKwargs):
+    """
+    This class is used to mark parameters that render the transform non-scriptable. They still work in eager mode and
+    thus will be tested there, but will be skipped by the JIT tests.
+    """
+
+    pass
+
+
 class ConsistencyConfig:
     def __init__(
         self,
@@ -73,7 +82,7 @@ CONSISTENCY_CONFIGS = [
         prototype_transforms.Resize,
         legacy_transforms.Resize,
         [
-            ArgsKwargs(32),
+            NotScriptableArgsKwargs(32),
             ArgsKwargs([32]),
             ArgsKwargs((32, 29)),
             ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
@@ -84,8 +93,10 @@ CONSISTENCY_CONFIGS = [
             # ArgsKwargs((30, 27), interpolation=0),
             # ArgsKwargs((35, 29), interpolation=2),
             # ArgsKwargs((34, 25), interpolation=3),
-            ArgsKwargs(31, max_size=32),
-            ArgsKwargs(30, max_size=100),
+            NotScriptableArgsKwargs(31, max_size=32),
+            ArgsKwargs([31], max_size=32),
+            NotScriptableArgsKwargs(30, max_size=100),
+            ArgsKwargs([31], max_size=32),
             ArgsKwargs((29, 32), antialias=False),
             ArgsKwargs((28, 31), antialias=True),
         ],
@@ -121,14 +132,15 @@ CONSISTENCY_CONFIGS = [
         prototype_transforms.Pad,
         legacy_transforms.Pad,
         [
-            ArgsKwargs(3),
+            NotScriptableArgsKwargs(3),
             ArgsKwargs([3]),
             ArgsKwargs([2, 3]),
             ArgsKwargs([3, 2, 1, 4]),
-            ArgsKwargs(5, fill=1, padding_mode="constant"),
-            ArgsKwargs(5, padding_mode="edge"),
-            ArgsKwargs(5, padding_mode="reflect"),
-            ArgsKwargs(5, padding_mode="symmetric"),
+            NotScriptableArgsKwargs(5, fill=1, padding_mode="constant"),
+            ArgsKwargs([5], fill=1, padding_mode="constant"),
+            NotScriptableArgsKwargs(5, padding_mode="edge"),
+            NotScriptableArgsKwargs(5, padding_mode="reflect"),
+            NotScriptableArgsKwargs(5, padding_mode="symmetric"),
         ],
     ),
     ConsistencyConfig(
@@ -170,7 +182,7 @@ CONSISTENCY_CONFIGS = [
     ConsistencyConfig(
         prototype_transforms.ToPILImage,
         legacy_transforms.ToPILImage,
-        [ArgsKwargs()],
+        [NotScriptableArgsKwargs()],
         make_images_kwargs=dict(
             color_spaces=[
                 "GRAY",
@@ -186,7 +198,7 @@ CONSISTENCY_CONFIGS = [
         prototype_transforms.Lambda,
         legacy_transforms.Lambda,
         [
-            ArgsKwargs(lambda image: image / 2),
+            NotScriptableArgsKwargs(lambda image: image / 2),
         ],
         # Technically, this also supports PIL, but it is overkill to write a function here that supports tensor and PIL
         # images given that the transform does nothing but call it anyway.
@@ -380,14 +392,15 @@ CONSISTENCY_CONFIGS = [
         [
             ArgsKwargs(12),
             ArgsKwargs((15, 17)),
-            ArgsKwargs(11, padding=1),
+            NotScriptableArgsKwargs(11, padding=1),
+            ArgsKwargs(11, padding=[1]),
             ArgsKwargs((8, 13), padding=(2, 3)),
             ArgsKwargs((14, 9), padding=(0, 2, 1, 0)),
             ArgsKwargs(36, pad_if_needed=True),
             ArgsKwargs((7, 8), fill=1),
-            ArgsKwargs(5, fill=(1, 2, 3)),
+            NotScriptableArgsKwargs(5, fill=(1, 2, 3)),
             ArgsKwargs(12),
-            ArgsKwargs(15, padding=2, padding_mode="edge"),
+            NotScriptableArgsKwargs(15, padding=2, padding_mode="edge"),
             ArgsKwargs(17, padding=(1, 0), padding_mode="reflect"),
             ArgsKwargs(8, padding=(3, 0, 0, 1), padding_mode="symmetric"),
         ],
@@ -642,6 +655,38 @@ def test_call_consistency(config, args_kwargs):
     )
 
 
+@pytest.mark.parametrize(
+    ("config", "args_kwargs"),
+    [
+        pytest.param(
+            config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}"
+        )
+        for config in CONSISTENCY_CONFIGS
+        for idx, args_kwargs in enumerate(config.args_kwargs)
+        if not isinstance(args_kwargs, NotScriptableArgsKwargs)
+    ],
+)
+def test_jit_consistency(config, args_kwargs):
+    args, kwargs = args_kwargs
+
+    prototype_transform_eager = config.prototype_cls(*args, **kwargs)
+    legacy_transform_eager = config.legacy_cls(*args, **kwargs)
+
+    legacy_transform_scripted = torch.jit.script(legacy_transform_eager)
+    prototype_transform_scripted = torch.jit.script(prototype_transform_eager)
+
+    for image in make_images(**config.make_images_kwargs):
+        image = image.as_subclass(torch.Tensor)
+
+        torch.manual_seed(0)
+        output_legacy_scripted = legacy_transform_scripted(image)
+
+        torch.manual_seed(0)
+        output_prototype_scripted = prototype_transform_scripted(image)
+
+        assert_close(output_prototype_scripted, output_legacy_scripted, **config.closeness_kwargs)
+
+
 class TestContainerTransforms:
     """
     Since we are testing containers here, we also need some transforms to wrap. Thus, testing a container transform for
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 667193784..65b672b7e 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -6,7 +6,7 @@ from typing import Any, cast, Dict, List, Optional, Tuple, Union
 import PIL.Image
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
-
+from torchvision import transforms as _transforms
 from torchvision.ops import masks_to_boxes
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
@@ -16,6 +16,14 @@ from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
 
 
 class RandomErasing(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomErasing
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return dict(
+            super()._extract_params_for_v1_transform(),
+            value="random" if self.value is None else self.value,
+        )
+
     _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)
 
     def __init__(
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index d4f2ca214..50b17068a 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -5,7 +5,7 @@ import PIL.Image
 import torch
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
-
+from torchvision import transforms as _transforms
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.prototype.transforms.functional._meta import get_spatial_size
@@ -161,6 +161,8 @@ class _AutoAugmentBase(Transform):
 
 
 class AutoAugment(_AutoAugmentBase):
+    _v1_transform_cls = _transforms.AutoAugment
+
     _AUGMENTATION_SPACE = {
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
         "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
@@ -315,6 +317,7 @@ class AutoAugment(_AutoAugmentBase):
 
 
 class RandAugment(_AutoAugmentBase):
+    _v1_transform_cls = _transforms.RandAugment
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
@@ -375,6 +378,7 @@ class RandAugment(_AutoAugmentBase):
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
+    _v1_transform_cls = _transforms.TrivialAugmentWide
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
@@ -425,6 +429,8 @@ class TrivialAugmentWide(_AutoAugmentBase):
 
 
 class AugMix(_AutoAugmentBase):
+    _v1_transform_cls = _transforms.AugMix
+
     _PARTIAL_AUGMENTATION_SPACE = {
         "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
         "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 17b02e369..a360e076b 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
-
+from torchvision import transforms as _transforms
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
@@ -12,6 +12,8 @@ from .utils import is_simple_tensor, query_chw
 
 
 class Grayscale(Transform):
+    _v1_transform_cls = _transforms.Grayscale
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -28,6 +30,8 @@ class Grayscale(Transform):
 
 
 class RandomGrayscale(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomGrayscale
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -47,6 +51,11 @@ class RandomGrayscale(_RandomApplyTransform):
 
 
 class ColorJitter(Transform):
+    _v1_transform_cls = _transforms.ColorJitter
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return {attr: value or 0 for attr, value in super()._extract_params_for_v1_transform().items()}
+
     def __init__(
         self,
         brightness: Optional[Union[float, Sequence[float]]] = None,
@@ -194,16 +203,22 @@ class RandomPhotometricDistort(Transform):
 
 
 class RandomEqualize(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomEqualize
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.equalize(inpt)
 
 
 class RandomInvert(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomInvert
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.invert(inpt)
 
 
 class RandomPosterize(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomPosterize
+
     def __init__(self, bits: int, p: float = 0.5) -> None:
         super().__init__(p=p)
         self.bits = bits
@@ -213,6 +228,8 @@ class RandomPosterize(_RandomApplyTransform):
 
 
 class RandomSolarize(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomSolarize
+
     def __init__(self, threshold: float, p: float = 0.5) -> None:
         super().__init__(p=p)
         self.threshold = threshold
@@ -222,11 +239,15 @@ class RandomSolarize(_RandomApplyTransform):
 
 
 class RandomAutocontrast(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomAutocontrast
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.autocontrast(inpt)
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomAdjustSharpness
+
     def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
         super().__init__(p=p)
         self.sharpness_factor = sharpness_factor
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 8282a5d4d..70ae972d9 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -6,6 +6,7 @@ from typing import Any, cast, Dict, List, Literal, Optional, Sequence, Tuple, Ty
 import PIL.Image
 import torch
 
+from torchvision import transforms as _transforms
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
@@ -25,16 +26,22 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomHorizontalFlip
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.horizontal_flip(inpt)
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomVerticalFlip
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.vertical_flip(inpt)
 
 
 class Resize(Transform):
+    _v1_transform_cls = _transforms.Resize
+
     def __init__(
         self,
         size: Union[int, Sequence[int]],
@@ -69,6 +76,8 @@ class Resize(Transform):
 
 
 class CenterCrop(Transform):
+    _v1_transform_cls = _transforms.CenterCrop
+
     def __init__(self, size: Union[int, Sequence[int]]):
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
@@ -78,6 +87,8 @@ class CenterCrop(Transform):
 
 
 class RandomResizedCrop(Transform):
+    _v1_transform_cls = _transforms.RandomResizedCrop
+
     def __init__(
         self,
         size: Union[int, Sequence[int]],
@@ -174,6 +185,8 @@ class FiveCrop(Transform):
         torch.Size([5])
     """
 
+    _v1_transform_cls = _transforms.FiveCrop
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -200,6 +213,8 @@ class TenCrop(Transform):
     See :class:`~torchvision.prototype.transforms.FiveCrop` for an example.
     """
 
+    _v1_transform_cls = _transforms.TenCrop
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -223,6 +238,18 @@ class TenCrop(Transform):
 
 
 class Pad(Transform):
+    _v1_transform_cls = _transforms.Pad
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(
+                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
+            )
+
+        return params
+
     def __init__(
         self,
         padding: Union[int, Sequence[int]],
@@ -285,6 +312,8 @@ class RandomZoomOut(_RandomApplyTransform):
 
 
 class RandomRotation(Transform):
+    _v1_transform_cls = _transforms.RandomRotation
+
     def __init__(
         self,
         degrees: Union[numbers.Number, Sequence],
@@ -322,6 +351,8 @@ class RandomRotation(Transform):
 
 
 class RandomAffine(Transform):
+    _v1_transform_cls = _transforms.RandomAffine
+
     def __init__(
         self,
         degrees: Union[numbers.Number, Sequence],
@@ -399,6 +430,24 @@ class RandomAffine(Transform):
 
 
 class RandomCrop(Transform):
+    _v1_transform_cls = _transforms.RandomCrop
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(
+                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
+            )
+
+        padding = self.padding
+        if padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = padding
+            padding = [pad_left, pad_top, pad_right, pad_bottom]
+        params["padding"] = padding
+
+        return params
+
     def __init__(
         self,
         size: Union[int, Sequence[int]],
@@ -491,6 +540,8 @@ class RandomCrop(Transform):
 
 
 class RandomPerspective(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomPerspective
+
     def __init__(
         self,
         distortion_scale: float = 0.5,
@@ -550,6 +601,8 @@ class RandomPerspective(_RandomApplyTransform):
 
 
 class ElasticTransform(Transform):
+    _v1_transform_cls = _transforms.ElasticTransform
+
     def __init__(
         self,
         alpha: Union[float, Sequence[float]] = 50.0,
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 0373ee1ba..1cef6eeb8 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -2,6 +2,7 @@ from typing import Any, Dict, Union
 
 import torch
 
+from torchvision import transforms as _transforms
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
@@ -27,6 +28,8 @@ class ConvertBoundingBoxFormat(Transform):
 
 
 class ConvertDtype(Transform):
+    _v1_transform_cls = _transforms.ConvertImageDtype
+
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 70a695199..07ab53aff 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -4,6 +4,7 @@ import PIL.Image
 
 import torch
 
+from torchvision import transforms as _transforms
 from torchvision.ops import remove_small_boxes
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
@@ -39,6 +40,8 @@ class Lambda(Transform):
 
 
 class LinearTransformation(Transform):
+    _v1_transform_cls = _transforms.LinearTransformation
+
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
@@ -94,6 +97,7 @@ class LinearTransformation(Transform):
 
 
 class Normalize(Transform):
+    _v1_transform_cls = _transforms.Normalize
     _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
     def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
@@ -113,6 +117,8 @@ class Normalize(Transform):
 
 
 class GaussianBlur(Transform):
+    _v1_transform_cls = _transforms.GaussianBlur
+
     def __init__(
         self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
     ) -> None:
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 43224cabd..a1fb3846a 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import enum
-from typing import Any, Callable, Dict, List, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import PIL.Image
 import torch
@@ -54,6 +56,51 @@ class Transform(nn.Module):
 
         return ", ".join(extra)
 
+    # This attribute should be set on all transforms that have a v1 equivalent. Doing so enables the v2 transformation
+    # to be scriptable. See `_extract_params_for_v1_transform()` and `__prepare_scriptable__` for details.
+    _v1_transform_cls: Optional[Type[nn.Module]] = None
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
+        # v2 transform instance. It does two things:
+        # 1. Extract all available public attributes that are specific to that transform and not `nn.Module` in general
+        # 2. If available handle the `fill` attribute for v1 compatibility (see below for details)
+        # Overwrite this method on the v2 transform class if the above is not sufficient. For example, this might happen
+        # if the v2 transform introduced new parameters that are not support by the v1 transform.
+        common_attrs = nn.Module().__dict__.keys()
+        params = {
+            attr: value
+            for attr, value in self.__dict__.items()
+            if not attr.startswith("_") and attr not in common_attrs
+        }
+
+        # transforms v2 has a more complex handling for the `fill` parameter than v1. By default, the input is parsed
+        # with `prototype.transforms._utils._setup_fill_arg()`, which returns a defaultdict that holds the fill value
+        # for the different datapoint types. Below we extract the value for tensors and return that together with the
+        # other params.
+        # This is needed for `Pad`, `ElasticTransform`, `RandomAffine`, `RandomCrop`, `RandomPerspective` and
+        # `RandomRotation`
+        if "fill" in params:
+            fill_type_defaultdict = params.pop("fill")
+            params["fill"] = fill_type_defaultdict[torch.Tensor]
+
+        return params
+
+    def __prepare_scriptable__(self) -> nn.Module:
+        # This method is called early on when `torch.jit.script`'ing an `nn.Module` instance. If it succeeds, the return
+        # value is used for scripting over the original object that should have been scripted. Since the v1 transforms
+        # are JIT scriptable, and we made sure that for single image inputs v1 and v2 are equivalent, we just return the
+        # equivalent v1 transform here. This of course only makes transforms v2 JIT scriptable as long as transforms v1
+        # is around.
+        if self._v1_transform_cls is None:
+            raise RuntimeError(
+                f"Transform {type(self.__name__)} cannot be JIT scripted. "
+                f"This is only support for backward compatibility with transforms which already in v1."
+                f"For torchscript support (on tensors only), you can use the functional API instead."
+            )
+
+        return self._v1_transform_cls(**self._extract_params_for_v1_transform())
+
 
 class _RandomApplyTransform(Transform):
     def __init__(self, p: float = 0.5) -> None:
-- 
GitLab


From 43df72f1adf0e44e92b8ef498b101b11fc95708c Mon Sep 17 00:00:00 2001
From: Nghia <nghiaho12@yahoo.com>
Date: Wed, 1 Feb 2023 00:11:49 -0800
Subject: [PATCH 233/624] Fixed typo in models.rst (#7156)

---
 docs/source/models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index ddd355503..2bc19dfca 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -125,7 +125,7 @@ Model Registration Mechanism
 
 .. betastatus:: registration mechanism
 
-As of v0.14, TorchVision offers a new model registration mechanism which allows retreaving models
+As of v0.14, TorchVision offers a new model registration mechanism which allows retrieving models
 and weights by their names. Here are a few examples on how to use them:
 
 .. code:: python
-- 
GitLab


From a23f0158bb1aeb2c4078a032647c51f03c03a166 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 1 Feb 2023 09:27:10 +0100
Subject: [PATCH 234/624] Fix quantized classif reference - missing args
 (#7072)

---
 references/classification/train.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/references/classification/train.py b/references/classification/train.py
index 995c608ac..10ba22bce 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -128,10 +128,12 @@ def load_data(traindir, valdir, args):
         print(f"Loading dataset_train from {cache_path}")
         dataset, _ = torch.load(cache_path)
     else:
+        # We need a default value for the variables below because args may come
+        # from train_quantization.py which doesn't define them.
         auto_augment_policy = getattr(args, "auto_augment", None)
         random_erase_prob = getattr(args, "random_erase", 0.0)
-        ra_magnitude = args.ra_magnitude
-        augmix_severity = args.augmix_severity
+        ra_magnitude = getattr(args, "ra_magnitude", None)
+        augmix_severity = getattr(args, "augmix_severity", None)
         dataset = torchvision.datasets.ImageFolder(
             traindir,
             presets.ClassificationPresetTrain(
-- 
GitLab


From 6bd04f659523bce0cb83f1934b1083dac74bb55c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 1 Feb 2023 10:08:59 +0100
Subject: [PATCH 235/624] Update docs to not mention 'registration mechanism' -
 and move corresponding utils out of Beta (#7142)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/models.rst     | 13 ++++++-------
 torchvision/models/_api.py | 10 ----------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index 2bc19dfca..155407786 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -120,13 +120,12 @@ behavior, such as batch normalization. To switch between these modes, use
     # Set model to eval mode
     model.eval()
 
-Model Registration Mechanism
-----------------------------
-
-.. betastatus:: registration mechanism
+Listing and retrieving available models
+---------------------------------------
 
-As of v0.14, TorchVision offers a new model registration mechanism which allows retrieving models
-and weights by their names. Here are a few examples on how to use them:
+As of v0.14, TorchVision offers a new mechanism which allows listing and
+retrieving models and weights by their names. Here are a few examples on how to
+use them:
 
 .. code:: python
 
@@ -148,7 +147,7 @@ and weights by their names. Here are a few examples on how to use them:
     weights_enum2 = get_model_weights(torchvision.models.quantization.mobilenet_v3_large)
     assert weights_enum == weights_enum2
 
-Here are the available public methods of the model registration mechanism:
+Here are the available public functions to retrieve models and their corresponding weights:
 
 .. currentmodule:: torchvision.models
 .. autosummary::
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 7c9ef3415..d888ac262 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -107,8 +107,6 @@ def get_weight(name: str) -> WeightsEnum:
     """
     Gets the weights enum value by its full name. Example: "ResNet50_Weights.IMAGENET1K_V1"
 
-    .. betastatus:: function
-
     Args:
         name (str): The name of the weight enum entry.
 
@@ -143,8 +141,6 @@ def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
     """
     Returns the weights enum class associated to the given model.
 
-    .. betastatus:: function
-
     Args:
         name (callable or str): The model builder function or the name under which it is registered.
 
@@ -208,8 +204,6 @@ def list_models(module: Optional[ModuleType] = None) -> List[str]:
     """
     Returns a list with the names of registered models.
 
-    .. betastatus:: function
-
     Args:
         module (ModuleType, optional): The module from which we want to extract the available models.
 
@@ -226,8 +220,6 @@ def get_model_builder(name: str) -> Callable[..., nn.Module]:
     """
     Gets the model name and returns the model builder method.
 
-    .. betastatus:: function
-
     Args:
         name (str): The name under which the model is registered.
 
@@ -246,8 +238,6 @@ def get_model(name: str, **config: Any) -> nn.Module:
     """
     Gets the model name and configuration and returns an instantiated model.
 
-    .. betastatus:: function
-
     Args:
         name (str): The name under which the model is registered.
         **config (Any): parameters passed to the model builder method.
-- 
GitLab


From 82c51c4899a0bf5aacb9f4d8ce5934247ef24391 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 1 Feb 2023 16:25:29 +0100
Subject: [PATCH 236/624] enable get_params alias for transforms v2 (#7153)

---
 test/test_prototype_transforms_consistency.py | 33 +++++++++++++++++++
 .../prototype/transforms/_transform.py        | 13 ++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 81bfa74ac..79a2b591a 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -655,6 +655,39 @@ def test_call_consistency(config, args_kwargs):
     )
 
 
+@pytest.mark.parametrize(
+    "config",
+    [config for config in CONSISTENCY_CONFIGS if hasattr(config.legacy_cls, "get_params")],
+    ids=lambda config: config.legacy_cls.__name__,
+)
+def test_get_params_alias(config):
+    assert config.prototype_cls.get_params is config.legacy_cls.get_params
+
+
+@pytest.mark.parametrize(
+    ("transform_cls", "args_kwargs"),
+    [
+        (prototype_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
+        (prototype_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
+        (prototype_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
+        (prototype_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
+        (prototype_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
+        (
+            prototype_transforms.RandomAffine,
+            ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
+        ),
+        (prototype_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
+        (prototype_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
+        (prototype_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
+        (prototype_transforms.AutoAugment, ArgsKwargs(5)),
+    ],
+)
+def test_get_params_jit(transform_cls, args_kwargs):
+    args, kwargs = args_kwargs
+
+    torch.jit.script(transform_cls.get_params)(*args, **kwargs)
+
+
 @pytest.mark.parametrize(
     ("config", "args_kwargs"),
     [
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index a1fb3846a..18678a526 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -56,10 +56,19 @@ class Transform(nn.Module):
 
         return ", ".join(extra)
 
-    # This attribute should be set on all transforms that have a v1 equivalent. Doing so enables the v2 transformation
-    # to be scriptable. See `_extract_params_for_v1_transform()` and `__prepare_scriptable__` for details.
+    # This attribute should be set on all transforms that have a v1 equivalent. Doing so enables two things:
+    # 1. In case the v1 transform has a static `get_params` method, it will also be available under the same name on
+    #    the v2 transform. See `__init_subclass__` for details.
+    # 2. The v2 transform will be JIT scriptable. See `_extract_params_for_v1_transform` and `__prepare_scriptable__`
+    #    for details.
     _v1_transform_cls: Optional[Type[nn.Module]] = None
 
+    def __init_subclass__(cls) -> None:
+        # Since `get_params` is a `@staticmethod`, we have to bind it to the class itself rather than to an instance.
+        # This method is called after subclassing has happened, i.e. `cls` is the subclass, e.g. `Resize`.
+        if cls._v1_transform_cls is not None and hasattr(cls._v1_transform_cls, "get_params"):
+            cls.get_params = cls._v1_transform_cls.get_params  # type: ignore[attr-defined]
+
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
         # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
         # v2 transform instance. It does two things:
-- 
GitLab


From b094075cbc8834d63a9fa8ae08bcad3d72a43321 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 1 Feb 2023 18:49:01 -0500
Subject: [PATCH 237/624] Deprecate CUDA 11.6 from CI, move to 11.7 (#7163)

---
 .circleci/config.yml                            |  2 +-
 .circleci/config.yml.in                         |  2 +-
 .circleci/unittest/windows/scripts/install.sh   |  3 ++-
 .github/workflows/prototype-tests-linux-gpu.yml | 14 +++++++-------
 .github/workflows/test-linux-gpu.yml            |  4 ++--
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7b96f9682..be43aa12a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -855,7 +855,7 @@ jobs:
     executor:
       name: windows-gpu
     environment:
-      CUDA_VERSION: "11.6"
+      CUDA_VERSION: "11.7"
       PYTHON_VERSION: << parameters.python_version >>
     steps:
       - checkout
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index a84a3fa64..ab6fa6c35 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -855,7 +855,7 @@ jobs:
     executor:
       name: windows-gpu
     environment:
-      CUDA_VERSION: "11.6"
+      CUDA_VERSION: "11.7"
       PYTHON_VERSION: << parameters.python_version >>
     steps:
       - checkout
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
index 85920abb8..e75bea649 100644
--- a/.circleci/unittest/windows/scripts/install.sh
+++ b/.circleci/unittest/windows/scripts/install.sh
@@ -1,3 +1,4 @@
+
 #!/usr/bin/env bash
 
 unset PYTORCH_VERSION
@@ -24,7 +25,7 @@ else
     fi
 
     cuda_toolkit_pckg="cudatoolkit"
-    if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 ]]; then
+    if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 || $CUDA_VERSION == 11.8 ]]; then
         cuda_toolkit_pckg="pytorch-cuda"
     fi
 
diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index e52a07ea7..cf1bd9e58 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -17,7 +17,7 @@ jobs:
         include:
           - python-version: "3.8"
             gpu-arch-type: cuda
-            gpu-arch-version: "11.6"
+            gpu-arch-version: "11.7"
             runner: linux.4xlarge.nvidia.gpu
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -40,7 +40,7 @@ jobs:
         PYTORCH_CHANNEL=pytorch-"${POSTFIX}"
         echo "${PYTORCH_CHANNEL}"
         echo '::endgroup::'
-                
+
         echo '::group::Set PyTorch conda mutex'
         if [[ ${{ matrix.gpu-arch-type }} = 'cuda' ]]; then
           PYTORCH_MUTEX="pytorch-cuda=${{ matrix.gpu-arch-version }}"
@@ -49,7 +49,7 @@ jobs:
         fi
         echo "${PYTORCH_MUTEX}"
         echo '::endgroup::'
-        
+
         echo '::group::Create conda environment'
         conda create --prefix $PWD/ci \
           --quiet --yes \
@@ -69,21 +69,21 @@ jobs:
           python3 -c "import torch; exit(not torch.cuda.is_available())"
         fi
         echo '::endgroup::'
-          
+
         echo '::group::Install TorchVision'
         python setup.py develop
         echo '::endgroup::'
-        
+
         echo '::group::Collect PyTorch environment information'
         python -m torch.utils.collect_env
         echo '::endgroup::'
-        
+
         echo '::group::Install testing utilities'
         pip install --progress-bar=off pytest pytest-mock pytest-cov
         echo '::endgroup::'
 
         echo '::group::Run prototype tests'
-        # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e. 
+        # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e.
         # `test/test_prototype*.py` takes the highest priority, neither `--ignore` nor `--ignore-glob` can help us here.
         rm test/test_prototype_datasets*.py
         pytest \
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
index a4d938f23..831de27e3 100644
--- a/.github/workflows/test-linux-gpu.yml
+++ b/.github/workflows/test-linux-gpu.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       matrix:
         python_version: ["3.8"]
-        cuda_arch_version: ["11.6"]
+        cuda_arch_version: ["11.7"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
@@ -45,7 +45,7 @@ jobs:
         # Create Conda Env
         conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
         conda activate /work/ci_env
-        
+
         # Install PyTorch, Torchvision, and testing libraries
         set -ex
         conda install \
-- 
GitLab


From 97ce7735143a648af6c70fa58fcc515ab306d135 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 2 Feb 2023 10:44:04 +0100
Subject: [PATCH 238/624] remove GTSRB from detection dataset list (#7164)

---
 docs/source/datasets.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index 5ecb60d36..68c72e7af 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -80,7 +80,6 @@ Image detection or segmentation
     CocoDetection
     CelebA
     Cityscapes
-    GTSRB
     Kitti
     OxfordIIITPet
     SBDataset
-- 
GitLab


From cb8c4417644ede58b3824424eb3c85c69322c24f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 2 Feb 2023 10:44:49 +0100
Subject: [PATCH 239/624] fix docstring of CelebA regarding arrays (#7161)

---
 torchvision/datasets/celeba.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/datasets/celeba.py b/torchvision/datasets/celeba.py
index 2e3206fcb..d055f92f1 100644
--- a/torchvision/datasets/celeba.py
+++ b/torchvision/datasets/celeba.py
@@ -23,10 +23,10 @@ class CelebA(VisionDataset):
             or ``landmarks``. Can also be a list to output a tuple with all specified target types.
             The targets represent:
 
-                - ``attr`` (np.array shape=(40,) dtype=int): binary (0, 1) labels for attributes
+                - ``attr`` (Tensor shape=(40,) dtype=int): binary (0, 1) labels for attributes
                 - ``identity`` (int): label for each person (data points with the same identity are the same person)
-                - ``bbox`` (np.array shape=(4,) dtype=int): bounding box (x, y, width, height)
-                - ``landmarks`` (np.array shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
+                - ``bbox`` (Tensor shape=(4,) dtype=int): bounding box (x, y, width, height)
+                - ``landmarks`` (Tensor shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
                   righteye_y, nose_x, nose_y, leftmouth_x, leftmouth_y, rightmouth_x, rightmouth_y)
 
             Defaults to ``attr``. If empty, ``None`` will be returned as target.
-- 
GitLab


From 135a0f9ea9841b6324b4fe8974e2543cbb95709a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 2 Feb 2023 10:46:13 +0100
Subject: [PATCH 240/624] Make WeightEnum and Weights public + cleanups (#7100)

---
 test/test_extended_models.py   |  2 +-
 torchvision/models/__init__.py |  7 ++++++-
 torchvision/models/_api.py     | 31 ++++++++++++++++---------------
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index 068d3e238..ded0ecf63 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -7,7 +7,7 @@ import test_models as TM
 import torch
 from common_extended_utils import get_file_size_mb, get_ops
 from torchvision import models
-from torchvision.models._api import get_model_weights, Weights, WeightsEnum
+from torchvision.models import get_model_weights, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
 
 run_if_test_with_extended = pytest.mark.skipif(
diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py
index 93d96112b..6ea0a1f71 100644
--- a/torchvision/models/__init__.py
+++ b/torchvision/models/__init__.py
@@ -15,4 +15,9 @@ from .vision_transformer import *
 from .swin_transformer import *
 from .maxvit import *
 from . import detection, optical_flow, quantization, segmentation, video
-from ._api import get_model, get_model_builder, get_model_weights, get_weight, list_models
+
+# The Weights and WeightsEnum are developer-facing utils that we make public for
+# downstream libs like torchgeo https://github.com/pytorch/vision/issues/7094
+# TODO: we could / should document them publicly, but it's not clear where, as
+# they're not intended for end users.
+from ._api import get_model, get_model_builder, get_model_weights, get_weight, list_models, Weights, WeightsEnum
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index d888ac262..3915547eb 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -1,7 +1,8 @@
 import importlib
 import inspect
 import sys
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
+from enum import Enum
 from functools import partial
 from inspect import signature
 from types import ModuleType
@@ -9,8 +10,6 @@ from typing import Any, Callable, cast, Dict, List, Mapping, Optional, TypeVar,
 
 from torch import nn
 
-from torchvision._utils import StrEnum
-
 from .._internally_replaced_utils import load_state_dict_from_url
 
 
@@ -65,7 +64,7 @@ class Weights:
             return self.transforms == other.transforms
 
 
-class WeightsEnum(StrEnum):
+class WeightsEnum(Enum):
     """
     This class is the parent class of all model weights. Each model building method receives an optional `weights`
     parameter with its associated pre-trained weights. It inherits from `Enum` and its values should be of type
@@ -75,14 +74,11 @@ class WeightsEnum(StrEnum):
         value (Weights): The data class entry with the weight information.
     """
 
-    def __init__(self, value: Weights):
-        self._value_ = value
-
     @classmethod
     def verify(cls, obj: Any) -> Any:
         if obj is not None:
             if type(obj) is str:
-                obj = cls.from_str(obj.replace(cls.__name__ + ".", ""))
+                obj = cls[obj.replace(cls.__name__ + ".", "")]
             elif not isinstance(obj, cls):
                 raise TypeError(
                     f"Invalid Weight class provided; expected {cls.__name__} but received {obj.__class__.__name__}."
@@ -95,12 +91,17 @@ class WeightsEnum(StrEnum):
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}.{self._name_}"
 
-    def __getattr__(self, name):
-        # Be able to fetch Weights attributes directly
-        for f in fields(Weights):
-            if f.name == name:
-                return object.__getattribute__(self.value, name)
-        return super().__getattr__(name)
+    @property
+    def url(self):
+        return self.value.url
+
+    @property
+    def transforms(self):
+        return self.value.transforms
+
+    @property
+    def meta(self):
+        return self.value.meta
 
 
 def get_weight(name: str) -> WeightsEnum:
@@ -134,7 +135,7 @@ def get_weight(name: str) -> WeightsEnum:
     if weights_enum is None:
         raise ValueError(f"The weight enum '{enum_name}' for the specific method couldn't be retrieved.")
 
-    return weights_enum.from_str(value_name)
+    return weights_enum[value_name]
 
 
 def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
-- 
GitLab


From 2cd25c1a05012a3720a87f20cec436811fadeedd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 6 Feb 2023 14:21:15 +0000
Subject: [PATCH 241/624] Fix resnet_fpn_backbone(pretrained=True) (#7172)

---
 test/test_extended_models.py                   | 7 ++++++-
 torchvision/models/_api.py                     | 8 ++++----
 torchvision/models/detection/backbone_utils.py | 4 ++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index ded0ecf63..da8339a55 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -9,6 +9,7 @@ from common_extended_utils import get_file_size_mb, get_ops
 from torchvision import models
 from torchvision.models import get_model_weights, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
+from torchvision.models.detection.backbone_utils import mobilenet_backbone, resnet_fpn_backbone
 
 run_if_test_with_extended = pytest.mark.skipif(
     os.getenv("PYTORCH_TEST_WITH_EXTENDED", "0") != "1",
@@ -425,7 +426,11 @@ class TestHandleLegacyInterface:
         + TM.list_model_fns(models.quantization)
         + TM.list_model_fns(models.segmentation)
         + TM.list_model_fns(models.video)
-        + TM.list_model_fns(models.optical_flow),
+        + TM.list_model_fns(models.optical_flow)
+        + [
+            lambda pretrained: resnet_fpn_backbone(backbone_name="resnet50", pretrained=pretrained),
+            lambda pretrained: mobilenet_backbone(backbone_name="mobilenet_v2", fpn=False, pretrained=pretrained),
+        ],
     )
     @run_if_test_with_extended
     def test_pretrained_deprecation(self, model_fn):
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 3915547eb..89522b12e 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -6,7 +6,7 @@ from enum import Enum
 from functools import partial
 from inspect import signature
 from types import ModuleType
-from typing import Any, Callable, cast, Dict, List, Mapping, Optional, TypeVar, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Type, TypeVar, Union
 
 from torch import nn
 
@@ -138,7 +138,7 @@ def get_weight(name: str) -> WeightsEnum:
     return weights_enum[value_name]
 
 
-def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
+def get_model_weights(name: Union[Callable, str]) -> Type[WeightsEnum]:
     """
     Returns the weights enum class associated to the given model.
 
@@ -152,7 +152,7 @@ def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
     return _get_enum_from_fn(model)
 
 
-def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
+def _get_enum_from_fn(fn: Callable) -> Type[WeightsEnum]:
     """
     Internal method that gets the weight enum of a specific model builder method.
 
@@ -182,7 +182,7 @@ def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
             "The WeightsEnum class for the specific method couldn't be retrieved. Make sure the typing info is correct."
         )
 
-    return cast(WeightsEnum, weights_enum)
+    return weights_enum
 
 
 M = TypeVar("M", bound=nn.Module)
diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
index 62ea13e32..1b10961d3 100644
--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -62,7 +62,7 @@ class BackboneWithFPN(nn.Module):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]]).from_str("IMAGENET1K_V1"),
+        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
     ),
 )
 def resnet_fpn_backbone(
@@ -177,7 +177,7 @@ def _validate_trainable_layers(
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]]).from_str("IMAGENET1K_V1"),
+        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
     ),
 )
 def mobilenet_backbone(
-- 
GitLab


From 4ee07ccc4d88c2cde264f318c6a8385173713665 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Mon, 6 Feb 2023 16:09:56 -0500
Subject: [PATCH 242/624] Remove Excess Py3.8 Linux Wheels Builds (#7178)

---
 .circleci/config.yml    | 81 -----------------------------------------
 .circleci/regenerate.py |  7 ++--
 2 files changed, 4 insertions(+), 84 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index be43aa12a..2a97b4bbc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1103,34 +1103,6 @@ workflows:
           name: binary_linux_wheel_py3.8_cpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          cu_version: cu118
-          name: binary_linux_wheel_py3.8_cu118
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda118
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.8_rocm5.2
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          cu_version: rocm5.3
-          name: binary_linux_wheel_py3.8_rocm5.3
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.3
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1511,59 +1483,6 @@ workflows:
           name: nightly_binary_linux_wheel_py3.8_cpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu118
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda118
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.2
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          cu_version: rocm5.3
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.3
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.3
       - binary_win_wheel:
           cu_version: cpu
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 2fee884ef..f372516ad 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -62,10 +62,11 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
 
                         # Disable all Linux Wheels Workflows from CircleCI
                         # since those will now be done through Nova. We'll keep
-                        # around the py3.8 Linux Wheels build since the docs
+                        # around the py3.8 cpu Linux Wheels build since the docs
                         # job depends on it.
-                        if os_type == "linux" and btype == "wheel" and python_version != "3.8":
-                            continue
+                        if os_type == "linux" and btype == "wheel":
+                            if not (python_version == "3.8" and cu_version == "cpu"):
+                                continue
 
                         # Disable all Macos Wheels Workflows from CircleCI.
                         if os_type == "macos" and btype == "wheel":
-- 
GitLab


From 85983a57e8986cf4a9afc34704bbacb9e6206ec9 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 6 Feb 2023 16:41:34 -0500
Subject: [PATCH 243/624] Remove cu116 from circleci (#7183)

---
 .circleci/config.yml    | 187 ++--------------------------------------
 .circleci/regenerate.py |   9 +-
 2 files changed, 12 insertions(+), 184 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2a97b4bbc..b7c6b6408 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1112,15 +1112,6 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.8_cpu
           python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu116
-          python_version: '3.8'
       - binary_win_wheel:
           cu_version: cu117
           filters:
@@ -1148,15 +1139,6 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.9_cpu
           python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu116
-          python_version: '3.9'
       - binary_win_wheel:
           cu_version: cu117
           filters:
@@ -1179,15 +1161,6 @@ workflows:
           cu_version: cpu
           name: binary_win_wheel_py3.10_cpu
           python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu116
-          python_version: '3.10'
       - binary_win_wheel:
           cu_version: cu117
           filters:
@@ -1210,15 +1183,6 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.8_cpu
           python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu116
-          python_version: '3.8'
       - binary_win_conda:
           cu_version: cu117
           filters:
@@ -1246,15 +1210,6 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.9_cpu
           python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu116
-          python_version: '3.9'
       - binary_win_conda:
           cu_version: cu117
           filters:
@@ -1277,15 +1232,6 @@ workflows:
           cu_version: cpu
           name: binary_win_conda_py3.10_cpu
           python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu116
-          python_version: '3.10'
       - binary_win_conda:
           cu_version: cu117
           filters:
@@ -1342,11 +1288,11 @@ workflows:
       - unittest_onnx
       - unittest_extended
       - unittest_linux_gpu:
-          cu_version: cu116
+          cu_version: cu117
           name: unittest_linux_gpu_py3.8
           python_version: '3.8'
       - unittest_linux_gpu:
-          cu_version: cu116
+          cu_version: cu117
           filters:
             branches:
               only:
@@ -1355,7 +1301,7 @@ workflows:
           name: unittest_linux_gpu_py3.9
           python_version: '3.9'
       - unittest_linux_gpu:
-          cu_version: cu116
+          cu_version: cu117
           filters:
             branches:
               only:
@@ -1376,11 +1322,11 @@ workflows:
           name: unittest_windows_cpu_py3.10
           python_version: '3.10'
       - unittest_windows_gpu:
-          cu_version: cu116
+          cu_version: cu117
           name: unittest_windows_gpu_py3.8
           python_version: '3.8'
       - unittest_windows_gpu:
-          cu_version: cu116
+          cu_version: cu117
           filters:
             branches:
               only:
@@ -1389,7 +1335,7 @@ workflows:
           name: unittest_windows_gpu_py3.9
           python_version: '3.9'
       - unittest_windows_gpu:
-          cu_version: cu116
+          cu_version: cu117
           filters:
             branches:
               only:
@@ -1417,7 +1363,7 @@ workflows:
           name: cmake_linux_cpu
           python_version: '3.8'
       - cmake_linux_gpu:
-          cu_version: cu116
+          cu_version: cu117
           name: cmake_linux_gpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda116
@@ -1426,7 +1372,7 @@ workflows:
           name: cmake_windows_cpu
           python_version: '3.8'
       - cmake_windows_gpu:
-          cu_version: cu116
+          cu_version: cu117
           name: cmake_windows_gpu
           python_version: '3.8'
       - cmake_macos_cpu:
@@ -1503,26 +1449,6 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.8_cpu
           subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu116
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu116
-          subfolder: cu116/
       - binary_win_wheel:
           cu_version: cu117
           filters:
@@ -1583,26 +1509,6 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.9_cpu
           subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu116
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu116
-          subfolder: cu116/
       - binary_win_wheel:
           cu_version: cu117
           filters:
@@ -1663,26 +1569,6 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.10_cpu
           subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu116
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu116
-          subfolder: cu116/
       - binary_win_wheel:
           cu_version: cu117
           filters:
@@ -1742,25 +1628,6 @@ workflows:
           name: nightly_binary_win_conda_py3.8_cpu_upload
           requires:
           - nightly_binary_win_conda_py3.8_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu116
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu116
       - binary_win_conda:
           cu_version: cu117
           filters:
@@ -1818,25 +1685,6 @@ workflows:
           name: nightly_binary_win_conda_py3.9_cpu_upload
           requires:
           - nightly_binary_win_conda_py3.9_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu116
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu116
       - binary_win_conda:
           cu_version: cu117
           filters:
@@ -1894,25 +1742,6 @@ workflows:
           name: nightly_binary_win_conda_py3.10_cpu_upload
           requires:
           - nightly_binary_win_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu116
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu116
       - binary_win_conda:
           cu_version: cu117
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index f372516ad..aa077efb9 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -32,8 +32,8 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
             cu_versions_dict = {
-                "linux": ["cpu", "cu116", "cu117", "cu118", "rocm5.2", "rocm5.3"],
-                "win": ["cpu", "cu116", "cu117", "cu118"],
+                "linux": ["cpu", "cu117", "cu118", "rocm5.2", "rocm5.3"],
+                "win": ["cpu", "cu117", "cu118"],
                 "macos": ["cpu"],
             }
             cu_versions = cu_versions_dict[os_type]
@@ -144,7 +144,6 @@ def upload_doc_job(filter_branch):
 
 
 manylinux_images = {
-    "cu116": "pytorch/manylinux-cuda116",
     "cu117": "pytorch/manylinux-cuda117",
     "cu118": "pytorch/manylinux-cuda118",
 }
@@ -271,7 +270,7 @@ def unittest_workflows(indentation=6):
                 if device_type == "gpu":
                     if python_version != "3.8":
                         job["filters"] = gen_filter_branch_tree("main", "nightly")
-                    job["cu_version"] = "cu116"
+                    job["cu_version"] = "cu117"
                 else:
                     job["cu_version"] = "cpu"
 
@@ -289,7 +288,7 @@ def cmake_workflows(indentation=6):
         for device in device_types:
             job = {"name": f"cmake_{os_type}_{device}", "python_version": python_version}
 
-            job["cu_version"] = "cu116" if device == "gpu" else "cpu"
+            job["cu_version"] = "cu117" if device == "gpu" else "cpu"
             if device == "gpu" and os_type == "linux":
                 job["wheel_docker_image"] = "pytorch/manylinux-cuda116"
             jobs.append({f"cmake_{os_type}_{device}": job})
-- 
GitLab


From ecbed7bd604a705d4b3709d97817598166b480ca Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 6 Feb 2023 19:00:23 -0800
Subject: [PATCH 244/624] [CI] Add `nvjpeg` for Windows CUDA-11.8 builds
 (#7186)

---
 packaging/windows/internal/cuda_install.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index 633c2c642..6474a98d9 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -89,7 +89,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
+    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvjpeg_11.8 nvjpeg_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
 )
 
 set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.5.0.96_cuda11-archive.zip
-- 
GitLab


From a05d81796dd4743e8b16797a43e9166f0cdb90c7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 7 Feb 2023 10:15:56 +0100
Subject: [PATCH 245/624] fix gradle link (#7181)

---
 .circleci/unittest/android/scripts/install_gradle.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/unittest/android/scripts/install_gradle.sh b/.circleci/unittest/android/scripts/install_gradle.sh
index 5f803abfa..ff90c657e 100755
--- a/.circleci/unittest/android/scripts/install_gradle.sh
+++ b/.circleci/unittest/android/scripts/install_gradle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -ex
 
-_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+_https_amazon_aws=https://downloads.gradle-dn.com/distributions
 GRADLE_VERSION=6.8.3
 
 _gradle_home=/opt/gradle
-- 
GitLab


From bac678c8897cb8ebbca1d3877350288993b6ca69 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 7 Feb 2023 10:52:01 +0100
Subject: [PATCH 246/624] remove functionality scheduled for 0.15 after
 deprecation (#7176)

---
 test/test_functional_tensor.py                | 61 -------------------
 test/test_prototype_transforms_consistency.py |  6 --
 test/test_transforms.py                       | 22 -------
 torchvision/datasets/utils.py                 | 13 ----
 torchvision/models/alexnet.py                 | 11 ----
 torchvision/models/densenet.py                | 13 ----
 torchvision/models/detection/faster_rcnn.py   | 13 ----
 torchvision/models/detection/fcos.py          | 11 ----
 torchvision/models/detection/keypoint_rcnn.py | 13 ----
 torchvision/models/detection/mask_rcnn.py     | 11 ----
 torchvision/models/detection/retinanet.py     | 11 ----
 torchvision/models/detection/ssd.py           | 22 -------
 torchvision/models/detection/ssdlite.py       | 11 ----
 torchvision/models/efficientnet.py            | 30 ---------
 torchvision/models/googlenet.py               | 12 ----
 torchvision/models/inception.py               | 12 ----
 torchvision/models/mobilenetv2.py             | 11 ----
 torchvision/models/mobilenetv3.py             | 12 ----
 torchvision/models/quantization/googlenet.py  | 13 ----
 torchvision/models/quantization/inception.py  | 13 ----
 .../models/quantization/mobilenetv2.py        | 12 ----
 .../models/quantization/mobilenetv3.py        | 12 ----
 torchvision/models/quantization/resnet.py     | 14 -----
 .../models/quantization/shufflenetv2.py       | 13 ----
 torchvision/models/regnet.py                  | 24 --------
 torchvision/models/resnet.py                  | 19 ------
 torchvision/models/segmentation/deeplabv3.py  | 13 ----
 torchvision/models/segmentation/fcn.py        | 12 ----
 torchvision/models/segmentation/lraspp.py     | 11 ----
 torchvision/models/shufflenetv2.py            | 14 -----
 torchvision/models/squeezenet.py              | 12 ----
 torchvision/models/vgg.py                     | 18 ------
 torchvision/models/vision_transformer.py      | 14 -----
 torchvision/transforms/functional.py          | 41 -------------
 torchvision/transforms/transforms.py          | 50 ---------------
 35 files changed, 600 deletions(-)

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index e4d65a64a..fb9838ec2 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -2,7 +2,6 @@ import colorsys
 import itertools
 import math
 import os
-import re
 from functools import partial
 from typing import Sequence
 
@@ -144,20 +143,6 @@ class TestRotate:
         center = (20, 22)
         _test_fn_on_batch(batch_tensors, F.rotate, angle=32, interpolation=NEAREST, expand=True, center=center)
 
-    def test_rotate_interpolation_type(self):
-        tensor, _ = _create_data(26, 26)
-        # assert changed type warning
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            ),
-        ):
-            res1 = F.rotate(tensor, 45, interpolation=2)
-            res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
-            assert_equal(res1, res2)
-
 
 class TestAffine:
 
@@ -364,22 +349,6 @@ class TestAffine:
 
         _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0])
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_warnings(self, device):
-        tensor, pil_img = _create_data(26, 26, device=device)
-
-        # assert changed type warning
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            ),
-        ):
-            res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=2)
-            res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
-            assert_equal(res1, res2)
-
 
 def _get_data_dims_and_points_for_perspective():
     # Ideally we would parametrize independently over data dims and points, but
@@ -478,23 +447,6 @@ def test_perspective_batch(device, dims_and_points, dt):
     )
 
 
-def test_perspective_interpolation_warning():
-    # assert changed type warning
-    spoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
-    epoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
-    tensor = torch.randint(0, 256, (3, 26, 26))
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=2)
-        res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR)
-        assert_equal(res1, res2)
-
-
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
@@ -568,19 +520,6 @@ def test_resize_asserts(device):
 
     tensor, pil_img = _create_data(26, 36, device=device)
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        res1 = F.resize(tensor, size=32, interpolation=2)
-
-    res2 = F.resize(tensor, size=32, interpolation=BILINEAR)
-    assert_equal(res1, res2)
-
     for img in (tensor, pil_img):
         exp_msg = "max_size should only be passed if size specifies the length of the smaller edge"
         with pytest.raises(ValueError, match=exp_msg):
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 79a2b591a..4ad968ba2 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -87,12 +87,6 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs((32, 29)),
             ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
             ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
-            # FIXME: these are currently failing, since the new transform only supports the enum. The int input is
-            #  already deprecated and scheduled to be removed in 0.15. Should we support ints on the prototype
-            #  transform? I guess it depends if we roll out before 0.15 or not.
-            # ArgsKwargs((30, 27), interpolation=0),
-            # ArgsKwargs((35, 29), interpolation=2),
-            # ArgsKwargs((34, 25), interpolation=3),
             NotScriptableArgsKwargs(31, max_size=32),
             ArgsKwargs([31], max_size=32),
             NotScriptableArgsKwargs(30, max_size=100),
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 0340f9f3f..214f2963b 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -1872,17 +1872,6 @@ def test_random_rotation():
     # Checking if RandomRotation can be printed as string
     t.__repr__()
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        t = transforms.RandomRotation((-10, 10), interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
-
 
 def test_random_rotation_error():
     # assert fill being either a Sequence or a Number
@@ -2212,17 +2201,6 @@ def test_random_affine():
     t = transforms.RandomAffine(10, interpolation=transforms.InterpolationMode.BILINEAR)
     assert "bilinear" in t.__repr__()
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        t = transforms.RandomAffine(10, interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
-
 
 def test_elastic_transformation():
     with pytest.raises(TypeError, match=r"alpha should be float or a sequence of floats"):
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index fb9de2e44..86a489f15 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -48,19 +48,6 @@ def _urlretrieve(url: str, filename: str, chunk_size: int = 1024 * 32) -> None:
         _save_response_content(iter(lambda: response.read(chunk_size), b""), filename, length=response.length)
 
 
-def gen_bar_updater() -> Callable[[int, int, int], None]:
-    warnings.warn("The function `gen_bar_update` is deprecated since 0.13 and will be removed in 0.15.")
-    pbar = tqdm(total=None)
-
-    def bar_update(count, block_size, total_size):
-        if pbar.total is None and total_size:
-            pbar.total = total_size
-        progress_bytes = count * block_size
-        pbar.update(progress_bytes - pbar.n)
-
-    return bar_update
-
-
 def calculate_md5(fpath: str, chunk_size: int = 1024 * 1024) -> str:
     # Setting the `usedforsecurity` flag does not change anything about the functionality, but indicates that we are
     # not using the MD5 checksum for cryptography. This enables its usage in restricted environments like FIPS. Without
diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py
index 5612fb45c..4778a19a8 100644
--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -117,14 +117,3 @@ def alexnet(*, weights: Optional[AlexNet_Weights] = None, progress: bool = True,
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "alexnet": AlexNet_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py
index 0668c76ef..8253b9572 100644
--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -446,16 +446,3 @@ def densenet201(*, weights: Optional[DenseNet201_Weights] = None, progress: bool
     weights = DenseNet201_Weights.verify(weights)
 
     return _densenet(32, (6, 12, 48, 32), 64, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-model_urls = _ModelURLs(
-    {
-        "densenet121": DenseNet121_Weights.IMAGENET1K_V1.url,
-        "densenet169": DenseNet169_Weights.IMAGENET1K_V1.url,
-        "densenet201": DenseNet201_Weights.IMAGENET1K_V1.url,
-        "densenet161": DenseNet161_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 1eaadaa20..bda052111 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -841,16 +841,3 @@ def fasterrcnn_mobilenet_v3_large_fpn(
         trainable_backbone_layers=trainable_backbone_layers,
         **kwargs,
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fasterrcnn_resnet50_fpn_coco": FasterRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-        "fasterrcnn_mobilenet_v3_large_320_fpn_coco": FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.COCO_V1.url,
-        "fasterrcnn_mobilenet_v3_large_fpn_coco": FasterRCNN_MobileNet_V3_Large_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index a5c73c8b4..8c6f84ca5 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -769,14 +769,3 @@ def fcos_resnet50_fpn(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fcos_resnet50_fpn_coco": FCOS_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 556ae7dc0..5db9911ca 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -470,16 +470,3 @@ def keypointrcnn_resnet50_fpn(
             overwrite_eps(model, 0.0)
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # legacy model for BC reasons, see https://github.com/pytorch/vision/issues/1606
-        "keypointrcnn_resnet50_fpn_coco_legacy": KeypointRCNN_ResNet50_FPN_Weights.COCO_LEGACY.url,
-        "keypointrcnn_resnet50_fpn_coco": KeypointRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index df755e252..b2f2cbfe2 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -585,14 +585,3 @@ def maskrcnn_resnet50_fpn_v2(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "maskrcnn_resnet50_fpn_coco": MaskRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 44513d8e8..3a14c983a 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -897,14 +897,3 @@ def retinanet_resnet50_fpn_v2(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "retinanet_resnet50_fpn_coco": RetinaNet_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index 8d75ff189..584798df7 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -680,25 +680,3 @@ def ssd300_vgg16(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssd300_vgg16_coco": SSD300_VGG16_Weights.COCO_V1.url,
-    }
-)
-
-
-backbone_urls = _ModelURLs(
-    {
-        # We port the features of a VGG16 backbone trained by amdegroot because unlike the one on TorchVision, it uses
-        # the same input standardization method as the paper.
-        # Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
-        # Only the `features` weights have proper values, those on the `classifier` module are filled with nans.
-        "vgg16_features": VGG16_Weights.IMAGENET1K_FEATURES.url,
-    }
-)
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index f06dcef52..b1ef24ef1 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -329,14 +329,3 @@ def ssdlite320_mobilenet_v3_large(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssdlite320_mobilenet_v3_large_coco": SSDLite320_MobileNet_V3_Large_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index fc69771ce..bf8d4cee1 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -1,6 +1,5 @@
 import copy
 import math
-import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -239,7 +238,6 @@ class EfficientNet(nn.Module):
         num_classes: int = 1000,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
         last_channel: Optional[int] = None,
-        **kwargs: Any,
     ) -> None:
         """
         EfficientNet V1 and V2 main class
@@ -263,16 +261,6 @@ class EfficientNet(nn.Module):
         ):
             raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
 
-        if "block" in kwargs:
-            warnings.warn(
-                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
-                "Please pass this information on 'MBConvConfig.block' instead."
-            )
-            if kwargs["block"] is not None:
-                for s in inverted_residual_setting:
-                    if isinstance(s, MBConvConfig):
-                        s.block = kwargs["block"]
-
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
 
@@ -1141,21 +1129,3 @@ def efficientnet_v2_l(
         norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
         **kwargs,
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "efficientnet_b0": EfficientNet_B0_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b1": EfficientNet_B1_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b2": EfficientNet_B2_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b3": EfficientNet_B3_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b4": EfficientNet_B4_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b5": EfficientNet_B5_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b6": EfficientNet_B6_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b7": EfficientNet_B7_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py
index 02d379d73..947ae210e 100644
--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -343,15 +343,3 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
             )
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # GoogLeNet ported from TensorFlow
-        "googlenet": GoogLeNet_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index a7b2a8e44..f7b00d492 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -476,15 +476,3 @@ def inception_v3(*, weights: Optional[Inception_V3_Weights] = None, progress: bo
             model.AuxLogits = None
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # Inception v3 ported from TensorFlow
-        "inception_v3_google": Inception_V3_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index b6d59d10e..451c553bc 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -258,14 +258,3 @@ def mobilenet_v2(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v2": MobileNet_V2_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index 057fe26db..4489563e8 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -421,15 +421,3 @@ def mobilenet_v3_small(
 
     inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs)
     return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large": MobileNet_V3_Large_Weights.IMAGENET1K_V1.url,
-        "mobilenet_v3_small": MobileNet_V3_Small_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index cb06594cd..6998a2b53 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -208,16 +208,3 @@ def googlenet(
             )
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..googlenet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 GoogLeNet ported from TensorFlow, with weights quantized in PyTorch
-        "googlenet_fbgemm": GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index 46d4f6d66..3421095d7 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -271,16 +271,3 @@ def inception_v3(
             model.AuxLogits = None
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..inception import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 weights ported from TensorFlow, quantized in PyTorch
-        "inception_v3_google_fbgemm": Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py
index 8561c3cac..1ac08f041 100644
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -152,15 +152,3 @@ def mobilenet_v2(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v2_qnnpack": MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py
index 4ee9434c8..e5686f020 100644
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -235,15 +235,3 @@ def mobilenet_v3_large(
 
     inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
     return _mobilenet_v3_model(inverted_residual_setting, last_channel, weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv3 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large_qnnpack": MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py
index 0f376f5c9..18ccff669 100644
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -482,17 +482,3 @@ def resnext101_64x4d(
     _ovewrite_named_param(kwargs, "groups", 64)
     _ovewrite_named_param(kwargs, "width_per_group", 4)
     return _resnet(QuantizableBottleneck, [3, 4, 23, 3], weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..resnet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "resnet18_fbgemm": ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnet50_fbgemm": ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnext101_32x8d_fbgemm": ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index d18b08bb2..d41a84449 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -425,16 +425,3 @@ def shufflenet_v2_x2_0(
     return _shufflenetv2(
         [4, 8, 4], [24, 244, 488, 976, 2048], weights=weights, progress=progress, quantize=quantize, **kwargs
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..shufflenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "shufflenetv2_x0.5_fbgemm": ShuffleNet_V2_X0_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "shufflenetv2_x1.0_fbgemm": ShuffleNet_V2_X1_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index a971dc09b..e887640e8 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1569,27 +1569,3 @@ def regnet_x_32gf(*, weights: Optional[RegNet_X_32GF_Weights] = None, progress:
 
     params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, **kwargs)
     return _regnet(params, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "regnet_y_400mf": RegNet_Y_400MF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_800mf": RegNet_Y_800MF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_1_6gf": RegNet_Y_1_6GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_3_2gf": RegNet_Y_3_2GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_8gf": RegNet_Y_8GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_16gf": RegNet_Y_16GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_32gf": RegNet_Y_32GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_400mf": RegNet_X_400MF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_800mf": RegNet_X_800MF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_1_6gf": RegNet_X_1_6GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_3_2gf": RegNet_X_3_2GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_8gf": RegNet_X_8GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_16gf": RegNet_X_16GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_32gf": RegNet_X_32GF_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index 1d3638917..40c8afb9a 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -983,22 +983,3 @@ def wide_resnet101_2(
 
     _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
     return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "resnet18": ResNet18_Weights.IMAGENET1K_V1.url,
-        "resnet34": ResNet34_Weights.IMAGENET1K_V1.url,
-        "resnet50": ResNet50_Weights.IMAGENET1K_V1.url,
-        "resnet101": ResNet101_Weights.IMAGENET1K_V1.url,
-        "resnet152": ResNet152_Weights.IMAGENET1K_V1.url,
-        "resnext50_32x4d": ResNeXt50_32X4D_Weights.IMAGENET1K_V1.url,
-        "resnext101_32x8d": ResNeXt101_32X8D_Weights.IMAGENET1K_V1.url,
-        "wide_resnet50_2": Wide_ResNet50_2_Weights.IMAGENET1K_V1.url,
-        "wide_resnet101_2": Wide_ResNet101_2_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
index b08889538..5f615d2c9 100644
--- a/torchvision/models/segmentation/deeplabv3.py
+++ b/torchvision/models/segmentation/deeplabv3.py
@@ -388,16 +388,3 @@ def deeplabv3_mobilenet_v3_large(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "deeplabv3_resnet50_coco": DeepLabV3_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "deeplabv3_resnet101_coco": DeepLabV3_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "deeplabv3_mobilenet_v3_large_coco": DeepLabV3_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
index fc13f1d7d..7a270c99d 100644
--- a/torchvision/models/segmentation/fcn.py
+++ b/torchvision/models/segmentation/fcn.py
@@ -230,15 +230,3 @@ def fcn_resnet101(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fcn_resnet50_coco": FCN_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "fcn_resnet101_coco": FCN_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
index d8a1d4ed7..ac1509d09 100644
--- a/torchvision/models/segmentation/lraspp.py
+++ b/torchvision/models/segmentation/lraspp.py
@@ -176,14 +176,3 @@ def lraspp_mobilenet_v3_large(
         model.load_state_dict(weights.get_state_dict(progress=progress))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "lraspp_mobilenet_v3_large_coco": LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index 99583e3b9..ba6056854 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -406,17 +406,3 @@ def shufflenet_v2_x2_0(
     weights = ShuffleNet_V2_X2_0_Weights.verify(weights)
 
     return _shufflenetv2(weights, progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "shufflenetv2_x0.5": ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1.url,
-        "shufflenetv2_x1.0": ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1.url,
-        "shufflenetv2_x1.5": None,
-        "shufflenetv2_x2.0": None,
-    }
-)
diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py
index 80ee7982e..84097c240 100644
--- a/torchvision/models/squeezenet.py
+++ b/torchvision/models/squeezenet.py
@@ -221,15 +221,3 @@ def squeezenet1_1(
     """
     weights = SqueezeNet1_1_Weights.verify(weights)
     return _squeezenet("1_1", weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "squeezenet1_0": SqueezeNet1_0_Weights.IMAGENET1K_V1.url,
-        "squeezenet1_1": SqueezeNet1_1_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index fca69928a..4bf1a8317 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -509,21 +509,3 @@ def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = Tru
     weights = VGG19_BN_Weights.verify(weights)
 
     return _vgg("E", True, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "vgg11": VGG11_Weights.IMAGENET1K_V1.url,
-        "vgg13": VGG13_Weights.IMAGENET1K_V1.url,
-        "vgg16": VGG16_Weights.IMAGENET1K_V1.url,
-        "vgg19": VGG19_Weights.IMAGENET1K_V1.url,
-        "vgg11_bn": VGG11_BN_Weights.IMAGENET1K_V1.url,
-        "vgg13_bn": VGG13_BN_Weights.IMAGENET1K_V1.url,
-        "vgg16_bn": VGG16_BN_Weights.IMAGENET1K_V1.url,
-        "vgg19_bn": VGG19_BN_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index 10a41444d..1e3075311 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -862,17 +862,3 @@ def interpolate_embeddings(
             model_state = model_state_copy
 
     return model_state
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "vit_b_16": ViT_B_16_Weights.IMAGENET1K_V1.url,
-        "vit_b_32": ViT_B_32_Weights.IMAGENET1K_V1.url,
-        "vit_l_16": ViT_L_16_Weights.IMAGENET1K_V1.url,
-        "vit_l_32": ViT_L_32_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 67399274b..abf827a08 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -421,8 +421,6 @@ def resize(
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
             ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
             supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image: if the longer edge of the image is greater
             than ``max_size`` after being resized according to ``size``, then
@@ -441,13 +439,6 @@ def resize(
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(resize)
-    # Backward compatibility with integer value
-    if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
-        interpolation = _interpolation_modes_from_int(interpolation)
 
     if not isinstance(interpolation, InterpolationMode):
         raise TypeError("Argument interpolation should be a InterpolationMode")
@@ -623,8 +614,6 @@ def resized_crop(
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
             ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
             supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
             is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
@@ -707,8 +696,6 @@ def perspective(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -724,14 +711,6 @@ def perspective(
 
     coeffs = _get_perspective_coeffs(startpoints, endpoints)
 
-    # Backward compatibility with integer value
-    if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
-        interpolation = _interpolation_modes_from_int(interpolation)
-
     if not isinstance(interpolation, InterpolationMode):
         raise TypeError("Argument interpolation should be a InterpolationMode")
 
@@ -1067,8 +1046,6 @@ def rotate(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1090,14 +1067,6 @@ def rotate(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(rotate)
 
-    # Backward compatibility with integer value
-    if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
-        interpolation = _interpolation_modes_from_int(interpolation)
-
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
 
@@ -1148,8 +1117,6 @@ def affine(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -1165,14 +1132,6 @@ def affine(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(affine)
 
-    # Backward compatibility with integer value
-    if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
-        interpolation = _interpolation_modes_from_int(interpolation)
-
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
 
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 18b87946a..9395ca674 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -298,8 +298,6 @@ class Resize(torch.nn.Module):
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image: if the longer edge of the image is greater
             than ``max_size`` after being resized according to ``size``, then
@@ -324,14 +322,6 @@ class Resize(torch.nn.Module):
         self.size = size
         self.max_size = max_size
 
-        # Backward compatibility with integer value
-        if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
-            interpolation = _interpolation_modes_from_int(interpolation)
-
         self.interpolation = interpolation
         self.antialias = antialias
 
@@ -752,8 +742,6 @@ class RandomPerspective(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
     """
@@ -763,14 +751,6 @@ class RandomPerspective(torch.nn.Module):
         _log_api_usage_once(self)
         self.p = p
 
-        # Backward compatibility with integer value
-        if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
-            interpolation = _interpolation_modes_from_int(interpolation)
-
         self.interpolation = interpolation
         self.distortion_scale = distortion_scale
 
@@ -867,8 +847,6 @@ class RandomResizedCrop(torch.nn.Module):
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
             is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
@@ -894,14 +872,6 @@ class RandomResizedCrop(torch.nn.Module):
         if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
             warnings.warn("Scale and ratio should be of kind (min, max)")
 
-        # Backward compatibility with integer value
-        if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
-            interpolation = _interpolation_modes_from_int(interpolation)
-
         self.interpolation = interpolation
         self.antialias = antialias
         self.scale = scale
@@ -1288,8 +1258,6 @@ class RandomRotation(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1307,14 +1275,6 @@ class RandomRotation(torch.nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
-        # Backward compatibility with integer value
-        if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
-            interpolation = _interpolation_modes_from_int(interpolation)
-
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
 
         if center is not None:
@@ -1398,8 +1358,6 @@ class RandomAffine(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
@@ -1422,14 +1380,6 @@ class RandomAffine(torch.nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
-        # Backward compatibility with integer value
-        if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
-            interpolation = _interpolation_modes_from_int(interpolation)
-
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
 
         if translate is not None:
-- 
GitLab


From 2d6e663afc15f878e6ff7ff52a1eaf0ee3e5a081 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 7 Feb 2023 10:59:33 +0100
Subject: [PATCH 247/624] make transforms v2 get_params a staticmethod (#7177)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_prototype_transforms_consistency.py | 73 ++++++++++++-------
 .../prototype/transforms/_transform.py        |  2 +-
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 4ad968ba2..2b90768a1 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -649,37 +649,58 @@ def test_call_consistency(config, args_kwargs):
     )
 
 
-@pytest.mark.parametrize(
-    "config",
-    [config for config in CONSISTENCY_CONFIGS if hasattr(config.legacy_cls, "get_params")],
-    ids=lambda config: config.legacy_cls.__name__,
+get_params_parametrization = pytest.mark.parametrize(
+    ("config", "get_params_args_kwargs"),
+    [
+        pytest.param(
+            next(config for config in CONSISTENCY_CONFIGS if config.prototype_cls is transform_cls),
+            get_params_args_kwargs,
+            id=transform_cls.__name__,
+        )
+        for transform_cls, get_params_args_kwargs in [
+            (prototype_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
+            (prototype_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
+            (prototype_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
+            (prototype_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
+            (prototype_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
+            (
+                prototype_transforms.RandomAffine,
+                ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
+            ),
+            (prototype_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
+            (prototype_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
+            (prototype_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
+            (prototype_transforms.AutoAugment, ArgsKwargs(5)),
+        ]
+    ],
 )
-def test_get_params_alias(config):
+
+
+@get_paramsl_parametrization
+def test_get_params_alias(config, get_params_args_kwargs):
     assert config.prototype_cls.get_params is config.legacy_cls.get_params
 
+    if not config.args_kwargs:
+        return
+    args, kwargs = config.args_kwargs[0]
+    legacy_transform = config.legacy_cls(*args, **kwargs)
+    prototype_transform = config.prototype_cls(*args, **kwargs)
 
-@pytest.mark.parametrize(
-    ("transform_cls", "args_kwargs"),
-    [
-        (prototype_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
-        (prototype_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
-        (prototype_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
-        (prototype_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
-        (prototype_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
-        (
-            prototype_transforms.RandomAffine,
-            ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
-        ),
-        (prototype_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
-        (prototype_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
-        (prototype_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
-        (prototype_transforms.AutoAugment, ArgsKwargs(5)),
-    ],
-)
-def test_get_params_jit(transform_cls, args_kwargs):
-    args, kwargs = args_kwargs
+    assert prototype_transform.get_params is legacy_transform.get_params
+
+
+@get_paramsl_parametrization
+def test_get_params_jit(config, get_params_args_kwargs):
+    get_params_args, get_params_kwargs = get_params_args_kwargs
+
+    torch.jit.script(config.prototype_cls.get_params)(*get_params_args, **get_params_kwargs)
+
+    if not config.args_kwargs:
+        return
+    args, kwargs = config.args_kwargs[0]
+    transform = config.prototype_cls(*args, **kwargs)
 
-    torch.jit.script(transform_cls.get_params)(*args, **kwargs)
+    torch.jit.script(transform.get_params)(*get_params_args, **get_params_kwargs)
 
 
 @pytest.mark.parametrize(
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 18678a526..206889ace 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -67,7 +67,7 @@ class Transform(nn.Module):
         # Since `get_params` is a `@staticmethod`, we have to bind it to the class itself rather than to an instance.
         # This method is called after subclassing has happened, i.e. `cls` is the subclass, e.g. `Resize`.
         if cls._v1_transform_cls is not None and hasattr(cls._v1_transform_cls, "get_params"):
-            cls.get_params = cls._v1_transform_cls.get_params  # type: ignore[attr-defined]
+            cls.get_params = staticmethod(cls._v1_transform_cls.get_params)  # type: ignore[attr-defined]
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
         # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
-- 
GitLab


From c05ad81bcb8a042a67abfd57efabb0778584b922 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Feb 2023 11:55:28 +0100
Subject: [PATCH 248/624] fix prototype consistency tests (#7190)

---
 test/test_prototype_transforms_consistency.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 2b90768a1..29963b36b 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -676,7 +676,7 @@ get_params_parametrization = pytest.mark.parametrize(
 )
 
 
-@get_paramsl_parametrization
+@get_params_parametrization
 def test_get_params_alias(config, get_params_args_kwargs):
     assert config.prototype_cls.get_params is config.legacy_cls.get_params
 
@@ -689,7 +689,7 @@ def test_get_params_alias(config, get_params_args_kwargs):
     assert prototype_transform.get_params is legacy_transform.get_params
 
 
-@get_paramsl_parametrization
+@get_params_parametrization
 def test_get_params_jit(config, get_params_args_kwargs):
     get_params_args, get_params_kwargs = get_params_args_kwargs
 
-- 
GitLab


From 732af96b92ac307bef334d4f263fc6af2c8d800c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Feb 2023 17:07:16 +0100
Subject: [PATCH 249/624] use torchdata as single source of truth for everthing
 datapipe (#6068)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_prototype_datasets_builtin.py           | 7 ++++++-
 torchvision/prototype/datasets/utils/_dataset.py  | 2 +-
 torchvision/prototype/datasets/utils/_internal.py | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 25ceaa490..6be261e21 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -10,9 +10,14 @@ import torch
 import torchvision.prototype.transforms.utils
 from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
 from torch.testing._comparison import assert_equal, ObjectPair, TensorLikePair
+
+# TODO: replace with torchdata.dataloader2.DataLoader2 as soon as it is stable-ish
 from torch.utils.data import DataLoader
-from torch.utils.data.graph import traverse_dps
+
+# TODO: replace with torchdata equivalent as soon as it is available
 from torch.utils.data.graph_settings import get_all_graph_pipes
+
+from torchdata.dataloader2.graph.utils import traverse_dps
 from torchdata.datapipes.iter import ShardingFilter, Shuffler
 from torchdata.datapipes.utils import StreamWrapper
 from torchvision._utils import sequence_to_str
diff --git a/torchvision/prototype/datasets/utils/_dataset.py b/torchvision/prototype/datasets/utils/_dataset.py
index e7486c854..0d1cc2b15 100644
--- a/torchvision/prototype/datasets/utils/_dataset.py
+++ b/torchvision/prototype/datasets/utils/_dataset.py
@@ -3,7 +3,7 @@ import importlib
 import pathlib
 from typing import Any, Collection, Dict, Iterator, List, Optional, Sequence, Union
 
-from torch.utils.data import IterDataPipe
+from torchdata.datapipes.iter import IterDataPipe
 from torchvision.datasets.utils import verify_str_arg
 
 from ._resource import OnlineResource
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 55f1b8a3f..f8a44b627 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -104,7 +104,7 @@ class PicklerDataPipe(IterDataPipe):
                 yield d
 
 
-class SharderDataPipe(torch.utils.data.datapipes.iter.grouping.ShardingFilterIterDataPipe):
+class SharderDataPipe(ShardingFilter):
     def __init__(self, source_datapipe: IterDataPipe) -> None:
         super().__init__(source_datapipe)
         self.rank = 0
-- 
GitLab


From 1222b495c9e369cbb72c74221f0ed80923a9e0bf Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Feb 2023 17:17:05 +0100
Subject: [PATCH 250/624] disable mypy for prototype datasets (#7194)

---
 mypy.ini | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 900c5479c..07b9c75c5 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -37,17 +37,7 @@ allow_redefinition = True
 
 [mypy-torchvision.prototype.datasets.*]
 
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
+ignore_errors = True
 
 [mypy-torchvision.io.image.*]
 
-- 
GitLab


From 1120aa9e89d86bc466bba8f10ba382a5c63d6056 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Feb 2023 20:02:02 +0100
Subject: [PATCH 251/624] introduce heuristic for simple tensor handling of
 transforms v2 (#7170)

---
 test/test_prototype_transforms.py             | 297 ++++++++++++------
 torchvision/prototype/transforms/_misc.py     |  19 ++
 .../prototype/transforms/_transform.py        |  35 ++-
 3 files changed, 250 insertions(+), 101 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 335fbfd4f..29c2bc135 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,15 +1,16 @@
 import itertools
+import re
 
 import numpy as np
 
 import PIL.Image
-
 import pytest
 import torch
 
 import torchvision.prototype.transforms.utils
-from common_utils import assert_equal, cpu_and_gpu
+from common_utils import cpu_and_gpu
 from prototype_common_utils import (
+    assert_equal,
     DEFAULT_EXTRA_DIMS,
     make_bounding_box,
     make_bounding_boxes,
@@ -25,7 +26,7 @@ from prototype_common_utils import (
 )
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import datapoints, transforms
-from torchvision.prototype.transforms.utils import check_type
+from torchvision.prototype.transforms.utils import check_type, is_simple_tensor
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
@@ -222,6 +223,67 @@ class TestSmoke:
         transform(input)
 
 
+@pytest.mark.parametrize(
+    "flat_inputs",
+    itertools.permutations(
+        [
+            next(make_vanilla_tensor_images()),
+            next(make_vanilla_tensor_images()),
+            next(make_pil_images()),
+            make_image(),
+            next(make_videos()),
+        ],
+        3,
+    ),
+)
+def test_simple_tensor_heuristic(flat_inputs):
+    def split_on_simple_tensor(to_split):
+        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
+        # 1. The first simple tensor. If none is present, this will be `None`
+        # 2. A list of the remaining simple tensors
+        # 3. A list of all other items
+        simple_tensors = []
+        others = []
+        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
+        # affect the splitting.
+        for item, inpt in zip(to_split, flat_inputs):
+            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
+        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
+
+    class CopyCloneTransform(transforms.Transform):
+        def _transform(self, inpt, params):
+            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
+
+        @staticmethod
+        def was_applied(output, inpt):
+            identity = output is inpt
+            if identity:
+                return False
+
+            # Make sure nothing fishy is going on
+            assert_equal(output, inpt)
+            return True
+
+    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
+
+    transform = CopyCloneTransform()
+    transformed_sample = transform(flat_inputs)
+
+    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
+
+    if first_simple_tensor_input is not None:
+        if other_inputs:
+            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+        else:
+            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+
+    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
+        assert not transform.was_applied(output, inpt)
+
+    for input, output in zip(other_inputs, other_outputs):
+        assert transform.was_applied(output, input)
+
+
 @pytest.mark.parametrize("p", [0.0, 1.0])
 class TestRandomHorizontalFlip:
     def input_expected_image_tensor(self, p, dtype=torch.float32):
@@ -1755,117 +1817,158 @@ class TestRandomResize:
         )
 
 
-@pytest.mark.parametrize(
-    ("dtype", "expected_dtypes"),
-    [
-        (
-            torch.float64,
-            {torch.Tensor: torch.float64, datapoints.Image: torch.float64, datapoints.BoundingBox: torch.float64},
-        ),
-        (
-            {torch.Tensor: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-            {torch.Tensor: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-        ),
-    ],
-)
-def test_to_dtype(dtype, expected_dtypes):
-    sample = dict(
-        plain_tensor=torch.testing.make_tensor(5, dtype=torch.int64, device="cpu"),
-        image=make_image(dtype=torch.uint8),
-        bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
-        str="str",
-        int=0,
+class TestToDtype:
+    @pytest.mark.parametrize(
+        ("dtype", "expected_dtypes"),
+        [
+            (
+                torch.float64,
+                {
+                    datapoints.Video: torch.float64,
+                    datapoints.Image: torch.float64,
+                    datapoints.BoundingBox: torch.float64,
+                },
+            ),
+            (
+                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+            ),
+        ],
     )
+    def test_call(self, dtype, expected_dtypes):
+        sample = dict(
+            video=make_video(dtype=torch.int64),
+            image=make_image(dtype=torch.uint8),
+            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
+            str="str",
+            int=0,
+        )
 
-    transform = transforms.ToDtype(dtype)
-    transformed_sample = transform(sample)
+        transform = transforms.ToDtype(dtype)
+        transformed_sample = transform(sample)
 
-    for key, value in sample.items():
-        value_type = type(value)
-        transformed_value = transformed_sample[key]
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
 
-        # make sure the transformation retains the type
-        assert isinstance(transformed_value, value_type)
+            # make sure the transformation retains the type
+            assert isinstance(transformed_value, value_type)
 
-        if isinstance(value, torch.Tensor):
-            assert transformed_value.dtype is expected_dtypes[value_type]
-        else:
-            assert transformed_value is value
+            if isinstance(value, torch.Tensor):
+                assert transformed_value.dtype is expected_dtypes[value_type]
+            else:
+                assert transformed_value is value
 
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((), dtype=torch.float32)
+        transform = transforms.ToDtype({torch.Tensor: torch.float64})
 
-@pytest.mark.parametrize(
-    ("dims", "inverse_dims"),
-    [
-        (
-            {torch.Tensor: (1, 2, 0), datapoints.Image: (2, 1, 0), datapoints.Video: None},
-            {torch.Tensor: (2, 0, 1), datapoints.Image: (2, 1, 0), datapoints.Video: None},
-        ),
-        (
-            {torch.Tensor: (1, 2, 0), datapoints.Image: (2, 1, 0), datapoints.Video: (1, 2, 3, 0)},
-            {torch.Tensor: (2, 0, 1), datapoints.Image: (2, 1, 0), datapoints.Video: (3, 0, 1, 2)},
-        ),
-    ],
-)
-def test_permute_dimensions(dims, inverse_dims):
-    sample = dict(
-        plain_tensor=torch.testing.make_tensor((3, 28, 28), dtype=torch.uint8, device="cpu"),
-        image=make_image(),
-        bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
-        video=make_video(),
-        str="str",
-        int=0,
+        assert transform(tensor).dtype is torch.float64
+
+    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, other_type: torch.float64})
+
+
+class TestPermuteDimensions:
+    @pytest.mark.parametrize(
+        ("dims", "inverse_dims"),
+        [
+            (
+                {datapoints.Image: (2, 1, 0), datapoints.Video: None},
+                {datapoints.Image: (2, 1, 0), datapoints.Video: None},
+            ),
+            (
+                {datapoints.Image: (2, 1, 0), datapoints.Video: (1, 2, 3, 0)},
+                {datapoints.Image: (2, 1, 0), datapoints.Video: (3, 0, 1, 2)},
+            ),
+        ],
     )
+    def test_call(self, dims, inverse_dims):
+        sample = dict(
+            image=make_image(),
+            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
+            video=make_video(),
+            str="str",
+            int=0,
+        )
 
-    transform = transforms.PermuteDimensions(dims)
-    transformed_sample = transform(sample)
+        transform = transforms.PermuteDimensions(dims)
+        transformed_sample = transform(sample)
 
-    for key, value in sample.items():
-        value_type = type(value)
-        transformed_value = transformed_sample[key]
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
 
-        if check_type(
-            value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
-        ):
-            if transform.dims.get(value_type) is not None:
-                assert transformed_value.permute(inverse_dims[value_type]).equal(value)
-            assert type(transformed_value) == torch.Tensor
-        else:
-            assert transformed_value is value
+            if check_type(
+                value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
+            ):
+                if transform.dims.get(value_type) is not None:
+                    assert transformed_value.permute(inverse_dims[value_type]).equal(value)
+                assert type(transformed_value) == torch.Tensor
+            else:
+                assert transformed_value is value
 
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((2, 3, 4))
+        transform = transforms.PermuteDimensions(dims=(1, 2, 0))
 
-@pytest.mark.parametrize(
-    "dims",
-    [
-        (-1, -2),
-        {torch.Tensor: (-1, -2), datapoints.Image: (1, 2), datapoints.Video: None},
-    ],
-)
-def test_transpose_dimensions(dims):
-    sample = dict(
-        plain_tensor=torch.testing.make_tensor((3, 28, 28), dtype=torch.uint8, device="cpu"),
-        image=make_image(),
-        bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
-        video=make_video(),
-        str="str",
-        int=0,
+        assert transform(tensor).shape == (3, 4, 2)
+
+    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+
+
+class TestTransposeDimensions:
+    @pytest.mark.parametrize(
+        "dims",
+        [
+            (-1, -2),
+            {datapoints.Image: (1, 2), datapoints.Video: None},
+        ],
     )
+    def test_call(self, dims):
+        sample = dict(
+            image=make_image(),
+            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
+            video=make_video(),
+            str="str",
+            int=0,
+        )
 
-    transform = transforms.TransposeDimensions(dims)
-    transformed_sample = transform(sample)
+        transform = transforms.TransposeDimensions(dims)
+        transformed_sample = transform(sample)
 
-    for key, value in sample.items():
-        value_type = type(value)
-        transformed_value = transformed_sample[key]
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
 
-        transposed_dims = transform.dims.get(value_type)
-        if check_type(
-            value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
-        ):
-            if transposed_dims is not None:
-                assert transformed_value.transpose(*transposed_dims).equal(value)
-            assert type(transformed_value) == torch.Tensor
-        else:
-            assert transformed_value is value
+            transposed_dims = transform.dims.get(value_type)
+            if check_type(
+                value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
+            ):
+                if transposed_dims is not None:
+                    assert transformed_value.transpose(*transposed_dims).equal(value)
+                assert type(transformed_value) == torch.Tensor
+            else:
+                assert transformed_value is value
+
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((2, 3, 4))
+        transform = transforms.TransposeDimensions(dims=(0, 2))
+
+        assert transform(tensor).shape == (4, 3, 2)
+
+    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
 
 
 class TestUniformTemporalSubsample:
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 07ab53aff..e7bb62da1 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
 
 import PIL.Image
@@ -155,6 +156,12 @@ class ToDtype(Transform):
         super().__init__()
         if not isinstance(dtype, dict):
             dtype = _get_defaultdict(dtype)
+        if torch.Tensor in dtype and any(cls in dtype for cls in [datapoints.Image, datapoints.Video]):
+            warnings.warn(
+                "Got `dtype` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+            )
         self.dtype = dtype
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -171,6 +178,12 @@ class PermuteDimensions(Transform):
         super().__init__()
         if not isinstance(dims, dict):
             dims = _get_defaultdict(dims)
+        if torch.Tensor in dims and any(cls in dims for cls in [datapoints.Image, datapoints.Video]):
+            warnings.warn(
+                "Got `dims` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+            )
         self.dims = dims
 
     def _transform(
@@ -189,6 +202,12 @@ class TransposeDimensions(Transform):
         super().__init__()
         if not isinstance(dims, dict):
             dims = _get_defaultdict(dims)
+        if torch.Tensor in dims and any(cls in dims for cls in [datapoints.Image, datapoints.Video]):
+            warnings.warn(
+                "Got `dims` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+            )
         self.dims = dims
 
     def _transform(
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 206889ace..675b0787e 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -7,7 +7,8 @@ import PIL.Image
 import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype.transforms.utils import check_type
+from torchvision.prototype import datapoints
+from torchvision.prototype.transforms.utils import check_type, has_any, is_simple_tensor
 from torchvision.utils import _log_api_usage_once
 
 
@@ -37,9 +38,35 @@ class Transform(nn.Module):
 
         params = self._get_params(flat_inputs)
 
-        flat_outputs = [
-            self._transform(inpt, params) if check_type(inpt, self._transformed_types) else inpt for inpt in flat_inputs
-        ]
+        # Below is a heuristic on how to deal with simple tensor inputs:
+        # 1. Simple tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
+        #    (`datapoints.Image` or `PIL.Image.Image`) or video (`datapoints.Video`) in the sample.
+        # 2. If there is no explicit image or video in the sample, only the first encountered simple tensor is
+        #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
+        #    of `tree_flatten`, which recurses depth-first through the input.
+        #
+        # This heuristic stems from two requirements:
+        # 1. We need to keep BC for single input simple tensors and treat them as images.
+        # 2. We don't want to treat all simple tensors as images, because some datasets like `CelebA` or `Widerface`
+        #    return supplemental numerical data as tensors that cannot be transformed as images.
+        #
+        # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
+        # tries to transform multiple simple tensors at the same time, expecting them all to be treated as images.
+        # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
+        flat_outputs = []
+        transform_simple_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
+        for inpt in flat_inputs:
+            needs_transform = True
+
+            if not check_type(inpt, self._transformed_types):
+                needs_transform = False
+            elif is_simple_tensor(inpt):
+                if transform_simple_tensor:
+                    transform_simple_tensor = False
+                else:
+                    needs_transform = False
+
+            flat_outputs.append(self._transform(inpt, params) if needs_transform else inpt)
 
         return tree_unflatten(flat_outputs, spec)
 
-- 
GitLab


From 378a3274b178ab065393f0de24e0b8fba9ab819d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Feb 2023 22:05:06 +0100
Subject: [PATCH 252/624] fix error message if v2 transform is not JIT
 scriptable (#7196)

---
 torchvision/prototype/transforms/_transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 675b0787e..c49306cc5 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -130,7 +130,7 @@ class Transform(nn.Module):
         # is around.
         if self._v1_transform_cls is None:
             raise RuntimeError(
-                f"Transform {type(self.__name__)} cannot be JIT scripted. "
+                f"Transform {type(self).__name__} cannot be JIT scripted. "
                 f"This is only support for backward compatibility with transforms which already in v1."
                 f"For torchscript support (on tensors only), you can use the functional API instead."
             )
-- 
GitLab


From 5ea8e013eb1bbfcf321c5c55113a5fa13840dd6c Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Wed, 8 Feb 2023 20:34:54 -0500
Subject: [PATCH 253/624] Windows Python 3.11 Support in CircleCI (#7199)

---
 .circleci/config.yml    | 123 ++++++++++++++++++++++++++++++++++++++++
 .circleci/regenerate.py |  11 +++-
 2 files changed, 132 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b7c6b6408..55e4709a2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1159,6 +1159,11 @@ workflows:
           python_version: '3.9'
       - binary_win_wheel:
           cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.10_cpu
           python_version: '3.10'
       - binary_win_wheel:
@@ -1172,8 +1177,30 @@ workflows:
           python_version: '3.10'
       - binary_win_wheel:
           cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.10_cu118
           python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cpu
+          name: binary_win_wheel_py3.11_cpu
+          python_version: '3.11'
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.11_cu117
+          python_version: '3.11'
+      - binary_win_wheel:
+          cu_version: cu118
+          name: binary_win_wheel_py3.11_cu118
+          python_version: '3.11'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1230,6 +1257,11 @@ workflows:
           python_version: '3.9'
       - binary_win_conda:
           cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.10_cpu
           python_version: '3.10'
       - binary_win_conda:
@@ -1243,6 +1275,11 @@ workflows:
           python_version: '3.10'
       - binary_win_conda:
           cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.10_cu118
           python_version: '3.10'
       - build_docs:
@@ -1309,6 +1346,15 @@ workflows:
               - nightly
           name: unittest_linux_gpu_py3.10
           python_version: '3.10'
+      - unittest_linux_gpu:
+          cu_version: cu117
+          filters:
+            branches:
+              only:
+              - main
+              - nightly
+          name: unittest_linux_gpu_py3.11
+          python_version: '3.11'
       - unittest_windows_cpu:
           cu_version: cpu
           name: unittest_windows_cpu_py3.8
@@ -1321,6 +1367,10 @@ workflows:
           cu_version: cpu
           name: unittest_windows_cpu_py3.10
           python_version: '3.10'
+      - unittest_windows_cpu:
+          cu_version: cpu
+          name: unittest_windows_cpu_py3.11
+          python_version: '3.11'
       - unittest_windows_gpu:
           cu_version: cu117
           name: unittest_windows_gpu_py3.8
@@ -1343,6 +1393,15 @@ workflows:
               - nightly
           name: unittest_windows_gpu_py3.10
           python_version: '3.10'
+      - unittest_windows_gpu:
+          cu_version: cu117
+          filters:
+            branches:
+              only:
+              - main
+              - nightly
+          name: unittest_windows_gpu_py3.11
+          python_version: '3.11'
       - unittest_macos_cpu:
           cu_version: cpu
           name: unittest_macos_cpu_py3.8
@@ -1355,6 +1414,10 @@ workflows:
           cu_version: cpu
           name: unittest_macos_cpu_py3.10
           python_version: '3.10'
+      - unittest_macos_cpu:
+          cu_version: cpu
+          name: unittest_macos_cpu_py3.11
+          python_version: '3.11'
 
   cmake:
     jobs:
@@ -1609,6 +1672,66 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.10_cu118
           subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cpu
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cpu_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu117
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu117_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cu117
+          subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu118
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cu118
+          subfolder: cu118/
       - binary_win_conda:
           cu_version: cpu
           filters:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index aa077efb9..d8d0d3c7b 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -21,7 +21,7 @@ import yaml
 from jinja2 import select_autoescape
 
 
-PYTHON_VERSIONS = ["3.8", "3.9", "3.10"]
+PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
 
 RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
 
@@ -62,7 +62,7 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
 
                         # Disable all Linux Wheels Workflows from CircleCI
                         # since those will now be done through Nova. We'll keep
-                        # around the py3.8 cpu Linux Wheels build since the docs
+                        # around the py3.8 CPU Linux Wheels build since the docs
                         # job depends on it.
                         if os_type == "linux" and btype == "wheel":
                             if not (python_version == "3.8" and cu_version == "cpu"):
@@ -76,6 +76,13 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         if os_type != "win" and btype == "conda":
                             continue
 
+                        # Not supporting Python 3.11 conda packages at the
+                        # moment since the necessary dependencies are not
+                        # available. Windows 3.11 Wheels will be built from
+                        # CircleCI here, however.
+                        if python_version == "3.11" and btype == "conda":
+                            continue
+
                         w += workflow_pair(
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
                         )
-- 
GitLab


From 86f551d308369b05935b0121271f35118062ac43 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 9 Feb 2023 10:44:24 +0100
Subject: [PATCH 254/624] update usages of torch.testing internals (#7203)

---
 test/prototype_common_utils.py          | 10 ++++++----
 test/test_prototype_datasets_builtin.py | 12 +++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 1cea10603..5f0daa4be 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -14,7 +14,7 @@ import torch
 import torch.testing
 from datasets_utils import combinations_grid
 from torch.nn.functional import one_hot
-from torch.testing._comparison import assert_equal as _assert_equal, BooleanPair, NonePair, NumberPair, TensorLikePair
+from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms.functional import convert_dtype_image_tensor, to_image_tensor
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
@@ -73,7 +73,7 @@ class ImagePair(TensorLikePair):
             actual, expected = self._promote_for_comparison(actual, expected)
             mae = float(torch.abs(actual - expected).float().mean())
             if mae > self.atol:
-                raise self._make_error_meta(
+                self._fail(
                     AssertionError,
                     f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
                 )
@@ -99,7 +99,7 @@ def assert_close(
     """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
     __tracebackhide__ = True
 
-    _assert_equal(
+    error_metas = not_close_error_metas(
         actual,
         expected,
         pair_types=(
@@ -117,10 +117,12 @@ def assert_close(
         check_dtype=check_dtype,
         check_layout=check_layout,
         check_stride=check_stride,
-        msg=msg,
         **kwargs,
     )
 
+    if error_metas:
+        raise error_metas[0].to_error(msg)
+
 
 assert_equal = functools.partial(assert_close, rtol=0, atol=0)
 
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 6be261e21..896023fd4 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -1,4 +1,3 @@
-import functools
 import io
 import pickle
 from collections import deque
@@ -9,7 +8,7 @@ import torch
 
 import torchvision.prototype.transforms.utils
 from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
-from torch.testing._comparison import assert_equal, ObjectPair, TensorLikePair
+from torch.testing._comparison import not_close_error_metas, ObjectPair, TensorLikePair
 
 # TODO: replace with torchdata.dataloader2.DataLoader2 as soon as it is stable-ish
 from torch.utils.data import DataLoader
@@ -25,9 +24,12 @@ from torchvision.prototype import datapoints, datasets, transforms
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
 
 
-assert_samples_equal = functools.partial(
-    assert_equal, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True
-)
+def assert_samples_equal(*args, msg=None, **kwargs):
+    error_metas = not_close_error_metas(
+        *args, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True, **kwargs
+    )
+    if error_metas:
+        raise error_metas[0].to_error(msg)
 
 
 def extract_datapipes(dp):
-- 
GitLab


From 539c6e290c172d173f986874d57be857235fbc6a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 9 Feb 2023 09:46:34 +0000
Subject: [PATCH 255/624] Put back previous tolerance for test_classification
 and test_video (#7202)

---
 test/test_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index d4dab1bbc..5826cc771 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -686,7 +686,7 @@ def test_classification_model(model_fn, dev):
     model.eval().to(device=dev)
     x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=1e-3)
+    _assert_expected(out.cpu(), model_name, prec=0.1)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
@@ -917,7 +917,7 @@ def test_video_model(model_fn, dev):
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
     x = torch.rand(input_shape).to(device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=1e-5)
+    _assert_expected(out.cpu(), model_name, prec=0.1)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
-- 
GitLab


From d75a5241945c3a2621fbfa98a25b613e3326dc15 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 9 Feb 2023 11:29:18 +0100
Subject: [PATCH 256/624] allow nn.ModuleList in RandomApply (#7197)

---
 test/test_prototype_transforms_consistency.py | 24 ++++++++++++-------
 .../prototype/transforms/_container.py        | 24 +++++++++++++++----
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 29963b36b..758acc7b1 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -23,6 +23,7 @@ from prototype_common_utils import (
     make_label,
     make_segmentation_mask,
 )
+from torch import nn
 from torchvision import transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import datapoints, transforms as prototype_transforms
@@ -761,19 +762,24 @@ class TestContainerTransforms:
         check_call_consistency(prototype_transform, legacy_transform)
 
     @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1])
-    def test_random_apply(self, p):
+    @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList])
+    def test_random_apply(self, p, sequence_type):
         prototype_transform = prototype_transforms.RandomApply(
-            [
-                prototype_transforms.Resize(256),
-                prototype_transforms.CenterCrop(224),
-            ],
+            sequence_type(
+                [
+                    prototype_transforms.Resize(256),
+                    prototype_transforms.CenterCrop(224),
+                ]
+            ),
             p=p,
         )
         legacy_transform = legacy_transforms.RandomApply(
-            [
-                legacy_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
-            ],
+            sequence_type(
+                [
+                    legacy_transforms.Resize(256),
+                    legacy_transforms.CenterCrop(224),
+                ]
+            ),
             p=p,
         )
 
diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/prototype/transforms/_container.py
index b65f18cd3..938f59f64 100644
--- a/torchvision/prototype/transforms/_container.py
+++ b/torchvision/prototype/transforms/_container.py
@@ -1,7 +1,9 @@
 import warnings
-from typing import Any, Callable, List, Optional, Sequence
+from typing import Any, Callable, List, Optional, Sequence, Union
 
 import torch
+
+from torch import nn
 from torchvision.prototype.transforms import Transform
 
 
@@ -25,9 +27,13 @@ class Compose(Transform):
         return "\n".join(format_string)
 
 
-class RandomApply(Compose):
-    def __init__(self, transforms: Sequence[Callable], p: float = 0.5) -> None:
-        super().__init__(transforms)
+class RandomApply(Transform):
+    def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
+        super().__init__()
+
+        if not isinstance(transforms, (Sequence, nn.ModuleList)):
+            raise TypeError("Argument transforms should be a sequence of callables or a `nn.ModuleList`")
+        self.transforms = transforms
 
         if not (0.0 <= p <= 1.0):
             raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
@@ -39,7 +45,15 @@ class RandomApply(Compose):
         if torch.rand(1) >= self.p:
             return sample
 
-        return super().forward(sample)
+        for transform in self.transforms:
+            sample = transform(sample)
+        return sample
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
 
 
 class RandomChoice(Transform):
-- 
GitLab


From 81700555db76b3b0768884d420a6a6ad7265b214 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 9 Feb 2023 13:31:41 +0000
Subject: [PATCH 257/624] Test some flaky detection models on float64 instead
 of float32 (#7204)

---
 test/test_models.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 5826cc771..abffe91aa 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -29,7 +29,7 @@ def list_model_fns(module):
     return [get_model_builder(name) for name in list_models(module)]
 
 
-def _get_image(input_shape, real_image, device):
+def _get_image(input_shape, real_image, device, dtype=None):
     """This routine loads a real or random image based on `real_image` argument.
     Currently, the real image is utilized for the following list of models:
     - `retinanet_resnet50_fpn`,
@@ -60,10 +60,10 @@ def _get_image(input_shape, real_image, device):
         convert_tensor = transforms.ToTensor()
         image = convert_tensor(img)
         assert tuple(image.size()) == input_shape
-        return image.to(device=device)
+        return image.to(device=device, dtype=dtype)
 
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    return torch.rand(input_shape).to(device=device)
+    return torch.rand(input_shape).to(device=device, dtype=dtype)
 
 
 @pytest.fixture
@@ -278,6 +278,11 @@ autocast_flaky_numerics = (
 # tests under test_quantized_classification_model will be skipped for the following models.
 quantized_flaky_models = ("inception_v3", "resnet50")
 
+# The tests for the following detection models are flaky.
+# We run those tests on float64 to avoid floating point errors.
+# FIXME: we shouldn't have to do that :'/
+detection_flaky_models = ("keypointrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn_v2")
+
 
 # The following contains configuration parameters for all models which are used by
 # the _test_*_model methods.
@@ -777,13 +782,17 @@ def test_detection_model(model_fn, dev):
         "input_shape": (3, 300, 300),
     }
     model_name = model_fn.__name__
+    if model_name in detection_flaky_models:
+        dtype = torch.float64
+    else:
+        dtype = torch.get_default_dtype()
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     input_shape = kwargs.pop("input_shape")
     real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
-    model.eval().to(device=dev)
-    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
+    model.eval().to(device=dev, dtype=dtype)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev, dtype=dtype)
     model_input = [x]
     with torch.no_grad(), freeze_rng_state():
         out = model(model_input)
-- 
GitLab


From e171bee832efa2ec1c70666922939b3f006f88d8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 9 Feb 2023 13:33:37 +0000
Subject: [PATCH 258/624] Increase tolerance fo resnet101 model test (#7206)

---
 test/test_models.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/test_models.py b/test/test_models.py
index abffe91aa..79591357b 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -691,7 +691,14 @@ def test_classification_model(model_fn, dev):
     model.eval().to(device=dev)
     x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=0.1)
+    # FIXME: this if/else is nasty and only here to please our CI prior to the
+    # release. We rethink these tests altogether.
+    if model_name == "resnet101":
+        prec = 0.2
+    else:
+        # FIXME: this is probably still way too high.
+        prec = 0.1
+    _assert_expected(out.cpu(), model_name, prec=prec)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
-- 
GitLab


From f47e70e88b6c1f9a366f93274d978934993bcb3f Mon Sep 17 00:00:00 2001
From: nps1ngh <npsingh0181@gmail.com>
Date: Thu, 9 Feb 2023 15:21:25 +0100
Subject: [PATCH 259/624] Use `inplace=None` as default in `ops.MLP` (#7209)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 torchvision/ops/misc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index 91784e602..0bbea6bce 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -268,7 +268,8 @@ class MLP(torch.nn.Sequential):
         hidden_channels (List[int]): List of the hidden channel dimensions
         norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer won't be used. Default: ``None``
         activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        inplace (bool, optional): Parameter for the activation layer, which can optionally do the operation in-place.
+            Default is ``None``, which uses the respective default values of the ``activation_layer`` and Dropout layer.
         bias (bool): Whether to use bias in the linear layer. Default ``True``
         dropout (float): The probability for the dropout layer. Default: 0.0
     """
@@ -279,7 +280,7 @@ class MLP(torch.nn.Sequential):
         hidden_channels: List[int],
         norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        inplace: Optional[bool] = True,
+        inplace: Optional[bool] = None,
         bias: bool = True,
         dropout: float = 0.0,
     ):
-- 
GitLab


From 87ec8048085a5d892e78e60cd1fb0b4e37219c1e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 9 Feb 2023 16:15:49 +0100
Subject: [PATCH 260/624] skip CPU tests on GPU GHA jobs (#6970)

---
 test/common_utils.py |  4 ++--
 test/conftest.py     | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 9e919a149..b76158b6c 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -13,11 +13,11 @@ from torchvision import io
 import __main__  # noqa: 401
 
 
-IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true"
+IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
 IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
 IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
-CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda."
+OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda."
 
 
 @contextlib.contextmanager
diff --git a/test/conftest.py b/test/conftest.py
index fb072ae25..adb13180a 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,7 +3,7 @@ import random
 import numpy as np
 import pytest
 import torch
-from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER
+from common_utils import CUDA_NOT_AVAILABLE_MSG, IN_FBCODE, IN_OSS_CI, IN_RE_WORKER, OSS_CI_GPU_NO_CUDA_MSG
 
 
 def pytest_configure(config):
@@ -18,7 +18,7 @@ def pytest_collection_modifyitems(items):
     #
     # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
     # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already.
-    # This is true for both CircleCI and the fbcode internal CI.
+    # This is true for both OSS CI and the fbcode internal CI.
     # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on
     # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if
     # these tests never existed.
@@ -49,12 +49,12 @@ def pytest_collection_modifyitems(items):
                 # TODO: something more robust would be to do that only in a sandcastle instance,
                 # so that we can still see the test being skipped when testing locally from a devvm
                 continue
-        elif IN_CIRCLE_CI:
+        elif IN_OSS_CI:
             # Here we're not in fbcode, so we can safely collect and skip tests.
             if not needs_cuda and torch.cuda.is_available():
-                # Similar to what happens in RE workers: we don't need the CircleCI GPU machines
+                # Similar to what happens in RE workers: we don't need the OSS CI GPU machines
                 # to run the CPU-only tests.
-                item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG))
+                item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG))
 
         if item.get_closest_marker("dont_collect") is not None:
             # currently, this is only used for some tests we're sure we don't want to run on fbcode
-- 
GitLab


From c974742c63abd2eb8700b0a8239904e6fcac3f4b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 9 Feb 2023 18:07:05 -0800
Subject: [PATCH 261/624] [CI] Tweak numpy constraints (#7212)

---
 packaging/torchvision/meta.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml
index 105e28c45..c75b37f94 100644
--- a/packaging/torchvision/meta.yaml
+++ b/packaging/torchvision/meta.yaml
@@ -23,7 +23,8 @@ requirements:
 
   run:
     - python
-    - defaults::numpy >=1.11
+    - defaults::numpy >=1.11 # [py <= 310]
+    - numpy >=1.23.5 # [py >= 311]
     - requests
     - libpng
     - ffmpeg >=4.2  # [not win]
-- 
GitLab


From 17088a689e66b5d8f678725fb1f4c7b97f0b4e77 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 10 Feb 2023 14:58:48 +0100
Subject: [PATCH 262/624] remove videos from test for DatasetFolder (#7216)

---
 test/test_datasets.py | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index bd6d1dcb2..8ebea4e90 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1528,27 +1528,16 @@ class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
 class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DatasetFolder
 
-    # The dataset has no fixed return type since it is defined by the loader parameter. For testing, we use a loader
-    # that simply returns the path as type 'str' instead of loading anything. See the 'dataset_args()' method.
-    FEATURE_TYPES = (str, int)
-
-    _IMAGE_EXTENSIONS = ("jpg", "png")
-    _VIDEO_EXTENSIONS = ("avi", "mp4")
-    _EXTENSIONS = (*_IMAGE_EXTENSIONS, *_VIDEO_EXTENSIONS)
+    _EXTENSIONS = ("jpg", "png")
 
     # DatasetFolder has two mutually exclusive parameters: 'extensions' and 'is_valid_file'. One of both is required.
     # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the
     # 'test_is_valid_file()' method.
     DEFAULT_CONFIG = dict(extensions=_EXTENSIONS)
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _IMAGE_EXTENSIONS]),
-        dict(extensions=_IMAGE_EXTENSIONS),
-        *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _VIDEO_EXTENSIONS]),
-        dict(extensions=_VIDEO_EXTENSIONS),
-    )
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])
 
     def dataset_args(self, tmpdir, config):
-        return tmpdir, lambda x: x
+        return tmpdir, datasets.folder.pil_loader
 
     def inject_fake_data(self, tmpdir, config):
         extensions = config["extensions"] or self._is_valid_file_to_extensions(config["is_valid_file"])
@@ -1559,14 +1548,8 @@ class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
             if ext not in extensions:
                 continue
 
-            create_example_folder = (
-                datasets_utils.create_image_folder
-                if ext in self._IMAGE_EXTENSIONS
-                else datasets_utils.create_video_folder
-            )
-
             num_examples = torch.randint(1, 3, size=()).item()
-            create_example_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples)
+            datasets_utils.create_image_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples)
 
             num_examples_total += num_examples
             classes.append(cls)
-- 
GitLab


From a9d25721a63833becca74f312175d078ea7206a2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 10 Feb 2023 15:32:06 +0100
Subject: [PATCH 263/624] [PoC] compatibility layer between stable datasets and
 prototype transforms (#6663)

---
 test/datasets_utils.py                        |  32 ++
 test/test_datasets.py                         |  18 +-
 torchvision/prototype/datapoints/__init__.py  |   2 +
 .../prototype/datapoints/_dataset_wrapper.py  | 399 ++++++++++++++++++
 4 files changed, 446 insertions(+), 5 deletions(-)
 create mode 100644 torchvision/prototype/datapoints/_dataset_wrapper.py

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 1f186650a..598d4408b 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -25,6 +25,7 @@ import torch
 import torchvision.datasets
 import torchvision.io
 from common_utils import disable_console_output, get_tmp_dir
+from torch.utils._pytree import tree_any
 from torchvision.transforms.functional import get_dimensions
 
 
@@ -581,6 +582,28 @@ class DatasetTestCase(unittest.TestCase):
 
                 mock.assert_called()
 
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        # Although this is a stable test, we unconditionally import from `torchvision.prototype` here. The wrapper needs
+        # to be available with the next release when v2 is released. Thus, if this import somehow fails on the release
+        # branch, we screwed up the roll-out
+        from torchvision.prototype.datapoints import wrap_dataset_for_transforms_v2
+        from torchvision.prototype.datapoints._datapoint import Datapoint
+
+        try:
+            with self.create_dataset(config) as (dataset, _):
+                wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
+                wrapped_sample = wrapped_dataset[0]
+                assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
+        except TypeError as error:
+            if str(error).startswith(f"No wrapper exist for dataset class {type(dataset).__name__}"):
+                return
+            raise error
+        except RuntimeError as error:
+            if "currently not supported by this wrapper" in str(error):
+                return
+            raise error
+
 
 class ImageDatasetTestCase(DatasetTestCase):
     """Abstract base class for image dataset testcases.
@@ -662,6 +685,15 @@ class VideoDatasetTestCase(DatasetTestCase):
 
         return wrapper
 
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly
+        # or use the supported `"TCHW"`
+        if config.setdefault("output_format", "TCHW") == "THWC":
+            return
+
+        super().test_transforms_v2_wrapper.__wrapped__(self, config)
+
 
 def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor:
     r"""Create a random uint8 tensor.
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 8ebea4e90..015f727a1 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -763,11 +763,19 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
         return info
 
     def _create_annotations(self, image_ids, num_annotations_per_image):
-        annotations = datasets_utils.combinations_grid(
-            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
-        )
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
+        annotations = []
+        annotion_id = 0
+        for image_id in itertools.islice(itertools.cycle(image_ids), len(image_ids) * num_annotations_per_image):
+            annotations.append(
+                dict(
+                    image_id=image_id,
+                    id=annotion_id,
+                    bbox=torch.rand(4).tolist(),
+                    segmentation=[torch.rand(8).tolist()],
+                    category_id=int(torch.randint(91, ())),
+                )
+            )
+            annotion_id += 1
         return annotations, dict()
 
     def _create_json(self, root, name, content):
diff --git a/torchvision/prototype/datapoints/__init__.py b/torchvision/prototype/datapoints/__init__.py
index f85cb3dd5..554088b91 100644
--- a/torchvision/prototype/datapoints/__init__.py
+++ b/torchvision/prototype/datapoints/__init__.py
@@ -4,3 +4,5 @@ from ._image import Image, ImageType, ImageTypeJIT, TensorImageType, TensorImage
 from ._label import Label, OneHotLabel
 from ._mask import Mask
 from ._video import TensorVideoType, TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT
+
+from ._dataset_wrapper import wrap_dataset_for_transforms_v2  # type: ignore[attr-defined]  # usort: skip
diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/prototype/datapoints/_dataset_wrapper.py
new file mode 100644
index 000000000..db96493b0
--- /dev/null
+++ b/torchvision/prototype/datapoints/_dataset_wrapper.py
@@ -0,0 +1,399 @@
+# type: ignore
+
+from __future__ import annotations
+
+import contextlib
+from collections import defaultdict
+
+import torch
+from torch.utils.data import Dataset
+
+from torchvision import datasets
+from torchvision.prototype import datapoints
+from torchvision.prototype.transforms import functional as F
+
+__all__ = ["wrap_dataset_for_transforms_v2"]
+
+
+# TODO: naming!
+def wrap_dataset_for_transforms_v2(dataset):
+    return VisionDatasetDatapointWrapper(dataset)
+
+
+class WrapperFactories(dict):
+    def register(self, dataset_cls):
+        def decorator(wrapper_factory):
+            self[dataset_cls] = wrapper_factory
+            return wrapper_factory
+
+        return decorator
+
+
+# We need this two-stage design, i.e. a wrapper factory producing the actual wrapper, since some wrappers depend on the
+# dataset instance rather than just the class, since they require the user defined instance attributes. Thus, we can
+# provide a wrapping from the dataset class to the factory here, but can only instantiate the wrapper at runtime when
+# we have access to the dataset instance.
+WRAPPER_FACTORIES = WrapperFactories()
+
+
+class VisionDatasetDatapointWrapper(Dataset):
+    def __init__(self, dataset):
+        dataset_cls = type(dataset)
+        wrapper_factory = WRAPPER_FACTORIES.get(dataset_cls)
+        if wrapper_factory is None:
+            # TODO: If we have documentation on how to do that, put a link in the error message.
+            msg = f"No wrapper exist for dataset class {dataset_cls.__name__}. Please wrap the output yourself."
+            if dataset_cls in datasets.__dict__.values():
+                msg = (
+                    f"{msg} If an automated wrapper for this dataset would be useful for you, "
+                    f"please open an issue at https://github.com/pytorch/vision/issues."
+                )
+            raise TypeError(msg)
+
+        self._dataset = dataset
+        self._wrapper = wrapper_factory(dataset)
+
+        # We need to disable the transforms on the dataset here to be able to inject the wrapping before we apply them.
+        # Although internally, `datasets.VisionDataset` merges `transform` and `target_transform` into the joint
+        # `transforms`
+        # https://github.com/pytorch/vision/blob/135a0f9ea9841b6324b4fe8974e2543cbb95709a/torchvision/datasets/vision.py#L52-L54
+        # some (if not most) datasets still use `transform` and `target_transform` individually. Thus, we need to
+        # disable all three here to be able to extract the untransformed sample to wrap.
+        self.transform, dataset.transform = dataset.transform, None
+        self.target_transform, dataset.target_transform = dataset.target_transform, None
+        self.transforms, dataset.transforms = dataset.transforms, None
+
+    def __getattr__(self, item):
+        with contextlib.suppress(AttributeError):
+            return object.__getattribute__(self, item)
+
+        return getattr(self._dataset, item)
+
+    def __getitem__(self, idx):
+        # This gets us the raw sample since we disabled the transforms for the underlying dataset in the constructor
+        # of this class
+        sample = self._dataset[idx]
+
+        sample = self._wrapper(sample)
+
+        # Regardless of whether the user has supplied the transforms individually (`transform` and `target_transform`)
+        # or joint (`transforms`), we can access the full functionality through `transforms`
+        if self.transforms is not None:
+            sample = self.transforms(*sample)
+
+        return sample
+
+    def __len__(self):
+        return len(self._dataset)
+
+
+def raise_not_supported(description):
+    raise RuntimeError(
+        f"{description} is currently not supported by this wrapper. "
+        f"If this would be helpful for you, please open an issue at https://github.com/pytorch/vision/issues."
+    )
+
+
+def identity(item):
+    return item
+
+
+def pil_image_to_mask(pil_image):
+    return datapoints.Mask(F.to_image_tensor(pil_image).squeeze(0))
+
+
+def list_of_dicts_to_dict_of_lists(list_of_dicts):
+    dict_of_lists = defaultdict(list)
+    for dct in list_of_dicts:
+        for key, value in dct.items():
+            dict_of_lists[key].append(value)
+    return dict(dict_of_lists)
+
+
+def wrap_target_by_type(target, *, target_types, type_wrappers):
+    if not isinstance(target, (tuple, list)):
+        target = [target]
+
+    wrapped_target = tuple(
+        type_wrappers.get(target_type, identity)(item) for target_type, item in zip(target_types, target)
+    )
+
+    if len(wrapped_target) == 1:
+        wrapped_target = wrapped_target[0]
+
+    return wrapped_target
+
+
+def classification_wrapper_factory(dataset):
+    return identity
+
+
+for dataset_cls in [
+    datasets.Caltech256,
+    datasets.CIFAR10,
+    datasets.CIFAR100,
+    datasets.ImageNet,
+    datasets.MNIST,
+    datasets.FashionMNIST,
+    datasets.GTSRB,
+    datasets.DatasetFolder,
+    datasets.ImageFolder,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(classification_wrapper_factory)
+
+
+def segmentation_wrapper_factory(dataset):
+    def wrapper(sample):
+        image, mask = sample
+        return image, pil_image_to_mask(mask)
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.VOCSegmentation,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(segmentation_wrapper_factory)
+
+
+def video_classification_wrapper_factory(dataset):
+    if dataset.video_clips.output_format == "THWC":
+        raise RuntimeError(
+            f"{type(dataset).__name__} with `output_format='THWC'` is not supported by this wrapper, "
+            f"since it is not compatible with the transformations. Please use `output_format='TCHW'` instead."
+        )
+
+    def wrapper(sample):
+        video, audio, label = sample
+
+        video = datapoints.Video(video)
+
+        return video, audio, label
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.HMDB51,
+    datasets.Kinetics,
+    datasets.UCF101,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(video_classification_wrapper_factory)
+
+
+@WRAPPER_FACTORIES.register(datasets.Caltech101)
+def caltech101_wrapper_factory(dataset):
+    if "annotation" in dataset.target_type:
+        raise_not_supported("Caltech101 dataset with `target_type=['annotation', ...]`")
+
+    return classification_wrapper_factory(dataset)
+
+
+@WRAPPER_FACTORIES.register(datasets.CocoDetection)
+def coco_dectection_wrapper_factory(dataset):
+    def segmentation_to_mask(segmentation, *, spatial_size):
+        from pycocotools import mask
+
+        segmentation = (
+            mask.frPyObjects(segmentation, *spatial_size)
+            if isinstance(segmentation, dict)
+            else mask.merge(mask.frPyObjects(segmentation, *spatial_size))
+        )
+        return torch.from_numpy(mask.decode(segmentation))
+
+    def wrapper(sample):
+        image, target = sample
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+
+        spatial_size = tuple(F.get_spatial_size(image))
+        batched_target["boxes"] = datapoints.BoundingBox(
+            batched_target["bbox"],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=spatial_size,
+        )
+        batched_target["masks"] = datapoints.Mask(
+            torch.stack(
+                [
+                    segmentation_to_mask(segmentation, spatial_size=spatial_size)
+                    for segmentation in batched_target["segmentation"]
+                ]
+            ),
+        )
+        batched_target["labels"] = torch.tensor(batched_target["category_id"])
+
+        return image, batched_target
+
+    return wrapper
+
+
+VOC_DETECTION_CATEGORIES = [
+    "__background__",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+VOC_DETECTION_CATEGORY_TO_IDX = dict(zip(VOC_DETECTION_CATEGORIES, range(len(VOC_DETECTION_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.VOCDetection)
+def voc_detection_wrapper_factory(dataset):
+    def wrapper(sample):
+        image, target = sample
+
+        batched_instances = list_of_dicts_to_dict_of_lists(target["annotation"]["object"])
+
+        target["boxes"] = datapoints.BoundingBox(
+            [
+                [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
+                for bndbox in batched_instances["bndbox"]
+            ],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=(image.height, image.width),
+        )
+        target["labels"] = torch.tensor(
+            [VOC_DETECTION_CATEGORY_TO_IDX[category] for category in batched_instances["name"]]
+        )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.SBDataset)
+def sbd_wrapper(dataset):
+    if dataset.mode == "boundaries":
+        raise_not_supported("SBDataset with mode='boundaries'")
+
+    return segmentation_wrapper_factory(dataset)
+
+
+@WRAPPER_FACTORIES.register(datasets.CelebA)
+def celeba_wrapper_factory(dataset):
+    if any(target_type in dataset.target_type for target_type in ["attr", "landmarks"]):
+        raise_not_supported("`CelebA` dataset with `target_type=['attr', 'landmarks', ...]`")
+
+    def wrapper(sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "bbox": lambda item: datapoints.BoundingBox(
+                    item, format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+                ),
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+KITTI_CATEGORIES = ["Car", "Van", "Truck", "Pedestrian", "Person_sitting", "Cyclist", "Tram", "Misc", "DontCare"]
+KITTI_CATEGORY_TO_IDX = dict(zip(KITTI_CATEGORIES, range(len(KITTI_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.Kitti)
+def kitti_wrapper_factory(dataset):
+    def wrapper(sample):
+        image, target = sample
+
+        if target is not None:
+            target = list_of_dicts_to_dict_of_lists(target)
+
+            target["boxes"] = datapoints.BoundingBox(
+                target["bbox"], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(image.height, image.width)
+            )
+            target["labels"] = torch.tensor([KITTI_CATEGORY_TO_IDX[category] for category in target["type"]])
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.OxfordIIITPet)
+def oxford_iiit_pet_wrapper_factor(dataset):
+    def wrapper(sample):
+        image, target = sample
+
+        if target is not None:
+            target = wrap_target_by_type(
+                target,
+                target_types=dataset._target_types,
+                type_wrappers={
+                    "segmentation": pil_image_to_mask,
+                },
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.Cityscapes)
+def cityscapes_wrapper_factory(dataset):
+    if any(target_type in dataset.target_type for target_type in ["polygon", "color"]):
+        raise_not_supported("`Cityscapes` dataset with `target_type=['polygon', 'color', ...]`")
+
+    def instance_segmentation_wrapper(mask):
+        # See https://github.com/mcordts/cityscapesScripts/blob/8da5dd00c9069058ccc134654116aac52d4f6fa2/cityscapesscripts/preparation/json2instanceImg.py#L7-L21
+        data = pil_image_to_mask(mask)
+        masks = []
+        labels = []
+        for id in data.unique():
+            masks.append(data == id)
+            label = id
+            if label >= 1_000:
+                label //= 1_000
+            labels.append(label)
+        return dict(masks=datapoints.Mask(torch.stack(masks)), labels=torch.stack(labels))
+
+    def wrapper(sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "instance": instance_segmentation_wrapper,
+                "semantic": pil_image_to_mask,
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.WIDERFace)
+def widerface_wrapper(dataset):
+    def wrapper(sample):
+        image, target = sample
+
+        if target is not None:
+            target["bbox"] = datapoints.BoundingBox(
+                target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+            )
+
+        return image, target
+
+    return wrapper
-- 
GitLab


From 7c9878a460004d3468f8ece8faa0c587abe0c4c9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 10 Feb 2023 15:33:13 +0100
Subject: [PATCH 264/624] remove datapoints compatibility for prototype
 datasets (#7154)

---
 test/test_prototype_datasets_builtin.py       | 16 ++++++++++------
 .../prototype/datapoints/_datapoint.py        | 19 +------------------
 .../prototype/datasets/_builtin/caltech.py    |  5 +++--
 .../prototype/datasets/_builtin/celeba.py     |  4 ++--
 .../prototype/datasets/_builtin/coco.py       |  5 ++---
 .../prototype/datasets/_builtin/cub200.py     |  4 ++--
 .../prototype/datasets/_builtin/sbd.py        |  8 +++++---
 7 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 896023fd4..7b33dc3e8 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -21,6 +21,7 @@ from torchdata.datapipes.iter import ShardingFilter, Shuffler
 from torchdata.datapipes.utils import StreamWrapper
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import datapoints, datasets, transforms
+from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
 
 
@@ -136,18 +137,21 @@ class TestCommon:
             raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:"))
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_simple_tensors(self, dataset_mock, config):
+    def test_no_unaccompanied_simple_tensors(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
+        sample = next_consume(iter(dataset))
 
         simple_tensors = {
-            key
-            for key, value in next_consume(iter(dataset)).items()
-            if torchvision.prototype.transforms.utils.is_simple_tensor(value)
+            key for key, value in sample.items() if torchvision.prototype.transforms.utils.is_simple_tensor(value)
         }
-        if simple_tensors:
+
+        if simple_tensors and not any(
+            isinstance(item, (datapoints.Image, datapoints.Video, EncodedImage)) for item in sample.values()
+        ):
             raise AssertionError(
                 f"The values of key(s) "
-                f"{sequence_to_str(sorted(simple_tensors), separate_last='and ')} contained simple tensors."
+                f"{sequence_to_str(sorted(simple_tensors), separate_last='and ')} contained simple tensors, "
+                f"but didn't find any (encoded) image or video."
             )
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index fbd19ad86..89c08a864 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -29,26 +29,9 @@ class Datapoint(torch.Tensor):
             requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
         return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
-    # FIXME: this is just here for BC with the prototype datasets. Some datasets use the Datapoint directly to have a
-    #  a no-op input for the prototype transforms. For this use case, we can't use plain tensors, since they will be
-    #  interpreted as images. We should decide if we want a public no-op datapoint like `GenericDatapoint` or make this
-    #  one public again.
-    def __new__(
-        cls,
-        data: Any,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: Optional[bool] = None,
-    ) -> Datapoint:
-        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-        return tensor.as_subclass(Datapoint)
-
     @classmethod
     def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
-        # FIXME: this is just here for BC with the prototype datasets. See __new__ for details. If that is resolved,
-        #  this method should be made abstract
-        # raise NotImplementedError
-        return tensor.as_subclass(cls)
+        raise NotImplementedError
 
     _NO_WRAPPING_EXCEPTIONS = {
         torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output),
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index 55a77c1a9..d8f560a36 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -3,9 +3,10 @@ import re
 from typing import Any, BinaryIO, Dict, List, Tuple, Union
 
 import numpy as np
+
+import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
 from torchvision.prototype.datapoints import BoundingBox, Label
-from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -115,7 +116,7 @@ class Caltech101(Dataset):
                 format="xyxy",
                 spatial_size=image.spatial_size,
             ),
-            contour=Datapoint(ann["obj_contour"].T),
+            contour=torch.as_tensor(ann["obj_contour"].T),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 9050cf0b5..66999c4c5 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -2,9 +2,9 @@ import csv
 import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, Union
 
+import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
 from torchvision.prototype.datapoints import BoundingBox, Label
-from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -149,7 +149,7 @@ class CelebA(Dataset):
                 spatial_size=image.spatial_size,
             ),
             landmarks={
-                landmark: Datapoint((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
+                landmark: torch.tensor((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
                 for landmark in {key[:-2] for key in landmarks.keys()}
             },
         )
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index fa68bf4dc..e02ca706b 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -15,7 +15,6 @@ from torchdata.datapipes.iter import (
     UnBatcher,
 )
 from torchvision.prototype.datapoints import BoundingBox, Label, Mask
-from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -124,8 +123,8 @@ class Coco(Dataset):
                     ]
                 )
             ),
-            areas=Datapoint([ann["area"] for ann in anns]),
-            crowds=Datapoint([ann["iscrowd"] for ann in anns], dtype=torch.bool),
+            areas=torch.as_tensor([ann["area"] for ann in anns]),
+            crowds=torch.as_tensor([ann["iscrowd"] for ann in anns], dtype=torch.bool),
             bounding_boxes=BoundingBox(
                 [ann["bbox"] for ann in anns],
                 format="xywh",
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index ea192baf6..db561f89e 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -3,6 +3,7 @@ import functools
 import pathlib
 from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple, Union
 
+import torch
 from torchdata.datapipes.iter import (
     CSVDictParser,
     CSVParser,
@@ -15,7 +16,6 @@ from torchdata.datapipes.iter import (
 )
 from torchdata.datapipes.map import IterToMapConverter
 from torchvision.prototype.datapoints import BoundingBox, Label
-from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -162,7 +162,7 @@ class CUB200(Dataset):
                 format="xyxy",
                 spatial_size=spatial_size,
             ),
-            segmentation=Datapoint(content["seg"]),
+            segmentation=torch.as_tensor(content["seg"]),
         )
 
     def _prepare_sample(
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
index c9f054b2c..97986b58b 100644
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ b/torchvision/prototype/datasets/_builtin/sbd.py
@@ -3,8 +3,8 @@ import re
 from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+import torch
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datapoints._datapoint import Datapoint
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -92,8 +92,10 @@ class SBD(Dataset):
             image=EncodedImage.from_file(image_buffer),
             ann_path=ann_path,
             # the boundaries are stored in sparse CSC format, which is not supported by PyTorch
-            boundaries=Datapoint(np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])),
-            segmentation=Datapoint(anns["Segmentation"].item()),
+            boundaries=torch.as_tensor(
+                np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])
+            ),
+            segmentation=torch.as_tensor(anns["Segmentation"].item()),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-- 
GitLab


From 9b233d41ad71de768a1714eaeb2ebd4f893688e5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 10 Feb 2023 15:53:43 +0000
Subject: [PATCH 265/624] Skip model test for vit_h_14 (#7218)

---
 test/test_models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_models.py b/test/test_models.py
index 79591357b..97494d649 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -682,6 +682,10 @@ def test_classification_model(model_fn, dev):
     model_name = model_fn.__name__
     if SKIP_BIG_MODEL and is_skippable(model_name, dev):
         pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model")
+    if model_name == "vit_h_14" and dev == "cuda":
+        # TODO: investigate why this fail on CI. It doesn't fail on AWS cluster with CUDA 11.6
+        # (can't test with later versions ATM)
+        pytest.xfail("https://github.com/pytorch/vision/issues/7143")
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
     input_shape = kwargs.pop("input_shape")
-- 
GitLab


From c73411a4dc785997015501625e3888699f6530aa Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 10:48:24 +0100
Subject: [PATCH 266/624] fix COCO box format in dataset wrapper (#7225)

---
 torchvision/prototype/datapoints/_dataset_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/prototype/datapoints/_dataset_wrapper.py
index db96493b0..115926105 100644
--- a/torchvision/prototype/datapoints/_dataset_wrapper.py
+++ b/torchvision/prototype/datapoints/_dataset_wrapper.py
@@ -209,7 +209,7 @@ def coco_dectection_wrapper_factory(dataset):
         spatial_size = tuple(F.get_spatial_size(image))
         batched_target["boxes"] = datapoints.BoundingBox(
             batched_target["bbox"],
-            format=datapoints.BoundingBoxFormat.XYXY,
+            format=datapoints.BoundingBoxFormat.XYWH,
             spatial_size=spatial_size,
         )
         batched_target["masks"] = datapoints.Mask(
-- 
GitLab


From 0aed832953f463d0bcdf8d1269b7fe383d607cd2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 13 Feb 2023 10:03:47 +0000
Subject: [PATCH 267/624] Add tests for transform presets, and various fixes
 (#7223)

---
 test/test_prototype_transforms.py             | 152 ++++++++++++++++++
 .../prototype/transforms/_auto_augment.py     |   5 +-
 torchvision/prototype/transforms/_color.py    |   3 +-
 .../prototype/transforms/_transform.py        |  30 +++-
 4 files changed, 179 insertions(+), 11 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 29c2bc135..046550209 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,5 +1,6 @@
 import itertools
 import re
+from collections import defaultdict
 
 import numpy as np
 
@@ -1988,3 +1989,154 @@ class TestUniformTemporalSubsample:
         assert type(output) is type(inpt)
         assert output.shape[-4] == num_samples
         assert output.dtype == inpt.dtype
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("label_type", (torch.Tensor, int))
+@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
+
+    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_simple_tensor(image)
+
+    label = 1 if label_type is int else torch.tensor([1])
+
+    if dataset_return_type is dict:
+        sample = {
+            "image": image,
+            "label": label,
+        }
+    else:
+        sample = image, label
+
+    t = transforms.Compose(
+        [
+            transforms.RandomResizedCrop((224, 224)),
+            transforms.RandomHorizontalFlip(p=1),
+            transforms.RandAugment(),
+            transforms.TrivialAugmentWide(),
+            transforms.AugMix(),
+            transforms.AutoAugment(),
+            to_tensor(),
+            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
+            # intended?  This results in a failure if we convert to tensor after
+            # it, because the image would still be uint8 which make Normalize
+            # fail.
+            transforms.ConvertImageDtype(torch.float),
+            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
+            transforms.RandomErasing(p=1),
+        ]
+    )
+
+    out = t(sample)
+
+    assert type(out) == type(sample)
+
+    if dataset_return_type is tuple:
+        out_image, out_label = out
+    else:
+        assert out.keys() == sample.keys()
+        out_image, out_label = out.values()
+
+    assert out_image.shape[-2:] == (224, 224)
+    assert out_label == label
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("label_type", (torch.Tensor, list))
+@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+def test_detection_preset(image_type, label_type, data_augmentation, to_tensor):
+    if data_augmentation == "hflip":
+        t = [
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "lsj":
+        t = [
+            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
+            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
+            # leaving FixedSizeCrop in prototype for now, and it expects Label
+            # classes which we won't release yet.
+            # transforms.FixedSizeCrop(
+            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})
+            # ),
+            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "multiscale":
+        t = [
+            transforms.RandomShortestSize(
+                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
+            ),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssd":
+        t = [
+            transforms.RandomPhotometricDistort(p=1),
+            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
+            # TODO: put back IoUCrop once we remove its hard requirement for Labels
+            # transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssdlite":
+        t = [
+            # TODO: put back IoUCrop once we remove its hard requirement for Labels
+            # transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    t = transforms.Compose(t)
+
+    num_boxes = 5
+    H = W = 250
+
+    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_simple_tensor(image)
+
+    label = torch.randint(0, 10, size=(num_boxes,))
+    if label_type is list:
+        label = label.tolist()
+
+    # TODO: is the shape of the boxes OK? Should it be (1, num_boxes, 4)?? Same for masks
+    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = boxes.clamp(min=0, max=min(H, W))
+    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
+
+    masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
+
+    sample = {
+        "image": image,
+        "label": label,
+        "boxes": boxes,
+        "masks": masks,
+    }
+
+    out = t(sample)
+
+    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
+        assert is_simple_tensor(out["image"])
+    else:
+        assert isinstance(out["image"], datapoints.Image)
+    assert isinstance(out["label"], type(sample["label"]))
+
+    out["label"] = torch.tensor(out["label"])
+    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 50b17068a..89bead236 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -37,10 +37,11 @@ class _AutoAugmentBase(Transform):
         unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBox, datapoints.Mask),
     ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints.ImageType, datapoints.VideoType]]:
         flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+        needs_transform_list = self._needs_transform_list(flat_inputs)
 
         image_or_videos = []
-        for idx, inpt in enumerate(flat_inputs):
-            if check_type(
+        for idx, (inpt, needs_transform) in enumerate(zip(flat_inputs, needs_transform_list)):
+            if needs_transform and check_type(
                 inpt,
                 (
                     datapoints.Image,
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index a360e076b..09e313e5b 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -169,7 +169,8 @@ class RandomPhotometricDistort(Transform):
         if isinstance(orig_inpt, PIL.Image.Image):
             inpt = F.pil_to_tensor(inpt)
 
-        output = inpt[..., permutation, :, :]
+        # TODO: Find a better fix than as_subclass???
+        output = inpt[..., permutation, :, :].as_subclass(type(inpt))
 
         if isinstance(orig_inpt, PIL.Image.Image):
             output = F.to_image_pil(output)
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index c49306cc5..16c30565d 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -36,8 +36,19 @@ class Transform(nn.Module):
 
         self._check_inputs(flat_inputs)
 
-        params = self._get_params(flat_inputs)
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self._get_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
 
+        flat_outputs = [
+            self._transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]:
         # Below is a heuristic on how to deal with simple tensor inputs:
         # 1. Simple tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
         #    (`datapoints.Image` or `PIL.Image.Image`) or video (`datapoints.Video`) in the sample.
@@ -53,7 +64,8 @@ class Transform(nn.Module):
         # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
         # tries to transform multiple simple tensors at the same time, expecting them all to be treated as images.
         # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
-        flat_outputs = []
+
+        needs_transform_list = []
         transform_simple_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
         for inpt in flat_inputs:
             needs_transform = True
@@ -65,10 +77,8 @@ class Transform(nn.Module):
                     transform_simple_tensor = False
                 else:
                     needs_transform = False
-
-            flat_outputs.append(self._transform(inpt, params) if needs_transform else inpt)
-
-        return tree_unflatten(flat_outputs, spec)
+            needs_transform_list.append(needs_transform)
+        return needs_transform_list
 
     def extra_repr(self) -> str:
         extra = []
@@ -159,10 +169,14 @@ class _RandomApplyTransform(Transform):
         if torch.rand(1) >= self.p:
             return inputs
 
-        params = self._get_params(flat_inputs)
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self._get_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
 
         flat_outputs = [
-            self._transform(inpt, params) if check_type(inpt, self._transformed_types) else inpt for inpt in flat_inputs
+            self._transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
         ]
 
         return tree_unflatten(flat_outputs, spec)
-- 
GitLab


From 2489f3703af0be5b70050a9ac385e871ed767b83 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 11:57:55 +0100
Subject: [PATCH 268/624] add tests for perspective start- / endpoints (#7226)

---
 test/prototype_transforms_kernel_infos.py     | 32 ++++++++++++++++---
 .../prototype/datapoints/_bounding_box.py     |  6 +++-
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 1fac15262..4c0af6703 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -15,11 +15,14 @@ from prototype_common_utils import (
     get_num_channels,
     ImageLoader,
     InfoBase,
+    make_bounding_box_loader,
     make_bounding_box_loaders,
+    make_detection_mask_loader,
     make_image_loader,
     make_image_loaders,
     make_image_loaders_for_interpolation,
     make_mask_loaders,
+    make_video_loader,
     make_video_loaders,
     mark_framework_limitation,
     TestMark,
@@ -1168,12 +1171,18 @@ _PERSPECTIVE_COEFFS = [
     [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
     [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
 ]
+_STARTPOINTS = [[0, 1], [2, 3], [4, 5], [6, 7]]
+_ENDPOINTS = [[9, 8], [7, 6], [5, 4], [3, 2]]
 
 
 def sample_inputs_perspective_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"]):
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
-            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
+            yield ArgsKwargs(
+                image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]
+            )
+
+    yield ArgsKwargs(make_image_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
 
 
 def reference_inputs_perspective_image_tensor():
@@ -1200,25 +1209,38 @@ def reference_inputs_perspective_image_tensor():
 def sample_inputs_perspective_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader, bounding_box_loader.format, None, None, coefficients=_PERSPECTIVE_COEFFS[0]
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            startpoints=None,
+            endpoints=None,
+            coefficients=_PERSPECTIVE_COEFFS[0],
         )
 
+    format = datapoints.BoundingBoxFormat.XYXY
+    yield ArgsKwargs(
+        make_bounding_box_loader(format=format), format=format, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
+    )
+
 
 def sample_inputs_perspective_mask():
     for mask_loader in make_mask_loaders(sizes=["random"]):
-        yield ArgsKwargs(mask_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0])
+        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
+
+    yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
 
 
 def reference_inputs_perspective_mask():
     for mask_loader, perspective_coeffs in itertools.product(
         make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
     ):
-        yield ArgsKwargs(mask_loader, None, None, coefficients=perspective_coeffs)
+        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=perspective_coeffs)
 
 
 def sample_inputs_perspective_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0])
+        yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
+
+    yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
 
 
 KERNEL_INFOS.extend(
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
index f3c9b6b34..1abcb3959 100644
--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -176,7 +176,11 @@ class BoundingBox(Datapoint):
         coefficients: Optional[List[float]] = None,
     ) -> BoundingBox:
         output = self._F.perspective_bounding_box(
-            self.as_subclass(torch.Tensor), startpoints, endpoints, self.format, coefficients=coefficients
+            self.as_subclass(torch.Tensor),
+            format=self.format,
+            startpoints=startpoints,
+            endpoints=endpoints,
+            coefficients=coefficients,
         )
         return BoundingBox.wrap_like(self, output)
 
-- 
GitLab


From 0316ed105330c0f059126264d8bac879d3c0fc55 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 13:33:30 +0100
Subject: [PATCH 269/624] make clamp_bounding_box a kernel / dispatcher hybrid
 (#7227)

---
 test/prototype_common_utils.py                |  4 +-
 test/prototype_transforms_kernel_infos.py     | 24 +++++++++-
 test/test_prototype_transforms_functional.py  | 44 ++++++++++++++++---
 torchvision/prototype/transforms/_meta.py     |  7 +--
 .../prototype/transforms/functional/_meta.py  | 30 ++++++++++---
 5 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 5f0daa4be..3d3438331 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -640,14 +640,14 @@ class TestMark:
         self.condition = condition or (lambda args_kwargs: True)
 
 
-def mark_framework_limitation(test_id, reason):
+def mark_framework_limitation(test_id, reason, condition=None):
     # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
     # framework cannot handle the kernel in general or a specific parameter combination.
     # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
     # still justified.
     # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
     # we are wasting CI resources for no reason for most of the time
-    return TestMark(test_id, pytest.mark.skip(reason=reason))
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
 
 
 class InfoBase:
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 4c0af6703..ce80658ce 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -12,6 +12,7 @@ import torchvision.prototype.transforms.functional as F
 from datasets_utils import combinations_grid
 from prototype_common_utils import (
     ArgsKwargs,
+    BoundingBoxLoader,
     get_num_channels,
     ImageLoader,
     InfoBase,
@@ -25,6 +26,7 @@ from prototype_common_utils import (
     make_video_loader,
     make_video_loaders,
     mark_framework_limitation,
+    TensorLoader,
     TestMark,
 )
 from torch.utils._pytree import tree_map
@@ -2010,8 +2012,15 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_clamp_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
+        yield ArgsKwargs(bounding_box_loader)
+
+        simple_tensor_loader = TensorLoader(
+            fn=lambda shape, dtype, device: bounding_box_loader.fn(shape, dtype, device).as_subclass(torch.Tensor),
+            shape=bounding_box_loader.shape,
+            dtype=bounding_box_loader.dtype,
+        )
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
+            simple_tensor_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -2020,6 +2029,19 @@ KERNEL_INFOS.append(
         F.clamp_bounding_box,
         sample_inputs_fn=sample_inputs_clamp_bounding_box,
         logs_usage=True,
+        test_marks=[
+            mark_framework_limitation(
+                ("TestKernels", "test_scripted_vs_eager"),
+                reason=(
+                    "The function is hybrid kernel / dispatcher. JIT unwraps a `datapoints.BoundingBox` into a "
+                    "`torch.Tensor`, but then the kernel (rightfully) complains that neither `format` nor "
+                    "`spatial_size` was passed"
+                ),
+                condition=lambda arg_kwargs: isinstance(arg_kwargs.args[0], BoundingBoxLoader)
+                and arg_kwargs.kwargs.get("format") is None
+                and arg_kwargs.kwargs.get("spatial_size") is None,
+            )
+        ],
     )
 )
 
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 649620eda..948143771 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -155,12 +155,14 @@ class TestKernels:
         if batched_tensor.ndim == data_dims:
             return batch
 
-        return [
-            self._unbatch(unbatched, data_dims=data_dims)
-            for unbatched in (
-                batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
-            )
-        ]
+        unbatcheds = []
+        for unbatched in (
+            batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
+        ):
+            if isinstance(batch, datapoints._datapoint.Datapoint):
+                unbatched = type(batch).wrap_like(batch, unbatched)
+            unbatcheds.append(self._unbatch(unbatched, data_dims=data_dims))
+        return unbatcheds
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -558,6 +560,36 @@ def test_normalize_image_tensor_stats(device, num_channels):
     assert_samples_from_standard_normal(F.normalize_image_tensor(image, mean, std))
 
 
+class TestClampBoundingBox:
+    @pytest.mark.parametrize(
+        "metadata",
+        [
+            dict(),
+            dict(format=datapoints.BoundingBoxFormat.XYXY),
+            dict(spatial_size=(1, 1)),
+        ],
+    )
+    def test_simple_tensor_insufficient_metadata(self, metadata):
+        simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+
+        with pytest.raises(ValueError, match="simple tensor"):
+            F.clamp_bounding_box(simple_tensor, **metadata)
+
+    @pytest.mark.parametrize(
+        "metadata",
+        [
+            dict(format=datapoints.BoundingBoxFormat.XYXY),
+            dict(spatial_size=(1, 1)),
+            dict(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(1, 1)),
+        ],
+    )
+    def test_datapoint_explicit_metadata(self, metadata):
+        datapoint = next(make_bounding_boxes())
+
+        with pytest.raises(ValueError, match="bounding box datapoint"):
+            F.clamp_bounding_box(datapoint, **metadata)
+
+
 # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
 #  `prototype_transforms_kernel_infos.py`
 
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 1cef6eeb8..946c00b0e 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -51,9 +51,4 @@ class ClampBoundingBoxes(Transform):
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
-        # We need to unwrap here to avoid unnecessary `__torch_function__` calls,
-        # since `clamp_bounding_box` does not have a dispatcher function that would do that for us
-        output = F.clamp_bounding_box(
-            inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size
-        )
-        return datapoints.BoundingBox.wrap_like(inpt, output)
+        return F.clamp_bounding_box(inpt)  # type: ignore[return-value]
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 31d86bec2..2c5180a86 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
@@ -209,12 +209,9 @@ def convert_format_bounding_box(
     return bounding_box
 
 
-def clamp_bounding_box(
+def _clamp_bounding_box(
     bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(clamp_bounding_box)
-
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
     xyxy_boxes = convert_format_bounding_box(
@@ -225,6 +222,29 @@ def clamp_bounding_box(
     return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
 
 
+def clamp_bounding_box(
+    inpt: datapoints.InputTypeJIT,
+    format: Optional[BoundingBoxFormat] = None,
+    spatial_size: Optional[Tuple[int, int]] = None,
+) -> datapoints.InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_bounding_box)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        if format is None or spatial_size is None:
+            raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.")
+        return _clamp_bounding_box(inpt, format=format, spatial_size=spatial_size)
+    elif isinstance(inpt, datapoints.BoundingBox):
+        if format is not None or spatial_size is not None:
+            raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.")
+        output = _clamp_bounding_box(inpt, format=inpt.format, spatial_size=inpt.spatial_size)
+        return datapoints.BoundingBox.wrap_like(inpt, output)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
+        )
+
+
 def _num_value_bits(dtype: torch.dtype) -> int:
     if dtype == torch.uint8:
         return 8
-- 
GitLab


From ea37cd387c089d12a450888ee8e4460a12880379 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 13:59:29 +0100
Subject: [PATCH 270/624] make convert_format_bounding_box a hybrid kernel
 dispatcher (#7228)

---
 test/prototype_common_utils.py                |  7 ++++
 test/prototype_transforms_kernel_infos.py     | 35 +++++++++++++------
 test/test_prototype_transforms_functional.py  | 31 ++++++++++++++--
 torchvision/prototype/transforms/_meta.py     |  7 +---
 .../prototype/transforms/functional/_meta.py  | 35 +++++++++++++++++--
 5 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 3d3438331..89358ee7d 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -237,6 +237,13 @@ class TensorLoader:
     def load(self, device):
         return self.fn(self.shape, self.dtype, device)
 
+    def unwrap(self):
+        return TensorLoader(
+            fn=lambda shape, dtype, device: self.fn(shape, dtype, device).as_subclass(torch.Tensor),
+            shape=self.shape,
+            dtype=self.dtype,
+        )
+
 
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index ce80658ce..2ddf085ea 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -26,7 +26,6 @@ from prototype_common_utils import (
     make_video_loader,
     make_video_loaders,
     mark_framework_limitation,
-    TensorLoader,
     TestMark,
 )
 from torch.utils._pytree import tree_map
@@ -660,7 +659,8 @@ KERNEL_INFOS.extend(
 def sample_inputs_convert_format_bounding_box():
     formats = list(datapoints.BoundingBoxFormat)
     for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
-        yield ArgsKwargs(bounding_box_loader, old_format=bounding_box_loader.format, new_format=new_format)
+        yield ArgsKwargs(bounding_box_loader, new_format=new_format)
+        yield ArgsKwargs(bounding_box_loader.unwrap(), old_format=bounding_box_loader.format, new_format=new_format)
 
 
 def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
@@ -671,8 +671,14 @@ def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
 
 def reference_inputs_convert_format_bounding_box():
     for args_kwargs in sample_inputs_convert_format_bounding_box():
-        if len(args_kwargs.args[0].shape) == 2:
-            yield args_kwargs
+        if len(args_kwargs.args[0].shape) != 2:
+            continue
+
+        (loader, *other_args), kwargs = args_kwargs
+        if isinstance(loader, BoundingBoxLoader):
+            kwargs["old_format"] = loader.format
+            loader = loader.unwrap()
+        yield ArgsKwargs(loader, *other_args, **kwargs)
 
 
 KERNEL_INFOS.append(
@@ -682,6 +688,18 @@ KERNEL_INFOS.append(
         reference_fn=reference_convert_format_bounding_box,
         reference_inputs_fn=reference_inputs_convert_format_bounding_box,
         logs_usage=True,
+        test_marks=[
+            mark_framework_limitation(
+                ("TestKernels", "test_scripted_vs_eager"),
+                reason=(
+                    "The function is hybrid kernel / dispatcher. JIT unwraps a `datapoints.BoundingBox` into a "
+                    "`torch.Tensor`, but then the kernel (rightfully) complains that neither `format` nor "
+                    "`spatial_size` was passed"
+                ),
+                condition=lambda arg_kwargs: isinstance(arg_kwargs.args[0], BoundingBoxLoader)
+                and arg_kwargs.kwargs.get("old_format") is None,
+            )
+        ],
     ),
 )
 
@@ -2014,13 +2032,10 @@ def sample_inputs_clamp_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         yield ArgsKwargs(bounding_box_loader)
 
-        simple_tensor_loader = TensorLoader(
-            fn=lambda shape, dtype, device: bounding_box_loader.fn(shape, dtype, device).as_subclass(torch.Tensor),
-            shape=bounding_box_loader.shape,
-            dtype=bounding_box_loader.dtype,
-        )
         yield ArgsKwargs(
-            simple_tensor_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
+            bounding_box_loader.unwrap(),
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
         )
 
 
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 948143771..5469e56df 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -572,7 +572,7 @@ class TestClampBoundingBox:
     def test_simple_tensor_insufficient_metadata(self, metadata):
         simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
-        with pytest.raises(ValueError, match="simple tensor"):
+        with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` has to be passed")):
             F.clamp_bounding_box(simple_tensor, **metadata)
 
     @pytest.mark.parametrize(
@@ -586,10 +586,37 @@ class TestClampBoundingBox:
     def test_datapoint_explicit_metadata(self, metadata):
         datapoint = next(make_bounding_boxes())
 
-        with pytest.raises(ValueError, match="bounding box datapoint"):
+        with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` must not be passed")):
             F.clamp_bounding_box(datapoint, **metadata)
 
 
+class TestConvertFormatBoundingBox:
+    @pytest.mark.parametrize(
+        ("inpt", "old_format"),
+        [
+            (next(make_bounding_boxes()), None),
+            (next(make_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
+        ],
+    )
+    def test_missing_new_format(self, inpt, old_format):
+        with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")):
+            F.convert_format_bounding_box(inpt, old_format)
+
+    def test_simple_tensor_insufficient_metadata(self):
+        simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+
+        with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
+            F.convert_format_bounding_box(simple_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
+
+    def test_datapoint_explicit_metadata(self):
+        datapoint = next(make_bounding_boxes())
+
+        with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
+            F.convert_format_bounding_box(
+                datapoint, old_format=datapoint.format, new_format=datapoints.BoundingBoxFormat.CXCYWH
+            )
+
+
 # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
 #  `prototype_transforms_kernel_infos.py`
 
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 946c00b0e..75085fff6 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -19,12 +19,7 @@ class ConvertBoundingBoxFormat(Transform):
         self.format = format
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
-        # We need to unwrap here to avoid unnecessary `__torch_function__` calls,
-        # since `convert_format_bounding_box` does not have a dispatcher function that would do that for us
-        output = F.convert_format_bounding_box(
-            inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=params["format"]
-        )
-        return datapoints.BoundingBox.wrap_like(inpt, output, format=params["format"])
+        return F.convert_format_bounding_box(inpt, new_format=self.format)  # type: ignore[return-value]
 
 
 class ConvertDtype(Transform):
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 2c5180a86..a9917a80e 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -186,11 +186,9 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
     return xyxy
 
 
-def convert_format_bounding_box(
+def _convert_format_bounding_box(
     bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(convert_format_bounding_box)
 
     if new_format == old_format:
         return bounding_box
@@ -209,6 +207,37 @@ def convert_format_bounding_box(
     return bounding_box
 
 
+def convert_format_bounding_box(
+    inpt: datapoints.InputTypeJIT,
+    old_format: Optional[BoundingBoxFormat] = None,
+    new_format: Optional[BoundingBoxFormat] = None,
+    inplace: bool = False,
+) -> datapoints.InputTypeJIT:
+    # This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor
+    # inputs as well as extract it from `datapoints.BoundingBox` inputs. However, putting a default value on
+    # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
+    # default error that would be thrown if `new_format` had no default value.
+    if new_format is None:
+        raise TypeError("convert_format_bounding_box() missing 1 required argument: 'new_format'")
+
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_format_bounding_box)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        if old_format is None:
+            raise ValueError("For simple tensor inputs, `old_format` has to be passed.")
+        return _convert_format_bounding_box(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
+    elif isinstance(inpt, datapoints.BoundingBox):
+        if old_format is not None:
+            raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.")
+        output = _convert_format_bounding_box(inpt, old_format=inpt.format, new_format=new_format, inplace=inplace)
+        return datapoints.BoundingBox.wrap_like(inpt, output)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
+        )
+
+
 def _clamp_bounding_box(
     bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
-- 
GitLab


From af7c6c048d0d4540779fed1cb6020a2909a71484 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 13 Feb 2023 14:33:46 +0100
Subject: [PATCH 271/624] Fixed issues with dtype in geom functional transforms
 v2 (#7211)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/prototype_common_utils.py                |  6 +-
 test/prototype_transforms_kernel_infos.py     | 23 +++++-
 test/test_prototype_transforms_consistency.py | 33 ++++++---
 test/test_prototype_transforms_functional.py  |  2 +-
 torchvision/prototype/transforms/_misc.py     |  9 ++-
 .../transforms/functional/_geometry.py        | 72 ++++++++++---------
 torchvision/transforms/transforms.py          | 12 +++-
 7 files changed, 101 insertions(+), 56 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 89358ee7d..c53fecaef 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -304,7 +304,7 @@ def make_image_loaders(
         "RGBA",
     ),
     extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.uint8),
+    dtypes=(torch.float32, torch.float64, torch.uint8),
     constant_alpha=True,
 ):
     for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
@@ -426,7 +426,7 @@ def make_bounding_box_loaders(
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(datapoints.BoundingBoxFormat),
     spatial_size="random",
-    dtypes=(torch.float32, torch.int64),
+    dtypes=(torch.float32, torch.float64, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
         yield make_bounding_box_loader(**params, spatial_size=spatial_size)
@@ -618,7 +618,7 @@ def make_video_loaders(
     ),
     num_frames=(1, 0, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
+    dtypes=(torch.uint8, torch.float32, torch.float64),
 ):
     for params in combinations_grid(
         size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 2ddf085ea..eddf76440 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -109,6 +109,12 @@ def float32_vs_uint8_pixel_difference(atol=1, mae=False):
     }
 
 
+def scripted_vs_eager_double_pixel_difference(device, atol=1e-6, rtol=1e-6):
+    return {
+        (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False},
+    }
+
+
 def pil_reference_wrapper(pil_kernel):
     @functools.wraps(pil_kernel)
     def wrapper(input_tensor, *other_args, **kwargs):
@@ -541,8 +547,10 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
     def transform(bbox, affine_matrix_, format_):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
+        if not torch.is_floating_point(bbox):
+            bbox = bbox.float()
         bbox_xyxy = F.convert_format_bounding_box(
-            bbox.float(), old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+            bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
         )
         points = np.array(
             [
@@ -560,6 +568,7 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
                 np.max(transformed_points[:, 0]).item(),
                 np.max(transformed_points[:, 1]).item(),
             ],
+            dtype=bbox_xyxy.dtype,
         )
         out_bbox = F.convert_format_bounding_box(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
@@ -844,6 +853,10 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.rotate_bounding_box,
             sample_inputs_fn=sample_inputs_rotate_bounding_box,
+            closeness_kwargs={
+                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-6, rtol=1e-6),
+                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+            },
         ),
         KernelInfo(
             F.rotate_mask,
@@ -1275,6 +1288,8 @@ KERNEL_INFOS.extend(
                 **pil_reference_pixel_difference(2, mae=True),
                 **cuda_vs_cpu_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(),
+                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
             },
         ),
         KernelInfo(
@@ -1294,7 +1309,11 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.perspective_video,
             sample_inputs_fn=sample_inputs_perspective_video,
-            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+            closeness_kwargs={
+                **cuda_vs_cpu_pixel_difference(),
+                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+            },
         ),
     ]
 )
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 758acc7b1..f0a7b44db 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -138,17 +138,28 @@ CONSISTENCY_CONFIGS = [
             NotScriptableArgsKwargs(5, padding_mode="symmetric"),
         ],
     ),
-    ConsistencyConfig(
-        prototype_transforms.LinearTransformation,
-        legacy_transforms.LinearTransformation,
-        [
-            ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX, LINEAR_TRANSFORMATION_MEAN),
-        ],
-        # Make sure that the product of the height, width and number of channels matches the number of elements in
-        # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36.
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"]),
-        supports_pil=False,
-    ),
+    *[
+        ConsistencyConfig(
+            prototype_transforms.LinearTransformation,
+            legacy_transforms.LinearTransformation,
+            [
+                ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX.to(matrix_dtype), LINEAR_TRANSFORMATION_MEAN.to(matrix_dtype)),
+            ],
+            # Make sure that the product of the height, width and number of channels matches the number of elements in
+            # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36.
+            make_images_kwargs=dict(
+                DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"], dtypes=[image_dtype]
+            ),
+            supports_pil=False,
+        )
+        for matrix_dtype, image_dtype in [
+            (torch.float32, torch.float32),
+            (torch.float64, torch.float64),
+            (torch.float32, torch.uint8),
+            (torch.float64, torch.float32),
+            (torch.float32, torch.float64),
+        ]
+    ],
     ConsistencyConfig(
         prototype_transforms.Grayscale,
         legacy_transforms.Grayscale,
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 5469e56df..539cbce77 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -142,7 +142,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
-            msg=parametrized_error_message(*other_args, **kwargs),
+            msg=parametrized_error_message(*([actual, expected] + other_args), **kwargs),
         )
 
     def _unbatch(self, batch, *, data_dims):
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index e7bb62da1..39d9dc103 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -64,6 +64,11 @@ class LinearTransformation(Transform):
                 f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
             )
 
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
         self.transformation_matrix = transformation_matrix
         self.mean_vector = mean_vector
 
@@ -93,7 +98,9 @@ class LinearTransformation(Transform):
             )
 
         flat_tensor = inpt.reshape(-1, n) - self.mean_vector
-        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
+
+        transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
+        transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
         return transformed_tensor.reshape(shape)
 
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 66e777dbd..aa16dc0af 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -404,9 +404,13 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
 
 
 def _apply_grid_transform(
-    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT
+    img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT
 ) -> torch.Tensor:
 
+    # We are using context knowledge that grid should have float dtype
+    fp = img.dtype == grid.dtype
+    float_img = img if fp else img.to(grid.dtype)
+
     shape = float_img.shape
     if shape[0] > 1:
         # Apply same grid to a batch of images
@@ -433,7 +437,9 @@ def _apply_grid_transform(
             # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill
             float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img)
 
-    return float_img
+    img = float_img.round_().to(img.dtype) if not fp else float_img
+
+    return img
 
 
 def _assert_grid_transform_inputs(
@@ -511,7 +517,6 @@ def affine_image_tensor(
 
     shape = image.shape
     ndim = image.ndim
-    fp = torch.is_floating_point(image)
 
     if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
@@ -535,13 +540,10 @@ def affine_image_tensor(
 
     _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
 
-    dtype = image.dtype if fp else torch.float32
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
     theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
     grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height)
-    output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
-
-    if not fp:
-        output = output.round_().to(image.dtype)
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -612,7 +614,7 @@ def _affine_bounding_box_xyxy(
     # Single point structure is similar to
     # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
     points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
-    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
     # 2) Now let's transform the points using affine matrix
     transformed_points = torch.matmul(points, transposed_affine_matrix)
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
@@ -797,19 +799,15 @@ def rotate_image_tensor(
     matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
 
     if image.numel() > 0:
-        fp = torch.is_floating_point(image)
         image = image.reshape(-1, num_channels, height, width)
 
         _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
 
         ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
-        dtype = image.dtype if fp else torch.float32
+        dtype = image.dtype if torch.is_floating_point(image) else torch.float32
         theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
         grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh)
-        output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
-
-        if not fp:
-            output = output.round_().to(image.dtype)
+        output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
 
         new_height, new_width = output.shape[-2:]
     else:
@@ -1237,9 +1235,9 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
 
     d = 0.5
     base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
-    x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device)
+    x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device, dtype=dtype)
     base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
+    y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device, dtype=dtype).unsqueeze_(-1)
     base_grid[..., 1].copy_(y_grid)
     base_grid[..., 2].fill_(1)
 
@@ -1283,7 +1281,6 @@ def perspective_image_tensor(
 
     shape = image.shape
     ndim = image.ndim
-    fp = torch.is_floating_point(image)
 
     if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
@@ -1304,12 +1301,9 @@ def perspective_image_tensor(
     )
 
     oh, ow = shape[-2:]
-    dtype = image.dtype if fp else torch.float32
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
     grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
-    output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
-
-    if not fp:
-        output = output.round_().to(image.dtype)
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1494,8 +1488,12 @@ def elastic_image_tensor(
 
     shape = image.shape
     ndim = image.ndim
+
     device = image.device
-    fp = torch.is_floating_point(image)
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    # We are aware that if input image dtype is uint8 and displacement is float64 then
+    # displacement will be casted to float32 and all computations will be done with float32
+    # We can fix this later if needed
 
     if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
@@ -1506,12 +1504,12 @@ def elastic_image_tensor(
     else:
         needs_unsquash = False
 
-    image_height, image_width = shape[-2:]
-    grid = _create_identity_grid((image_height, image_width), device=device).add_(displacement.to(device))
-    output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill)
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
 
-    if not fp:
-        output = output.round_().to(image.dtype)
+    image_height, image_width = shape[-2:]
+    grid = _create_identity_grid((image_height, image_width), device=device, dtype=dtype).add_(displacement)
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1531,13 +1529,13 @@ def elastic_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
-def _create_identity_grid(size: Tuple[int, int], device: torch.device) -> torch.Tensor:
+def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: torch.dtype) -> torch.Tensor:
     sy, sx = size
-    base_grid = torch.empty(1, sy, sx, 2, device=device)
-    x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device)
+    base_grid = torch.empty(1, sy, sx, 2, device=device, dtype=dtype)
+    x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device, dtype=dtype)
     base_grid[..., 0].copy_(x_grid)
 
-    y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device).unsqueeze_(-1)
+    y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device, dtype=dtype).unsqueeze_(-1)
     base_grid[..., 1].copy_(y_grid)
 
     return base_grid
@@ -1552,7 +1550,11 @@ def elastic_bounding_box(
         return bounding_box
 
     # TODO: add in docstring about approximation we are doing for grid inversion
-    displacement = displacement.to(bounding_box.device)
+    device = bounding_box.device
+    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
 
     original_shape = bounding_box.shape
     bounding_box = (
@@ -1563,7 +1565,7 @@ def elastic_bounding_box(
     # Or add spatial_size arg and check displacement shape
     spatial_size = displacement.shape[-3], displacement.shape[-2]
 
-    id_grid = _create_identity_grid(spatial_size, bounding_box.device)
+    id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
     inv_grid = id_grid.sub_(displacement)
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 9395ca674..e39e04c34 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -1078,6 +1078,11 @@ class LinearTransformation(torch.nn.Module):
                 f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
             )
 
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
         self.transformation_matrix = transformation_matrix
         self.mean_vector = mean_vector
 
@@ -1105,9 +1110,10 @@ class LinearTransformation(torch.nn.Module):
             )
 
         flat_tensor = tensor.view(-1, n) - self.mean_vector
-        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
-        tensor = transformed_tensor.view(shape)
-        return tensor
+
+        transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
+        transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
+        return transformed_tensor.view(shape)
 
     def __repr__(self) -> str:
         s = (
-- 
GitLab


From acabaf80557fd0d9d9bbf2866eb7a598e85eed2f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 15:21:57 +0100
Subject: [PATCH 272/624] fix {convert_format, clamp}_bounding_box (#7229)

---
 torchvision/prototype/transforms/functional/_meta.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index a9917a80e..478604517 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -230,8 +230,10 @@ def convert_format_bounding_box(
     elif isinstance(inpt, datapoints.BoundingBox):
         if old_format is not None:
             raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.")
-        output = _convert_format_bounding_box(inpt, old_format=inpt.format, new_format=new_format, inplace=inplace)
-        return datapoints.BoundingBox.wrap_like(inpt, output)
+        output = _convert_format_bounding_box(
+            inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
+        )
+        return datapoints.BoundingBox.wrap_like(inpt, output, format=new_format)
     else:
         raise TypeError(
             f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
@@ -266,7 +268,7 @@ def clamp_bounding_box(
     elif isinstance(inpt, datapoints.BoundingBox):
         if format is not None or spatial_size is not None:
             raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.")
-        output = _clamp_bounding_box(inpt, format=inpt.format, spatial_size=inpt.spatial_size)
+        output = _clamp_bounding_box(inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size)
         return datapoints.BoundingBox.wrap_like(inpt, output)
     else:
         raise TypeError(
-- 
GitLab


From a63046ceea2c7c4d40b283a5a0d7df1ec8c5af90 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 15:58:26 +0100
Subject: [PATCH 273/624] only use plain tensors in kernel tests (#7230)

---
 test/prototype_common_utils.py                |   7 -
 test/prototype_transforms_dispatcher_infos.py |  34 ++++-
 test/prototype_transforms_kernel_infos.py     |  50 +------
 test/test_prototype_transforms_functional.py  | 140 ++++++++----------
 4 files changed, 97 insertions(+), 134 deletions(-)

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index c53fecaef..f2ae8d2b9 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -237,13 +237,6 @@ class TensorLoader:
     def load(self, device):
         return self.fn(self.shape, self.dtype, device)
 
-    def unwrap(self):
-        return TensorLoader(
-            fn=lambda shape, dtype, device: self.fn(shape, dtype, device).as_subclass(torch.Tensor),
-            shape=self.shape,
-            dtype=self.dtype,
-        )
-
 
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index f6b878657..8fe5333aa 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -64,13 +64,21 @@ class DispatcherInfo(InfoBase):
 
             if not filter_metadata:
                 yield from sample_inputs
-            else:
-                for args_kwargs in sample_inputs:
-                    for attribute in datapoint_type.__annotations__.keys():
-                        if attribute in args_kwargs.kwargs:
-                            del args_kwargs.kwargs[attribute]
+                return
 
-                    yield args_kwargs
+            import itertools
+
+            for args_kwargs in sample_inputs:
+                for name in itertools.chain(
+                    datapoint_type.__annotations__.keys(),
+                    # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a
+                    #  per-dispatcher level. However, so far there is no option for that.
+                    (f"old_{name}" for name in datapoint_type.__annotations__.keys()),
+                ):
+                    if name in args_kwargs.kwargs:
+                        del args_kwargs.kwargs[name]
+
+                yield args_kwargs
 
 
 def xfail_jit(reason, *, condition=None):
@@ -458,4 +466,18 @@ DISPATCHER_INFOS = [
             skip_dispatch_datapoint,
         ],
     ),
+    DispatcherInfo(
+        F.clamp_bounding_box,
+        kernels={datapoints.BoundingBox: F.clamp_bounding_box},
+        test_marks=[
+            skip_dispatch_datapoint,
+        ],
+    ),
+    DispatcherInfo(
+        F.convert_format_bounding_box,
+        kernels={datapoints.BoundingBox: F.convert_format_bounding_box},
+        test_marks=[
+            skip_dispatch_datapoint,
+        ],
+    ),
 ]
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index eddf76440..e65fb2d6a 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -12,7 +12,6 @@ import torchvision.prototype.transforms.functional as F
 from datasets_utils import combinations_grid
 from prototype_common_utils import (
     ArgsKwargs,
-    BoundingBoxLoader,
     get_num_channels,
     ImageLoader,
     InfoBase,
@@ -337,7 +336,6 @@ def sample_inputs_resize_video():
 
 
 def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=None):
-
     old_height, old_width = spatial_size
     new_height, new_width = F._geometry._compute_resized_output_size(spatial_size, size=size, max_size=max_size)
 
@@ -350,13 +348,15 @@ def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=
     )
 
     expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=bounding_box.format, affine_matrix=affine_matrix
+        bounding_box, format=datapoints.BoundingBoxFormat.XYXY, affine_matrix=affine_matrix
     )
     return expected_bboxes, (new_height, new_width)
 
 
 def reference_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(extra_dims=((), (4,))):
+    for bounding_box_loader in make_bounding_box_loaders(
+        formats=[datapoints.BoundingBoxFormat.XYXY], extra_dims=((), (4,))
+    ):
         for size in _get_resize_sizes(bounding_box_loader.spatial_size):
             yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
 
@@ -668,8 +668,7 @@ KERNEL_INFOS.extend(
 def sample_inputs_convert_format_bounding_box():
     formats = list(datapoints.BoundingBoxFormat)
     for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
-        yield ArgsKwargs(bounding_box_loader, new_format=new_format)
-        yield ArgsKwargs(bounding_box_loader.unwrap(), old_format=bounding_box_loader.format, new_format=new_format)
+        yield ArgsKwargs(bounding_box_loader, old_format=bounding_box_loader.format, new_format=new_format)
 
 
 def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
@@ -680,14 +679,8 @@ def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
 
 def reference_inputs_convert_format_bounding_box():
     for args_kwargs in sample_inputs_convert_format_bounding_box():
-        if len(args_kwargs.args[0].shape) != 2:
-            continue
-
-        (loader, *other_args), kwargs = args_kwargs
-        if isinstance(loader, BoundingBoxLoader):
-            kwargs["old_format"] = loader.format
-            loader = loader.unwrap()
-        yield ArgsKwargs(loader, *other_args, **kwargs)
+        if len(args_kwargs.args[0].shape) == 2:
+            yield args_kwargs
 
 
 KERNEL_INFOS.append(
@@ -697,18 +690,6 @@ KERNEL_INFOS.append(
         reference_fn=reference_convert_format_bounding_box,
         reference_inputs_fn=reference_inputs_convert_format_bounding_box,
         logs_usage=True,
-        test_marks=[
-            mark_framework_limitation(
-                ("TestKernels", "test_scripted_vs_eager"),
-                reason=(
-                    "The function is hybrid kernel / dispatcher. JIT unwraps a `datapoints.BoundingBox` into a "
-                    "`torch.Tensor`, but then the kernel (rightfully) complains that neither `format` nor "
-                    "`spatial_size` was passed"
-                ),
-                condition=lambda arg_kwargs: isinstance(arg_kwargs.args[0], BoundingBoxLoader)
-                and arg_kwargs.kwargs.get("old_format") is None,
-            )
-        ],
     ),
 )
 
@@ -2049,10 +2030,8 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_clamp_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(bounding_box_loader)
-
         yield ArgsKwargs(
-            bounding_box_loader.unwrap(),
+            bounding_box_loader,
             format=bounding_box_loader.format,
             spatial_size=bounding_box_loader.spatial_size,
         )
@@ -2063,19 +2042,6 @@ KERNEL_INFOS.append(
         F.clamp_bounding_box,
         sample_inputs_fn=sample_inputs_clamp_bounding_box,
         logs_usage=True,
-        test_marks=[
-            mark_framework_limitation(
-                ("TestKernels", "test_scripted_vs_eager"),
-                reason=(
-                    "The function is hybrid kernel / dispatcher. JIT unwraps a `datapoints.BoundingBox` into a "
-                    "`torch.Tensor`, but then the kernel (rightfully) complains that neither `format` nor "
-                    "`spatial_size` was passed"
-                ),
-                condition=lambda arg_kwargs: isinstance(arg_kwargs.args[0], BoundingBoxLoader)
-                and arg_kwargs.kwargs.get("format") is None
-                and arg_kwargs.kwargs.get("spatial_size") is None,
-            )
-        ],
     )
 )
 
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 539cbce77..1650d03de 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -121,8 +121,8 @@ class TestKernels:
     def test_logging(self, spy_on, info, args_kwargs, device):
         spy = spy_on(torch._C._log_api_usage_once)
 
-        args, kwargs = args_kwargs.load(device)
-        info.kernel(*args, **kwargs)
+        (input, *other_args), kwargs = args_kwargs.load(device)
+        info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
 
         spy.assert_any_call(f"{info.kernel.__module__}.{info.id}")
 
@@ -134,6 +134,7 @@ class TestKernels:
         kernel_scripted = script(kernel_eager)
 
         (input, *other_args), kwargs = args_kwargs.load(device)
+        input = input.as_subclass(torch.Tensor)
 
         actual = kernel_scripted(input, *other_args, **kwargs)
         expected = kernel_eager(input, *other_args, **kwargs)
@@ -155,14 +156,12 @@ class TestKernels:
         if batched_tensor.ndim == data_dims:
             return batch
 
-        unbatcheds = []
-        for unbatched in (
-            batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
-        ):
-            if isinstance(batch, datapoints._datapoint.Datapoint):
-                unbatched = type(batch).wrap_like(batch, unbatched)
-            unbatcheds.append(self._unbatch(unbatched, data_dims=data_dims))
-        return unbatcheds
+        return [
+            self._unbatch(unbatched, data_dims=data_dims)
+            for unbatched in (
+                batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
+            )
+        ]
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -195,6 +194,7 @@ class TestKernels:
         elif not all(batched_input.shape[:-data_dims]):
             pytest.skip("Input has a degenerate batch shape.")
 
+        batched_input = batched_input.as_subclass(torch.Tensor)
         batched_output = info.kernel(batched_input, *other_args, **kwargs)
         actual = self._unbatch(batched_output, data_dims=data_dims)
 
@@ -212,6 +212,7 @@ class TestKernels:
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_no_inplace(self, info, args_kwargs, device):
         (input, *other_args), kwargs = args_kwargs.load(device)
+        input = input.as_subclass(torch.Tensor)
 
         if input.numel() == 0:
             pytest.skip("The input has a degenerate shape.")
@@ -225,6 +226,7 @@ class TestKernels:
     @needs_cuda
     def test_cuda_vs_cpu(self, test_id, info, args_kwargs):
         (input_cpu, *other_args), kwargs = args_kwargs.load("cpu")
+        input_cpu = input_cpu.as_subclass(torch.Tensor)
         input_cuda = input_cpu.to("cuda")
 
         output_cpu = info.kernel(input_cpu, *other_args, **kwargs)
@@ -242,6 +244,7 @@ class TestKernels:
     @pytest.mark.parametrize("device", cpu_and_gpu())
     def test_dtype_and_device_consistency(self, info, args_kwargs, device):
         (input, *other_args), kwargs = args_kwargs.load(device)
+        input = input.as_subclass(torch.Tensor)
 
         output = info.kernel(input, *other_args, **kwargs)
         # Most kernels just return a tensor, but some also return some additional metadata
@@ -254,6 +257,7 @@ class TestKernels:
     @reference_inputs
     def test_against_reference(self, test_id, info, args_kwargs):
         (input, *other_args), kwargs = args_kwargs.load("cpu")
+        input = input.as_subclass(torch.Tensor)
 
         actual = info.kernel(input, *other_args, **kwargs)
         expected = info.reference_fn(input, *other_args, **kwargs)
@@ -271,6 +275,7 @@ class TestKernels:
     )
     def test_float32_vs_uint8(self, test_id, info, args_kwargs):
         (input, *other_args), kwargs = args_kwargs.load("cpu")
+        input = input.as_subclass(torch.Tensor)
 
         if input.dtype != torch.uint8:
             pytest.skip(f"Input dtype is {input.dtype}.")
@@ -341,7 +346,6 @@ class TestDispatchers:
     @pytest.mark.parametrize(
         "dispatcher",
         [
-            F.clamp_bounding_box,
             F.get_dimensions,
             F.get_image_num_channels,
             F.get_image_size,
@@ -647,21 +651,15 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 def test_correctness_affine_bounding_box_on_fixed_input(device):
     # Check transformation against known expected output
+    format = datapoints.BoundingBoxFormat.XYXY
     spatial_size = (64, 64)
-    # xyxy format
     in_boxes = [
         [20, 25, 35, 45],
         [50, 5, 70, 22],
         [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
         [1, 1, 5, 5],
     ]
-    in_boxes = datapoints.BoundingBox(
-        in_boxes,
-        format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=spatial_size,
-        dtype=torch.float64,
-        device=device,
-    )
+    in_boxes = torch.tensor(in_boxes, dtype=torch.float64, device=device)
     # Tested parameters
     angle = 63
     scale = 0.89
@@ -686,11 +684,11 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
 
     output_boxes = F.affine_bounding_box(
         in_boxes,
-        in_boxes.format,
-        in_boxes.spatial_size,
-        angle,
-        (dx * spatial_size[1], dy * spatial_size[0]),
-        scale,
+        format=format,
+        spatial_size=spatial_size,
+        angle=angle,
+        translate=(dx * spatial_size[1], dy * spatial_size[0]),
+        scale=scale,
         shear=(0, 0),
     )
 
@@ -725,9 +723,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
         affine_matrix = affine_matrix[:2, :]
 
         height, width = bbox.spatial_size
-        bbox_xyxy = convert_format_bounding_box(
-            bbox, old_format=bbox.format, new_format=datapoints.BoundingBoxFormat.XYXY
-        )
+        bbox_xyxy = convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -766,10 +762,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return (
-            convert_format_bounding_box(out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox.format),
-            (height, width),
-        )
+        return convert_format_bounding_box(out_bbox, new_format=bbox.format), (height, width)
 
     spatial_size = (32, 38)
 
@@ -778,8 +771,8 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
         bboxes_spatial_size = bboxes.spatial_size
 
         output_bboxes, output_spatial_size = F.rotate_bounding_box(
-            bboxes,
-            bboxes_format,
+            bboxes.as_subclass(torch.Tensor),
+            format=bboxes_format,
             spatial_size=bboxes_spatial_size,
             angle=angle,
             expand=expand,
@@ -810,6 +803,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
 @pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
     # Check transformation against known expected output
+    format = datapoints.BoundingBoxFormat.XYXY
     spatial_size = (64, 64)
     # xyxy format
     in_boxes = [
@@ -818,13 +812,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
         [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
         [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
     ]
-    in_boxes = datapoints.BoundingBox(
-        in_boxes,
-        format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=spatial_size,
-        dtype=torch.float64,
-        device=device,
-    )
+    in_boxes = torch.tensor(in_boxes, dtype=torch.float64, device=device)
     # Tested parameters
     angle = 45
     center = None if expand else [12, 23]
@@ -854,9 +842,9 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
 
     output_boxes, _ = F.rotate_bounding_box(
         in_boxes,
-        in_boxes.format,
-        in_boxes.spatial_size,
-        angle,
+        format=format,
+        spatial_size=spatial_size,
+        angle=angle,
         expand=expand,
         center=center,
     )
@@ -906,16 +894,14 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     #     out_box = denormalize_bbox(n_out_box, height, width)
     #     expected_bboxes.append(out_box)
 
-    size = (64, 76)
-    # xyxy format
+    format = datapoints.BoundingBoxFormat.XYXY
+    spatial_size = (64, 76)
     in_boxes = [
         [10.0, 15.0, 25.0, 35.0],
         [50.0, 5.0, 70.0, 22.0],
         [45.0, 46.0, 56.0, 62.0],
     ]
-    in_boxes = datapoints.BoundingBox(
-        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=size, device=device
-    )
+    in_boxes = torch.tensor(in_boxes, device=device)
     if format != datapoints.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
@@ -924,15 +910,15 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         format,
         top,
         left,
-        size[0],
-        size[1],
+        spatial_size[0],
+        spatial_size[1],
     )
 
     if format != datapoints.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_spatial_size, size)
+    torch.testing.assert_close(output_spatial_size, spatial_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -980,8 +966,8 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
         bbox[3] = (bbox[3] - top_) * size_[0] / height_
         return bbox
 
+    format = datapoints.BoundingBoxFormat.XYXY
     spatial_size = (100, 100)
-    # xyxy format
     in_boxes = [
         [10.0, 10.0, 20.0, 20.0],
         [5.0, 10.0, 15.0, 20.0],
@@ -1024,22 +1010,22 @@ def test_correctness_pad_bounding_box(device, padding):
     def _compute_expected_bbox(bbox, padding_):
         pad_left, pad_up, _, _ = _parse_padding(padding_)
 
-        bbox_format = bbox.format
-        bbox_dtype = bbox.dtype
+        dtype = bbox.dtype
+        format = bbox.format
         bbox = (
             bbox.clone()
-            if bbox_format == datapoints.BoundingBoxFormat.XYXY
-            else convert_format_bounding_box(bbox, bbox_format, datapoints.BoundingBoxFormat.XYXY)
+            if format == datapoints.BoundingBoxFormat.XYXY
+            else convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
         )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_format_bounding_box(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format)
-        if bbox.dtype != bbox_dtype:
+        bbox = convert_format_bounding_box(bbox, new_format=format)
+        if bbox.dtype != dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
-            bbox = bbox.to(bbox_dtype)
+            bbox = bbox.to(dtype)
         return bbox
 
     def _compute_expected_spatial_size(bbox, padding_):
@@ -1108,9 +1094,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
             ]
         )
 
-        bbox_xyxy = convert_format_bounding_box(
-            bbox, old_format=bbox.format, new_format=datapoints.BoundingBoxFormat.XYXY
-        )
+        bbox_xyxy = convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -1122,22 +1106,22 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
         numer = np.matmul(points, m1.T)
         denom = np.matmul(points, m2.T)
         transformed_points = numer / denom
-        out_bbox = [
-            np.min(transformed_points[:, 0]),
-            np.min(transformed_points[:, 1]),
-            np.max(transformed_points[:, 0]),
-            np.max(transformed_points[:, 1]),
-        ]
+        out_bbox = np.array(
+            [
+                np.min(transformed_points[:, 0]),
+                np.min(transformed_points[:, 1]),
+                np.max(transformed_points[:, 0]),
+                np.max(transformed_points[:, 1]),
+            ]
+        )
         out_bbox = datapoints.BoundingBox(
-            np.array(out_bbox),
+            out_bbox,
             format=datapoints.BoundingBoxFormat.XYXY,
             spatial_size=bbox.spatial_size,
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(
-            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox.format
-        )
+        return convert_format_bounding_box(out_bbox, new_format=bbox.format)
 
     spatial_size = (32, 38)
 
@@ -1146,14 +1130,12 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 
     for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
 
         output_bboxes = F.perspective_bounding_box(
-            bboxes,
-            bboxes_format,
-            None,
-            None,
+            bboxes.as_subclass(torch.Tensor),
+            format=bboxes.format,
+            startpoints=None,
+            endpoints=None,
             coefficients=pcoeffs,
         )
 
@@ -1162,7 +1144,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
-- 
GitLab


From e21e43629605957e2f6db8680ae85a894e6223aa Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 17:34:55 +0100
Subject: [PATCH 274/624] unbatch image_id in CocoDetection wrapper (#7232)

---
 torchvision/prototype/datapoints/_dataset_wrapper.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/prototype/datapoints/_dataset_wrapper.py
index 115926105..910d2bb24 100644
--- a/torchvision/prototype/datapoints/_dataset_wrapper.py
+++ b/torchvision/prototype/datapoints/_dataset_wrapper.py
@@ -206,6 +206,10 @@ def coco_dectection_wrapper_factory(dataset):
 
         batched_target = list_of_dicts_to_dict_of_lists(target)
 
+        image_ids = batched_target.pop("image_id")
+        image_id = batched_target["image_id"] = image_ids.pop()
+        assert all(other_image_id == image_id for other_image_id in image_ids)
+
         spatial_size = tuple(F.get_spatial_size(image))
         batched_target["boxes"] = datapoints.BoundingBox(
             batched_target["bbox"],
-- 
GitLab


From 8fdaeb034a232bc59ba2f1925cf72cc585edbbba Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 13 Feb 2023 18:32:23 +0100
Subject: [PATCH 275/624] Image and Mask can accept PIL images (#7231)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/prototype_transforms_kernel_infos.py     |  2 +-
 test/test_prototype_datapoints.py             | 29 +++++++++++++++++++
 .../prototype/datapoints/_dataset_wrapper.py  |  2 +-
 torchvision/prototype/datapoints/_image.py    |  5 ++++
 torchvision/prototype/datapoints/_mask.py     |  6 ++++
 5 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index e65fb2d6a..8852f9864 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -835,7 +835,7 @@ KERNEL_INFOS.extend(
             F.rotate_bounding_box,
             sample_inputs_fn=sample_inputs_rotate_bounding_box,
             closeness_kwargs={
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-6, rtol=1e-6),
+                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
                 **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
             },
         ),
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index e6d2321fc..4663cdac3 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -1,5 +1,7 @@
 import pytest
 import torch
+
+from PIL import Image
 from torchvision.prototype import datapoints
 
 
@@ -130,3 +132,30 @@ def test_wrap_like():
     assert type(label_new) is datapoints.Label
     assert label_new.data_ptr() == output.data_ptr()
     assert label_new.categories is label.categories
+
+
+@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
+def test_image_instance(data):
+    image = datapoints.Image(data)
+    assert isinstance(image, torch.Tensor)
+    assert image.ndim == 3 and image.shape[0] == 3
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
+def test_mask_instance(data):
+    mask = datapoints.Mask(data)
+    assert isinstance(mask, torch.Tensor)
+    assert mask.ndim == 3 and mask.shape[0] == 1
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
+@pytest.mark.parametrize(
+    "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
+)
+def test_bbox_instance(data, format):
+    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
+    assert isinstance(bboxes, torch.Tensor)
+    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat.from_str(format.upper())
+    assert bboxes.format == format
diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/prototype/datapoints/_dataset_wrapper.py
index 910d2bb24..e60d61e5f 100644
--- a/torchvision/prototype/datapoints/_dataset_wrapper.py
+++ b/torchvision/prototype/datapoints/_dataset_wrapper.py
@@ -99,7 +99,7 @@ def identity(item):
 
 
 def pil_image_to_mask(pil_image):
-    return datapoints.Mask(F.to_image_tensor(pil_image).squeeze(0))
+    return datapoints.Mask(pil_image)
 
 
 def list_of_dicts_to_dict_of_lists(list_of_dicts):
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index 4ffeb37d5..e999d8243 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -23,6 +23,11 @@ class Image(Datapoint):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: Optional[bool] = None,
     ) -> Image:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.prototype.transforms import functional as F
+
+            data = F.pil_to_tensor(data)
+
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         if tensor.ndim < 2:
             raise ValueError
diff --git a/torchvision/prototype/datapoints/_mask.py b/torchvision/prototype/datapoints/_mask.py
index 834f99051..55476cd50 100644
--- a/torchvision/prototype/datapoints/_mask.py
+++ b/torchvision/prototype/datapoints/_mask.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 from typing import Any, List, Optional, Tuple, Union
 
+import PIL.Image
 import torch
 from torchvision.transforms import InterpolationMode
 
@@ -21,6 +22,11 @@ class Mask(Datapoint):
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: Optional[bool] = None,
     ) -> Mask:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.prototype.transforms import functional as F
+
+            data = F.pil_to_tensor(data)
+
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return cls._wrap(tensor)
 
-- 
GitLab


From b030e9363eab6089cc580725ee703cf2f01f3765 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 13 Feb 2023 17:41:28 +0000
Subject: [PATCH 276/624] Change default of antialias parameter from None to
 'warn' (#7160)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 gallery/plot_optical_flow.py                  | 11 +--
 references/depth/stereo/transforms.py         |  6 +-
 references/optical_flow/transforms.py         |  8 +-
 references/video_classification/presets.py    | 12 ++-
 test/test_functional_tensor.py                | 43 +++++++++--
 test/test_models.py                           | 20 +++++
 test/test_prototype_transforms.py             | 69 ++++++++++++++++-
 test/test_transforms.py                       | 23 ++++--
 test/test_transforms_tensor.py                | 25 ++++++-
 .../prototype/datapoints/_bounding_box.py     |  4 +-
 .../prototype/datapoints/_datapoint.py        |  4 +-
 torchvision/prototype/datapoints/_image.py    |  4 +-
 torchvision/prototype/datapoints/_mask.py     |  4 +-
 torchvision/prototype/datapoints/_video.py    |  4 +-
 torchvision/prototype/transforms/_geometry.py | 10 +--
 torchvision/prototype/transforms/_presets.py  |  8 +-
 .../transforms/functional/_geometry.py        | 21 ++++--
 torchvision/transforms/_presets.py            | 16 +++-
 torchvision/transforms/functional.py          | 75 ++++++++++++++++---
 torchvision/transforms/functional_tensor.py   |  8 +-
 torchvision/transforms/transforms.py          | 49 +++++++++---
 21 files changed, 345 insertions(+), 79 deletions(-)

diff --git a/gallery/plot_optical_flow.py b/gallery/plot_optical_flow.py
index b0a932098..835ce3301 100644
--- a/gallery/plot_optical_flow.py
+++ b/gallery/plot_optical_flow.py
@@ -81,9 +81,10 @@ plot(img1_batch)
 
 #########################
 # The RAFT model accepts RGB images. We first get the frames from
-# :func:`~torchvision.io.read_video` and resize them to ensure their
-# dimensions are divisible by 8. Then we use the transforms bundled into the
-# weights in order to preprocess the input and rescale its values to the
+# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
+# are divisible by 8. Note that we explicitly use ``antialias=False``, because
+# this is how those models were trained. Then we use the transforms bundled into
+# the weights in order to preprocess the input and rescale its values to the
 # required ``[-1, 1]`` interval.
 
 from torchvision.models.optical_flow import Raft_Large_Weights
@@ -93,8 +94,8 @@ transforms = weights.transforms()
 
 
 def preprocess(img1_batch, img2_batch):
-    img1_batch = F.resize(img1_batch, size=[520, 960])
-    img2_batch = F.resize(img2_batch, size=[520, 960])
+    img1_batch = F.resize(img1_batch, size=[520, 960], antialias=False)
+    img2_batch = F.resize(img2_batch, size=[520, 960], antialias=False)
     return transforms(img1_batch, img2_batch)
 
 
diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py
index f9e05feba..9c4a6bab6 100644
--- a/references/depth/stereo/transforms.py
+++ b/references/depth/stereo/transforms.py
@@ -455,7 +455,11 @@ class Resize(torch.nn.Module):
         INTERP_MODE = self._interpolation_mode_strategy()
 
         for img in images:
-            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE),)
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the stereo models with antialias=True?
+            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE, antialias=False),)
 
         for dsp in disparities:
             if dsp is not None:
diff --git a/references/optical_flow/transforms.py b/references/optical_flow/transforms.py
index 1ca3ca2a8..bc831a2ee 100644
--- a/references/optical_flow/transforms.py
+++ b/references/optical_flow/transforms.py
@@ -196,8 +196,12 @@ class RandomResizeAndCrop(torch.nn.Module):
 
         if torch.rand(1).item() < self.resize_prob:
             # rescale the images
-            img1 = F.resize(img1, size=(new_h, new_w))
-            img2 = F.resize(img2, size=(new_h, new_w))
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the OF models with antialias=True?
+            img1 = F.resize(img1, size=(new_h, new_w), antialias=False)
+            img2 = F.resize(img2, size=(new_h, new_w), antialias=False)
             if valid_flow_mask is None:
                 flow = F.resize(flow, size=(new_h, new_w))
                 flow = flow * torch.tensor([scale_x, scale_y])[:, None, None]
diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py
index ef7740522..f73802c96 100644
--- a/references/video_classification/presets.py
+++ b/references/video_classification/presets.py
@@ -15,7 +15,11 @@ class VideoClassificationPresetTrain:
     ):
         trans = [
             transforms.ConvertImageDtype(torch.float32),
-            transforms.Resize(resize_size),
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the video models with antialias=True?
+            transforms.Resize(resize_size, antialias=False),
         ]
         if hflip_prob > 0:
             trans.append(transforms.RandomHorizontalFlip(hflip_prob))
@@ -31,7 +35,11 @@ class VideoClassificationPresetEval:
         self.transforms = transforms.Compose(
             [
                 transforms.ConvertImageDtype(torch.float32),
-                transforms.Resize(resize_size),
+                # We hard-code antialias=False to preserve results after we changed
+                # its default from None to True (see
+                # https://github.com/pytorch/vision/pull/7160)
+                # TODO: we could re-train the video models with antialias=True?
+                transforms.Resize(resize_size, antialias=False),
                 transforms.Normalize(mean=mean, std=std),
                 transforms.CenterCrop(crop_size),
                 ConvertBCHWtoCBHW(),
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index fb9838ec2..58ba98bdf 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -2,6 +2,7 @@ import colorsys
 import itertools
 import math
 import os
+import warnings
 from functools import partial
 from typing import Sequence
 
@@ -483,8 +484,8 @@ def test_resize(device, dt, size, max_size, interpolation):
         tensor = tensor.to(dt)
         batch_tensors = batch_tensors.to(dt)
 
-    resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size)
-    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size)
+    resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size, antialias=True)
+    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size, antialias=True)
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
@@ -509,10 +510,12 @@ def test_resize(device, dt, size, max_size, interpolation):
     else:
         script_size = size
 
-    resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size)
+    resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True)
     assert_equal(resized_tensor, resize_result)
 
-    _test_fn_on_batch(batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size)
+    _test_fn_on_batch(
+        batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True
+    )
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -547,7 +550,7 @@ def test_resize_antialias(device, dt, size, interpolation):
         tensor = tensor.to(dt)
 
     resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, antialias=True)
-    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation)
+    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, antialias=True)
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
@@ -596,6 +599,23 @@ def test_assert_resize_antialias(interpolation):
         F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True)
 
 
+def test_resize_antialias_default_warning():
+
+    img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8)
+
+    match = "The default value of the antialias"
+    with pytest.warns(UserWarning, match=match):
+        F.resize(img, size=(20, 20))
+    with pytest.warns(UserWarning, match=match):
+        F.resized_crop(img, 0, 0, 10, 10, size=(20, 20))
+
+    # For modes that aren't bicubic or bilinear, don't throw a warning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        F.resize(img, size=(20, 20), interpolation=NEAREST)
+        F.resized_crop(img, 0, 0, 10, 10, size=(20, 20), interpolation=NEAREST)
+
+
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize("dt", [torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("size", [[10, 7], [10, 42], [42, 7]])
@@ -924,7 +944,9 @@ def test_resized_crop(device, mode):
     # 1) resize to the same size, crop to the same size => should be identity
     tensor, _ = _create_data(26, 36, device=device)
 
-    out_tensor = F.resized_crop(tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode)
+    out_tensor = F.resized_crop(
+        tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode, antialias=True
+    )
     assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}")
 
     # 2) resize by half and crop a TL corner
@@ -939,7 +961,14 @@ def test_resized_crop(device, mode):
 
     batch_tensors = _create_data_batch(26, 36, num_samples=4, device=device)
     _test_fn_on_batch(
-        batch_tensors, F.resized_crop, top=1, left=2, height=20, width=30, size=[10, 15], interpolation=NEAREST
+        batch_tensors,
+        F.resized_crop,
+        top=1,
+        left=2,
+        height=20,
+        width=30,
+        size=[10, 15],
+        interpolation=NEAREST,
     )
 
 
diff --git a/test/test_models.py b/test/test_models.py
index 97494d649..e1a288f4e 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -1050,5 +1050,25 @@ def test_raft(model_fn, scripted):
     _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1)
 
 
+def test_presets_antialias():
+
+    img = torch.randint(0, 256, size=(1, 3, 224, 224), dtype=torch.uint8)
+
+    match = "The default value of the antialias parameter"
+    with pytest.warns(UserWarning, match=match):
+        models.ResNet18_Weights.DEFAULT.transforms()(img)
+    with pytest.warns(UserWarning, match=match):
+        models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms()(img)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        models.ResNet18_Weights.DEFAULT.transforms(antialias=True)(img)
+        models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms(antialias=True)(img)
+
+        models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT.transforms()(img)
+        models.video.R3D_18_Weights.DEFAULT.transforms()(img)
+        models.optical_flow.Raft_Small_Weights.DEFAULT.transforms()(img, img)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 046550209..0ed51c44d 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,5 +1,6 @@
 import itertools
 import re
+import warnings
 from collections import defaultdict
 
 import numpy as np
@@ -94,7 +95,7 @@ def parametrize_from_transforms(*transforms):
 class TestSmoke:
     @parametrize_from_transforms(
         transforms.RandomErasing(p=1.0),
-        transforms.Resize([16, 16]),
+        transforms.Resize([16, 16], antialias=True),
         transforms.CenterCrop([16, 16]),
         transforms.ConvertDtype(),
         transforms.RandomHorizontalFlip(),
@@ -210,7 +211,7 @@ class TestSmoke:
     @parametrize(
         [
             (
-                transforms.RandomResizedCrop([16, 16]),
+                transforms.RandomResizedCrop([16, 16], antialias=True),
                 itertools.chain(
                     make_images(extra_dims=[(4,)]),
                     make_vanilla_tensor_images(),
@@ -1991,6 +1992,70 @@ class TestUniformTemporalSubsample:
         assert output.dtype == inpt.dtype
 
 
+# TODO: remove this test in 0.17 when the default of antialias changes to True
+def test_antialias_warning():
+    pil_img = PIL.Image.new("RGB", size=(10, 10), color=127)
+    tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8)
+    tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
+
+    match = "The default value of the antialias parameter"
+    with pytest.warns(UserWarning, match=match):
+        transforms.Resize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResizedCrop((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.ScaleJitter((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomShortestSize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResize(10, 20)(tensor_img)
+
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize(tensor_img, (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize_image_tensor(tensor_img, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize(tensor_video, (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize_video(tensor_video, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Image(tensor_img).resize((20, 20))
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Video(tensor_video).resize((20, 20))
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20))
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        transforms.Resize((20, 20))(pil_img)
+        transforms.RandomResizedCrop((20, 20))(pil_img)
+        transforms.ScaleJitter((20, 20))(pil_img)
+        transforms.RandomShortestSize((20, 20))(pil_img)
+        transforms.RandomResize(10, 20)(pil_img)
+        transforms.functional.resize(pil_img, (20, 20))
+
+        transforms.Resize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
+        transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
+        transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResize(10, 20, antialias=True)(tensor_img)
+
+        transforms.functional.resize(tensor_img, (20, 20), antialias=True)
+        transforms.functional.resize_image_tensor(tensor_img, (20, 20), antialias=True)
+        transforms.functional.resize(tensor_video, (20, 20), antialias=True)
+        transforms.functional.resize_video(tensor_video, (20, 20), antialias=True)
+
+        datapoints.Image(tensor_img).resize((20, 20), antialias=True)
+        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+        datapoints.Video(tensor_video).resize((20, 20), antialias=True)
+        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+
+
 @pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
 @pytest.mark.parametrize("label_type", (torch.Tensor, int))
 @pytest.mark.parametrize("dataset_return_type", (dict, tuple))
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 214f2963b..a9074909c 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2,6 +2,7 @@ import math
 import os
 import random
 import re
+import warnings
 from functools import partial
 
 import numpy as np
@@ -319,7 +320,7 @@ def test_randomresized_params():
         scale_range = (scale_min, scale_min + round(random.random(), 2))
         aspect_min = max(round(random.random(), 2), epsilon)
         aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2))
-        randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range)
+        randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range, antialias=True)
         i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range)
         aspect_ratio_obtained = w / h
         assert (
@@ -366,7 +367,7 @@ def test_randomresized_params():
 def test_resize(height, width, osize, max_size):
     img = Image.new("RGB", size=(width, height), color=127)
 
-    t = transforms.Resize(osize, max_size=max_size)
+    t = transforms.Resize(osize, max_size=max_size, antialias=True)
     result = t(img)
 
     msg = f"{height}, {width} - {osize} - {max_size}"
@@ -424,7 +425,7 @@ def test_resize_sequence_output(height, width, osize):
     img = Image.new("RGB", size=(width, height), color=127)
     oheight, owidth = osize
 
-    t = transforms.Resize(osize)
+    t = transforms.Resize(osize, antialias=True)
     result = t(img)
 
     assert (owidth, oheight) == result.size
@@ -439,6 +440,16 @@ def test_resize_antialias_error():
         t(img)
 
 
+def test_resize_antialias_default_warning():
+
+    img = Image.new("RGB", size=(10, 10), color=127)
+    # We make sure we don't warn for PIL images since the default behaviour doesn't change
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        transforms.Resize((20, 20))(img)
+        transforms.RandomResizedCrop((20, 20))(img)
+
+
 @pytest.mark.parametrize("height, width", ((32, 64), (64, 32)))
 def test_resize_size_equals_small_edge_size(height, width):
     # Non-regression test for https://github.com/pytorch/vision/issues/5405
@@ -447,7 +458,7 @@ def test_resize_size_equals_small_edge_size(height, width):
     img = Image.new("RGB", size=(width, height), color=127)
 
     small_edge = min(height, width)
-    t = transforms.Resize(small_edge, max_size=max_size)
+    t = transforms.Resize(small_edge, max_size=max_size, antialias=True)
     result = t(img)
     assert max(result.size) == max_size
 
@@ -1424,11 +1435,11 @@ def test_random_choice(proba_passthrough, seed):
 def test_random_order():
     random_state = random.getstate()
     random.seed(42)
-    random_order_transform = transforms.RandomOrder([transforms.Resize(20), transforms.CenterCrop(10)])
+    random_order_transform = transforms.RandomOrder([transforms.Resize(20, antialias=True), transforms.CenterCrop(10)])
     img = transforms.ToPILImage()(torch.rand(3, 25, 25))
     num_samples = 250
     num_normal_order = 0
-    resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20)(img))
+    resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20, antialias=True)(img))
     for _ in range(num_samples):
         out = random_order_transform(img)
         if out == resize_crop_out:
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index 1a1de659a..b58e24203 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import warnings
 
 import numpy as np
 import pytest
@@ -371,7 +372,7 @@ class TestResize:
     def test_resize_int(self, size):
         # TODO: Minimal check for bug-fix, improve this later
         x = torch.rand(3, 32, 46)
-        t = T.Resize(size=size)
+        t = T.Resize(size=size, antialias=True)
         y = t(x)
         # If size is an int, smaller edge of the image will be matched to this number.
         # i.e, if height > width, then image will be rescaled to (size * height / width, size).
@@ -394,13 +395,13 @@ class TestResize:
         if max_size is not None and len(size) != 1:
             pytest.skip("Size should be an int or a sequence of length 1 if max_size is specified")
 
-        transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size)
+        transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size, antialias=True)
         s_transform = torch.jit.script(transform)
         _test_transform_vs_scripted(transform, s_transform, tensor)
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
     def test_resize_save_load(self, tmpdir):
-        fn = T.Resize(size=[32])
+        fn = T.Resize(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -424,9 +425,25 @@ class TestResize:
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
     def test_resized_crop_save_load(self, tmpdir):
-        fn = T.RandomResizedCrop(size=[32])
+        fn = T.RandomResizedCrop(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
+    def test_antialias_default_warning(self):
+
+        img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8)
+
+        match = "The default value of the antialias"
+        with pytest.warns(UserWarning, match=match):
+            T.Resize((20, 20))(img)
+        with pytest.warns(UserWarning, match=match):
+            T.RandomResizedCrop((20, 20))(img)
+
+        # For modes that aren't bicubic or bilinear, don't throw a warning
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            T.Resize((20, 20), interpolation=NEAREST)(img)
+            T.RandomResizedCrop((20, 20), interpolation=NEAREST)(img)
+
 
 def _test_random_affine_helper(device, **kwargs):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
index 1abcb3959..718c3c2ad 100644
--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -78,7 +78,7 @@ class BoundingBox(Datapoint):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
         output, spatial_size = self._F.resize_bounding_box(
             self.as_subclass(torch.Tensor), spatial_size=self.spatial_size, size=size, max_size=max_size
@@ -105,7 +105,7 @@ class BoundingBox(Datapoint):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
         output, spatial_size = self._F.resized_crop_bounding_box(
             self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index 89c08a864..d75a22110 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -145,7 +145,7 @@ class Datapoint(torch.Tensor):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Datapoint:
         return self
 
@@ -163,7 +163,7 @@ class Datapoint(torch.Tensor):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Datapoint:
         return self
 
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index e999d8243..bbd06de70 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -64,7 +64,7 @@ class Image(Datapoint):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Image:
         output = self._F.resize_image_tensor(
             self.as_subclass(torch.Tensor), size, interpolation=interpolation, max_size=max_size, antialias=antialias
@@ -87,7 +87,7 @@ class Image(Datapoint):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Image:
         output = self._F.resized_crop_image_tensor(
             self.as_subclass(torch.Tensor),
diff --git a/torchvision/prototype/datapoints/_mask.py b/torchvision/prototype/datapoints/_mask.py
index 55476cd50..dec26f80a 100644
--- a/torchvision/prototype/datapoints/_mask.py
+++ b/torchvision/prototype/datapoints/_mask.py
@@ -55,7 +55,7 @@ class Mask(Datapoint):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Mask:
         output = self._F.resize_mask(self.as_subclass(torch.Tensor), size, max_size=max_size)
         return Mask.wrap_like(self, output)
@@ -76,7 +76,7 @@ class Mask(Datapoint):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Mask:
         output = self._F.resized_crop_mask(self.as_subclass(torch.Tensor), top, left, height, width, size=size)
         return Mask.wrap_like(self, output)
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 5cc8370cd..2f628f2ef 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -59,7 +59,7 @@ class Video(Datapoint):
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Video:
         output = self._F.resize_video(
             self.as_subclass(torch.Tensor),
@@ -86,7 +86,7 @@ class Video(Datapoint):
         width: int,
         size: List[int],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> Video:
         output = self._F.resized_crop_video(
             self.as_subclass(torch.Tensor),
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 70ae972d9..b8c8d10ae 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -47,7 +47,7 @@ class Resize(Transform):
         size: Union[int, Sequence[int]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
 
@@ -95,7 +95,7 @@ class RandomResizedCrop(Transform):
         scale: Tuple[float, float] = (0.08, 1.0),
         ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
@@ -761,7 +761,7 @@ class ScaleJitter(Transform):
         target_size: Tuple[int, int],
         scale_range: Tuple[float, float] = (0.1, 2.0),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ):
         super().__init__()
         self.target_size = target_size
@@ -789,7 +789,7 @@ class RandomShortestSize(Transform):
         min_size: Union[List[int], Tuple[int], int],
         max_size: Optional[int] = None,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ):
         super().__init__()
         self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
@@ -936,7 +936,7 @@ class RandomResize(Transform):
         min_size: int,
         max_size: int,
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.min_size = min_size
diff --git a/torchvision/prototype/transforms/_presets.py b/torchvision/prototype/transforms/_presets.py
index a6980f3e1..86300b049 100644
--- a/torchvision/prototype/transforms/_presets.py
+++ b/torchvision/prototype/transforms/_presets.py
@@ -41,10 +41,14 @@ class StereoMatching(torch.nn.Module):
 
     def forward(self, left_image: Tensor, right_image: Tensor) -> Tuple[Tensor, Tensor]:
         def _process_image(img: PIL.Image.Image) -> Tensor:
-            if self.resize_size is not None:
-                img = F.resize(img, self.resize_size, interpolation=self.interpolation)
             if not isinstance(img, Tensor):
                 img = F.pil_to_tensor(img)
+            if self.resize_size is not None:
+                # We hard-code antialias=False to preserve results after we changed
+                # its default from None to True (see
+                # https://github.com/pytorch/vision/pull/7160)
+                # TODO: we could re-train the stereo models with antialias=True?
+                img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=False)
             if self.use_gray_scale is True:
                 img = F.rgb_to_grayscale(img)
             img = F.convert_image_dtype(img, torch.float)
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index aa16dc0af..c7e80cb41 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -10,6 +10,7 @@ from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 from torchvision.prototype import datapoints
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional import (
+    _check_antialias,
     _compute_resized_output_size as __compute_resized_output_size,
     _get_perspective_coeffs,
     InterpolationMode,
@@ -143,14 +144,18 @@ def resize_image_tensor(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
+    antialias = _check_antialias(img=image, antialias=antialias, interpolation=interpolation)
+    assert not isinstance(antialias, str)
     antialias = False if antialias is None else antialias
     align_corners: Optional[bool] = None
     if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
         align_corners = False
-    elif antialias:
-        raise ValueError("Antialias option is supported for bilinear and bicubic interpolation modes only")
+    else:
+        # The default of antialias should be True from 0.17, so we don't warn or
+        # error if other interpolation modes are used. This is documented.
+        antialias = False
 
     shape = image.shape
     num_channels, old_height, old_width = shape[-3:]
@@ -225,7 +230,7 @@ def resize_video(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     return resize_image_tensor(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
@@ -235,7 +240,7 @@ def resize(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(resize)
@@ -1761,7 +1766,7 @@ def resized_crop_image_tensor(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     image = crop_image_tensor(image, top, left, height, width)
     return resize_image_tensor(image, size, interpolation=interpolation, antialias=antialias)
@@ -1814,7 +1819,7 @@ def resized_crop_video(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     return resized_crop_image_tensor(
         video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
@@ -1829,7 +1834,7 @@ def resized_crop(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(resized_crop)
diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py
index 33b94d01c..ccbe425f2 100644
--- a/torchvision/transforms/_presets.py
+++ b/torchvision/transforms/_presets.py
@@ -2,7 +2,7 @@
 This file is part of the private API. Please do not use directly these classes as they will be modified on
 future versions without warning. The classes should be accessed only via the transforms argument of Weights.
 """
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn, Tensor
@@ -44,6 +44,7 @@ class ImageClassification(nn.Module):
         mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
         std: Tuple[float, ...] = (0.229, 0.224, 0.225),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.crop_size = [crop_size]
@@ -51,9 +52,10 @@ class ImageClassification(nn.Module):
         self.mean = list(mean)
         self.std = list(std)
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(self, img: Tensor) -> Tensor:
-        img = F.resize(img, self.resize_size, interpolation=self.interpolation)
+        img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
         img = F.center_crop(img, self.crop_size)
         if not isinstance(img, Tensor):
             img = F.pil_to_tensor(img)
@@ -105,7 +107,11 @@ class VideoClassification(nn.Module):
 
         N, T, C, H, W = vid.shape
         vid = vid.view(-1, C, H, W)
-        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation)
+        # We hard-code antialias=False to preserve results after we changed
+        # its default from None to True (see
+        # https://github.com/pytorch/vision/pull/7160)
+        # TODO: we could re-train the video models with antialias=True?
+        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation, antialias=False)
         vid = F.center_crop(vid, self.crop_size)
         vid = F.convert_image_dtype(vid, torch.float)
         vid = F.normalize(vid, mean=self.mean, std=self.std)
@@ -145,16 +151,18 @@ class SemanticSegmentation(nn.Module):
         mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
         std: Tuple[float, ...] = (0.229, 0.224, 0.225),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.resize_size = [resize_size] if resize_size is not None else None
         self.mean = list(mean)
         self.std = list(std)
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(self, img: Tensor) -> Tensor:
         if isinstance(self.resize_size, list):
-            img = F.resize(img, self.resize_size, interpolation=self.interpolation)
+            img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
         if not isinstance(img, Tensor):
             img = F.pil_to_tensor(img)
         img = F.convert_image_dtype(img, torch.float)
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index abf827a08..76c79df93 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -393,7 +393,7 @@ def resize(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> Tensor:
     r"""Resize the input image to the given size.
     If the image is torch Tensor, it is expected
@@ -429,10 +429,24 @@ def resize(
             smaller edge may be shorter than ``size``. This is only supported
             if ``size`` is an int (or a sequence of length 1 in torchscript
             mode).
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
 
     Returns:
         PIL Image or Tensor: Resized image.
@@ -462,6 +476,8 @@ def resize(
     if (image_height, image_width) == output_size:
         return img
 
+    antialias = _check_antialias(img, antialias, interpolation)
+
     if not isinstance(img, torch.Tensor):
         if antialias is not None and not antialias:
             warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
@@ -594,7 +610,7 @@ def resized_crop(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> Tensor:
     """Crop the given image and resize it to desired size.
     If the image is torch Tensor, it is expected
@@ -614,10 +630,24 @@ def resized_crop(
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
             ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
             supported.
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
     Returns:
         PIL Image or Tensor: Cropped image.
     """
@@ -1537,3 +1567,28 @@ def elastic_transform(
     if not isinstance(img, torch.Tensor):
         output = to_pil_image(output, mode=img.mode)
     return output
+
+
+# TODO in v0.17: remove this helper and change default of antialias to True everywhere
+def _check_antialias(
+    img: Tensor, antialias: Optional[Union[str, bool]], interpolation: InterpolationMode
+) -> Optional[bool]:
+    if isinstance(antialias, str):  # it should be "warn", but we don't bother checking against that
+        if isinstance(img, Tensor) and (
+            interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC
+        ):
+            warnings.warn(
+                "The default value of the antialias parameter of all the resizing transforms "
+                "(Resize(), RandomResizedCrop(), etc.) "
+                "will change from None to True in v0.17, "
+                "in order to be consistent across the PIL and Tensor backends. "
+                "To suppress this warning, directly pass "
+                "antialias=True (recommended, future default), antialias=None (current default, "
+                "which means False for Tensors and True for PIL), "
+                "or antialias=False (only works on Tensors - PIL will still use antialiasing). "
+                "This also applies if you are using the inference transforms from the models weights: "
+                "update the call to weights.transforms(antialias=True)."
+            )
+        antialias = None
+
+    return antialias
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index 30414bf1c..d0e7c1788 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -440,6 +440,8 @@ def resize(
     img: Tensor,
     size: List[int],
     interpolation: str = "bilinear",
+    # TODO: in v0.17, change the default to True. This will a private function
+    # by then, so we don't care about warning here.
     antialias: Optional[bool] = None,
 ) -> Tensor:
     _assert_image_tensor(img)
@@ -451,7 +453,11 @@ def resize(
         antialias = False
 
     if antialias and interpolation not in ["bilinear", "bicubic"]:
-        raise ValueError("Antialias option is supported for bilinear and bicubic interpolation modes only")
+        # We manually set it to False to avoid an error downstream in interpolate()
+        # This behaviour is documented: the parameter is irrelevant for modes
+        # that are not bilinear or bicubic. We used to raise an error here, but
+        # now we don't as True is the default.
+        antialias = False
 
     img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
 
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index e39e04c34..88cc1c0d9 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -306,13 +306,27 @@ class Resize(torch.nn.Module):
             smaller edge may be shorter than ``size``. This is only supported
             if ``size`` is an int (or a sequence of length 1 in torchscript
             mode).
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
     """
 
-    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias=None):
+    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias="warn"):
         super().__init__()
         _log_api_usage_once(self)
         if not isinstance(size, (int, Sequence)):
@@ -847,10 +861,24 @@ class RandomResizedCrop(torch.nn.Module):
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
     """
 
     def __init__(
@@ -859,7 +887,7 @@ class RandomResizedCrop(torch.nn.Module):
         scale=(0.08, 1.0),
         ratio=(3.0 / 4.0, 4.0 / 3.0),
         interpolation=InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ):
         super().__init__()
         _log_api_usage_once(self)
@@ -874,6 +902,7 @@ class RandomResizedCrop(torch.nn.Module):
 
         self.interpolation = interpolation
         self.antialias = antialias
+        self.interpolation = interpolation
         self.scale = scale
         self.ratio = ratio
 
-- 
GitLab


From 707457050620e1f70ab1b187dad81cc36a7f9180 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Feb 2023 21:18:52 +0100
Subject: [PATCH 277/624] call dataset wrapper with idx and sample (#7235)

---
 .../prototype/datapoints/_dataset_wrapper.py  | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/prototype/datapoints/_dataset_wrapper.py
index e60d61e5f..dc4578c49 100644
--- a/torchvision/prototype/datapoints/_dataset_wrapper.py
+++ b/torchvision/prototype/datapoints/_dataset_wrapper.py
@@ -74,7 +74,7 @@ class VisionDatasetDatapointWrapper(Dataset):
         # of this class
         sample = self._dataset[idx]
 
-        sample = self._wrapper(sample)
+        sample = self._wrapper(idx, sample)
 
         # Regardless of whether the user has supplied the transforms individually (`transform` and `target_transform`)
         # or joint (`transforms`), we can access the full functionality through `transforms`
@@ -125,7 +125,10 @@ def wrap_target_by_type(target, *, target_types, type_wrappers):
 
 
 def classification_wrapper_factory(dataset):
-    return identity
+    def wrapper(idx, sample):
+        return sample
+
+    return wrapper
 
 
 for dataset_cls in [
@@ -143,7 +146,7 @@ for dataset_cls in [
 
 
 def segmentation_wrapper_factory(dataset):
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, mask = sample
         return image, pil_image_to_mask(mask)
 
@@ -163,7 +166,7 @@ def video_classification_wrapper_factory(dataset):
             f"since it is not compatible with the transformations. Please use `output_format='TCHW'` instead."
         )
 
-    def wrapper(sample):
+    def wrapper(idx, sample):
         video, audio, label = sample
 
         video = datapoints.Video(video)
@@ -201,14 +204,17 @@ def coco_dectection_wrapper_factory(dataset):
         )
         return torch.from_numpy(mask.decode(segmentation))
 
-    def wrapper(sample):
+    def wrapper(idx, sample):
+        image_id = dataset.ids[idx]
+
         image, target = sample
 
+        if not target:
+            return image, dict(image_id=image_id)
+
         batched_target = list_of_dicts_to_dict_of_lists(target)
 
-        image_ids = batched_target.pop("image_id")
-        image_id = batched_target["image_id"] = image_ids.pop()
-        assert all(other_image_id == image_id for other_image_id in image_ids)
+        batched_target["image_id"] = image_id
 
         spatial_size = tuple(F.get_spatial_size(image))
         batched_target["boxes"] = datapoints.BoundingBox(
@@ -259,7 +265,7 @@ VOC_DETECTION_CATEGORY_TO_IDX = dict(zip(VOC_DETECTION_CATEGORIES, range(len(VOC
 
 @WRAPPER_FACTORIES.register(datasets.VOCDetection)
 def voc_detection_wrapper_factory(dataset):
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, target = sample
 
         batched_instances = list_of_dicts_to_dict_of_lists(target["annotation"]["object"])
@@ -294,7 +300,7 @@ def celeba_wrapper_factory(dataset):
     if any(target_type in dataset.target_type for target_type in ["attr", "landmarks"]):
         raise_not_supported("`CelebA` dataset with `target_type=['attr', 'landmarks', ...]`")
 
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, target = sample
 
         target = wrap_target_by_type(
@@ -318,7 +324,7 @@ KITTI_CATEGORY_TO_IDX = dict(zip(KITTI_CATEGORIES, range(len(KITTI_CATEGORIES)))
 
 @WRAPPER_FACTORIES.register(datasets.Kitti)
 def kitti_wrapper_factory(dataset):
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, target = sample
 
         if target is not None:
@@ -336,7 +342,7 @@ def kitti_wrapper_factory(dataset):
 
 @WRAPPER_FACTORIES.register(datasets.OxfordIIITPet)
 def oxford_iiit_pet_wrapper_factor(dataset):
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, target = sample
 
         if target is not None:
@@ -371,7 +377,7 @@ def cityscapes_wrapper_factory(dataset):
             labels.append(label)
         return dict(masks=datapoints.Mask(torch.stack(masks)), labels=torch.stack(labels))
 
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, target = sample
 
         target = wrap_target_by_type(
@@ -390,7 +396,7 @@ def cityscapes_wrapper_factory(dataset):
 
 @WRAPPER_FACTORIES.register(datasets.WIDERFace)
 def widerface_wrapper(dataset):
-    def wrapper(sample):
+    def wrapper(idx, sample):
         image, target = sample
 
         if target is not None:
-- 
GitLab


From b570f2c17130c30be56a276aa0d1ed11a096dad1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 14 Feb 2023 11:05:35 +0100
Subject: [PATCH 278/624] Undeprecate PIL  int constants for interpolation
 (#7241)

---
 test/test_functional_tensor.py           | 37 ++++++++++++++++----
 test/test_onnx.py                        |  3 +-
 test/test_transforms.py                  | 18 ++++++----
 test/test_transforms_tensor.py           |  8 ++---
 torchvision/transforms/_pil_constants.py | 25 --------------
 torchvision/transforms/functional.py     | 43 ++++++++++++++++++------
 torchvision/transforms/functional_pil.py | 17 +++++-----
 torchvision/transforms/transforms.py     | 32 +++++++++++++-----
 8 files changed, 110 insertions(+), 73 deletions(-)
 delete mode 100644 torchvision/transforms/_pil_constants.py

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 58ba98bdf..3e0ca881a 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -7,6 +7,7 @@ from functools import partial
 from typing import Sequence
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch
 import torchvision.transforms as T
@@ -144,6 +145,12 @@ class TestRotate:
         center = (20, 22)
         _test_fn_on_batch(batch_tensors, F.rotate, angle=32, interpolation=NEAREST, expand=True, center=center)
 
+    def test_rotate_interpolation_type(self):
+        tensor, _ = _create_data(26, 26)
+        res1 = F.rotate(tensor, 45, interpolation=PIL.Image.BILINEAR)
+        res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
+        assert_equal(res1, res2)
+
 
 class TestAffine:
 
@@ -350,6 +357,14 @@ class TestAffine:
 
         _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0])
 
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_interpolation_type(self, device):
+        tensor, pil_img = _create_data(26, 26, device=device)
+
+        res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=PIL.Image.BILINEAR)
+        res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
+        assert_equal(res1, res2)
+
 
 def _get_data_dims_and_points_for_perspective():
     # Ideally we would parametrize independently over data dims and points, but
@@ -448,6 +463,16 @@ def test_perspective_batch(device, dims_and_points, dt):
     )
 
 
+def test_perspective_interpolation_type():
+    spoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
+    epoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
+    tensor = torch.randint(0, 256, (3, 26, 26))
+
+    res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=PIL.Image.BILINEAR)
+    res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR)
+    assert_equal(res1, res2)
+
+
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
@@ -489,9 +514,7 @@ def test_resize(device, dt, size, max_size, interpolation):
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
-    if interpolation not in [
-        NEAREST,
-    ]:
+    if interpolation != NEAREST:
         # We can not check values if mode = NEAREST, as results are different
         # E.g. resized_tensor  = [[a, a, b, c, d, d, e, ...]]
         # E.g. resized_pil_img = [[a, b, c, c, d, e, f, ...]]
@@ -504,9 +527,7 @@ def test_resize(device, dt, size, max_size, interpolation):
         _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=8.0)
 
     if isinstance(size, int):
-        script_size = [
-            size,
-        ]
+        script_size = [size]
     else:
         script_size = size
 
@@ -523,6 +544,10 @@ def test_resize_asserts(device):
 
     tensor, pil_img = _create_data(26, 36, device=device)
 
+    res1 = F.resize(tensor, size=32, interpolation=PIL.Image.BILINEAR)
+    res2 = F.resize(tensor, size=32, interpolation=BILINEAR)
+    assert_equal(res1, res2)
+
     for img in (tensor, pil_img):
         exp_msg = "max_size should only be passed if size specifies the length of the smaller edge"
         with pytest.raises(ValueError, match=exp_msg):
diff --git a/test/test_onnx.py b/test/test_onnx.py
index 09c73accc..0af76072e 100644
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@@ -407,13 +407,12 @@ class TestONNXExporter:
     def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
         import os
 
-        import torchvision.transforms._pil_constants as _pil_constants
         from PIL import Image
         from torchvision.transforms import functional as F
 
         data_dir = os.path.join(os.path.dirname(__file__), "assets")
         path = os.path.join(data_dir, *rel_path.split("/"))
-        image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR)
+        image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR)
 
         return F.convert_image_dtype(F.pil_to_tensor(image))
 
diff --git a/test/test_transforms.py b/test/test_transforms.py
index a9074909c..57e61bbad 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -9,7 +9,6 @@ import numpy as np
 import pytest
 import torch
 import torchvision.transforms as transforms
-import torchvision.transforms._pil_constants as _pil_constants
 import torchvision.transforms.functional as F
 import torchvision.transforms.functional_tensor as F_t
 from PIL import Image
@@ -175,7 +174,7 @@ class TestAccImage:
     def test_accimage_resize(self):
         trans = transforms.Compose(
             [
-                transforms.Resize(256, interpolation=_pil_constants.LINEAR),
+                transforms.Resize(256, interpolation=Image.LINEAR),
                 transforms.PILToTensor(),
                 transforms.ConvertImageDtype(dtype=torch.float),
             ]
@@ -1533,10 +1532,10 @@ def test_ten_crop(should_vflip, single_dim):
     five_crop.__repr__()
 
     if should_vflip:
-        vflipped_img = img.transpose(_pil_constants.FLIP_TOP_BOTTOM)
+        vflipped_img = img.transpose(Image.FLIP_TOP_BOTTOM)
         expected_output += five_crop(vflipped_img)
     else:
-        hflipped_img = img.transpose(_pil_constants.FLIP_LEFT_RIGHT)
+        hflipped_img = img.transpose(Image.FLIP_LEFT_RIGHT)
         expected_output += five_crop(hflipped_img)
 
     assert len(results) == 10
@@ -1883,6 +1882,9 @@ def test_random_rotation():
     # Checking if RandomRotation can be printed as string
     t.__repr__()
 
+    t = transforms.RandomRotation((-10, 10), interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
+
 
 def test_random_rotation_error():
     # assert fill being either a Sequence or a Number
@@ -2212,6 +2214,9 @@ def test_random_affine():
     t = transforms.RandomAffine(10, interpolation=transforms.InterpolationMode.BILINEAR)
     assert "bilinear" in t.__repr__()
 
+    t = transforms.RandomAffine(10, interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
+
 
 def test_elastic_transformation():
     with pytest.raises(TypeError, match=r"alpha should be float or a sequence of floats"):
@@ -2228,9 +2233,8 @@ def test_elastic_transformation():
     with pytest.raises(ValueError, match=r"sigma is a sequence its length should be 2"):
         transforms.ElasticTransform(alpha=2.0, sigma=[1.0, 0.0, 1.0])
 
-    with pytest.warns(UserWarning, match=r"Argument interpolation should be of type InterpolationMode"):
-        t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
     with pytest.raises(TypeError, match=r"fill should be int or float"):
         transforms.ElasticTransform(alpha=1.0, sigma=1.0, fill={})
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index b58e24203..ef26f393d 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -3,9 +3,9 @@ import sys
 import warnings
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch
-import torchvision.transforms._pil_constants as _pil_constants
 from common_utils import (
     _assert_approx_equal_tensor_to_pil,
     _assert_equal_tensor_to_pil,
@@ -657,13 +657,13 @@ def test_autoaugment__op_apply_shear(interpolation, mode):
             matrix = (1, level, 0, 0, 1, 0)
         elif mode == "Y":
             matrix = (1, 0, 0, level, 1, 0)
-        return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample)
+        return pil_img.transform((image_size, image_size), PIL.Image.AFFINE, matrix, resample=resample)
 
     t_img, pil_img = _create_data(image_size, image_size)
 
     resample_pil = {
-        F.InterpolationMode.NEAREST: _pil_constants.NEAREST,
-        F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR,
+        F.InterpolationMode.NEAREST: PIL.Image.NEAREST,
+        F.InterpolationMode.BILINEAR: PIL.Image.BILINEAR,
     }[interpolation]
 
     level = 0.3
diff --git a/torchvision/transforms/_pil_constants.py b/torchvision/transforms/_pil_constants.py
deleted file mode 100644
index 46f6ce5d2..000000000
--- a/torchvision/transforms/_pil_constants.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from PIL import Image
-
-# See https://pillow.readthedocs.io/en/stable/releasenotes/9.1.0.html#deprecations
-# TODO: Remove this file once PIL minimal version is >= 9.1
-
-if hasattr(Image, "Resampling"):
-    BICUBIC = Image.Resampling.BICUBIC
-    BILINEAR = Image.Resampling.BILINEAR
-    LINEAR = Image.Resampling.BILINEAR
-    NEAREST = Image.Resampling.NEAREST
-
-    AFFINE = Image.Transform.AFFINE
-    FLIP_LEFT_RIGHT = Image.Transpose.FLIP_LEFT_RIGHT
-    FLIP_TOP_BOTTOM = Image.Transpose.FLIP_TOP_BOTTOM
-    PERSPECTIVE = Image.Transform.PERSPECTIVE
-else:
-    BICUBIC = Image.BICUBIC
-    BILINEAR = Image.BILINEAR
-    NEAREST = Image.NEAREST
-    LINEAR = Image.LINEAR
-
-    AFFINE = Image.AFFINE
-    FLIP_LEFT_RIGHT = Image.FLIP_LEFT_RIGHT
-    FLIP_TOP_BOTTOM = Image.FLIP_TOP_BOTTOM
-    PERSPECTIVE = Image.PERSPECTIVE
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 76c79df93..29940837a 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -421,6 +421,7 @@ def resize(
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
             ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
             supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image: if the longer edge of the image is greater
             than ``max_size`` after being resized according to ``size``, then
@@ -454,8 +455,12 @@ def resize(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(resize)
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if isinstance(size, (list, tuple)):
         if len(size) not in [1, 2]:
@@ -630,6 +635,7 @@ def resized_crop(
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
             ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
             supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         antialias (bool, optional): Whether to apply antialiasing.
             It only affects **tensors** with bilinear or bicubic modes and it is
             ignored otherwise: on PIL images, antialiasing is always applied on
@@ -726,6 +732,7 @@ def perspective(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -741,8 +748,12 @@ def perspective(
 
     coeffs = _get_perspective_coeffs(startpoints, endpoints)
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if not isinstance(img, torch.Tensor):
         pil_interpolation = pil_modes_mapping[interpolation]
@@ -1076,6 +1087,7 @@ def rotate(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1097,15 +1109,19 @@ def rotate(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(rotate)
 
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
+
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
 
     if center is not None and not isinstance(center, (list, tuple)):
         raise TypeError("Argument center should be a sequence")
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
     if not isinstance(img, torch.Tensor):
         pil_interpolation = pil_modes_mapping[interpolation]
         return F_pil.rotate(img, angle=angle, interpolation=pil_interpolation, expand=expand, center=center, fill=fill)
@@ -1147,6 +1163,7 @@ def affine(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -1162,6 +1179,13 @@ def affine(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(affine)
 
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
+
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
 
@@ -1177,9 +1201,6 @@ def affine(
     if not isinstance(shear, (numbers.Number, (list, tuple))):
         raise TypeError("Shear should be either a single value or a sequence of two values")
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
     if isinstance(angle, int):
         angle = float(angle)
 
@@ -1524,7 +1545,7 @@ def elastic_transform(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``.
-            For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (number or str or tuple): Pixel fill value for constant fill. Default is 0.
             If a tuple of length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant.
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py
index a75c46b49..120998d00 100644
--- a/torchvision/transforms/functional_pil.py
+++ b/torchvision/transforms/functional_pil.py
@@ -9,7 +9,6 @@ try:
     import accimage
 except ImportError:
     accimage = None
-from . import _pil_constants
 
 
 @torch.jit.unused
@@ -54,7 +53,7 @@ def hflip(img: Image.Image) -> Image.Image:
     if not _is_pil_image(img):
         raise TypeError(f"img should be PIL Image. Got {type(img)}")
 
-    return img.transpose(_pil_constants.FLIP_LEFT_RIGHT)
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
 
 
 @torch.jit.unused
@@ -62,7 +61,7 @@ def vflip(img: Image.Image) -> Image.Image:
     if not _is_pil_image(img):
         raise TypeError(f"img should be PIL Image. Got {type(img)}")
 
-    return img.transpose(_pil_constants.FLIP_TOP_BOTTOM)
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
 
 
 @torch.jit.unused
@@ -240,7 +239,7 @@ def crop(
 def resize(
     img: Image.Image,
     size: Union[List[int], int],
-    interpolation: int = _pil_constants.BILINEAR,
+    interpolation: int = Image.BILINEAR,
 ) -> Image.Image:
 
     if not _is_pil_image(img):
@@ -284,7 +283,7 @@ def _parse_fill(
 def affine(
     img: Image.Image,
     matrix: List[float],
-    interpolation: int = _pil_constants.NEAREST,
+    interpolation: int = Image.NEAREST,
     fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
 ) -> Image.Image:
 
@@ -293,14 +292,14 @@ def affine(
 
     output_size = img.size
     opts = _parse_fill(fill, img)
-    return img.transform(output_size, _pil_constants.AFFINE, matrix, interpolation, **opts)
+    return img.transform(output_size, Image.AFFINE, matrix, interpolation, **opts)
 
 
 @torch.jit.unused
 def rotate(
     img: Image.Image,
     angle: float,
-    interpolation: int = _pil_constants.NEAREST,
+    interpolation: int = Image.NEAREST,
     expand: bool = False,
     center: Optional[Tuple[int, int]] = None,
     fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
@@ -317,7 +316,7 @@ def rotate(
 def perspective(
     img: Image.Image,
     perspective_coeffs: List[float],
-    interpolation: int = _pil_constants.BICUBIC,
+    interpolation: int = Image.BICUBIC,
     fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
 ) -> Image.Image:
 
@@ -326,7 +325,7 @@ def perspective(
 
     opts = _parse_fill(fill, img)
 
-    return img.transform(img.size, _pil_constants.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
+    return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
 
 
 @torch.jit.unused
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 88cc1c0d9..d7858353b 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -298,6 +298,7 @@ class Resize(torch.nn.Module):
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image: if the longer edge of the image is greater
             than ``max_size`` after being resized according to ``size``, then
@@ -336,6 +337,9 @@ class Resize(torch.nn.Module):
         self.size = size
         self.max_size = max_size
 
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
         self.interpolation = interpolation
         self.antialias = antialias
 
@@ -756,6 +760,7 @@ class RandomPerspective(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
     """
@@ -765,6 +770,9 @@ class RandomPerspective(torch.nn.Module):
         _log_api_usage_once(self)
         self.p = p
 
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
         self.interpolation = interpolation
         self.distortion_scale = distortion_scale
 
@@ -861,6 +869,7 @@ class RandomResizedCrop(torch.nn.Module):
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         antialias (bool, optional): Whether to apply antialiasing.
             It only affects **tensors** with bilinear or bicubic modes and it is
             ignored otherwise: on PIL images, antialiasing is always applied on
@@ -900,9 +909,11 @@ class RandomResizedCrop(torch.nn.Module):
         if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
             warnings.warn("Scale and ratio should be of kind (min, max)")
 
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
         self.interpolation = interpolation
         self.antialias = antialias
-        self.interpolation = interpolation
         self.scale = scale
         self.ratio = ratio
 
@@ -1139,10 +1150,10 @@ class LinearTransformation(torch.nn.Module):
             )
 
         flat_tensor = tensor.view(-1, n) - self.mean_vector
-
         transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
         transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
-        return transformed_tensor.view(shape)
+        tensor = transformed_tensor.view(shape)
+        return tensor
 
     def __repr__(self) -> str:
         s = (
@@ -1293,6 +1304,7 @@ class RandomRotation(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1310,6 +1322,9 @@ class RandomRotation(torch.nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
 
         if center is not None:
@@ -1393,6 +1408,7 @@ class RandomAffine(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
@@ -1415,6 +1431,9 @@ class RandomAffine(torch.nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
+        if isinstance(interpolation, int):
+            interpolation = _interpolation_modes_from_int(interpolation)
+
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
 
         if translate is not None:
@@ -2039,7 +2058,7 @@ class ElasticTransform(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
 
@@ -2080,12 +2099,7 @@ class ElasticTransform(torch.nn.Module):
 
         self.sigma = sigma
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument interpolation should be of type InterpolationMode instead of int. "
-                "Please, use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
         self.interpolation = interpolation
 
-- 
GitLab


From d744da936f7d43788afc96f8cf09f845312480b6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 14 Feb 2023 11:12:16 +0100
Subject: [PATCH 279/624] allow subclasses in dataset wrappers (#7236)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/datasets_utils.py                        |  2 +-
 test/test_prototype_datapoints.py             | 44 ++++++++++++++++++
 .../prototype/datapoints/_dataset_wrapper.py  | 45 +++++++++++++------
 3 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 598d4408b..c02ffeb0d 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -596,7 +596,7 @@ class DatasetTestCase(unittest.TestCase):
                 wrapped_sample = wrapped_dataset[0]
                 assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
         except TypeError as error:
-            if str(error).startswith(f"No wrapper exist for dataset class {type(dataset).__name__}"):
+            if str(error).startswith(f"No wrapper exists for dataset class {type(dataset).__name__}"):
                 return
             raise error
         except RuntimeError as error:
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index 4663cdac3..c2cc0986b 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -1,7 +1,11 @@
+import re
+
 import pytest
 import torch
 
 from PIL import Image
+
+from torchvision import datasets
 from torchvision.prototype import datapoints
 
 
@@ -159,3 +163,43 @@ def test_bbox_instance(data, format):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat.from_str(format.upper())
     assert bboxes.format == format
+
+
+class TestDatasetWrapper:
+    def test_unknown_type(self):
+        unknown_object = object()
+        with pytest.raises(
+            TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`")
+        ):
+            datapoints.wrap_dataset_for_transforms_v2(unknown_object)
+
+    def test_unknown_dataset(self):
+        class MyVisionDataset(datasets.VisionDataset):
+            pass
+
+        dataset = MyVisionDataset("root")
+
+        with pytest.raises(TypeError, match="No wrapper exist"):
+            datapoints.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_missing_wrapper(self):
+        dataset = datasets.FakeData()
+
+        with pytest.raises(TypeError, match="please open an issue"):
+            datapoints.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_subclass(self, mocker):
+        sentinel = object()
+        mocker.patch.dict(
+            datapoints._dataset_wrapper.WRAPPER_FACTORIES,
+            clear=False,
+            values={datasets.FakeData: lambda dataset: lambda idx, sample: sentinel},
+        )
+
+        class MyFakeData(datasets.FakeData):
+            pass
+
+        dataset = MyFakeData()
+        wrapped_dataset = datapoints.wrap_dataset_for_transforms_v2(dataset)
+
+        assert wrapped_dataset[0] is sentinel
diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/prototype/datapoints/_dataset_wrapper.py
index dc4578c49..74f830951 100644
--- a/torchvision/prototype/datapoints/_dataset_wrapper.py
+++ b/torchvision/prototype/datapoints/_dataset_wrapper.py
@@ -39,16 +39,26 @@ WRAPPER_FACTORIES = WrapperFactories()
 class VisionDatasetDatapointWrapper(Dataset):
     def __init__(self, dataset):
         dataset_cls = type(dataset)
-        wrapper_factory = WRAPPER_FACTORIES.get(dataset_cls)
-        if wrapper_factory is None:
-            # TODO: If we have documentation on how to do that, put a link in the error message.
-            msg = f"No wrapper exist for dataset class {dataset_cls.__name__}. Please wrap the output yourself."
-            if dataset_cls in datasets.__dict__.values():
-                msg = (
-                    f"{msg} If an automated wrapper for this dataset would be useful for you, "
-                    f"please open an issue at https://github.com/pytorch/vision/issues."
-                )
-            raise TypeError(msg)
+
+        if not isinstance(dataset, datasets.VisionDataset):
+            raise TypeError(
+                f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, "
+                f"but got a '{dataset_cls.__name__}' instead."
+            )
+
+        for cls in dataset_cls.mro():
+            if cls in WRAPPER_FACTORIES:
+                wrapper_factory = WRAPPER_FACTORIES[cls]
+                break
+            elif cls is datasets.VisionDataset:
+                # TODO: If we have documentation on how to do that, put a link in the error message.
+                msg = f"No wrapper exists for dataset class {dataset_cls.__name__}. Please wrap the output yourself."
+                if dataset_cls in datasets.__dict__.values():
+                    msg = (
+                        f"{msg} If an automated wrapper for this dataset would be useful for you, "
+                        f"please open an issue at https://github.com/pytorch/vision/issues."
+                    )
+                raise TypeError(msg)
 
         self._dataset = dataset
         self._wrapper = wrapper_factory(dataset)
@@ -98,6 +108,13 @@ def identity(item):
     return item
 
 
+def identity_wrapper_factory(dataset):
+    def wrapper(idx, sample):
+        return sample
+
+    return wrapper
+
+
 def pil_image_to_mask(pil_image):
     return datapoints.Mask(pil_image)
 
@@ -125,10 +142,7 @@ def wrap_target_by_type(target, *, target_types, type_wrappers):
 
 
 def classification_wrapper_factory(dataset):
-    def wrapper(idx, sample):
-        return sample
-
-    return wrapper
+    return identity_wrapper_factory(dataset)
 
 
 for dataset_cls in [
@@ -237,6 +251,9 @@ def coco_dectection_wrapper_factory(dataset):
     return wrapper
 
 
+WRAPPER_FACTORIES.register(datasets.CocoCaptions)(identity_wrapper_factory)
+
+
 VOC_DETECTION_CATEGORIES = [
     "__background__",
     "aeroplane",
-- 
GitLab


From 20f84bfa112dd56adeb1cc4da0a16ec3bdaaf92f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Feb 2023 10:15:09 +0000
Subject: [PATCH 280/624] Tighter test against PIL on resize (#7233)

---
 test/test_functional_tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 3e0ca881a..e7c845dc8 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -524,7 +524,7 @@ def test_resize(device, dt, size, max_size, interpolation):
             resized_tensor_f = resized_tensor_f.to(torch.float)
 
         # Pay attention to high tolerance for MAE
-        _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=8.0)
+        _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=3.0)
 
     if isinstance(size, int):
         script_size = [size]
-- 
GitLab


From 3a0e028fa4e4445e3f71df2c13f9463c72f29b57 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Feb 2023 10:15:47 +0000
Subject: [PATCH 281/624] Use `is False` for some antialias checks (#7234)

---
 torchvision/prototype/transforms/functional/_geometry.py | 2 +-
 torchvision/transforms/functional.py                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index c7e80cb41..6fcd87ac9 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -249,7 +249,7 @@ def resize(
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
         return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
     elif isinstance(inpt, PIL.Image.Image):
-        if antialias is not None and not antialias:
+        if antialias is False:
             warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
         return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
     else:
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 29940837a..beeb02cd9 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -484,7 +484,7 @@ def resize(
     antialias = _check_antialias(img, antialias, interpolation)
 
     if not isinstance(img, torch.Tensor):
-        if antialias is not None and not antialias:
+        if antialias is False:
             warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
         pil_interpolation = pil_modes_mapping[interpolation]
         return F_pil.resize(img, size=output_size, interpolation=pil_interpolation)
-- 
GitLab


From d4575e5b06140cdab213e53cb33bf19a931da82b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Feb 2023 13:36:01 +0000
Subject: [PATCH 282/624] Let LinearTransformation return datapoints instead of
 tensors (#7244)

---
 torchvision/prototype/transforms/_misc.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 39d9dc103..b398227b4 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -76,12 +76,7 @@ class LinearTransformation(Transform):
         if has_any(sample, PIL.Image.Image):
             raise TypeError("LinearTransformation does not work on PIL Images")
 
-    def _transform(
-        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
-    ) -> torch.Tensor:
-        # Image instance after linear transformation is not Image anymore due to unknown data range
-        # Thus we will return Tensor for input Image
-
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         shape = inpt.shape
         n = shape[-3] * shape[-2] * shape[-1]
         if n != self.transformation_matrix.shape[0]:
@@ -97,11 +92,15 @@ class LinearTransformation(Transform):
                 f"Got {inpt.device} vs {self.mean_vector.device}"
             )
 
-        flat_tensor = inpt.reshape(-1, n) - self.mean_vector
+        flat_inpt = inpt.reshape(-1, n) - self.mean_vector
+
+        transformation_matrix = self.transformation_matrix.to(flat_inpt.dtype)
+        output = torch.mm(flat_inpt, transformation_matrix)
+        output = output.reshape(shape)
 
-        transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
-        transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
-        return transformed_tensor.reshape(shape)
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
+        return output
 
 
 class Normalize(Transform):
@@ -120,7 +119,7 @@ class Normalize(Transform):
 
     def _transform(
         self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
-    ) -> torch.Tensor:
+    ) -> Any:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
 
-- 
GitLab


From 0bdd01a79ab741ef25a9da9f50274e66a2033dbb Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Feb 2023 13:43:29 +0000
Subject: [PATCH 283/624] Remove usage of torch._six (#7243)

---
 torchvision/datasets/utils.py  | 2 +-
 torchvision/datasets/vision.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index 86a489f15..220c1ae79 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -451,7 +451,7 @@ def verify_str_arg(
     valid_values: Optional[Iterable[T]] = None,
     custom_msg: Optional[str] = None,
 ) -> T:
-    if not isinstance(value, torch._six.string_classes):
+    if not isinstance(value, str):
         if arg is None:
             msg = "Expected type str, but got type {type}."
         else:
diff --git a/torchvision/datasets/vision.py b/torchvision/datasets/vision.py
index 22fc85322..b7c69f016 100644
--- a/torchvision/datasets/vision.py
+++ b/torchvision/datasets/vision.py
@@ -36,7 +36,7 @@ class VisionDataset(data.Dataset):
         target_transform: Optional[Callable] = None,
     ) -> None:
         _log_api_usage_once(self)
-        if isinstance(root, torch._six.string_classes):
+        if isinstance(root, str):
             root = os.path.expanduser(root)
         self.root = root
 
-- 
GitLab


From 1f4a98468a74f3f3739fc05bd15a014b3f8b1fab Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 14 Feb 2023 14:52:23 +0100
Subject: [PATCH 284/624] add proper smoke test for prototype transforms
 (#7238)

---
 test/test_prototype_transforms.py | 234 ++++++++++++++++++++++++------
 1 file changed, 192 insertions(+), 42 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 0ed51c44d..7030d2d1b 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,4 +1,5 @@
 import itertools
+import pathlib
 import re
 import warnings
 from collections import defaultdict
@@ -20,15 +21,16 @@ from prototype_common_utils import (
     make_image,
     make_images,
     make_label,
-    make_masks,
     make_one_hot_labels,
     make_segmentation_mask,
     make_video,
     make_videos,
 )
+from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import datapoints, transforms
-from torchvision.prototype.transforms.utils import check_type, is_simple_tensor
+from torchvision.prototype.transforms import functional as F
+from torchvision.prototype.transforms.utils import check_type, is_simple_tensor, query_chw
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
@@ -66,53 +68,201 @@ def parametrize(transforms_with_inputs):
     )
 
 
-def parametrize_from_transforms(*transforms):
-    transforms_with_inputs = []
-    for transform in transforms:
-        for creation_fn in [
-            make_images,
-            make_bounding_boxes,
-            make_one_hot_labels,
-            make_vanilla_tensor_images,
-            make_pil_images,
-            make_masks,
-            make_videos,
-        ]:
-            inputs = list(creation_fn())
-            try:
-                output = transform(inputs[0])
-            except Exception:
+def auto_augment_adapter(transform, input, device):
+    adapted_input = {}
+    image_or_video_found = False
+    for key, value in input.items():
+        if isinstance(value, (datapoints.BoundingBox, datapoints.Mask)):
+            # AA transforms don't support bounding boxes or masks
+            continue
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
+            if image_or_video_found:
+                # AA transforms only support a single image or video
                 continue
-            else:
-                if output is inputs[0]:
-                    continue
+            image_or_video_found = True
+        adapted_input[key] = value
+    return adapted_input
+
+
+def linear_transformation_adapter(transform, input, device):
+    flat_inputs = list(input.values())
+    c, h, w = query_chw(
+        [
+            item
+            for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs))
+            if needs_transform
+        ]
+    )
+    num_elements = c * h * w
+    transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device)
+    transform.mean_vector = torch.randn((num_elements,), device=device)
+    return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
 
-            transforms_with_inputs.append((transform, inputs))
 
-    return parametrize(transforms_with_inputs)
+def normalize_adapter(transform, input, device):
+    adapted_input = {}
+    for key, value in input.items():
+        if isinstance(value, PIL.Image.Image):
+            # normalize doesn't support PIL images
+            continue
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
+            # normalize doesn't support integer images
+            value = F.convert_dtype(value, torch.float32)
+        adapted_input[key] = value
+    return adapted_input
 
 
 class TestSmoke:
-    @parametrize_from_transforms(
-        transforms.RandomErasing(p=1.0),
-        transforms.Resize([16, 16], antialias=True),
-        transforms.CenterCrop([16, 16]),
-        transforms.ConvertDtype(),
-        transforms.RandomHorizontalFlip(),
-        transforms.Pad(5),
-        transforms.RandomZoomOut(),
-        transforms.RandomRotation(degrees=(-45, 45)),
-        transforms.RandomAffine(degrees=(-45, 45)),
-        transforms.RandomCrop([16, 16], padding=1, pad_if_needed=True),
-        # TODO: Something wrong with input data setup. Let's fix that
-        # transforms.RandomEqualize(),
-        # transforms.RandomInvert(),
-        # transforms.RandomPosterize(bits=4),
-        # transforms.RandomSolarize(threshold=0.5),
-        # transforms.RandomAdjustSharpness(sharpness_factor=0.5),
+    @pytest.mark.parametrize(
+        ("transform", "adapter"),
+        [
+            (transforms.RandomErasing(p=1.0), None),
+            (transforms.AugMix(), auto_augment_adapter),
+            (transforms.AutoAugment(), auto_augment_adapter),
+            (transforms.RandAugment(), auto_augment_adapter),
+            (transforms.TrivialAugmentWide(), auto_augment_adapter),
+            (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
+            (transforms.Grayscale(), None),
+            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
+            (transforms.RandomAutocontrast(p=1.0), None),
+            (transforms.RandomEqualize(p=1.0), None),
+            (transforms.RandomGrayscale(p=1.0), None),
+            (transforms.RandomInvert(p=1.0), None),
+            (transforms.RandomPhotometricDistort(p=1.0), None),
+            (transforms.RandomPosterize(bits=4, p=1.0), None),
+            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
+            (transforms.CenterCrop([16, 16]), None),
+            (transforms.ElasticTransform(sigma=1.0), None),
+            (transforms.Pad(4), None),
+            (transforms.RandomAffine(degrees=30.0), None),
+            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
+            (transforms.RandomHorizontalFlip(p=1.0), None),
+            (transforms.RandomPerspective(p=1.0), None),
+            (transforms.RandomResize(min_size=10, max_size=20), None),
+            (transforms.RandomResizedCrop([16, 16]), None),
+            (transforms.RandomRotation(degrees=30), None),
+            (transforms.RandomShortestSize(min_size=10), None),
+            (transforms.RandomVerticalFlip(p=1.0), None),
+            (transforms.RandomZoomOut(p=1.0), None),
+            (transforms.Resize([16, 16], antialias=True), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
+            (transforms.ClampBoundingBoxes(), None),
+            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
+            (transforms.ConvertDtype(), None),
+            (transforms.GaussianBlur(kernel_size=3), None),
+            (
+                transforms.LinearTransformation(
+                    # These are just dummy values that will be filled by the adapter. We can't define them upfront,
+                    # because for we neither know the spatial size nor the device at this point
+                    transformation_matrix=torch.empty((1, 1)),
+                    mean_vector=torch.empty((1,)),
+                ),
+                linear_transformation_adapter,
+            ),
+            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
+            (transforms.ToDtype(torch.float64), None),
+            (transforms.UniformTemporalSubsample(num_samples=2), None),
+        ],
+        ids=lambda transform: type(transform).__name__,
     )
-    def test_common(self, transform, input):
-        transform(input)
+    @pytest.mark.parametrize("container_type", [dict, list, tuple])
+    @pytest.mark.parametrize(
+        "image_or_video",
+        [
+            make_image(),
+            make_video(),
+            next(make_pil_images(color_spaces=["RGB"])),
+            next(make_vanilla_tensor_images()),
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_common(self, transform, adapter, container_type, image_or_video, device):
+        spatial_size = F.get_spatial_size(image_or_video)
+        input = dict(
+            image_or_video=image_or_video,
+            image_datapoint=make_image(size=spatial_size),
+            video_datapoint=make_video(size=spatial_size),
+            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
+            bounding_box_xyxy=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
+            ),
+            bounding_box_xywh=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
+            ),
+            bounding_box_cxcywh=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
+            ),
+            bounding_box_degenerate_xyxy=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [2, 0, 1, 1],  # x1 > x2, y1 < y2
+                    [0, 2, 1, 1],  # x1 < x2, y1 > y2
+                    [2, 2, 1, 1],  # x1 > x2, y1 > y2
+                ],
+                format=datapoints.BoundingBoxFormat.XYXY,
+                spatial_size=spatial_size,
+            ),
+            bounding_box_degenerate_xywh=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=datapoints.BoundingBoxFormat.XYWH,
+                spatial_size=spatial_size,
+            ),
+            bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=datapoints.BoundingBoxFormat.CXCYWH,
+                spatial_size=spatial_size,
+            ),
+            detection_mask=make_detection_mask(size=spatial_size),
+            segmentation_mask=make_segmentation_mask(size=spatial_size),
+            int=0,
+            float=0.0,
+            bool=True,
+            none=None,
+            str="str",
+            path=pathlib.Path.cwd(),
+            object=object(),
+            tensor=torch.empty(5),
+            array=np.empty(5),
+        )
+        if adapter is not None:
+            input = adapter(transform, input, device)
+
+        if container_type in {tuple, list}:
+            input = container_type(input.values())
+
+        input_flat, input_spec = tree_flatten(input)
+        input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat]
+        input = tree_unflatten(input_flat, input_spec)
+
+        torch.manual_seed(0)
+        output = transform(input)
+        output_flat, output_spec = tree_flatten(output)
+
+        assert output_spec == input_spec
+
+        for output_item, input_item, should_be_transformed in zip(
+            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
+        ):
+            if should_be_transformed:
+                assert type(output_item) is type(input_item)
+            else:
+                assert output_item is input_item
 
     @parametrize(
         [
-- 
GitLab


From 6af6bf45cc36455e119cbb600503c7d0c8700ff4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 14 Feb 2023 18:03:34 +0000
Subject: [PATCH 285/624] Remove unused import (#7245)

---
 torchvision/datasets/vision.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchvision/datasets/vision.py b/torchvision/datasets/vision.py
index b7c69f016..aba19369b 100644
--- a/torchvision/datasets/vision.py
+++ b/torchvision/datasets/vision.py
@@ -1,7 +1,6 @@
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
-import torch
 import torch.utils.data as data
 
 from ..utils import _log_api_usage_once
-- 
GitLab


From 602e8ca1ff1dd05802dbd96c44092c9c6cb714ca Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 14 Feb 2023 23:03:21 +0100
Subject: [PATCH 286/624] clamp bounding boxes in some geometry kernels (#7215)

Co-authored-by: vfdev-5 <vfdev.5@gmail.com>
---
 test/prototype_transforms_kernel_infos.py     | 126 +++++++++++++-----
 test/test_prototype_transforms.py             |   2 +-
 test/test_prototype_transforms_functional.py  |  30 ++++-
 .../prototype/datapoints/_bounding_box.py     |  10 +-
 torchvision/prototype/transforms/__init__.py  |   2 +-
 torchvision/prototype/transforms/_meta.py     |   2 +-
 .../transforms/functional/_geometry.py        |  97 ++++++++------
 .../prototype/transforms/functional/_meta.py  |   9 +-
 8 files changed, 189 insertions(+), 89 deletions(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 8852f9864..dc9c3e57d 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -108,7 +108,7 @@ def float32_vs_uint8_pixel_difference(atol=1, mae=False):
     }
 
 
-def scripted_vs_eager_double_pixel_difference(device, atol=1e-6, rtol=1e-6):
+def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6):
     return {
         (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False},
     }
@@ -211,10 +211,12 @@ def reference_horizontal_flip_bounding_box(bounding_box, *, format, spatial_size
             [-1, 0, spatial_size[1]],
             [0, 1, 0],
         ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
 
     return expected_bboxes
 
@@ -322,7 +324,7 @@ def reference_inputs_resize_image_tensor():
 def sample_inputs_resize_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         for size in _get_resize_sizes(bounding_box_loader.spatial_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
+            yield ArgsKwargs(bounding_box_loader, spatial_size=bounding_box_loader.spatial_size, size=size)
 
 
 def sample_inputs_resize_mask():
@@ -344,19 +346,20 @@ def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=
             [new_width / old_width, 0, 0],
             [0, new_height / old_height, 0],
         ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
     expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=datapoints.BoundingBoxFormat.XYXY, affine_matrix=affine_matrix
+        bounding_box,
+        format=bounding_box.format,
+        spatial_size=(new_height, new_width),
+        affine_matrix=affine_matrix,
     )
     return expected_bboxes, (new_height, new_width)
 
 
 def reference_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(
-        formats=[datapoints.BoundingBoxFormat.XYXY], extra_dims=((), (4,))
-    ):
+    for bounding_box_loader in make_bounding_box_loaders(extra_dims=((), (4,))):
         for size in _get_resize_sizes(bounding_box_loader.spatial_size):
             yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
 
@@ -543,14 +546,17 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
     return true_matrix
 
 
-def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix):
-    def transform(bbox, affine_matrix_, format_):
+def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
+    def transform(bbox, affine_matrix_, format_, spatial_size_):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
             bbox = bbox.float()
         bbox_xyxy = F.convert_format_bounding_box(
-            bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+            bbox.as_subclass(torch.Tensor),
+            old_format=format_,
+            new_format=datapoints.BoundingBoxFormat.XYXY,
+            inplace=True,
         )
         points = np.array(
             [
@@ -573,12 +579,15 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
         out_bbox = F.convert_format_bounding_box(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
-        return out_bbox.to(dtype=in_dtype)
+        # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = out_bbox.to(dtype=in_dtype)
+        return out_bbox
 
     if bounding_box.ndim < 2:
         bounding_box = [bounding_box]
 
-    expected_bboxes = [transform(bbox, affine_matrix, format) for bbox in bounding_box]
+    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
     if len(expected_bboxes) > 1:
         expected_bboxes = torch.stack(expected_bboxes)
     else:
@@ -594,7 +603,9 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
     affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
     affine_matrix = affine_matrix[:2, :]
 
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
 
     return expected_bboxes
 
@@ -643,9 +654,6 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_affine_bounding_box,
             reference_fn=reference_affine_bounding_box,
             reference_inputs_fn=reference_inputs_affine_bounding_box,
-            closeness_kwargs={
-                (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
-            },
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
             ],
@@ -729,10 +737,12 @@ def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
             [1, 0, 0],
             [0, -1, spatial_size[0]],
         ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
 
     return expected_bboxes
 
@@ -806,6 +816,43 @@ def sample_inputs_rotate_bounding_box():
         )
 
 
+def reference_inputs_rotate_bounding_box():
+    for bounding_box_loader, angle in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), _ROTATE_ANGLES
+    ):
+        yield ArgsKwargs(
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
+            angle=angle,
+        )
+
+    # TODO: add samples with expand=True and center
+
+
+def reference_rotate_bounding_box(bounding_box, *, format, spatial_size, angle, expand=False, center=None):
+
+    if center is None:
+        center = [spatial_size[1] * 0.5, spatial_size[0] * 0.5]
+
+    a = np.cos(angle * np.pi / 180.0)
+    b = np.sin(angle * np.pi / 180.0)
+    cx = center[0]
+    cy = center[1]
+    affine_matrix = np.array(
+        [
+            [a, b, cx - cx * a - b * cy],
+            [-b, a, cy + cx * b - a * cy],
+        ],
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+    )
+
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
+    return expected_bboxes, spatial_size
+
+
 def sample_inputs_rotate_mask():
     for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
         yield ArgsKwargs(mask_loader, angle=15.0)
@@ -834,9 +881,11 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.rotate_bounding_box,
             sample_inputs_fn=sample_inputs_rotate_bounding_box,
+            reference_fn=reference_rotate_bounding_box,
+            reference_inputs_fn=reference_inputs_rotate_bounding_box,
             closeness_kwargs={
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
             },
         ),
         KernelInfo(
@@ -897,17 +946,19 @@ def sample_inputs_crop_video():
 
 
 def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
-
     affine_matrix = np.array(
         [
             [1, 0, -left],
             [0, 1, -top],
         ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
-    return expected_bboxes, (height, width)
+    spatial_size = (height, width)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
+    return expected_bboxes, spatial_size
 
 
 def reference_inputs_crop_bounding_box():
@@ -1119,13 +1170,15 @@ def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, p
             [1, 0, left],
             [0, 1, top],
         ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
     height = spatial_size[0] + top + bottom
     width = spatial_size[1] + left + right
 
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
+    )
     return expected_bboxes, (height, width)
 
 
@@ -1225,14 +1278,16 @@ def sample_inputs_perspective_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
             startpoints=None,
             endpoints=None,
             coefficients=_PERSPECTIVE_COEFFS[0],
         )
 
     format = datapoints.BoundingBoxFormat.XYXY
+    loader = make_bounding_box_loader(format=format)
     yield ArgsKwargs(
-        make_bounding_box_loader(format=format), format=format, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
+        loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
     )
 
 
@@ -1269,13 +1324,17 @@ KERNEL_INFOS.extend(
                 **pil_reference_pixel_difference(2, mae=True),
                 **cuda_vs_cpu_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(),
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
             },
         ),
         KernelInfo(
             F.perspective_bounding_box,
             sample_inputs_fn=sample_inputs_perspective_bounding_box,
+            closeness_kwargs={
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
+            },
         ),
         KernelInfo(
             F.perspective_mask,
@@ -1292,8 +1351,8 @@ KERNEL_INFOS.extend(
             sample_inputs_fn=sample_inputs_perspective_video,
             closeness_kwargs={
                 **cuda_vs_cpu_pixel_difference(),
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
             },
         ),
     ]
@@ -1331,6 +1390,7 @@ def sample_inputs_elastic_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
             displacement=displacement,
         )
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 7030d2d1b..167b839ee 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -146,7 +146,7 @@ class TestSmoke:
             (transforms.RandomZoomOut(p=1.0), None),
             (transforms.Resize([16, 16], antialias=True), None),
             (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
-            (transforms.ClampBoundingBoxes(), None),
+            (transforms.ClampBoundingBox(), None),
             (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertDtype(), None),
             (transforms.GaussianBlur(kernel_size=3), None),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 1650d03de..bb4b6ef11 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -25,7 +25,7 @@ from torch.utils._pytree import tree_map
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F
 from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding
-from torchvision.prototype.transforms.functional._meta import convert_format_bounding_box
+from torchvision.prototype.transforms.functional._meta import clamp_bounding_box, convert_format_bounding_box
 from torchvision.transforms.functional import _get_perspective_coeffs
 
 
@@ -257,16 +257,17 @@ class TestKernels:
     @reference_inputs
     def test_against_reference(self, test_id, info, args_kwargs):
         (input, *other_args), kwargs = args_kwargs.load("cpu")
-        input = input.as_subclass(torch.Tensor)
 
-        actual = info.kernel(input, *other_args, **kwargs)
+        actual = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
+        # We intnetionally don't unwrap the input of the reference function in order for it to have access to all
+        # metadata regardless of whether the kernel takes it explicitly or not
         expected = info.reference_fn(input, *other_args, **kwargs)
 
         assert_close(
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
-            msg=parametrized_error_message(*other_args, **kwargs),
+            msg=parametrized_error_message(input, *other_args, **kwargs),
         )
 
     @make_info_args_kwargs_parametrization(
@@ -682,6 +683,10 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
         (48.56528888843238, 9.611532109828834, 53.35347829361575, 14.39972151501221),
     ]
 
+    expected_bboxes = clamp_bounding_box(
+        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+    ).tolist()
+
     output_boxes = F.affine_bounding_box(
         in_boxes,
         format=format,
@@ -762,7 +767,8 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(out_bbox, new_format=bbox.format), (height, width)
+        out_bbox = clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
+        return out_bbox, (height, width)
 
     spatial_size = (32, 38)
 
@@ -839,6 +845,9 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
             [69.27564928, 12.39339828, 74.93250353, 18.05025253],
             [18.36396103, 1.07968978, 46.64823228, 29.36396103],
         ]
+        expected_bboxes = clamp_bounding_box(
+            datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+        ).tolist()
 
     output_boxes, _ = F.rotate_bounding_box(
         in_boxes,
@@ -905,6 +914,10 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     if format != datapoints.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
+    expected_bboxes = clamp_bounding_box(
+        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+    ).tolist()
+
     output_boxes, output_spatial_size = F.crop_bounding_box(
         in_boxes,
         format,
@@ -1121,7 +1134,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(out_bbox, new_format=bbox.format)
+        return clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
 
     spatial_size = (32, 38)
 
@@ -1134,6 +1147,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
         output_bboxes = F.perspective_bounding_box(
             bboxes.as_subclass(torch.Tensor),
             format=bboxes.format,
+            spatial_size=bboxes.spatial_size,
             startpoints=None,
             endpoints=None,
             coefficients=pcoeffs,
@@ -1178,6 +1192,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
         ]
         out_bbox = torch.tensor(out_bbox)
         out_bbox = convert_format_bounding_box(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
+        out_bbox = clamp_bounding_box(out_bbox, format=format_, spatial_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
@@ -1201,7 +1216,8 @@ def test_correctness_center_crop_bounding_box(device, output_size):
             expected_bboxes = torch.stack(expected_bboxes)
         else:
             expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_boxes, expected_bboxes)
+
+        torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
         torch.testing.assert_close(output_spatial_size, output_size)
 
 
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
index 718c3c2ad..7c44a9e4b 100644
--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -81,7 +81,10 @@ class BoundingBox(Datapoint):
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
         output, spatial_size = self._F.resize_bounding_box(
-            self.as_subclass(torch.Tensor), spatial_size=self.spatial_size, size=size, max_size=max_size
+            self.as_subclass(torch.Tensor),
+            spatial_size=self.spatial_size,
+            size=size,
+            max_size=max_size,
         )
         return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
 
@@ -178,6 +181,7 @@ class BoundingBox(Datapoint):
         output = self._F.perspective_bounding_box(
             self.as_subclass(torch.Tensor),
             format=self.format,
+            spatial_size=self.spatial_size,
             startpoints=startpoints,
             endpoints=endpoints,
             coefficients=coefficients,
@@ -190,5 +194,7 @@ class BoundingBox(Datapoint):
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> BoundingBox:
-        output = self._F.elastic_bounding_box(self.as_subclass(torch.Tensor), self.format, displacement)
+        output = self._F.elastic_bounding_box(
+            self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement
+        )
         return BoundingBox.wrap_like(self, output)
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 132edb1b6..a640d726c 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -41,7 +41,7 @@ from ._geometry import (
     ScaleJitter,
     TenCrop,
 )
-from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
+from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
 from ._misc import (
     GaussianBlur,
     Identity,
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 75085fff6..79bd5549b 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -42,7 +42,7 @@ class ConvertDtype(Transform):
 ConvertImageDtype = ConvertDtype
 
 
-class ClampBoundingBoxes(Transform):
+class ClampBoundingBox(Transform):
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 6fcd87ac9..5fa6cbb48 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -22,7 +22,7 @@ from torchvision.transforms.functional_tensor import _pad_symmetric
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
+from ._meta import clamp_bounding_box, convert_format_bounding_box, get_spatial_size_image_pil
 
 from ._utils import is_simple_tensor
 
@@ -580,8 +580,9 @@ def affine_image_pil(
     return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
-def _affine_bounding_box_xyxy(
+def _affine_bounding_box_with_expand(
     bounding_box: torch.Tensor,
+    format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
@@ -593,6 +594,17 @@ def _affine_bounding_box_xyxy(
     if bounding_box.numel() == 0:
         return bounding_box, spatial_size
 
+    original_shape = bounding_box.shape
+    original_dtype = bounding_box.dtype
+    bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
+    dtype = bounding_box.dtype
+    device = bounding_box.device
+    bounding_box = (
+        convert_format_bounding_box(
+            bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+        )
+    ).reshape(-1, 4)
+
     angle, translate, shear, center = _affine_parse_args(
         angle, translate, scale, shear, InterpolationMode.NEAREST, center
     )
@@ -601,9 +613,6 @@ def _affine_bounding_box_xyxy(
         height, width = spatial_size
         center = [width * 0.5, height * 0.5]
 
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
-
     affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
     transposed_affine_matrix = (
         torch.tensor(
@@ -651,7 +660,13 @@ def _affine_bounding_box_xyxy(
         new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
         spatial_size = (new_height, new_width)
 
-    return out_bboxes.to(bounding_box.dtype), spatial_size
+    out_bboxes = clamp_bounding_box(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
+    out_bboxes = convert_format_bounding_box(
+        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    out_bboxes = out_bboxes.to(original_dtype)
+    return out_bboxes, spatial_size
 
 
 def affine_bounding_box(
@@ -664,19 +679,18 @@ def affine_bounding_box(
     shear: List[float],
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    original_shape = bounding_box.shape
-
-    bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
-    ).reshape(-1, 4)
-
-    out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center)
-
-    # out_bboxes should be of shape [N boxes, 4]
-
-    return convert_format_bounding_box(
-        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-    ).reshape(original_shape)
+    out_box, _ = _affine_bounding_box_with_expand(
+        bounding_box,
+        format=format,
+        spatial_size=spatial_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        expand=False,
+    )
+    return out_box
 
 
 def affine_mask(
@@ -852,14 +866,10 @@ def rotate_bounding_box(
         warnings.warn("The provided center argument has no effect on the result if expand is True")
         center = None
 
-    original_shape = bounding_box.shape
-    bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
-    ).reshape(-1, 4)
-
-    out_bboxes, spatial_size = _affine_bounding_box_xyxy(
+    return _affine_bounding_box_with_expand(
         bounding_box,
-        spatial_size,
+        format=format,
+        spatial_size=spatial_size,
         angle=-angle,
         translate=[0.0, 0.0],
         scale=1.0,
@@ -868,13 +878,6 @@ def rotate_bounding_box(
         expand=expand,
     )
 
-    return (
-        convert_format_bounding_box(
-            out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-        ).reshape(original_shape),
-        spatial_size,
-    )
-
 
 def rotate_mask(
     mask: torch.Tensor,
@@ -1112,8 +1115,9 @@ def pad_bounding_box(
     height, width = spatial_size
     height += top + bottom
     width += left + right
+    spatial_size = (height, width)
 
-    return bounding_box, (height, width)
+    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
 
 
 def pad_video(
@@ -1185,8 +1189,9 @@ def crop_bounding_box(
         sub = [left, top, 0, 0]
 
     bounding_box = bounding_box - torch.tensor(sub, dtype=bounding_box.dtype, device=bounding_box.device)
+    spatial_size = (height, width)
 
-    return bounding_box, (height, width)
+    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1332,6 +1337,7 @@ def perspective_image_pil(
 def perspective_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
+    spatial_size: Tuple[int, int],
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
@@ -1342,6 +1348,7 @@ def perspective_bounding_box(
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
     original_shape = bounding_box.shape
+    # TODO: first cast to float if bbox is int64 before convert_format_bounding_box
     bounding_box = (
         convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
@@ -1408,7 +1415,11 @@ def perspective_bounding_box(
     transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
 
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
+    out_bboxes = clamp_bounding_box(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=spatial_size,
+    )
 
     # out_bboxes should be of shape [N boxes, 4]
 
@@ -1549,6 +1560,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
 def elastic_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
+    spatial_size: Tuple[int, int],
     displacement: torch.Tensor,
 ) -> torch.Tensor:
     if bounding_box.numel() == 0:
@@ -1562,14 +1574,11 @@ def elastic_bounding_box(
         displacement = displacement.to(dtype=dtype, device=device)
 
     original_shape = bounding_box.shape
+    # TODO: first cast to float if bbox is int64 before convert_format_bounding_box
     bounding_box = (
         convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
-    # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
-    # Or add spatial_size arg and check displacement shape
-    spatial_size = displacement.shape[-3], displacement.shape[-2]
-
     id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
@@ -1588,7 +1597,11 @@ def elastic_bounding_box(
 
     transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
+    out_bboxes = clamp_bounding_box(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=spatial_size,
+    )
 
     return convert_format_bounding_box(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
@@ -1796,7 +1809,7 @@ def resized_crop_bounding_box(
     size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     bounding_box, _ = crop_bounding_box(bounding_box, format, top, left, height, width)
-    return resize_bounding_box(bounding_box, (height, width), size)
+    return resize_bounding_box(bounding_box, spatial_size=(height, width), size=size)
 
 
 def resized_crop_mask(
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 478604517..5e32516fb 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -245,12 +245,17 @@ def _clamp_bounding_box(
 ) -> torch.Tensor:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
+    in_dtype = bounding_box.dtype
+    bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
     xyxy_boxes = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+        bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
-    return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
+    out_boxes = convert_format_bounding_box(
+        xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    )
+    return out_boxes.to(in_dtype)
 
 
 def clamp_bounding_box(
-- 
GitLab


From f6b5b82efd0a89273f1a9274e7f1ce73c0f3971c Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Tue, 14 Feb 2023 18:31:36 -0500
Subject: [PATCH 287/624] Adding RC trigger to Build Workflows (#7237)

---
 .github/workflows/build-conda-linux.yml  | 6 ++++++
 .github/workflows/build-conda-m1.yml     | 6 ++++++
 .github/workflows/build-conda-macos.yml  | 6 ++++++
 .github/workflows/build-wheels-linux.yml | 6 ++++++
 .github/workflows/build-wheels-m1.yml    | 6 ++++++
 .github/workflows/build-wheels-macos.yml | 6 ++++++
 6 files changed, 36 insertions(+)

diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
index d38ba9fa2..e44bb877e 100644
--- a/.github/workflows/build-conda-linux.yml
+++ b/.github/workflows/build-conda-linux.yml
@@ -5,6 +5,12 @@ on:
   push:
     branches:
       - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
index cb0b687f0..7006b6714 100644
--- a/.github/workflows/build-conda-m1.yml
+++ b/.github/workflows/build-conda-m1.yml
@@ -5,6 +5,12 @@ on:
   push:
     branches:
       - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml
index 4b43528a3..39d00534b 100644
--- a/.github/workflows/build-conda-macos.yml
+++ b/.github/workflows/build-conda-macos.yml
@@ -5,6 +5,12 @@ on:
   push:
     branches:
       - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index 806b5331e..e997d648e 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -5,6 +5,12 @@ on:
   push:
     branches:
       - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index 7f5446ebb..66c7687ac 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -5,6 +5,12 @@ on:
   push:
     branches:
       - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
index 78677961d..6c5ebc0fc 100644
--- a/.github/workflows/build-wheels-macos.yml
+++ b/.github/workflows/build-wheels-macos.yml
@@ -5,6 +5,12 @@ on:
   push:
     branches:
       - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
-- 
GitLab


From f9d1883e97ca3a0ec2c5f16745df6fc3a0fe379a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Feb 2023 11:37:45 +0100
Subject: [PATCH 288/624] Fix some annotations in transforms v2 for JIT v1
 compatibility (#7252)

---
 test/prototype_transforms_dispatcher_infos.py | 67 +++++----------
 test/prototype_transforms_kernel_infos.py     | 83 +++++++++----------
 .../prototype/datapoints/_bounding_box.py     |  2 +-
 .../prototype/datapoints/_datapoint.py        |  6 +-
 torchvision/prototype/datapoints/_image.py    |  4 +-
 torchvision/prototype/datapoints/_mask.py     |  4 +-
 torchvision/prototype/datapoints/_video.py    |  4 +-
 torchvision/prototype/transforms/_geometry.py |  2 +-
 torchvision/prototype/transforms/_utils.py    |  3 +-
 .../transforms/functional/_geometry.py        | 22 ++---
 10 files changed, 84 insertions(+), 113 deletions(-)

diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 8fe5333aa..442dd526e 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -3,7 +3,7 @@ import collections.abc
 import pytest
 import torchvision.prototype.transforms.functional as F
 from prototype_common_utils import InfoBase, TestMark
-from prototype_transforms_kernel_infos import KERNEL_INFOS
+from prototype_transforms_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
 from torchvision.prototype import datapoints
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
@@ -96,25 +96,6 @@ def xfail_jit_python_scalar_arg(name, *, reason=None):
     )
 
 
-def xfail_jit_tuple_instead_of_list(name, *, reason=None):
-    return xfail_jit(
-        reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting",
-        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple),
-    )
-
-
-def is_list_of_ints(args_kwargs):
-    fill = args_kwargs.kwargs.get("fill")
-    return isinstance(fill, list) and any(isinstance(scalar_fill, int) for scalar_fill in fill)
-
-
-def xfail_jit_list_of_ints(name, *, reason=None):
-    return xfail_jit(
-        reason or f"Passing a list of integers for `{name}` is not supported when scripting",
-        condition=is_list_of_ints,
-    )
-
-
 skip_dispatch_datapoint = TestMark(
     ("TestDispatchers", "test_dispatch_datapoint"),
     pytest.mark.skip(reason="Dispatcher doesn't support arbitrary datapoint dispatch."),
@@ -130,6 +111,13 @@ multi_crop_skips = [
 multi_crop_skips.append(skip_dispatch_datapoint)
 
 
+def xfails_pil(reason, *, condition=None):
+    return [
+        TestMark(("TestDispatchers", test_name), pytest.mark.xfail(reason=reason), condition=condition)
+        for test_name in ["test_dispatch_pil", "test_pil_output_type"]
+    ]
+
+
 def fill_sequence_needs_broadcast(args_kwargs):
     (image_loader, *_), kwargs = args_kwargs
     try:
@@ -143,11 +131,8 @@ def fill_sequence_needs_broadcast(args_kwargs):
     return image_loader.num_channels > 1
 
 
-xfail_dispatch_pil_if_fill_sequence_needs_broadcast = TestMark(
-    ("TestDispatchers", "test_dispatch_pil"),
-    pytest.mark.xfail(
-        reason="PIL kernel doesn't support sequences of length 1 for `fill` if the number of color channels is larger."
-    ),
+xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
+    "PIL kernel doesn't support sequences of length 1 for `fill` if the number of color channels is larger.",
     condition=fill_sequence_needs_broadcast,
 )
 
@@ -186,11 +171,9 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.affine_image_pil),
         test_marks=[
-            xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
+            *xfails_pil_if_fill_sequence_needs_broadcast,
             xfail_jit_python_scalar_arg("shear"),
-            xfail_jit_tuple_instead_of_list("fill"),
-            # TODO: check if this is a regression since it seems that should be supported if `int` is ok
-            xfail_jit_list_of_ints("fill"),
+            xfail_jit_python_scalar_arg("fill"),
         ],
     ),
     DispatcherInfo(
@@ -213,9 +196,8 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.rotate_image_pil),
         test_marks=[
-            xfail_jit_tuple_instead_of_list("fill"),
-            # TODO: check if this is a regression since it seems that should be supported if `int` is ok
-            xfail_jit_list_of_ints("fill"),
+            xfail_jit_python_scalar_arg("fill"),
+            *xfails_pil_if_fill_sequence_needs_broadcast,
         ],
     ),
     DispatcherInfo(
@@ -248,21 +230,16 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"),
         test_marks=[
-            TestMark(
-                ("TestDispatchers", "test_dispatch_pil"),
-                pytest.mark.xfail(
-                    reason=(
-                        "PIL kernel doesn't support sequences of length 1 for argument `fill` and "
-                        "`padding_mode='constant'`, if the number of color channels is larger."
-                    )
+            *xfails_pil(
+                reason=(
+                    "PIL kernel doesn't support sequences of length 1 for argument `fill` and "
+                    "`padding_mode='constant'`, if the number of color channels is larger."
                 ),
                 condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs)
                 and args_kwargs.kwargs.get("padding_mode", "constant") == "constant",
             ),
-            xfail_jit_tuple_instead_of_list("padding"),
-            xfail_jit_tuple_instead_of_list("fill"),
-            # TODO: check if this is a regression since it seems that should be supported if `int` is ok
-            xfail_jit_list_of_ints("fill"),
+            xfail_jit("F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition),
+            xfail_jit_python_scalar_arg("padding"),
         ],
     ),
     DispatcherInfo(
@@ -275,7 +252,8 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
         test_marks=[
-            xfail_dispatch_pil_if_fill_sequence_needs_broadcast,
+            *xfails_pil_if_fill_sequence_needs_broadcast,
+            xfail_jit_python_scalar_arg("fill"),
         ],
     ),
     DispatcherInfo(
@@ -287,6 +265,7 @@ DISPATCHER_INFOS = [
             datapoints.Mask: F.elastic_mask,
         },
         pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
+        test_marks=[xfail_jit_python_scalar_arg("fill")],
     ),
     DispatcherInfo(
         F.center_crop,
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index dc9c3e57d..ce05c980a 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -153,26 +153,6 @@ def xfail_jit_python_scalar_arg(name, *, reason=None):
     )
 
 
-def xfail_jit_tuple_instead_of_list(name, *, reason=None):
-    reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting"
-    return xfail_jit(
-        reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting",
-        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple),
-    )
-
-
-def is_list_of_ints(args_kwargs):
-    fill = args_kwargs.kwargs.get("fill")
-    return isinstance(fill, list) and any(isinstance(scalar_fill, int) for scalar_fill in fill)
-
-
-def xfail_jit_list_of_ints(name, *, reason=None):
-    return xfail_jit(
-        reason or f"Passing a list of integers for `{name}` is not supported when scripting",
-        condition=is_list_of_ints,
-    )
-
-
 KERNEL_INFOS = []
 
 
@@ -450,21 +430,21 @@ _DIVERSE_AFFINE_PARAMS = [
 ]
 
 
-def get_fills(*, num_channels, dtype, vector=True):
+def get_fills(*, num_channels, dtype):
     yield None
 
-    max_value = get_max_value(dtype)
-    # This intentionally gives us a float and an int scalar fill value
-    yield max_value / 2
-    yield max_value
+    int_value = get_max_value(dtype)
+    float_value = int_value / 2
+    yield int_value
+    yield float_value
 
-    if not vector:
-        return
+    for vector_type in [list, tuple]:
+        yield vector_type([int_value])
+        yield vector_type([float_value])
 
-    if dtype.is_floating_point:
-        yield [0.1 + c / 10 for c in range(num_channels)]
-    else:
-        yield [12.0 + c for c in range(num_channels)]
+        if num_channels > 1:
+            yield vector_type(float_value * c / 10 for c in range(num_channels))
+            yield vector_type(int_value if c % 2 == 0 else 0 for c in range(num_channels))
 
 
 def float32_vs_uint8_fill_adapter(other_args, kwargs):
@@ -644,9 +624,7 @@ KERNEL_INFOS.extend(
             closeness_kwargs=pil_reference_pixel_difference(10, mae=True),
             test_marks=[
                 xfail_jit_python_scalar_arg("shear"),
-                xfail_jit_tuple_instead_of_list("fill"),
-                # TODO: check if this is a regression since it seems that should be supported if `int` is ok
-                xfail_jit_list_of_ints("fill"),
+                xfail_jit_python_scalar_arg("fill"),
             ],
         ),
         KernelInfo(
@@ -873,9 +851,7 @@ KERNEL_INFOS.extend(
             float32_vs_uint8=True,
             closeness_kwargs=pil_reference_pixel_difference(1, mae=True),
             test_marks=[
-                xfail_jit_tuple_instead_of_list("fill"),
-                # TODO: check if this is a regression since it seems that should be supported if `int` is ok
-                xfail_jit_list_of_ints("fill"),
+                xfail_jit_python_scalar_arg("fill"),
             ],
         ),
         KernelInfo(
@@ -1122,12 +1098,14 @@ def reference_inputs_pad_image_tensor():
     for image_loader, params in itertools.product(
         make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PAD_PARAMS
     ):
-        # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
         for fill in get_fills(
             num_channels=image_loader.num_channels,
             dtype=image_loader.dtype,
-            vector=params["padding_mode"] == "constant",
         ):
+            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
+            if isinstance(fill, (list, tuple)):
+                continue
+
             yield ArgsKwargs(image_loader, fill=fill, **params)
 
 
@@ -1195,6 +1173,16 @@ def reference_inputs_pad_bounding_box():
         )
 
 
+def pad_xfail_jit_fill_condition(args_kwargs):
+    fill = args_kwargs.kwargs.get("fill")
+    if not isinstance(fill, (list, tuple)):
+        return False
+    elif isinstance(fill, tuple):
+        return True
+    else:  # isinstance(fill, list):
+        return all(isinstance(f, int) for f in fill)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1205,10 +1193,10 @@ KERNEL_INFOS.extend(
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs=float32_vs_uint8_pixel_difference(),
             test_marks=[
-                xfail_jit_tuple_instead_of_list("padding"),
-                xfail_jit_tuple_instead_of_list("fill"),
-                # TODO: check if this is a regression since it seems that should be supported if `int` is ok
-                xfail_jit_list_of_ints("fill"),
+                xfail_jit_python_scalar_arg("padding"),
+                xfail_jit(
+                    "F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition
+                ),
             ],
         ),
         KernelInfo(
@@ -1217,7 +1205,7 @@ KERNEL_INFOS.extend(
             reference_fn=reference_pad_bounding_box,
             reference_inputs_fn=reference_inputs_pad_bounding_box,
             test_marks=[
-                xfail_jit_tuple_instead_of_list("padding"),
+                xfail_jit_python_scalar_arg("padding"),
             ],
         ),
         KernelInfo(
@@ -1261,8 +1249,11 @@ def reference_inputs_perspective_image_tensor():
             F.InterpolationMode.BILINEAR,
         ],
     ):
-        # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
+            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
+            if isinstance(fill, (list, tuple)):
+                continue
+
             yield ArgsKwargs(
                 image_loader,
                 startpoints=None,
@@ -1327,6 +1318,7 @@ KERNEL_INFOS.extend(
                 **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
                 **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
             },
+            test_marks=[xfail_jit_python_scalar_arg("fill")],
         ),
         KernelInfo(
             F.perspective_bounding_box,
@@ -1418,6 +1410,7 @@ KERNEL_INFOS.extend(
                 **float32_vs_uint8_pixel_difference(6, mae=True),
                 **cuda_vs_cpu_pixel_difference(),
             },
+            test_marks=[xfail_jit_python_scalar_arg("fill")],
         ),
         KernelInfo(
             F.elastic_bounding_box,
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
index 7c44a9e4b..e04a965d9 100644
--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -118,7 +118,7 @@ class BoundingBox(Datapoint):
     def pad(
         self,
         padding: Union[int, Sequence[int]],
-        fill: FillTypeJIT = None,
+        fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> BoundingBox:
         output, spatial_size = self._F.pad_bounding_box(
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index d75a22110..3738d2a81 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -12,7 +12,7 @@ from torchvision.transforms import InterpolationMode
 
 D = TypeVar("D", bound="Datapoint")
 FillType = Union[int, float, Sequence[int], Sequence[float], None]
-FillTypeJIT = Union[int, float, List[float], None]
+FillTypeJIT = Optional[List[float]]
 
 
 class Datapoint(torch.Tensor):
@@ -169,8 +169,8 @@ class Datapoint(torch.Tensor):
 
     def pad(
         self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
+        padding: List[int],
+        fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> Datapoint:
         return self
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index bbd06de70..8f3092fa1 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -103,8 +103,8 @@ class Image(Datapoint):
 
     def pad(
         self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
+        padding: List[int],
+        fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> Image:
         output = self._F.pad_image_tensor(self.as_subclass(torch.Tensor), padding, fill=fill, padding_mode=padding_mode)
diff --git a/torchvision/prototype/datapoints/_mask.py b/torchvision/prototype/datapoints/_mask.py
index dec26f80a..a1870fa4b 100644
--- a/torchvision/prototype/datapoints/_mask.py
+++ b/torchvision/prototype/datapoints/_mask.py
@@ -83,8 +83,8 @@ class Mask(Datapoint):
 
     def pad(
         self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
+        padding: List[int],
+        fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> Mask:
         output = self._F.pad_mask(self.as_subclass(torch.Tensor), padding, padding_mode=padding_mode, fill=fill)
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 2f628f2ef..0e5ff7a17 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -102,8 +102,8 @@ class Video(Datapoint):
 
     def pad(
         self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
+        padding: List[int],
+        fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> Video:
         output = self._F.pad_video(self.as_subclass(torch.Tensor), padding, fill=fill, padding_mode=padding_mode)
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index b8c8d10ae..c4708cc57 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -270,7 +270,7 @@ class Pad(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
-        return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)
+        return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
 
 
 class RandomZoomOut(_RandomApplyTransform):
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
index b5ec05669..f2d818b13 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/prototype/transforms/_utils.py
@@ -60,10 +60,9 @@ def _convert_fill_arg(fill: datapoints.FillType) -> datapoints.FillTypeJIT:
     if fill is None:
         return fill
 
-    # This cast does Sequence -> List[float] to please mypy and torch.jit.script
     if not isinstance(fill, (int, float)):
         fill = [float(v) for v in list(fill)]
-    return fill
+    return fill  # type: ignore[return-value]
 
 
 def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillTypeJIT]:
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 5fa6cbb48..7fa0736cc 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -432,7 +432,7 @@ def _apply_grid_transform(
     if fill is not None:
         float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3)
         mask = mask.expand_as(float_img)
-        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]
+        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]  # type: ignore[arg-type]
         fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1)
         if mode == "nearest":
             bool_mask = mask < 0.5
@@ -968,8 +968,8 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
 
 def pad_image_tensor(
     image: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: datapoints.FillTypeJIT = None,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
     # Be aware that while `padding` has order `[left, top, right, bottom]` has order, `torch_padding` uses
@@ -1069,14 +1069,14 @@ pad_image_pil = _FP.pad
 
 def pad_mask(
     mask: torch.Tensor,
-    padding: Union[int, List[int]],
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
-    fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     if fill is None:
         fill = 0
 
-    if isinstance(fill, list):
+    if isinstance(fill, (tuple, list)):
         raise ValueError("Non-scalar fill value is not supported")
 
     if mask.ndim < 3:
@@ -1097,7 +1097,7 @@ def pad_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
-    padding: Union[int, List[int]],
+    padding: List[int],
     padding_mode: str = "constant",
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     if padding_mode not in ["constant"]:
@@ -1122,8 +1122,8 @@ def pad_bounding_box(
 
 def pad_video(
     video: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: datapoints.FillTypeJIT = None,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
     return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode)
@@ -1131,8 +1131,8 @@ def pad_video(
 
 def pad(
     inpt: datapoints.InputTypeJIT,
-    padding: Union[int, List[int]],
-    fill: datapoints.FillTypeJIT = None,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
-- 
GitLab


From f627b9d1091315b6de1c860e9619832a659b2ba2 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 15 Feb 2023 02:42:26 -0800
Subject: [PATCH 289/624] [ONNX] Fix dtype for NonMaxSuppression (#7056)

Co-authored-by: Nikita Shulga <nshulga@fb.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/ops/_register_onnx_ops.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py
index 8f9598e1f..7016af82c 100644
--- a/torchvision/ops/_register_onnx_ops.py
+++ b/torchvision/ops/_register_onnx_ops.py
@@ -11,7 +11,6 @@ base_onnx_opset_version = _onnx_opset_version_11
 def _register_custom_op():
     from torch.onnx.symbolic_helper import parse_args
     from torch.onnx.symbolic_opset11 import select, squeeze, unsqueeze
-    from torch.onnx.symbolic_opset9 import _cast_Long
 
     @parse_args("v", "v", "f")
     def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
@@ -19,13 +18,18 @@ def _register_custom_op():
         scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
         max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
         iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
-        nms_out = g.op("NonMaxSuppression", boxes, scores, max_output_per_class, iou_threshold)
+        nms_out = g.op(
+            "NonMaxSuppression",
+            g.op("Cast", boxes, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+            g.op("Cast", scores, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+            max_output_per_class,
+            iou_threshold,
+        )
         return squeeze(g, select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1)
 
     def _process_batch_indices_for_roi_align(g, rois):
-        return _cast_Long(
-            g, squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1), False
-        )
+        indices = squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1)
+        return g.op("Cast", indices, to_i=torch.onnx.TensorProtoDataType.INT64)
 
     def _process_rois_for_roi_align(g, rois):
         return select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
-- 
GitLab


From 0e0a5dc729232302117852bf860ff479058829ca Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Feb 2023 13:09:21 +0100
Subject: [PATCH 290/624] Support integer values for interpolation in the
 prototype transforms (#7248)

---
 test/test_prototype_transforms.py             |  6 +-
 test/test_prototype_transforms_consistency.py | 46 +++++++++---
 .../prototype/datapoints/_bounding_box.py     | 12 +--
 .../prototype/datapoints/_datapoint.py        | 12 +--
 torchvision/prototype/datapoints/_image.py    | 12 +--
 torchvision/prototype/datapoints/_mask.py     | 12 +--
 torchvision/prototype/datapoints/_video.py    | 12 +--
 torchvision/prototype/transforms/_augment.py  |  5 +-
 .../prototype/transforms/_auto_augment.py     | 15 ++--
 torchvision/prototype/transforms/_geometry.py | 37 +++++-----
 torchvision/prototype/transforms/_presets.py  |  6 +-
 .../transforms/functional/_geometry.py        | 74 +++++++++++++------
 12 files changed, 153 insertions(+), 96 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 167b839ee..b881ebc50 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1534,7 +1534,7 @@ class TestScaleJitter:
             assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
 
     def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
         antialias_sentinel = mocker.MagicMock()
 
         transform = transforms.ScaleJitter(
@@ -1581,7 +1581,7 @@ class TestRandomShortestSize:
             assert shorter in min_size
 
     def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
         antialias_sentinel = mocker.MagicMock()
 
         transform = transforms.RandomShortestSize(
@@ -1945,7 +1945,7 @@ class TestRandomResize:
             assert min_size <= size < max_size
 
     def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
         antialias_sentinel = mocker.MagicMock()
 
         transform = transforms.RandomResize(
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index f0a7b44db..bb20f8a7b 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -88,6 +88,9 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs((32, 29)),
             ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
             ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+            ArgsKwargs((30, 27), interpolation=PIL.Image.NEAREST),
+            ArgsKwargs((35, 29), interpolation=PIL.Image.BILINEAR),
+            ArgsKwargs((34, 25), interpolation=PIL.Image.BICUBIC),
             NotScriptableArgsKwargs(31, max_size=32),
             ArgsKwargs([31], max_size=32),
             NotScriptableArgsKwargs(30, max_size=100),
@@ -305,6 +308,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(25, ratio=(0.5, 1.5)),
             ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
             ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+            ArgsKwargs((31, 28), interpolation=PIL.Image.NEAREST),
+            ArgsKwargs((33, 26), interpolation=PIL.Image.BICUBIC),
             ArgsKwargs((29, 32), antialias=False),
             ArgsKwargs((28, 31), antialias=True),
         ],
@@ -352,6 +357,8 @@ CONSISTENCY_CONFIGS = [
                 ArgsKwargs(sigma=(2.5, 3.9)),
                 ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST),
                 ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+                ArgsKwargs(interpolation=PIL.Image.NEAREST),
+                ArgsKwargs(interpolation=PIL.Image.BICUBIC),
                 ArgsKwargs(fill=1),
             ],
             # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
@@ -386,6 +393,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(degrees=0.0, shear=(4, 5, 4, 13)),
             ArgsKwargs(degrees=(-20.0, 10.0), translate=(0.4, 0.6), scale=(0.3, 0.8), shear=(4, 5, 4, 13)),
             ArgsKwargs(degrees=30.0, interpolation=prototype_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs(degrees=30.0, interpolation=PIL.Image.NEAREST),
             ArgsKwargs(degrees=30.0, fill=1),
             ArgsKwargs(degrees=30.0, fill=(2, 3, 4)),
             ArgsKwargs(degrees=30.0, center=(0, 0)),
@@ -420,6 +428,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(p=1),
             ArgsKwargs(p=1, distortion_scale=0.3),
             ArgsKwargs(p=1, distortion_scale=0.2, interpolation=prototype_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=PIL.Image.NEAREST),
             ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
             ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
         ],
@@ -432,6 +441,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(degrees=30.0),
             ArgsKwargs(degrees=(-20.0, 10.0)),
             ArgsKwargs(degrees=30.0, interpolation=prototype_transforms.InterpolationMode.BILINEAR),
+            ArgsKwargs(degrees=30.0, interpolation=PIL.Image.BILINEAR),
             ArgsKwargs(degrees=30.0, expand=True),
             ArgsKwargs(degrees=30.0, center=(0, 0)),
             ArgsKwargs(degrees=30.0, fill=1),
@@ -851,7 +861,11 @@ class TestAATransforms:
     )
     @pytest.mark.parametrize(
         "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
+        [
+            prototype_transforms.InterpolationMode.NEAREST,
+            prototype_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
     )
     def test_randaug(self, inpt, interpolation, mocker):
         t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1)
@@ -889,7 +903,11 @@ class TestAATransforms:
     )
     @pytest.mark.parametrize(
         "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
+        [
+            prototype_transforms.InterpolationMode.NEAREST,
+            prototype_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
     )
     def test_trivial_aug(self, inpt, interpolation, mocker):
         t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation)
@@ -937,7 +955,11 @@ class TestAATransforms:
     )
     @pytest.mark.parametrize(
         "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
+        [
+            prototype_transforms.InterpolationMode.NEAREST,
+            prototype_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
     )
     def test_augmix(self, inpt, interpolation, mocker):
         t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
@@ -986,7 +1008,11 @@ class TestAATransforms:
     )
     @pytest.mark.parametrize(
         "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
+        [
+            prototype_transforms.InterpolationMode.NEAREST,
+            prototype_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
     )
     def test_aa(self, inpt, interpolation):
         aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet")
@@ -1264,13 +1290,13 @@ class TestRefSegTransforms:
         (legacy_F.convert_image_dtype, {}),
         (legacy_F.to_pil_image, {}),
         (legacy_F.normalize, {}),
-        (legacy_F.resize, {}),
+        (legacy_F.resize, {"interpolation"}),
         (legacy_F.pad, {"padding", "fill"}),
         (legacy_F.crop, {}),
         (legacy_F.center_crop, {}),
-        (legacy_F.resized_crop, {}),
+        (legacy_F.resized_crop, {"interpolation"}),
         (legacy_F.hflip, {}),
-        (legacy_F.perspective, {"startpoints", "endpoints", "fill"}),
+        (legacy_F.perspective, {"startpoints", "endpoints", "fill", "interpolation"}),
         (legacy_F.vflip, {}),
         (legacy_F.five_crop, {}),
         (legacy_F.ten_crop, {}),
@@ -1279,8 +1305,8 @@ class TestRefSegTransforms:
         (legacy_F.adjust_saturation, {}),
         (legacy_F.adjust_hue, {}),
         (legacy_F.adjust_gamma, {}),
-        (legacy_F.rotate, {"center", "fill"}),
-        (legacy_F.affine, {"angle", "translate", "center", "fill"}),
+        (legacy_F.rotate, {"center", "fill", "interpolation"}),
+        (legacy_F.affine, {"angle", "translate", "center", "fill", "interpolation"}),
         (legacy_F.to_grayscale, {}),
         (legacy_F.rgb_to_grayscale, {}),
         (legacy_F.to_tensor, {}),
@@ -1292,7 +1318,7 @@ class TestRefSegTransforms:
         (legacy_F.adjust_sharpness, {}),
         (legacy_F.autocontrast, {}),
         (legacy_F.equalize, {}),
-        (legacy_F.elastic_transform, {"fill"}),
+        (legacy_F.elastic_transform, {"fill", "interpolation"}),
     ],
 )
 def test_dispatcher_signature_consistency(legacy_dispatcher, name_only_params):
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
index e04a965d9..b904dd5e5 100644
--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -76,7 +76,7 @@ class BoundingBox(Datapoint):
     def resize(  # type: ignore[override]
         self,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
@@ -107,7 +107,7 @@ class BoundingBox(Datapoint):
         height: int,
         width: int,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
         output, spatial_size = self._F.resized_crop_bounding_box(
@@ -133,7 +133,7 @@ class BoundingBox(Datapoint):
     def rotate(
         self,
         angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: FillTypeJIT = None,
@@ -154,7 +154,7 @@ class BoundingBox(Datapoint):
         translate: List[float],
         scale: float,
         shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> BoundingBox:
@@ -174,7 +174,7 @@ class BoundingBox(Datapoint):
         self,
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> BoundingBox:
@@ -191,7 +191,7 @@ class BoundingBox(Datapoint):
     def elastic(
         self,
         displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> BoundingBox:
         output = self._F.elastic_bounding_box(
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/prototype/datapoints/_datapoint.py
index 3738d2a81..5f4a0d96e 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/prototype/datapoints/_datapoint.py
@@ -143,7 +143,7 @@ class Datapoint(torch.Tensor):
     def resize(  # type: ignore[override]
         self,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Datapoint:
@@ -162,7 +162,7 @@ class Datapoint(torch.Tensor):
         height: int,
         width: int,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Datapoint:
         return self
@@ -178,7 +178,7 @@ class Datapoint(torch.Tensor):
     def rotate(
         self,
         angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: FillTypeJIT = None,
@@ -191,7 +191,7 @@ class Datapoint(torch.Tensor):
         translate: List[float],
         scale: float,
         shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Datapoint:
@@ -201,7 +201,7 @@ class Datapoint(torch.Tensor):
         self,
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Datapoint:
@@ -210,7 +210,7 @@ class Datapoint(torch.Tensor):
     def elastic(
         self,
         displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Datapoint:
         return self
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/prototype/datapoints/_image.py
index 8f3092fa1..4fc14323a 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/prototype/datapoints/_image.py
@@ -62,7 +62,7 @@ class Image(Datapoint):
     def resize(  # type: ignore[override]
         self,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Image:
@@ -86,7 +86,7 @@ class Image(Datapoint):
         height: int,
         width: int,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Image:
         output = self._F.resized_crop_image_tensor(
@@ -113,7 +113,7 @@ class Image(Datapoint):
     def rotate(
         self,
         angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: FillTypeJIT = None,
@@ -129,7 +129,7 @@ class Image(Datapoint):
         translate: List[float],
         scale: float,
         shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Image:
@@ -149,7 +149,7 @@ class Image(Datapoint):
         self,
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Image:
@@ -166,7 +166,7 @@ class Image(Datapoint):
     def elastic(
         self,
         displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Image:
         output = self._F.elastic_image_tensor(
diff --git a/torchvision/prototype/datapoints/_mask.py b/torchvision/prototype/datapoints/_mask.py
index a1870fa4b..41dce097c 100644
--- a/torchvision/prototype/datapoints/_mask.py
+++ b/torchvision/prototype/datapoints/_mask.py
@@ -53,7 +53,7 @@ class Mask(Datapoint):
     def resize(  # type: ignore[override]
         self,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Mask:
@@ -75,7 +75,7 @@ class Mask(Datapoint):
         height: int,
         width: int,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Mask:
         output = self._F.resized_crop_mask(self.as_subclass(torch.Tensor), top, left, height, width, size=size)
@@ -93,7 +93,7 @@ class Mask(Datapoint):
     def rotate(
         self,
         angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: FillTypeJIT = None,
@@ -107,7 +107,7 @@ class Mask(Datapoint):
         translate: List[float],
         scale: float,
         shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Mask:
@@ -126,7 +126,7 @@ class Mask(Datapoint):
         self,
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Mask:
@@ -138,7 +138,7 @@ class Mask(Datapoint):
     def elastic(
         self,
         displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
     ) -> Mask:
         output = self._F.elastic_mask(self.as_subclass(torch.Tensor), displacement, fill=fill)
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/prototype/datapoints/_video.py
index 0e5ff7a17..f62edd68e 100644
--- a/torchvision/prototype/datapoints/_video.py
+++ b/torchvision/prototype/datapoints/_video.py
@@ -57,7 +57,7 @@ class Video(Datapoint):
     def resize(  # type: ignore[override]
         self,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Video:
@@ -85,7 +85,7 @@ class Video(Datapoint):
         height: int,
         width: int,
         size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> Video:
         output = self._F.resized_crop_video(
@@ -112,7 +112,7 @@ class Video(Datapoint):
     def rotate(
         self,
         angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: FillTypeJIT = None,
@@ -128,7 +128,7 @@ class Video(Datapoint):
         translate: List[float],
         scale: float,
         shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Video:
@@ -148,7 +148,7 @@ class Video(Datapoint):
         self,
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Video:
@@ -165,7 +165,7 @@ class Video(Datapoint):
     def elastic(
         self,
         displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
     ) -> Video:
         output = self._F.elastic_video(
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 65b672b7e..3ceabba5e 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -10,6 +10,7 @@ from torchvision import transforms as _transforms
 from torchvision.ops import masks_to_boxes
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
+from torchvision.prototype.transforms.functional._geometry import _check_interpolation
 
 from ._transform import _RandomApplyTransform
 from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
@@ -203,11 +204,11 @@ class SimpleCopyPaste(Transform):
     def __init__(
         self,
         blending: bool = True,
-        resize_interpolation: InterpolationMode = F.InterpolationMode.BILINEAR,
+        resize_interpolation: Union[int, InterpolationMode] = F.InterpolationMode.BILINEAR,
         antialias: Optional[bool] = None,
     ) -> None:
         super().__init__()
-        self.resize_interpolation = resize_interpolation
+        self.resize_interpolation = _check_interpolation(resize_interpolation)
         self.blending = blending
         self.antialias = antialias
 
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 89bead236..67afecf5d 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -8,6 +8,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 from torchvision import transforms as _transforms
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
+from torchvision.prototype.transforms.functional._geometry import _check_interpolation
 from torchvision.prototype.transforms.functional._meta import get_spatial_size
 from torchvision.transforms import functional_tensor as _FT
 
@@ -19,11 +20,11 @@ class _AutoAugmentBase(Transform):
     def __init__(
         self,
         *,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__()
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.fill = _setup_fill_arg(fill)
 
     def _get_random_item(self, dct: Dict[str, Tuple[Callable, bool]]) -> Tuple[str, Tuple[Callable, bool]]:
@@ -79,7 +80,7 @@ class _AutoAugmentBase(Transform):
         image: Union[datapoints.ImageType, datapoints.VideoType],
         transform_id: str,
         magnitude: float,
-        interpolation: InterpolationMode,
+        interpolation: Union[InterpolationMode, int],
         fill: Dict[Type, datapoints.FillTypeJIT],
     ) -> Union[datapoints.ImageType, datapoints.VideoType]:
         fill_ = fill[type(image)]
@@ -193,7 +194,7 @@ class AutoAugment(_AutoAugmentBase):
     def __init__(
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
@@ -350,7 +351,7 @@ class RandAugment(_AutoAugmentBase):
         num_ops: int = 2,
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
@@ -403,7 +404,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
     def __init__(
         self,
         num_magnitude_bins: int = 31,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ):
         super().__init__(interpolation=interpolation, fill=fill)
@@ -461,7 +462,7 @@ class AugMix(_AutoAugmentBase):
         chain_depth: int = -1,
         alpha: float = 1.0,
         all_ops: bool = True,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index c4708cc57..ffabb9147 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -10,6 +10,7 @@ from torchvision import transforms as _transforms
 from torchvision.ops.boxes import box_iou
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
+from torchvision.prototype.transforms.functional._geometry import _check_interpolation
 from torchvision.transforms.functional import _get_perspective_coeffs
 
 from ._transform import _RandomApplyTransform
@@ -45,7 +46,7 @@ class Resize(Transform):
     def __init__(
         self,
         size: Union[int, Sequence[int]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
@@ -61,7 +62,7 @@ class Resize(Transform):
             )
         self.size = size
 
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.max_size = max_size
         self.antialias = antialias
 
@@ -94,7 +95,7 @@ class RandomResizedCrop(Transform):
         size: Union[int, Sequence[int]],
         scale: Tuple[float, float] = (0.08, 1.0),
         ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
@@ -111,7 +112,7 @@ class RandomResizedCrop(Transform):
 
         self.scale = scale
         self.ratio = ratio
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.antialias = antialias
 
         self._log_ratio = torch.log(torch.tensor(self.ratio))
@@ -317,14 +318,14 @@ class RandomRotation(Transform):
     def __init__(
         self,
         degrees: Union[numbers.Number, Sequence],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.expand = expand
 
         self.fill = _setup_fill_arg(fill)
@@ -359,7 +360,7 @@ class RandomAffine(Transform):
         translate: Optional[Sequence[float]] = None,
         scale: Optional[Sequence[float]] = None,
         shear: Optional[Union[int, float, Sequence[float]]] = None,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
@@ -383,7 +384,7 @@ class RandomAffine(Transform):
         else:
             self.shear = shear
 
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.fill = _setup_fill_arg(fill)
 
         if center is not None:
@@ -546,7 +547,7 @@ class RandomPerspective(_RandomApplyTransform):
         self,
         distortion_scale: float = 0.5,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         p: float = 0.5,
     ) -> None:
         super().__init__(p=p)
@@ -555,7 +556,7 @@ class RandomPerspective(_RandomApplyTransform):
             raise ValueError("Argument distortion_scale value should be between 0 and 1")
 
         self.distortion_scale = distortion_scale
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
@@ -608,13 +609,13 @@ class ElasticTransform(Transform):
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
         fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     ) -> None:
         super().__init__()
         self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
         self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
 
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
@@ -760,13 +761,13 @@ class ScaleJitter(Transform):
         self,
         target_size: Tuple[int, int],
         scale_range: Tuple[float, float] = (0.1, 2.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ):
         super().__init__()
         self.target_size = target_size
         self.scale_range = scale_range
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
@@ -788,13 +789,13 @@ class RandomShortestSize(Transform):
         self,
         min_size: Union[List[int], Tuple[int], int],
         max_size: Optional[int] = None,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ):
         super().__init__()
         self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
         self.max_size = max_size
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
@@ -935,13 +936,13 @@ class RandomResize(Transform):
         self,
         min_size: int,
         max_size: int,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.min_size = min_size
         self.max_size = max_size
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
diff --git a/torchvision/prototype/transforms/_presets.py b/torchvision/prototype/transforms/_presets.py
index 86300b049..7f18e885c 100644
--- a/torchvision/prototype/transforms/_presets.py
+++ b/torchvision/prototype/transforms/_presets.py
@@ -9,6 +9,8 @@ import PIL.Image
 import torch
 from torch import Tensor
 
+from torchvision.prototype.transforms.functional._geometry import _check_interpolation
+
 from . import functional as F, InterpolationMode
 
 __all__ = ["StereoMatching"]
@@ -22,7 +24,7 @@ class StereoMatching(torch.nn.Module):
         resize_size: Optional[Tuple[int, ...]],
         mean: Tuple[float, ...] = (0.5, 0.5, 0.5),
         std: Tuple[float, ...] = (0.5, 0.5, 0.5),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     ) -> None:
         super().__init__()
 
@@ -36,7 +38,7 @@ class StereoMatching(torch.nn.Module):
 
         self.mean = list(mean)
         self.std = list(std)
-        self.interpolation = interpolation
+        self.interpolation = _check_interpolation(interpolation)
         self.use_gray_scale = use_gray_scale
 
     def forward(self, left_image: Tensor, right_image: Tensor) -> Tuple[Tensor, Tensor]:
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 7fa0736cc..814697f03 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -13,6 +13,7 @@ from torchvision.transforms.functional import (
     _check_antialias,
     _compute_resized_output_size as __compute_resized_output_size,
     _get_perspective_coeffs,
+    _interpolation_modes_from_int,
     InterpolationMode,
     pil_modes_mapping,
     pil_to_tensor,
@@ -27,6 +28,17 @@ from ._meta import clamp_bounding_box, convert_format_bounding_box, get_spatial_
 from ._utils import is_simple_tensor
 
 
+def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise ValueError(
+            f"Argument interpolation should be an `InterpolationMode` or a corresponding Pillow integer constant, "
+            f"but got {interpolation}."
+        )
+    return interpolation
+
+
 def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
 
@@ -142,10 +154,11 @@ def _compute_resized_output_size(
 def resize_image_tensor(
     image: torch.Tensor,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
     antialias = _check_antialias(img=image, antialias=antialias, interpolation=interpolation)
     assert not isinstance(antialias, str)
     antialias = False if antialias is None else antialias
@@ -189,9 +202,10 @@ def resize_image_tensor(
 def resize_image_pil(
     image: PIL.Image.Image,
     size: Union[Sequence[int], int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
 ) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
     size = _compute_resized_output_size(image.size[::-1], size=size, max_size=max_size)  # type: ignore[arg-type]
     return _FP.resize(image, size, interpolation=pil_modes_mapping[interpolation])
 
@@ -228,7 +242,7 @@ def resize_bounding_box(
 def resize_video(
     video: torch.Tensor,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
@@ -238,7 +252,7 @@ def resize_video(
 def resize(
     inpt: datapoints.InputTypeJIT,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> datapoints.InputTypeJIT:
@@ -513,10 +527,12 @@ def affine_image_tensor(
     translate: List[float],
     scale: float,
     shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
     if image.numel() == 0:
         return image
 
@@ -563,10 +579,11 @@ def affine_image_pil(
     translate: List[float],
     scale: float,
     shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
     angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
 
     # center = (img_size[0] * 0.5 + 0.5, img_size[1] * 0.5 + 0.5)
@@ -731,7 +748,7 @@ def affine_video(
     translate: List[float],
     scale: float,
     shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
@@ -753,7 +770,7 @@ def affine(
     translate: List[float],
     scale: float,
     shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     fill: datapoints.FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> datapoints.InputTypeJIT:
@@ -797,11 +814,13 @@ def affine(
 def rotate_image_tensor(
     image: torch.Tensor,
     angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
     fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
     shape = image.shape
     num_channels, height, width = shape[-3:]
 
@@ -840,11 +859,13 @@ def rotate_image_tensor(
 def rotate_image_pil(
     image: PIL.Image.Image,
     angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
     fill: datapoints.FillTypeJIT = None,
 ) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+
     if center is not None and expand:
         warnings.warn("The provided center argument has no effect on the result if expand is True")
         center = None
@@ -910,7 +931,7 @@ def rotate_mask(
 def rotate_video(
     video: torch.Tensor,
     angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
     fill: datapoints.FillTypeJIT = None,
@@ -921,7 +942,7 @@ def rotate_video(
 def rotate(
     inpt: datapoints.InputTypeJIT,
     angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
     fill: datapoints.FillTypeJIT = None,
@@ -1281,11 +1302,13 @@ def perspective_image_tensor(
     image: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+
     if image.numel() == 0:
         return image
 
@@ -1326,11 +1349,12 @@ def perspective_image_pil(
     image: PIL.Image.Image,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    interpolation: InterpolationMode = InterpolationMode.BICUBIC,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BICUBIC,
     fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
     return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
@@ -1455,7 +1479,7 @@ def perspective_video(
     video: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
@@ -1468,7 +1492,7 @@ def perspective(
     inpt: datapoints.InputTypeJIT,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> datapoints.InputTypeJIT:
@@ -1496,9 +1520,11 @@ def perspective(
 def elastic_image_tensor(
     image: torch.Tensor,
     displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
     if image.numel() == 0:
         return image
 
@@ -1537,7 +1563,7 @@ def elastic_image_tensor(
 def elastic_image_pil(
     image: PIL.Image.Image,
     displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
@@ -1630,7 +1656,7 @@ def elastic_mask(
 def elastic_video(
     video: torch.Tensor,
     displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
 ) -> torch.Tensor:
     return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
@@ -1639,7 +1665,7 @@ def elastic_video(
 def elastic(
     inpt: datapoints.InputTypeJIT,
     displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints.FillTypeJIT = None,
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
@@ -1778,7 +1804,7 @@ def resized_crop_image_tensor(
     height: int,
     width: int,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     image = crop_image_tensor(image, top, left, height, width)
@@ -1793,7 +1819,7 @@ def resized_crop_image_pil(
     height: int,
     width: int,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
 ) -> PIL.Image.Image:
     image = crop_image_pil(image, top, left, height, width)
     return resize_image_pil(image, size, interpolation=interpolation)
@@ -1831,7 +1857,7 @@ def resized_crop_video(
     height: int,
     width: int,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     return resized_crop_image_tensor(
@@ -1846,7 +1872,7 @@ def resized_crop(
     height: int,
     width: int,
     size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> datapoints.InputTypeJIT:
     if not torch.jit.is_scripting():
-- 
GitLab


From c5e9a10d3fcd115b4af2e8c66f9ae099de700d00 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Feb 2023 14:11:50 +0100
Subject: [PATCH 291/624] add error for max_size with size sequence in resize
 (#7253)

---
 torchvision/prototype/transforms/functional/_geometry.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 814697f03..543a3bbc7 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -148,6 +148,11 @@ def _compute_resized_output_size(
 ) -> List[int]:
     if isinstance(size, int):
         size = [size]
+    elif max_size is not None and len(size) != 1:
+        raise ValueError(
+            "max_size should only be passed if size specifies the length of the smaller edge, "
+            "i.e. size should be an int or a sequence of length 1 in torchscript mode."
+        )
     return __compute_resized_output_size(spatial_size, size=size, max_size=max_size)
 
 
-- 
GitLab


From 1e19d73c5ab13eef2e6dab2c36a7d24a7e3e96ed Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Feb 2023 13:16:10 +0000
Subject: [PATCH 292/624] Add SanitizeBoundingBoxes transform (#7246)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_prototype_transforms.py            | 116 +++++++++++++++++
 torchvision/prototype/transforms/__init__.py |   2 +-
 torchvision/prototype/transforms/_misc.py    | 127 ++++++++++++++++---
 3 files changed, 224 insertions(+), 21 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index b881ebc50..ff772c515 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,5 +1,6 @@
 import itertools
 import pathlib
+import random
 import re
 import warnings
 from collections import defaultdict
@@ -2355,3 +2356,118 @@ def test_detection_preset(image_type, label_type, data_augmentation, to_tensor):
 
     out["label"] = torch.tensor(out["label"])
     assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes
+
+
+@pytest.mark.parametrize("min_size", (1, 10))
+@pytest.mark.parametrize(
+    "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
+)
+def test_sanitize_bounding_boxes(min_size, labels_getter):
+    H, W = 256, 128
+
+    boxes_and_validity = [
+        ([0, 1, 10, 1], False),  # Y1 == Y2
+        ([0, 1, 0, 20], False),  # X1 == X2
+        ([0, 0, min_size - 1, 10], False),  # H < min_size
+        ([0, 0, 10, min_size - 1], False),  # W < min_size
+        ([0, 0, 10, H + 1], False),  # Y2 > H
+        ([0, 0, W + 1, 10], False),  # X2 > W
+        ([-1, 1, 10, 20], False),  # any < 0
+        ([0, 0, -1, 20], False),  # any < 0
+        ([0, 0, -10, -1], False),  # any < 0
+        ([0, 0, min_size, 10], True),  # H < min_size
+        ([0, 0, 10, min_size], True),  # W < min_size
+        ([0, 0, W, H], True),  # TODO: Is that actually OK?? Should it be -1?
+        ([1, 1, 30, 20], True),
+        ([0, 0, 10, 10], True),
+        ([1, 1, 30, 20], True),
+    ]
+
+    random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
+    boxes, is_valid_mask = zip(*boxes_and_validity)
+    valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
+
+    boxes = torch.tensor(boxes)
+    labels = torch.arange(boxes.shape[-2])
+
+    boxes = datapoints.BoundingBox(
+        boxes,
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=(H, W),
+    )
+
+    sample = {
+        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
+        "labels": labels,
+        "boxes": boxes,
+        "whatever": torch.rand(10),
+        "None": None,
+    }
+
+    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+
+    assert out["image"] is sample["image"]
+    assert out["whatever"] is sample["whatever"]
+
+    if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
+        assert out["labels"] is sample["labels"]
+    else:
+        assert isinstance(out["labels"], torch.Tensor)
+        assert out["boxes"].shape[:-1] == out["labels"].shape
+        # This works because we conveniently set labels to arange(num_boxes)
+        assert out["labels"].tolist() == valid_indices
+
+
+@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
+def test_sanitize_bounding_boxes_default_heuristic(key):
+    labels = torch.arange(10)
+    d = {key: labels}
+    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+
+    if key.lower() != "labels":
+        # If "labels" is in the dict (case-insensitive),
+        # it takes precedence over other keys which would otherwise be a match
+        d = {key: "something_else", "labels": labels}
+        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+
+
+def test_sanitize_bounding_boxes_errors():
+
+    good_bbox = datapoints.BoundingBox(
+        [[0, 0, 10, 10]],
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=(20, 20),
+    )
+
+    with pytest.raises(ValueError, match="min_size must be >= 1"):
+        transforms.SanitizeBoundingBoxes(min_size=0)
+    with pytest.raises(ValueError, match="labels_getter should either be a str"):
+        transforms.SanitizeBoundingBoxes(labels_getter=12)
+
+    with pytest.raises(ValueError, match="Could not infer where the labels are"):
+        bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+
+    with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
+        not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
+        transforms.SanitizeBoundingBoxes()(not_a_dict)
+
+    with pytest.raises(ValueError, match="must be a tensor"):
+        not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
+        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+
+    with pytest.raises(ValueError, match="Number of boxes"):
+        different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
+
+    with pytest.raises(ValueError, match="boxes must be of shape"):
+        bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
+            [
+                [[0, 0, 10, 10]],
+                [[0, 0, 10, 10]],
+            ],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=(20, 20),
+        )
+        different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index a640d726c..ff3b75845 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -49,7 +49,7 @@ from ._misc import (
     LinearTransformation,
     Normalize,
     PermuteDimensions,
-    RemoveSmallBoundingBoxes,
+    SanitizeBoundingBoxes,
     ToDtype,
     TransposeDimensions,
 )
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index b398227b4..caed3eec9 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,12 +1,14 @@
+import collections
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
+from contextlib import suppress
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Tuple, Type, Union
 
 import PIL.Image
 
 import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten
 
 from torchvision import transforms as _transforms
-from torchvision.ops import remove_small_boxes
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F, Transform
 
@@ -225,28 +227,113 @@ class TransposeDimensions(Transform):
         return inpt.transpose(*dims)
 
 
-class RemoveSmallBoundingBoxes(Transform):
-    _transformed_types = (datapoints.BoundingBox, datapoints.Mask, datapoints.Label, datapoints.OneHotLabel)
+class SanitizeBoundingBoxes(Transform):
+    # This removes boxes and their corresponding labels:
+    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
+    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
 
-    def __init__(self, min_size: float = 1.0) -> None:
+    def __init__(
+        self,
+        min_size: float = 1.0,
+        labels_getter: Union[Callable[[Any], Optional[torch.Tensor]], str, None] = "default",
+    ) -> None:
         super().__init__()
+
+        if min_size < 1:
+            raise ValueError(f"min_size must be >= 1, got {min_size}.")
         self.min_size = min_size
 
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        bounding_box = query_bounding_box(flat_inputs)
-
-        # TODO: We can improve performance here by not using the `remove_small_boxes` function. It requires the box to
-        #  be in XYXY format only to calculate the width and height internally. Thus, if the box is in XYWH or CXCYWH
-        #  format,we need to convert first just to afterwards compute the width and height again, although they were
-        #  there in the first place for these formats.
-        bounding_box = F.convert_format_bounding_box(
-            bounding_box.as_subclass(torch.Tensor),
-            old_format=bounding_box.format,
-            new_format=datapoints.BoundingBoxFormat.XYXY,
-        )
-        valid_indices = remove_small_boxes(bounding_box, min_size=self.min_size)
+        self.labels_getter = labels_getter
+        self._labels_getter: Optional[Callable[[Any], Optional[torch.Tensor]]]
+        if labels_getter == "default":
+            self._labels_getter = self._find_labels_default_heuristic
+        elif callable(labels_getter):
+            self._labels_getter = labels_getter
+        elif isinstance(labels_getter, str):
+            self._labels_getter = lambda inputs: inputs[labels_getter]
+        elif labels_getter is None:
+            self._labels_getter = None
+        else:
+            raise ValueError(
+                "labels_getter should either be a str, callable, or 'default'. "
+                f"Got {labels_getter} of type {type(labels_getter)}."
+            )
+
+    @staticmethod
+    def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
+        # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive
+        # Returns None if nothing is found
+        candidate_key = None
+        with suppress(StopIteration):
+            candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
+        if candidate_key is None:
+            with suppress(StopIteration):
+                candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
+        if candidate_key is None:
+            raise ValueError(
+                "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
+                "If there are no samples and it is by design, pass labels_getter=None."
+            )
+        return inputs[candidate_key]
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping):
+            raise ValueError(
+                f"If labels_getter is a str or 'default' (got {self.labels_getter}), "
+                f"then the input to forward() must be a dict. Got {type(inputs)} instead."
+            )
+
+        if self._labels_getter is None:
+            labels = None
+        else:
+            labels = self._labels_getter(inputs)
+            if labels is not None and not isinstance(labels, torch.Tensor):
+                raise ValueError(f"The labels in the input to forward() must be a tensor, got {type(labels)} instead.")
 
-        return dict(valid_indices=valid_indices)
+        flat_inputs, spec = tree_flatten(inputs)
+        # TODO: this enforces one single BoundingBox entry.
+        # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
+        # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
+        boxes = query_bounding_box(flat_inputs)
+
+        if boxes.ndim != 2:
+            raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")
+
+        if labels is not None and boxes.shape[0] != labels.shape[0]:
+            raise ValueError(
+                f"Number of boxes (shape={boxes.shape}) and number of labels (shape={labels.shape}) do not match."
+            )
+
+        boxes = cast(
+            datapoints.BoundingBox,
+            F.convert_format_bounding_box(
+                boxes,
+                new_format=datapoints.BoundingBoxFormat.XYXY,
+            ),
+        )
+        ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
+        mask = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
+        # TODO: Do we really need to check for out of bounds here? All
+        # transforms should be clamping anyway, so this should never happen?
+        image_h, image_w = boxes.spatial_size
+        mask &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
+        mask &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
+
+        params = dict(mask=mask, labels=labels)
+        flat_outputs = [
+            # Even-though it may look like we're transforming all inputs, we don't:
+            # _transform() will only care about BoundingBoxes and the labels
+            self._transform(inpt, params)
+            for inpt in flat_inputs
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt.wrap_like(inpt, inpt[params["valid_indices"]])
+
+        if (inpt is not None and inpt is params["labels"]) or isinstance(inpt, datapoints.BoundingBox):
+            inpt = inpt[params["mask"]]
+
+        return inpt
-- 
GitLab


From f0b700028fc13c6d8b37bcdc8f3dcce655cc9195 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Feb 2023 15:02:22 +0100
Subject: [PATCH 293/624] allow integer parameters in ColorJitter (#7255)

---
 torchvision/prototype/transforms/_color.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
index 09e313e5b..8ac0d8577 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -80,14 +80,16 @@ class ColorJitter(Transform):
         if value is None:
             return None
 
-        if isinstance(value, float):
+        if isinstance(value, (int, float)):
             if value < 0:
                 raise ValueError(f"If {name} is a single number, it must be non negative.")
             value = [center - value, center + value]
             if clip_first_on_zero:
                 value[0] = max(value[0], 0.0)
-        elif not (isinstance(value, collections.abc.Sequence) and len(value) == 2):
-            raise TypeError(f"{name} should be a single number or a sequence with length 2.")
+        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
+            value = [float(v) for v in value]
+        else:
+            raise TypeError(f"{name}={value} should be a single number or a sequence with length 2.")
 
         if not bound[0] <= value[0] <= value[1] <= bound[1]:
             raise ValueError(f"{name} values should be between {bound}, but got {value}.")
-- 
GitLab


From 316cc25c9018957f8677e7974b11593fd9d1e41d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Feb 2023 15:06:19 +0100
Subject: [PATCH 294/624] Ten crop annotation (#7254)

---
 torchvision/prototype/transforms/_geometry.py | 13 +++-
 .../transforms/functional/_geometry.py        | 78 +++++++++++++++----
 torchvision/transforms/functional.py          |  4 +-
 torchvision/transforms/transforms.py          |  2 +-
 4 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index ffabb9147..69238760b 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -234,7 +234,18 @@ class TenCrop(Transform):
 
     def _transform(
         self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[List[datapoints.ImageTypeJIT], List[datapoints.VideoTypeJIT]]:
+    ) -> Tuple[
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+    ]:
         return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
 
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 543a3bbc7..0a50c956f 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1964,8 +1964,6 @@ def five_crop(
     if not torch.jit.is_scripting():
         _log_api_usage_once(five_crop)
 
-    # TODO: consider breaking BC here to return List[datapoints.ImageTypeJIT/VideoTypeJIT] to align this op with
-    #  `ten_crop`
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return five_crop_image_tensor(inpt, size)
     elif isinstance(inpt, datapoints.Image):
@@ -1983,40 +1981,90 @@ def five_crop(
         )
 
 
-def ten_crop_image_tensor(image: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]:
-    tl, tr, bl, br, center = five_crop_image_tensor(image, size)
+def ten_crop_image_tensor(
+    image: torch.Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    non_flipped = five_crop_image_tensor(image, size)
 
     if vertical_flip:
         image = vertical_flip_image_tensor(image)
     else:
         image = horizontal_flip_image_tensor(image)
 
-    tl_flip, tr_flip, bl_flip, br_flip, center_flip = five_crop_image_tensor(image, size)
+    flipped = five_crop_image_tensor(image, size)
 
-    return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
+    return non_flipped + flipped
 
 
 @torch.jit.unused
-def ten_crop_image_pil(image: PIL.Image.Image, size: List[int], vertical_flip: bool = False) -> List[PIL.Image.Image]:
-    tl, tr, bl, br, center = five_crop_image_pil(image, size)
+def ten_crop_image_pil(
+    image: PIL.Image.Image, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+]:
+    non_flipped = five_crop_image_pil(image, size)
 
     if vertical_flip:
         image = vertical_flip_image_pil(image)
     else:
         image = horizontal_flip_image_pil(image)
 
-    tl_flip, tr_flip, bl_flip, br_flip, center_flip = five_crop_image_pil(image, size)
-
-    return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
-
-
-def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]:
+    flipped = five_crop_image_pil(image, size)
+
+    return non_flipped + flipped
+
+
+def ten_crop_video(
+    video: torch.Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
     return ten_crop_image_tensor(video, size, vertical_flip=vertical_flip)
 
 
 def ten_crop(
     inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], size: List[int], vertical_flip: bool = False
-) -> Union[List[datapoints.ImageTypeJIT], List[datapoints.VideoTypeJIT]]:
+) -> Tuple[
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+    ImageOrVideoTypeJIT,
+]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(ten_crop)
 
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index beeb02cd9..c5b2a71d0 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -827,7 +827,9 @@ def five_crop(img: Tensor, size: List[int]) -> Tuple[Tensor, Tensor, Tensor, Ten
     return tl, tr, bl, br, center
 
 
-def ten_crop(img: Tensor, size: List[int], vertical_flip: bool = False) -> List[Tensor]:
+def ten_crop(
+    img: Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
     """Generate ten cropped images from the given image.
     Crop the given image into four corners and the central crop plus the
     flipped version of these (horizontal flipping is used by default).
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index d7858353b..90cb0374e 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -1049,7 +1049,7 @@ class TenCrop(torch.nn.Module):
 
     Example:
          >>> transform = Compose([
-         >>>    TenCrop(size), # this is a list of PIL Images
+         >>>    TenCrop(size), # this is a tuple of PIL Images
          >>>    Lambda(lambda crops: torch.stack([PILToTensor()(crop) for crop in crops])) # returns a 4D tensor
          >>> ])
          >>> #In your test loop you can do the following:
-- 
GitLab


From 3080082da92dfc1665fbb3b7b6ea7d09951245e9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Feb 2023 14:21:17 +0000
Subject: [PATCH 295/624] Make RandomApply torchscriptable in V2 (#7256)

---
 test/test_prototype_transforms_consistency.py  | 5 +++++
 torchvision/prototype/transforms/_container.py | 8 +++++++-
 torchvision/prototype/transforms/_transform.py | 5 +++--
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index bb20f8a7b..c6709a5e5 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -806,6 +806,11 @@ class TestContainerTransforms:
 
         check_call_consistency(prototype_transform, legacy_transform)
 
+        if sequence_type is nn.ModuleList:
+            # quick and dirty test that it is jit-scriptable
+            scripted = torch.jit.script(prototype_transform)
+            scripted(torch.rand(1, 3, 300, 300))
+
     # We can't test other values for `p` since the random parameter generation is different
     @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)])
     def test_random_choice(self, probabilities):
diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/prototype/transforms/_container.py
index 938f59f64..42c73a2c1 100644
--- a/torchvision/prototype/transforms/_container.py
+++ b/torchvision/prototype/transforms/_container.py
@@ -1,9 +1,10 @@
 import warnings
-from typing import Any, Callable, List, Optional, Sequence, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 
 import torch
 
 from torch import nn
+from torchvision import transforms as _transforms
 from torchvision.prototype.transforms import Transform
 
 
@@ -28,6 +29,8 @@ class Compose(Transform):
 
 
 class RandomApply(Transform):
+    _v1_transform_cls = _transforms.RandomApply
+
     def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
         super().__init__()
 
@@ -39,6 +42,9 @@ class RandomApply(Transform):
             raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
         self.p = p
 
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return {"transforms": self.transforms, "p": self.p}
+
     def forward(self, *inputs: Any) -> Any:
         sample = inputs if len(inputs) > 1 else inputs[0]
 
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 16c30565d..7f3c03d5e 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -141,8 +141,9 @@ class Transform(nn.Module):
         if self._v1_transform_cls is None:
             raise RuntimeError(
                 f"Transform {type(self).__name__} cannot be JIT scripted. "
-                f"This is only support for backward compatibility with transforms which already in v1."
-                f"For torchscript support (on tensors only), you can use the functional API instead."
+                "torchscript is only supported for backward compatibility with transforms "
+                "which are already in torchvision.transforms. "
+                "For torchscript support (on tensors only), you can use the functional API instead."
             )
 
         return self._v1_transform_cls(**self._extract_params_for_v1_transform())
-- 
GitLab


From d805aeaee7651ffb22a98647683605ae5f5c6c1a Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Wed, 15 Feb 2023 15:41:12 +0100
Subject: [PATCH 296/624] Fixed issues in elastic transform (#7257)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 .../transforms/functional/_geometry.py          | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 0a50c956f..840223908 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1538,10 +1538,21 @@ def elastic_image_tensor(
 
     device = image.device
     dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+
+    # Patch: elastic transform should support (cpu,f16) input
+    is_cpu_half = device.type == "cpu" and dtype == torch.float16
+    if is_cpu_half:
+        image = image.to(torch.float32)
+        dtype = torch.float32
+
     # We are aware that if input image dtype is uint8 and displacement is float64 then
     # displacement will be casted to float32 and all computations will be done with float32
     # We can fix this later if needed
 
+    expected_shape = (1,) + shape[-2:] + (2,)
+    if expected_shape != displacement.shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
     if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
@@ -1561,6 +1572,9 @@ def elastic_image_tensor(
     if needs_unsquash:
         output = output.reshape(shape)
 
+    if is_cpu_half:
+        output = output.to(torch.float16)
+
     return output
 
 
@@ -1676,6 +1690,9 @@ def elastic(
     if not torch.jit.is_scripting():
         _log_api_usage_once(elastic)
 
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
     elif isinstance(inpt, datapoints._datapoint.Datapoint):
-- 
GitLab


From 30bb1ceac69d975c6c585960080f7b7b56f545dc Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 15 Feb 2023 11:19:25 -0800
Subject: [PATCH 297/624] [ONNX] misc improvements (#7249)

Co-authored-by: Nikita Shulga <nshulga@fb.com>
---
 test/test_onnx.py                     |   6 +-
 torchvision/ops/_register_onnx_ops.py | 178 ++++++++++++++------------
 2 files changed, 97 insertions(+), 87 deletions(-)

diff --git a/test/test_onnx.py b/test/test_onnx.py
index 0af76072e..19ed13b1a 100644
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@@ -34,7 +34,7 @@ class TestONNXExporter:
         opset_version: Optional[int] = None,
     ):
         if opset_version is None:
-            opset_version = _register_onnx_ops.base_onnx_opset_version
+            opset_version = _register_onnx_ops.BASE_ONNX_OPSET_VERSION
 
         model.eval()
 
@@ -139,7 +139,7 @@ class TestONNXExporter:
         self.run_model(model, [(x, single_roi)])
 
     def test_roi_align_aligned(self):
-        supported_onnx_version = _register_onnx_ops._onnx_opset_version_16
+        supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 2, aligned=True)
@@ -166,7 +166,7 @@ class TestONNXExporter:
         self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
     def test_roi_align_malformed_boxes(self):
-        supported_onnx_version = _register_onnx_ops._onnx_opset_version_16
+        supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16
         x = torch.randn(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 1, aligned=True)
diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py
index 7016af82c..5dd263a5d 100644
--- a/torchvision/ops/_register_onnx_ops.py
+++ b/torchvision/ops/_register_onnx_ops.py
@@ -2,96 +2,106 @@ import sys
 import warnings
 
 import torch
+from torch.onnx import symbolic_opset11 as opset11
+from torch.onnx.symbolic_helper import parse_args
 
-_onnx_opset_version_11 = 11
-_onnx_opset_version_16 = 16
-base_onnx_opset_version = _onnx_opset_version_11
+_ONNX_OPSET_VERSION_11 = 11
+_ONNX_OPSET_VERSION_16 = 16
+BASE_ONNX_OPSET_VERSION = _ONNX_OPSET_VERSION_11
 
 
-def _register_custom_op():
-    from torch.onnx.symbolic_helper import parse_args
-    from torch.onnx.symbolic_opset11 import select, squeeze, unsqueeze
-
-    @parse_args("v", "v", "f")
-    def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
-        boxes = unsqueeze(g, boxes, 0)
-        scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
-        max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
-        iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
-        nms_out = g.op(
-            "NonMaxSuppression",
-            g.op("Cast", boxes, to_i=torch.onnx.TensorProtoDataType.FLOAT),
-            g.op("Cast", scores, to_i=torch.onnx.TensorProtoDataType.FLOAT),
-            max_output_per_class,
-            iou_threshold,
-        )
-        return squeeze(g, select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1)
-
-    def _process_batch_indices_for_roi_align(g, rois):
-        indices = squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1)
-        return g.op("Cast", indices, to_i=torch.onnx.TensorProtoDataType.INT64)
-
-    def _process_rois_for_roi_align(g, rois):
-        return select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
-
-    def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int):
-        if sampling_ratio < 0:
-            warnings.warn(
-                "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. "
-                "The model will be exported with a sampling_ratio of 0."
-            )
-            sampling_ratio = 0
-        return sampling_ratio
-
-    @parse_args("v", "v", "f", "i", "i", "i", "i")
-    def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
-        batch_indices = _process_batch_indices_for_roi_align(g, rois)
-        rois = _process_rois_for_roi_align(g, rois)
-        if aligned:
-            warnings.warn(
-                "ROIAlign with aligned=True is only supported in opset >= 16. "
-                "Please export with opset 16 or higher, or use aligned=False."
-            )
-        sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
-        return g.op(
-            "RoiAlign",
-            input,
-            rois,
-            batch_indices,
-            spatial_scale_f=spatial_scale,
-            output_height_i=pooled_height,
-            output_width_i=pooled_width,
-            sampling_ratio_i=sampling_ratio,
-        )
+@parse_args("v", "v", "f")
+def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
+    boxes = opset11.unsqueeze(g, boxes, 0)
+    scores = opset11.unsqueeze(g, opset11.unsqueeze(g, scores, 0), 0)
+    max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
+    iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
+
+    # Cast boxes and scores to float32 in case they are float64 inputs
+    nms_out = g.op(
+        "NonMaxSuppression",
+        g.op("Cast", boxes, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        g.op("Cast", scores, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        max_output_per_class,
+        iou_threshold,
+    )
+    return opset11.squeeze(
+        g, opset11.select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1
+    )
+
+
+def _process_batch_indices_for_roi_align(g, rois):
+    indices = opset11.squeeze(
+        g, opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1
+    )
+    return g.op("Cast", indices, to_i=torch.onnx.TensorProtoDataType.INT64)
+
+
+def _process_rois_for_roi_align(g, rois):
+    return opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
 
-    @parse_args("v", "v", "f", "i", "i", "i", "i")
-    def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
-        batch_indices = _process_batch_indices_for_roi_align(g, rois)
-        rois = _process_rois_for_roi_align(g, rois)
-        coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel"
-        sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
-        return g.op(
-            "RoiAlign",
-            input,
-            rois,
-            batch_indices,
-            coordinate_transformation_mode_s=coordinate_transformation_mode,
-            spatial_scale_f=spatial_scale,
-            output_height_i=pooled_height,
-            output_width_i=pooled_width,
-            sampling_ratio_i=sampling_ratio,
+
+def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int):
+    if sampling_ratio < 0:
+        warnings.warn(
+            "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. "
+            "The model will be exported with a sampling_ratio of 0."
         )
+        sampling_ratio = 0
+    return sampling_ratio
+
 
-    @parse_args("v", "v", "f", "i", "i")
-    def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
-        roi_pool = g.op(
-            "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    if aligned:
+        warnings.warn(
+            "ROIAlign with aligned=True is only supported in opset >= 16. "
+            "Please export with opset 16 or higher, or use aligned=False."
         )
-        return roi_pool, None
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
+
+
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel"
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        coordinate_transformation_mode_s=coordinate_transformation_mode,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
 
-    from torch.onnx import register_custom_op_symbolic
 
-    register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version_11)
-    register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _onnx_opset_version_11)
-    register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _onnx_opset_version_16)
-    register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version_11)
+@parse_args("v", "v", "f", "i", "i")
+def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
+    roi_pool = g.op(
+        "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+    )
+    return roi_pool, None
+
+
+def _register_custom_op():
+    torch.onnx.register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _ONNX_OPSET_VERSION_16)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _ONNX_OPSET_VERSION_11)
-- 
GitLab


From d010e82fec10422f79c69564de7ff2721d93d278 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Feb 2023 20:24:05 +0100
Subject: [PATCH 298/624] fix ten_crop datapoint return (#7260)

---
 torchvision/prototype/transforms/functional/_geometry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 840223908..22731bb15 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -2089,10 +2089,10 @@ def ten_crop(
         return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
     elif isinstance(inpt, datapoints.Image):
         output = ten_crop_image_tensor(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
-        return [datapoints.Image.wrap_like(inpt, item) for item in output]
+        return tuple(datapoints.Image.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
     elif isinstance(inpt, datapoints.Video):
         output = ten_crop_video(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
-        return [datapoints.Video.wrap_like(inpt, item) for item in output]
+        return tuple(datapoints.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
     elif isinstance(inpt, PIL.Image.Image):
         return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
     else:
-- 
GitLab


From 3991ab994630ede87706dc9b46d005b8834cc5bc Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 16 Feb 2023 11:20:58 +0100
Subject: [PATCH 299/624] Promote prototype transforms to beta status (#7261)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: vfdev-5 <vfdev.5@gmail.com>
---
 test/datasets_utils.py                        |  14 +-
 test/prototype_common_utils.py                |   9 +-
 test/prototype_transforms_dispatcher_infos.py |   4 +-
 test/prototype_transforms_kernel_infos.py     |  20 +-
 test/test_prototype_datapoints.py             |  40 +-
 test/test_prototype_datasets_builtin.py       |  13 +-
 test/test_prototype_transforms.py             | 137 ++-
 test/test_prototype_transforms_consistency.py | 210 ++---
 test/test_prototype_transforms_functional.py  |  16 +-
 test/test_prototype_transforms_utils.py       |  14 +-
 torchvision/datapoints/__init__.py            |   7 +
 .../datapoints/_bounding_box.py               |   0
 .../{prototype => }/datapoints/_datapoint.py  |   2 +-
 .../datapoints/_dataset_wrapper.py            |   5 +-
 .../{prototype => }/datapoints/_image.py      |   2 +-
 .../{prototype => }/datapoints/_mask.py       |   2 +-
 .../{prototype => }/datapoints/_video.py      |   0
 torchvision/prototype/datapoints/__init__.py  |   7 -
 torchvision/prototype/datapoints/_label.py    |   2 +-
 .../prototype/datasets/_builtin/caltech.py    |   3 +-
 .../prototype/datasets/_builtin/celeba.py     |   3 +-
 .../prototype/datasets/_builtin/cifar.py      |   3 +-
 .../prototype/datasets/_builtin/coco.py       |   3 +-
 .../prototype/datasets/_builtin/cub200.py     |   3 +-
 .../prototype/datasets/_builtin/fer2013.py    |   3 +-
 .../prototype/datasets/_builtin/gtsrb.py      |   3 +-
 .../prototype/datasets/_builtin/mnist.py      |   3 +-
 .../prototype/datasets/_builtin/pcam.py       |   3 +-
 .../prototype/datasets/_builtin/semeion.py    |   3 +-
 .../datasets/_builtin/stanford_cars.py        |   3 +-
 .../prototype/datasets/_builtin/svhn.py       |   3 +-
 .../prototype/datasets/_builtin/usps.py       |   3 +-
 .../prototype/datasets/_builtin/voc.py        |   3 +-
 .../prototype/datasets/utils/_encoded.py      |   2 +-
 torchvision/prototype/transforms/__init__.py  |  61 +-
 torchvision/prototype/transforms/_augment.py  | 123 +--
 torchvision/prototype/transforms/_geometry.py | 856 +-----------------
 torchvision/prototype/transforms/_misc.py     | 291 +-----
 torchvision/prototype/transforms/_presets.py  |   4 +-
 .../prototype/transforms/_type_conversion.py  |  50 +-
 torchvision/transforms/v2/__init__.py         |  47 +
 torchvision/transforms/v2/_augment.py         | 105 +++
 .../v2}/_auto_augment.py                      |   9 +-
 .../transforms => transforms/v2}/_color.py    |   5 +-
 .../v2}/_container.py                         |   2 +-
 .../v2}/_deprecated.py                        |   4 +-
 torchvision/transforms/v2/_geometry.py        | 847 +++++++++++++++++
 .../transforms => transforms/v2}/_meta.py     |   5 +-
 torchvision/transforms/v2/_misc.py            | 290 ++++++
 .../transforms => transforms/v2}/_temporal.py |  11 +-
 .../v2}/_transform.py                         |   4 +-
 torchvision/transforms/v2/_type_conversion.py |  44 +
 .../transforms => transforms/v2}/_utils.py    |   4 +-
 .../v2}/functional/__init__.py                |   0
 .../v2}/functional/_augment.py                |   2 +-
 .../v2}/functional/_color.py                  |   2 +-
 .../v2}/functional/_deprecated.py             |   2 +-
 .../v2}/functional/_geometry.py               |   2 +-
 .../v2}/functional/_meta.py                   |   4 +-
 .../v2}/functional/_misc.py                   |   2 +-
 .../v2}/functional/_temporal.py               |  20 +-
 .../v2}/functional/_type_conversion.py        |   2 +-
 .../v2}/functional/_utils.py                  |   2 +-
 .../transforms => transforms/v2}/utils.py     |   4 +-
 64 files changed, 1683 insertions(+), 1669 deletions(-)
 create mode 100644 torchvision/datapoints/__init__.py
 rename torchvision/{prototype => }/datapoints/_bounding_box.py (100%)
 rename torchvision/{prototype => }/datapoints/_datapoint.py (99%)
 rename torchvision/{prototype => }/datapoints/_dataset_wrapper.py (98%)
 rename torchvision/{prototype => }/datapoints/_image.py (99%)
 rename torchvision/{prototype => }/datapoints/_mask.py (98%)
 rename torchvision/{prototype => }/datapoints/_video.py (100%)
 create mode 100644 torchvision/transforms/v2/__init__.py
 create mode 100644 torchvision/transforms/v2/_augment.py
 rename torchvision/{prototype/transforms => transforms/v2}/_auto_augment.py (98%)
 rename torchvision/{prototype/transforms => transforms/v2}/_color.py (98%)
 rename torchvision/{prototype/transforms => transforms/v2}/_container.py (98%)
 rename torchvision/{prototype/transforms => transforms/v2}/_deprecated.py (92%)
 create mode 100644 torchvision/transforms/v2/_geometry.py
 rename torchvision/{prototype/transforms => transforms/v2}/_meta.py (90%)
 create mode 100644 torchvision/transforms/v2/_misc.py
 rename torchvision/{prototype/transforms => transforms/v2}/_temporal.py (53%)
 rename torchvision/{prototype/transforms => transforms/v2}/_transform.py (98%)
 create mode 100644 torchvision/transforms/v2/_type_conversion.py
 rename torchvision/{prototype/transforms => transforms/v2}/_utils.py (96%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/__init__.py (100%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_augment.py (97%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_color.py (99%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_deprecated.py (96%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_geometry.py (99%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_meta.py (99%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_misc.py (99%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_temporal.py (55%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_type_conversion.py (95%)
 rename torchvision/{prototype/transforms => transforms/v2}/functional/_utils.py (70%)
 rename torchvision/{prototype/transforms => transforms/v2}/utils.py (94%)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index c02ffeb0d..e8290b55c 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -584,11 +584,8 @@ class DatasetTestCase(unittest.TestCase):
 
     @test_all_configs
     def test_transforms_v2_wrapper(self, config):
-        # Although this is a stable test, we unconditionally import from `torchvision.prototype` here. The wrapper needs
-        # to be available with the next release when v2 is released. Thus, if this import somehow fails on the release
-        # branch, we screwed up the roll-out
-        from torchvision.prototype.datapoints import wrap_dataset_for_transforms_v2
-        from torchvision.prototype.datapoints._datapoint import Datapoint
+        from torchvision.datapoints import wrap_dataset_for_transforms_v2
+        from torchvision.datapoints._datapoint import Datapoint
 
         try:
             with self.create_dataset(config) as (dataset, _):
@@ -596,12 +593,13 @@ class DatasetTestCase(unittest.TestCase):
                 wrapped_sample = wrapped_dataset[0]
                 assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
         except TypeError as error:
-            if str(error).startswith(f"No wrapper exists for dataset class {type(dataset).__name__}"):
-                return
+            msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
+            if str(error).startswith(msg):
+                pytest.skip(msg)
             raise error
         except RuntimeError as error:
             if "currently not supported by this wrapper" in str(error):
-                return
+                pytest.skip("Config is currently not supported by this wrapper")
             raise error
 
 
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index f2ae8d2b9..8648a09ad 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -12,12 +12,13 @@ import PIL.Image
 import pytest
 import torch
 import torch.testing
+import torchvision.prototype.datapoints as proto_datapoints
 from datasets_utils import combinations_grid
 from torch.nn.functional import one_hot
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms.functional import convert_dtype_image_tensor, to_image_tensor
+from torchvision import datapoints
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
 
 __all__ = [
     "assert_close",
@@ -457,7 +458,7 @@ def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
         # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
         # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
         data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
-        return datapoints.Label(data, categories=categories)
+        return proto_datapoints.Label(data, categories=categories)
 
     return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)
 
@@ -481,7 +482,7 @@ def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int
             # since `one_hot` only supports int64
             label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
             data = one_hot(label, num_classes=num_categories).to(dtype)
-        return datapoints.OneHotLabel(data, categories=categories)
+        return proto_datapoints.OneHotLabel(data, categories=categories)
 
     return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)
 
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
index 442dd526e..308f787ba 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -1,10 +1,10 @@
 import collections.abc
 
 import pytest
-import torchvision.prototype.transforms.functional as F
+import torchvision.transforms.v2.functional as F
 from prototype_common_utils import InfoBase, TestMark
 from prototype_transforms_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index ce05c980a..a0f7da5e2 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -8,7 +8,7 @@ import PIL.Image
 import pytest
 import torch.testing
 import torchvision.ops
-import torchvision.prototype.transforms.functional as F
+import torchvision.transforms.v2.functional as F
 from datasets_utils import combinations_grid
 from prototype_common_utils import (
     ArgsKwargs,
@@ -28,7 +28,7 @@ from prototype_common_utils import (
     TestMark,
 )
 from torch.utils._pytree import tree_map
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms.functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
@@ -2383,19 +2383,18 @@ KERNEL_INFOS.extend(
 
 def sample_inputs_uniform_temporal_subsample_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=[4]):
-        for temporal_dim in [-4, len(video_loader.shape) - 4]:
-            yield ArgsKwargs(video_loader, num_samples=2, temporal_dim=temporal_dim)
+        yield ArgsKwargs(video_loader, num_samples=2)
 
 
-def reference_uniform_temporal_subsample_video(x, num_samples, temporal_dim=-4):
+def reference_uniform_temporal_subsample_video(x, num_samples):
     # Copy-pasted from
     # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19
-    t = x.shape[temporal_dim]
+    t = x.shape[-4]
     assert num_samples > 0 and t > 0
     # Sample by nearest neighbor interpolation if num_samples > t.
     indices = torch.linspace(0, t - 1, num_samples)
     indices = torch.clamp(indices, 0, t - 1).long()
-    return torch.index_select(x, temporal_dim, indices)
+    return torch.index_select(x, -4, indices)
 
 
 def reference_inputs_uniform_temporal_subsample_video():
@@ -2410,12 +2409,5 @@ KERNEL_INFOS.append(
         sample_inputs_fn=sample_inputs_uniform_temporal_subsample_video,
         reference_fn=reference_uniform_temporal_subsample_video,
         reference_inputs_fn=reference_inputs_uniform_temporal_subsample_video,
-        test_marks=[
-            TestMark(
-                ("TestKernels", "test_batched_vs_single"),
-                pytest.mark.skip("Positive `temporal_dim` arguments are not equivalent for batched and single inputs"),
-                condition=lambda args_kwargs: args_kwargs.kwargs.get("temporal_dim") >= 0,
-            ),
-        ],
     )
 )
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index c2cc0986b..615fa9f61 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -5,8 +5,8 @@ import torch
 
 from PIL import Image
 
-from torchvision import datasets
-from torchvision.prototype import datapoints
+from torchvision import datapoints, datasets
+from torchvision.prototype import datapoints as proto_datapoints
 
 
 @pytest.mark.parametrize(
@@ -24,38 +24,38 @@ from torchvision.prototype import datapoints
     ],
 )
 def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
-    datapoint = datapoints.Label(data, requires_grad=input_requires_grad)
+    datapoint = proto_datapoints.Label(data, requires_grad=input_requires_grad)
     assert datapoint.requires_grad is expected_requires_grad
 
 
 def test_isinstance():
     assert isinstance(
-        datapoints.Label([0, 1, 0], categories=["foo", "bar"]),
+        proto_datapoints.Label([0, 1, 0], categories=["foo", "bar"]),
         torch.Tensor,
     )
 
 
 def test_wrapping_no_copy():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     assert label.data_ptr() == tensor.data_ptr()
 
 
 def test_to_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     label_to = label.to(torch.int32)
 
-    assert type(label_to) is datapoints.Label
+    assert type(label_to) is proto_datapoints.Label
     assert label_to.dtype is torch.int32
     assert label_to.categories is label.categories
 
 
 def test_to_datapoint_reference():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
 
     tensor_to = tensor.to(label)
 
@@ -65,31 +65,31 @@ def test_to_datapoint_reference():
 
 def test_clone_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     label_clone = label.clone()
 
-    assert type(label_clone) is datapoints.Label
+    assert type(label_clone) is proto_datapoints.Label
     assert label_clone.data_ptr() != label.data_ptr()
     assert label_clone.categories is label.categories
 
 
 def test_requires_grad__wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.float32)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     assert not label.requires_grad
 
     label_requires_grad = label.requires_grad_(True)
 
-    assert type(label_requires_grad) is datapoints.Label
+    assert type(label_requires_grad) is proto_datapoints.Label
     assert label.requires_grad
     assert label_requires_grad.requires_grad
 
 
 def test_other_op_no_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     # any operation besides .to() and .clone() will do here
     output = label * 2
@@ -107,33 +107,33 @@ def test_other_op_no_wrapping():
 )
 def test_no_tensor_output_op_no_wrapping(op):
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     output = op(label)
 
-    assert type(output) is not datapoints.Label
+    assert type(output) is not proto_datapoints.Label
 
 
 def test_inplace_op_no_wrapping():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     output = label.add_(0)
 
     assert type(output) is torch.Tensor
-    assert type(label) is datapoints.Label
+    assert type(label) is proto_datapoints.Label
 
 
 def test_wrap_like():
     tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = datapoints.Label(tensor, categories=["foo", "bar"])
+    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
 
     # any operation besides .to() and .clone() will do here
     output = label * 2
 
-    label_new = datapoints.Label.wrap_like(label, output)
+    label_new = proto_datapoints.Label.wrap_like(label, output)
 
-    assert type(label_new) is datapoints.Label
+    assert type(label_new) is proto_datapoints.Label
     assert label_new.data_ptr() == output.data_ptr()
     assert label_new.categories is label.categories
 
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 7b33dc3e8..4848e799f 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -5,8 +5,8 @@ from pathlib import Path
 
 import pytest
 import torch
+import torchvision.transforms.v2 as transforms
 
-import torchvision.prototype.transforms.utils
 from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
 from torch.testing._comparison import not_close_error_metas, ObjectPair, TensorLikePair
 
@@ -19,10 +19,13 @@ from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.dataloader2.graph.utils import traverse_dps
 from torchdata.datapipes.iter import ShardingFilter, Shuffler
 from torchdata.datapipes.utils import StreamWrapper
+from torchvision import datapoints
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import datapoints, datasets, transforms
+from torchvision.prototype import datasets
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
+from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 def assert_samples_equal(*args, msg=None, **kwargs):
@@ -141,9 +144,7 @@ class TestCommon:
         dataset, _ = dataset_mock.load(config)
         sample = next_consume(iter(dataset))
 
-        simple_tensors = {
-            key for key, value in sample.items() if torchvision.prototype.transforms.utils.is_simple_tensor(value)
-        }
+        simple_tensors = {key for key, value in sample.items() if is_simple_tensor(value)}
 
         if simple_tensors and not any(
             isinstance(item, (datapoints.Image, datapoints.Video, EncodedImage)) for item in sample.values()
@@ -276,6 +277,6 @@ class TestUSPS:
             assert "label" in sample
 
             assert isinstance(sample["image"], datapoints.Image)
-            assert isinstance(sample["label"], datapoints.Label)
+            assert isinstance(sample["label"], Label)
 
             assert sample["image"].shape == (1, 16, 16)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index ff772c515..b8f20a26b 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -10,8 +10,11 @@ import numpy as np
 import PIL.Image
 import pytest
 import torch
+import torchvision.prototype.datapoints as proto_datapoints
+import torchvision.prototype.transforms as proto_transforms
+import torchvision.transforms.v2 as transforms
 
-import torchvision.prototype.transforms.utils
+import torchvision.transforms.v2.utils
 from common_utils import cpu_and_gpu
 from prototype_common_utils import (
     assert_equal,
@@ -28,11 +31,12 @@ from prototype_common_utils import (
     make_videos,
 )
 from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import datapoints
 from torchvision.ops.boxes import box_iou
-from torchvision.prototype import datapoints, transforms
-from torchvision.prototype.transforms import functional as F
-from torchvision.prototype.transforms.utils import check_type, is_simple_tensor, query_chw
 from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2._utils import _convert_fill_arg
+from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
 
@@ -281,8 +285,8 @@ class TestSmoke:
                 ],
             )
             for transform in [
-                transforms.RandomMixup(alpha=1.0),
-                transforms.RandomCutmix(alpha=1.0),
+                proto_transforms.RandomMixup(alpha=1.0),
+                proto_transforms.RandomCutmix(alpha=1.0),
             ]
         ]
     )
@@ -563,7 +567,7 @@ class TestPad:
     def test__transform(self, padding, fill, padding_mode, mocker):
         transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         _ = transform(inpt)
 
@@ -576,7 +580,7 @@ class TestPad:
     def test__transform_image_mask(self, fill, mocker):
         transform = transforms.Pad(1, fill=fill, padding_mode="constant")
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
         image = datapoints.Image(torch.rand(3, 32, 32))
         mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
         inpt = [image, mask]
@@ -634,7 +638,7 @@ class TestRandomZoomOut:
 
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
         # Otherwise, we can mock transform._get_params
@@ -651,7 +655,7 @@ class TestRandomZoomOut:
     def test__transform_image_mask(self, fill, mocker):
         transform = transforms.RandomZoomOut(fill=fill, p=1.0)
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
         image = datapoints.Image(torch.rand(3, 32, 32))
         mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
         inpt = [image, mask]
@@ -724,7 +728,7 @@ class TestRandomRotation:
         else:
             assert transform.degrees == [float(-degrees), float(degrees)]
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.rotate")
+        fn = mocker.patch("torchvision.transforms.v2.functional.rotate")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
@@ -859,7 +863,7 @@ class TestRandomAffine:
         else:
             assert transform.degrees == [float(-degrees), float(degrees)]
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.affine")
+        fn = mocker.patch("torchvision.transforms.v2.functional.affine")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
@@ -964,8 +968,8 @@ class TestRandomCrop:
             )
         else:
             expected.spatial_size = inpt.spatial_size
-        _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected)
-        fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop")
+        _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
+        fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
 
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
@@ -1036,7 +1040,7 @@ class TestGaussianBlur:
         else:
             assert transform.sigma == [sigma, sigma]
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur")
+        fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
@@ -1068,7 +1072,7 @@ class TestRandomColorOp:
     def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
         transform = transform_cls(p=p, **kwargs)
 
-        fn = mocker.patch(f"torchvision.prototype.transforms.functional.{func_op_name}")
+        fn = mocker.patch(f"torchvision.transforms.v2.functional.{func_op_name}")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         _ = transform(inpt)
         if p > 0.0:
@@ -1104,7 +1108,7 @@ class TestRandomPerspective:
         fill = 12
         transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.perspective")
+        fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
@@ -1178,7 +1182,7 @@ class TestElasticTransform:
         else:
             assert transform.sigma == sigma
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.elastic")
+        fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
         inpt.spatial_size = (24, 32)
@@ -1251,13 +1255,13 @@ class TestRandomErasing:
         w_sentinel = mocker.MagicMock()
         v_sentinel = mocker.MagicMock()
         mocker.patch(
-            "torchvision.prototype.transforms._augment.RandomErasing._get_params",
+            "torchvision.transforms.v2._augment.RandomErasing._get_params",
             return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
         )
 
         inpt_sentinel = mocker.MagicMock()
 
-        mock = mocker.patch("torchvision.prototype.transforms._augment.F.erase")
+        mock = mocker.patch("torchvision.transforms.v2._augment.F.erase")
         output = transform(inpt_sentinel)
 
         if p:
@@ -1300,7 +1304,7 @@ class TestToImageTensor:
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch(
-            "torchvision.prototype.transforms.functional.to_image_tensor",
+            "torchvision.transforms.v2.functional.to_image_tensor",
             return_value=torch.rand(1, 3, 8, 8),
         )
 
@@ -1319,7 +1323,7 @@ class TestToImagePIL:
         [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
 
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToImagePIL()
@@ -1336,7 +1340,7 @@ class TestToPILImage:
         [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
     )
     def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
 
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToPILImage()
@@ -1443,7 +1447,7 @@ class TestRandomIoUCrop:
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
         image = datapoints.Image(torch.rand(1, 3, 4, 4))
         bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
-        label = datapoints.Label(torch.tensor([1]))
+        label = proto_datapoints.Label(torch.tensor([1]))
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
         transform._get_params = mocker.MagicMock(return_value={})
@@ -1454,7 +1458,7 @@ class TestRandomIoUCrop:
         transform = transforms.RandomIoUCrop()
         with pytest.raises(
             TypeError,
-            match="requires input sample to contain Images or PIL Images, BoundingBoxes and Labels or OneHotLabels",
+            match="requires input sample to contain tensor or PIL images and bounding boxes",
         ):
             transform(torch.tensor(0))
 
@@ -1463,13 +1467,11 @@ class TestRandomIoUCrop:
 
         image = datapoints.Image(torch.rand(3, 32, 24))
         bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
-        label = datapoints.Label(torch.randint(0, 10, size=(6,)))
-        ohe_label = datapoints.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
         masks = make_detection_mask((32, 24), num_objects=6)
 
-        sample = [image, bboxes, label, ohe_label, masks]
+        sample = [image, bboxes, masks]
 
-        fn = mocker.patch("torchvision.prototype.transforms.functional.crop", side_effect=lambda x, **params: x)
+        fn = mocker.patch("torchvision.transforms.v2.functional.crop", side_effect=lambda x, **params: x)
         is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
 
         params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
@@ -1493,17 +1495,7 @@ class TestRandomIoUCrop:
         assert isinstance(output_bboxes, datapoints.BoundingBox)
         assert len(output_bboxes) == expected_within_targets
 
-        # check labels
-        output_label = output[2]
-        assert isinstance(output_label, datapoints.Label)
-        assert len(output_label) == expected_within_targets
-        torch.testing.assert_close(output_label, label[is_within_crop_area])
-
-        output_ohe_label = output[3]
-        assert isinstance(output_ohe_label, datapoints.OneHotLabel)
-        torch.testing.assert_close(output_ohe_label, ohe_label[is_within_crop_area])
-
-        output_masks = output[4]
+        output_masks = output[2]
         assert isinstance(output_masks, datapoints.Mask)
         assert len(output_masks) == expected_within_targets
 
@@ -1545,12 +1537,12 @@ class TestScaleJitter:
 
         size_sentinel = mocker.MagicMock()
         mocker.patch(
-            "torchvision.prototype.transforms._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
+            "torchvision.transforms.v2._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
         )
 
         inpt_sentinel = mocker.MagicMock()
 
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
         transform(inpt_sentinel)
 
         mock.assert_called_once_with(
@@ -1592,13 +1584,13 @@ class TestRandomShortestSize:
 
         size_sentinel = mocker.MagicMock()
         mocker.patch(
-            "torchvision.prototype.transforms._geometry.RandomShortestSize._get_params",
+            "torchvision.transforms.v2._geometry.RandomShortestSize._get_params",
             return_value=dict(size=size_sentinel),
         )
 
         inpt_sentinel = mocker.MagicMock()
 
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
         transform(inpt_sentinel)
 
         mock.assert_called_once_with(
@@ -1613,13 +1605,13 @@ class TestSimpleCopyPaste:
         return mocker.MagicMock(spec=image_type)
 
     def test__extract_image_targets_assertion(self, mocker):
-        transform = transforms.SimpleCopyPaste()
+        transform = proto_transforms.SimpleCopyPaste()
 
         flat_sample = [
             # images, batch size = 2
             self.create_fake_image(mocker, datapoints.Image),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=datapoints.Label),
+            mocker.MagicMock(spec=proto_datapoints.Label),
             mocker.MagicMock(spec=datapoints.BoundingBox),
             mocker.MagicMock(spec=datapoints.Mask),
             # labels, bboxes, masks
@@ -1631,9 +1623,9 @@ class TestSimpleCopyPaste:
             transform._extract_image_targets(flat_sample)
 
     @pytest.mark.parametrize("image_type", [datapoints.Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [proto_datapoints.Label, proto_datapoints.OneHotLabel])
     def test__extract_image_targets(self, image_type, label_type, mocker):
-        transform = transforms.SimpleCopyPaste()
+        transform = proto_transforms.SimpleCopyPaste()
 
         flat_sample = [
             # images, batch size = 2
@@ -1669,7 +1661,7 @@ class TestSimpleCopyPaste:
                 assert isinstance(target[key], type_)
                 assert target[key] in flat_sample
 
-    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [proto_datapoints.Label, proto_datapoints.OneHotLabel])
     def test__copy_paste(self, label_type):
         image = 2 * torch.ones(3, 32, 32)
         masks = torch.zeros(2, 32, 32)
@@ -1679,7 +1671,7 @@ class TestSimpleCopyPaste:
         blending = True
         resize_interpolation = InterpolationMode.BILINEAR
         antialias = None
-        if label_type == datapoints.OneHotLabel:
+        if label_type == proto_datapoints.OneHotLabel:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
             "boxes": datapoints.BoundingBox(
@@ -1694,7 +1686,7 @@ class TestSimpleCopyPaste:
         paste_masks[0, 13:19, 12:18] = 1
         paste_masks[1, 15:19, 1:8] = 1
         paste_labels = torch.tensor([3, 4])
-        if label_type == datapoints.OneHotLabel:
+        if label_type == proto_datapoints.OneHotLabel:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
             "boxes": datapoints.BoundingBox(
@@ -1704,7 +1696,7 @@ class TestSimpleCopyPaste:
             "labels": label_type(paste_labels),
         }
 
-        transform = transforms.SimpleCopyPaste()
+        transform = proto_transforms.SimpleCopyPaste()
         random_selection = torch.tensor([0, 1])
         output_image, output_target = transform._copy_paste(
             image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias
@@ -1716,7 +1708,7 @@ class TestSimpleCopyPaste:
         torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
 
         expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == datapoints.OneHotLabel:
+        if label_type == proto_datapoints.OneHotLabel:
             expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
         torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
 
@@ -1731,7 +1723,7 @@ class TestFixedSizeCrop:
         batch_shape = (10,)
         spatial_size = (11, 5)
 
-        transform = transforms.FixedSizeCrop(size=crop_size)
+        transform = proto_transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
             make_image(size=spatial_size, color_space="RGB"),
@@ -1759,9 +1751,8 @@ class TestFixedSizeCrop:
         fill_sentinel = 12
         padding_mode_sentinel = mocker.MagicMock()
 
-        transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
+        transform = proto_transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
         transform._transformed_types = (mocker.MagicMock,)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
         needs_crop, needs_pad = needs
@@ -1810,7 +1801,7 @@ class TestFixedSizeCrop:
             if not needs_crop:
                 assert args[0] is inpt_sentinel
             assert args[1] is padding_sentinel
-            fill_sentinel = transforms._utils._convert_fill_arg(fill_sentinel)
+            fill_sentinel = _convert_fill_arg(fill_sentinel)
             assert kwargs == dict(fill=fill_sentinel, padding_mode=padding_mode_sentinel)
         else:
             mock_pad.assert_not_called()
@@ -1839,8 +1830,7 @@ class TestFixedSizeCrop:
         masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
-        transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
+        transform = proto_transforms.FixedSizeCrop((-1, -1))
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
         output = transform(
@@ -1877,8 +1867,7 @@ class TestFixedSizeCrop:
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
-        transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
+        transform = proto_transforms.FixedSizeCrop((-1, -1))
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
         transform(bounding_box)
@@ -1922,10 +1911,10 @@ class TestLinearTransformation:
 class TestLabelToOneHot:
     def test__transform(self):
         categories = ["apple", "pear", "pineapple"]
-        labels = datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
-        transform = transforms.LabelToOneHot()
+        labels = proto_datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
+        transform = proto_transforms.LabelToOneHot()
         ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, datapoints.OneHotLabel)
+        assert isinstance(ohe_labels, proto_datapoints.OneHotLabel)
         assert ohe_labels.shape == (4, 3)
         assert ohe_labels.categories == labels.categories == categories
 
@@ -1956,13 +1945,13 @@ class TestRandomResize:
 
         size_sentinel = mocker.MagicMock()
         mocker.patch(
-            "torchvision.prototype.transforms._geometry.RandomResize._get_params",
+            "torchvision.transforms.v2._geometry.RandomResize._get_params",
             return_value=dict(size=size_sentinel),
         )
 
         inpt_sentinel = mocker.MagicMock()
 
-        mock_resize = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
+        mock_resize = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
         transform(inpt_sentinel)
 
         mock_resize.assert_called_with(
@@ -2048,7 +2037,7 @@ class TestPermuteDimensions:
             int=0,
         )
 
-        transform = transforms.PermuteDimensions(dims)
+        transform = proto_transforms.PermuteDimensions(dims)
         transformed_sample = transform(sample)
 
         for key, value in sample.items():
@@ -2056,7 +2045,7 @@ class TestPermuteDimensions:
             transformed_value = transformed_sample[key]
 
             if check_type(
-                value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
+                value, (datapoints.Image, torchvision.transforms.v2.utils.is_simple_tensor, datapoints.Video)
             ):
                 if transform.dims.get(value_type) is not None:
                     assert transformed_value.permute(inverse_dims[value_type]).equal(value)
@@ -2067,14 +2056,14 @@ class TestPermuteDimensions:
     @pytest.mark.filterwarnings("error")
     def test_plain_tensor_call(self):
         tensor = torch.empty((2, 3, 4))
-        transform = transforms.PermuteDimensions(dims=(1, 2, 0))
+        transform = proto_transforms.PermuteDimensions(dims=(1, 2, 0))
 
         assert transform(tensor).shape == (3, 4, 2)
 
     @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
     def test_plain_tensor_warning(self, other_type):
         with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+            proto_transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
 
 
 class TestTransposeDimensions:
@@ -2094,7 +2083,7 @@ class TestTransposeDimensions:
             int=0,
         )
 
-        transform = transforms.TransposeDimensions(dims)
+        transform = proto_transforms.TransposeDimensions(dims)
         transformed_sample = transform(sample)
 
         for key, value in sample.items():
@@ -2103,7 +2092,7 @@ class TestTransposeDimensions:
 
             transposed_dims = transform.dims.get(value_type)
             if check_type(
-                value, (datapoints.Image, torchvision.prototype.transforms.utils.is_simple_tensor, datapoints.Video)
+                value, (datapoints.Image, torchvision.transforms.v2.utils.is_simple_tensor, datapoints.Video)
             ):
                 if transposed_dims is not None:
                     assert transformed_value.transpose(*transposed_dims).equal(value)
@@ -2114,14 +2103,14 @@ class TestTransposeDimensions:
     @pytest.mark.filterwarnings("error")
     def test_plain_tensor_call(self):
         tensor = torch.empty((2, 3, 4))
-        transform = transforms.TransposeDimensions(dims=(0, 2))
+        transform = proto_transforms.TransposeDimensions(dims=(0, 2))
 
         assert transform(tensor).shape == (4, 3, 2)
 
     @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
     def test_plain_tensor_warning(self, other_type):
         with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+            proto_transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
 
 
 class TestUniformTemporalSubsample:
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index c6709a5e5..9b3482f3f 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -12,6 +12,8 @@ import PIL.Image
 import pytest
 
 import torch
+import torchvision.prototype.transforms as prototype_transforms
+import torchvision.transforms.v2 as v2_transforms
 from prototype_common_utils import (
     ArgsKwargs,
     assert_close,
@@ -24,13 +26,13 @@ from prototype_common_utils import (
     make_segmentation_mask,
 )
 from torch import nn
-from torchvision import transforms as legacy_transforms
+from torchvision import datapoints, transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import datapoints, transforms as prototype_transforms
-from torchvision.prototype.transforms import functional as prototype_F
-from torchvision.prototype.transforms.functional import to_image_pil
-from torchvision.prototype.transforms.utils import query_spatial_size
+
 from torchvision.transforms import functional as legacy_F
+from torchvision.transforms.v2 import functional as prototype_F
+from torchvision.transforms.v2.functional import to_image_pil
+from torchvision.transforms.v2.utils import query_spatial_size
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
@@ -71,7 +73,7 @@ LINEAR_TRANSFORMATION_MATRIX = torch.rand([LINEAR_TRANSFORMATION_MEAN.numel()] *
 
 CONSISTENCY_CONFIGS = [
     ConsistencyConfig(
-        prototype_transforms.Normalize,
+        v2_transforms.Normalize,
         legacy_transforms.Normalize,
         [
             ArgsKwargs(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
@@ -80,14 +82,14 @@ CONSISTENCY_CONFIGS = [
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.float]),
     ),
     ConsistencyConfig(
-        prototype_transforms.Resize,
+        v2_transforms.Resize,
         legacy_transforms.Resize,
         [
             NotScriptableArgsKwargs(32),
             ArgsKwargs([32]),
             ArgsKwargs((32, 29)),
-            ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+            ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC),
             ArgsKwargs((30, 27), interpolation=PIL.Image.NEAREST),
             ArgsKwargs((35, 29), interpolation=PIL.Image.BILINEAR),
             ArgsKwargs((34, 25), interpolation=PIL.Image.BICUBIC),
@@ -100,7 +102,7 @@ CONSISTENCY_CONFIGS = [
         ],
     ),
     ConsistencyConfig(
-        prototype_transforms.CenterCrop,
+        v2_transforms.CenterCrop,
         legacy_transforms.CenterCrop,
         [
             ArgsKwargs(18),
@@ -108,7 +110,7 @@ CONSISTENCY_CONFIGS = [
         ],
     ),
     ConsistencyConfig(
-        prototype_transforms.FiveCrop,
+        v2_transforms.FiveCrop,
         legacy_transforms.FiveCrop,
         [
             ArgsKwargs(18),
@@ -117,7 +119,7 @@ CONSISTENCY_CONFIGS = [
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]),
     ),
     ConsistencyConfig(
-        prototype_transforms.TenCrop,
+        v2_transforms.TenCrop,
         legacy_transforms.TenCrop,
         [
             ArgsKwargs(18),
@@ -127,7 +129,7 @@ CONSISTENCY_CONFIGS = [
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]),
     ),
     ConsistencyConfig(
-        prototype_transforms.Pad,
+        v2_transforms.Pad,
         legacy_transforms.Pad,
         [
             NotScriptableArgsKwargs(3),
@@ -143,7 +145,7 @@ CONSISTENCY_CONFIGS = [
     ),
     *[
         ConsistencyConfig(
-            prototype_transforms.LinearTransformation,
+            v2_transforms.LinearTransformation,
             legacy_transforms.LinearTransformation,
             [
                 ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX.to(matrix_dtype), LINEAR_TRANSFORMATION_MEAN.to(matrix_dtype)),
@@ -164,7 +166,7 @@ CONSISTENCY_CONFIGS = [
         ]
     ],
     ConsistencyConfig(
-        prototype_transforms.Grayscale,
+        v2_transforms.Grayscale,
         legacy_transforms.Grayscale,
         [
             ArgsKwargs(num_output_channels=1),
@@ -175,7 +177,7 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
-        prototype_transforms.ConvertDtype,
+        v2_transforms.ConvertDtype,
         legacy_transforms.ConvertImageDtype,
         [
             ArgsKwargs(torch.float16),
@@ -189,7 +191,7 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
-        prototype_transforms.ToPILImage,
+        v2_transforms.ToPILImage,
         legacy_transforms.ToPILImage,
         [NotScriptableArgsKwargs()],
         make_images_kwargs=dict(
@@ -204,7 +206,7 @@ CONSISTENCY_CONFIGS = [
         supports_pil=False,
     ),
     ConsistencyConfig(
-        prototype_transforms.Lambda,
+        v2_transforms.Lambda,
         legacy_transforms.Lambda,
         [
             NotScriptableArgsKwargs(lambda image: image / 2),
@@ -214,7 +216,7 @@ CONSISTENCY_CONFIGS = [
         supports_pil=False,
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomHorizontalFlip,
+        v2_transforms.RandomHorizontalFlip,
         legacy_transforms.RandomHorizontalFlip,
         [
             ArgsKwargs(p=0),
@@ -222,7 +224,7 @@ CONSISTENCY_CONFIGS = [
         ],
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomVerticalFlip,
+        v2_transforms.RandomVerticalFlip,
         legacy_transforms.RandomVerticalFlip,
         [
             ArgsKwargs(p=0),
@@ -230,7 +232,7 @@ CONSISTENCY_CONFIGS = [
         ],
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomEqualize,
+        v2_transforms.RandomEqualize,
         legacy_transforms.RandomEqualize,
         [
             ArgsKwargs(p=0),
@@ -239,7 +241,7 @@ CONSISTENCY_CONFIGS = [
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]),
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomInvert,
+        v2_transforms.RandomInvert,
         legacy_transforms.RandomInvert,
         [
             ArgsKwargs(p=0),
@@ -247,7 +249,7 @@ CONSISTENCY_CONFIGS = [
         ],
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomPosterize,
+        v2_transforms.RandomPosterize,
         legacy_transforms.RandomPosterize,
         [
             ArgsKwargs(p=0, bits=5),
@@ -257,7 +259,7 @@ CONSISTENCY_CONFIGS = [
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]),
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomSolarize,
+        v2_transforms.RandomSolarize,
         legacy_transforms.RandomSolarize,
         [
             ArgsKwargs(p=0, threshold=0.5),
@@ -267,7 +269,7 @@ CONSISTENCY_CONFIGS = [
     ),
     *[
         ConsistencyConfig(
-            prototype_transforms.RandomAutocontrast,
+            v2_transforms.RandomAutocontrast,
             legacy_transforms.RandomAutocontrast,
             [
                 ArgsKwargs(p=0),
@@ -279,7 +281,7 @@ CONSISTENCY_CONFIGS = [
         for dt, ckw in [(torch.uint8, dict(atol=1, rtol=0)), (torch.float32, dict(rtol=None, atol=None))]
     ],
     ConsistencyConfig(
-        prototype_transforms.RandomAdjustSharpness,
+        v2_transforms.RandomAdjustSharpness,
         legacy_transforms.RandomAdjustSharpness,
         [
             ArgsKwargs(p=0, sharpness_factor=0.5),
@@ -289,7 +291,7 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs={"atol": 1e-6, "rtol": 1e-6},
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomGrayscale,
+        v2_transforms.RandomGrayscale,
         legacy_transforms.RandomGrayscale,
         [
             ArgsKwargs(p=0),
@@ -300,14 +302,14 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomResizedCrop,
+        v2_transforms.RandomResizedCrop,
         legacy_transforms.RandomResizedCrop,
         [
             ArgsKwargs(16),
             ArgsKwargs(17, scale=(0.3, 0.7)),
             ArgsKwargs(25, ratio=(0.5, 1.5)),
-            ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+            ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC),
             ArgsKwargs((31, 28), interpolation=PIL.Image.NEAREST),
             ArgsKwargs((33, 26), interpolation=PIL.Image.BICUBIC),
             ArgsKwargs((29, 32), antialias=False),
@@ -315,7 +317,7 @@ CONSISTENCY_CONFIGS = [
         ],
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomErasing,
+        v2_transforms.RandomErasing,
         legacy_transforms.RandomErasing,
         [
             ArgsKwargs(p=0),
@@ -329,7 +331,7 @@ CONSISTENCY_CONFIGS = [
         supports_pil=False,
     ),
     ConsistencyConfig(
-        prototype_transforms.ColorJitter,
+        v2_transforms.ColorJitter,
         legacy_transforms.ColorJitter,
         [
             ArgsKwargs(),
@@ -347,7 +349,7 @@ CONSISTENCY_CONFIGS = [
     ),
     *[
         ConsistencyConfig(
-            prototype_transforms.ElasticTransform,
+            v2_transforms.ElasticTransform,
             legacy_transforms.ElasticTransform,
             [
                 ArgsKwargs(),
@@ -355,8 +357,8 @@ CONSISTENCY_CONFIGS = [
                 ArgsKwargs(alpha=(15.3, 27.2)),
                 ArgsKwargs(sigma=3.0),
                 ArgsKwargs(sigma=(2.5, 3.9)),
-                ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST),
-                ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC),
+                ArgsKwargs(interpolation=v2_transforms.InterpolationMode.NEAREST),
+                ArgsKwargs(interpolation=v2_transforms.InterpolationMode.BICUBIC),
                 ArgsKwargs(interpolation=PIL.Image.NEAREST),
                 ArgsKwargs(interpolation=PIL.Image.BICUBIC),
                 ArgsKwargs(fill=1),
@@ -370,7 +372,7 @@ CONSISTENCY_CONFIGS = [
         for dt, ckw in [(torch.uint8, {"rtol": 1e-1, "atol": 1}), (torch.float32, {"rtol": 1e-2, "atol": 1e-3})]
     ],
     ConsistencyConfig(
-        prototype_transforms.GaussianBlur,
+        v2_transforms.GaussianBlur,
         legacy_transforms.GaussianBlur,
         [
             ArgsKwargs(kernel_size=3),
@@ -381,7 +383,7 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs={"rtol": 1e-5, "atol": 1e-5},
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomAffine,
+        v2_transforms.RandomAffine,
         legacy_transforms.RandomAffine,
         [
             ArgsKwargs(degrees=30.0),
@@ -392,7 +394,7 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(degrees=0.0, shear=(8, 17)),
             ArgsKwargs(degrees=0.0, shear=(4, 5, 4, 13)),
             ArgsKwargs(degrees=(-20.0, 10.0), translate=(0.4, 0.6), scale=(0.3, 0.8), shear=(4, 5, 4, 13)),
-            ArgsKwargs(degrees=30.0, interpolation=prototype_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs(degrees=30.0, interpolation=v2_transforms.InterpolationMode.NEAREST),
             ArgsKwargs(degrees=30.0, interpolation=PIL.Image.NEAREST),
             ArgsKwargs(degrees=30.0, fill=1),
             ArgsKwargs(degrees=30.0, fill=(2, 3, 4)),
@@ -401,7 +403,7 @@ CONSISTENCY_CONFIGS = [
         removed_params=["fillcolor", "resample"],
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomCrop,
+        v2_transforms.RandomCrop,
         legacy_transforms.RandomCrop,
         [
             ArgsKwargs(12),
@@ -421,13 +423,13 @@ CONSISTENCY_CONFIGS = [
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(26, 26), (18, 33), (29, 22)]),
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomPerspective,
+        v2_transforms.RandomPerspective,
         legacy_transforms.RandomPerspective,
         [
             ArgsKwargs(p=0),
             ArgsKwargs(p=1),
             ArgsKwargs(p=1, distortion_scale=0.3),
-            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=prototype_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=v2_transforms.InterpolationMode.NEAREST),
             ArgsKwargs(p=1, distortion_scale=0.2, interpolation=PIL.Image.NEAREST),
             ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
             ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
@@ -435,12 +437,12 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs={"atol": None, "rtol": None},
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomRotation,
+        v2_transforms.RandomRotation,
         legacy_transforms.RandomRotation,
         [
             ArgsKwargs(degrees=30.0),
             ArgsKwargs(degrees=(-20.0, 10.0)),
-            ArgsKwargs(degrees=30.0, interpolation=prototype_transforms.InterpolationMode.BILINEAR),
+            ArgsKwargs(degrees=30.0, interpolation=v2_transforms.InterpolationMode.BILINEAR),
             ArgsKwargs(degrees=30.0, interpolation=PIL.Image.BILINEAR),
             ArgsKwargs(degrees=30.0, expand=True),
             ArgsKwargs(degrees=30.0, center=(0, 0)),
@@ -450,43 +452,43 @@ CONSISTENCY_CONFIGS = [
         removed_params=["resample"],
     ),
     ConsistencyConfig(
-        prototype_transforms.PILToTensor,
+        v2_transforms.PILToTensor,
         legacy_transforms.PILToTensor,
     ),
     ConsistencyConfig(
-        prototype_transforms.ToTensor,
+        v2_transforms.ToTensor,
         legacy_transforms.ToTensor,
     ),
     ConsistencyConfig(
-        prototype_transforms.Compose,
+        v2_transforms.Compose,
         legacy_transforms.Compose,
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomApply,
+        v2_transforms.RandomApply,
         legacy_transforms.RandomApply,
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomChoice,
+        v2_transforms.RandomChoice,
         legacy_transforms.RandomChoice,
     ),
     ConsistencyConfig(
-        prototype_transforms.RandomOrder,
+        v2_transforms.RandomOrder,
         legacy_transforms.RandomOrder,
     ),
     ConsistencyConfig(
-        prototype_transforms.AugMix,
+        v2_transforms.AugMix,
         legacy_transforms.AugMix,
     ),
     ConsistencyConfig(
-        prototype_transforms.AutoAugment,
+        v2_transforms.AutoAugment,
         legacy_transforms.AutoAugment,
     ),
     ConsistencyConfig(
-        prototype_transforms.RandAugment,
+        v2_transforms.RandAugment,
         legacy_transforms.RandAugment,
     ),
     ConsistencyConfig(
-        prototype_transforms.TrivialAugmentWide,
+        v2_transforms.TrivialAugmentWide,
         legacy_transforms.TrivialAugmentWide,
     ),
 ]
@@ -680,19 +682,19 @@ get_params_parametrization = pytest.mark.parametrize(
             id=transform_cls.__name__,
         )
         for transform_cls, get_params_args_kwargs in [
-            (prototype_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
-            (prototype_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
-            (prototype_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
-            (prototype_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
-            (prototype_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
+            (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
+            (v2_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
+            (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
+            (v2_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
+            (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
             (
-                prototype_transforms.RandomAffine,
+                v2_transforms.RandomAffine,
                 ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
             ),
-            (prototype_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
-            (prototype_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
-            (prototype_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
-            (prototype_transforms.AutoAugment, ArgsKwargs(5)),
+            (v2_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
+            (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
+            (v2_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
+            (v2_transforms.AutoAugment, ArgsKwargs(5)),
         ]
     ],
 )
@@ -767,10 +769,10 @@ class TestContainerTransforms:
     """
 
     def test_compose(self):
-        prototype_transform = prototype_transforms.Compose(
+        prototype_transform = v2_transforms.Compose(
             [
-                prototype_transforms.Resize(256),
-                prototype_transforms.CenterCrop(224),
+                v2_transforms.Resize(256),
+                v2_transforms.CenterCrop(224),
             ]
         )
         legacy_transform = legacy_transforms.Compose(
@@ -785,11 +787,11 @@ class TestContainerTransforms:
     @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1])
     @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList])
     def test_random_apply(self, p, sequence_type):
-        prototype_transform = prototype_transforms.RandomApply(
+        prototype_transform = v2_transforms.RandomApply(
             sequence_type(
                 [
-                    prototype_transforms.Resize(256),
-                    prototype_transforms.CenterCrop(224),
+                    v2_transforms.Resize(256),
+                    v2_transforms.CenterCrop(224),
                 ]
             ),
             p=p,
@@ -814,9 +816,9 @@ class TestContainerTransforms:
     # We can't test other values for `p` since the random parameter generation is different
     @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)])
     def test_random_choice(self, probabilities):
-        prototype_transform = prototype_transforms.RandomChoice(
+        prototype_transform = v2_transforms.RandomChoice(
             [
-                prototype_transforms.Resize(256),
+                v2_transforms.Resize(256),
                 legacy_transforms.CenterCrop(224),
             ],
             probabilities=probabilities,
@@ -834,7 +836,7 @@ class TestContainerTransforms:
 
 class TestToTensorTransforms:
     def test_pil_to_tensor(self):
-        prototype_transform = prototype_transforms.PILToTensor()
+        prototype_transform = v2_transforms.PILToTensor()
         legacy_transform = legacy_transforms.PILToTensor()
 
         for image in make_images(extra_dims=[()]):
@@ -844,7 +846,7 @@ class TestToTensorTransforms:
 
     def test_to_tensor(self):
         with pytest.warns(UserWarning, match=re.escape("The transform `ToTensor()` is deprecated")):
-            prototype_transform = prototype_transforms.ToTensor()
+            prototype_transform = v2_transforms.ToTensor()
         legacy_transform = legacy_transforms.ToTensor()
 
         for image in make_images(extra_dims=[()]):
@@ -867,14 +869,14 @@ class TestAATransforms:
     @pytest.mark.parametrize(
         "interpolation",
         [
-            prototype_transforms.InterpolationMode.NEAREST,
-            prototype_transforms.InterpolationMode.BILINEAR,
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
             PIL.Image.NEAREST,
         ],
     )
     def test_randaug(self, inpt, interpolation, mocker):
         t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1)
-        t = prototype_transforms.RandAugment(interpolation=interpolation, num_ops=1)
+        t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1)
 
         le = len(t._AUGMENTATION_SPACE)
         keys = list(t._AUGMENTATION_SPACE.keys())
@@ -909,14 +911,14 @@ class TestAATransforms:
     @pytest.mark.parametrize(
         "interpolation",
         [
-            prototype_transforms.InterpolationMode.NEAREST,
-            prototype_transforms.InterpolationMode.BILINEAR,
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
             PIL.Image.NEAREST,
         ],
     )
     def test_trivial_aug(self, inpt, interpolation, mocker):
         t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation)
-        t = prototype_transforms.TrivialAugmentWide(interpolation=interpolation)
+        t = v2_transforms.TrivialAugmentWide(interpolation=interpolation)
 
         le = len(t._AUGMENTATION_SPACE)
         keys = list(t._AUGMENTATION_SPACE.keys())
@@ -961,15 +963,15 @@ class TestAATransforms:
     @pytest.mark.parametrize(
         "interpolation",
         [
-            prototype_transforms.InterpolationMode.NEAREST,
-            prototype_transforms.InterpolationMode.BILINEAR,
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
             PIL.Image.NEAREST,
         ],
     )
     def test_augmix(self, inpt, interpolation, mocker):
         t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
         t_ref._sample_dirichlet = lambda t: t.softmax(dim=-1)
-        t = prototype_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
+        t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
         t._sample_dirichlet = lambda t: t.softmax(dim=-1)
 
         le = len(t._AUGMENTATION_SPACE)
@@ -1014,15 +1016,15 @@ class TestAATransforms:
     @pytest.mark.parametrize(
         "interpolation",
         [
-            prototype_transforms.InterpolationMode.NEAREST,
-            prototype_transforms.InterpolationMode.BILINEAR,
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
             PIL.Image.NEAREST,
         ],
     )
     def test_aa(self, inpt, interpolation):
         aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet")
         t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation)
-        t = prototype_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+        t = v2_transforms.AutoAugment(aa_policy, interpolation=interpolation)
 
         torch.manual_seed(12)
         expected_output = t_ref(inpt)
@@ -1087,10 +1089,16 @@ class TestRefDetTransforms:
     @pytest.mark.parametrize(
         "t_ref, t, data_kwargs",
         [
-            (det_transforms.RandomHorizontalFlip(p=1.0), prototype_transforms.RandomHorizontalFlip(p=1.0), {}),
-            (det_transforms.RandomIoUCrop(), prototype_transforms.RandomIoUCrop(), {"with_mask": False}),
-            (det_transforms.RandomZoomOut(), prototype_transforms.RandomZoomOut(), {"with_mask": False}),
-            (det_transforms.ScaleJitter((1024, 1024)), prototype_transforms.ScaleJitter((1024, 1024)), {}),
+            (det_transforms.RandomHorizontalFlip(p=1.0), v2_transforms.RandomHorizontalFlip(p=1.0), {}),
+            # FIXME: make
+            #  v2_transforms.Compose([
+            #      v2_transforms.RandomIoUCrop(),
+            #      v2_transforms.SanitizeBoundingBoxes()
+            #  ])
+            #  work
+            # (det_transforms.RandomIoUCrop(), v2_transforms.RandomIoUCrop(), {"with_mask": False}),
+            (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}),
+            (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024)), {}),
             (
                 det_transforms.FixedSizeCrop((1024, 1024), fill=0),
                 prototype_transforms.FixedSizeCrop((1024, 1024), fill=0),
@@ -1100,7 +1108,7 @@ class TestRefDetTransforms:
                 det_transforms.RandomShortestSize(
                     min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
                 ),
-                prototype_transforms.RandomShortestSize(
+                v2_transforms.RandomShortestSize(
                     min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
                 ),
                 {},
@@ -1127,11 +1135,11 @@ seg_transforms = import_transforms_from_references("segmentation")
 # 1. transforms.RandomCrop uses a different scheme to pad images and masks of insufficient size than its name
 #    counterpart in the detection references. Thus, we cannot use it with `pad_if_needed=True`
 # 2. transforms.Pad only supports a fixed padding, but the segmentation datasets don't have a fixed image size.
-class PadIfSmaller(prototype_transforms.Transform):
+class PadIfSmaller(v2_transforms.Transform):
     def __init__(self, size, fill=0):
         super().__init__()
         self.size = size
-        self.fill = prototype_transforms._geometry._setup_fill_arg(fill)
+        self.fill = v2_transforms._geometry._setup_fill_arg(fill)
 
     def _get_params(self, sample):
         height, width = query_spatial_size(sample)
@@ -1193,27 +1201,27 @@ class TestRefSegTransforms:
         [
             (
                 seg_transforms.RandomHorizontalFlip(flip_prob=1.0),
-                prototype_transforms.RandomHorizontalFlip(p=1.0),
+                v2_transforms.RandomHorizontalFlip(p=1.0),
                 dict(),
             ),
             (
                 seg_transforms.RandomHorizontalFlip(flip_prob=0.0),
-                prototype_transforms.RandomHorizontalFlip(p=0.0),
+                v2_transforms.RandomHorizontalFlip(p=0.0),
                 dict(),
             ),
             (
                 seg_transforms.RandomCrop(size=480),
-                prototype_transforms.Compose(
+                v2_transforms.Compose(
                     [
                         PadIfSmaller(size=480, fill=defaultdict(lambda: 0, {datapoints.Mask: 255})),
-                        prototype_transforms.RandomCrop(size=480),
+                        v2_transforms.RandomCrop(size=480),
                     ]
                 ),
                 dict(),
             ),
             (
                 seg_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
-                prototype_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+                v2_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
                 dict(supports_pil=False, image_dtype=torch.float),
             ),
         ],
@@ -1222,7 +1230,7 @@ class TestRefSegTransforms:
         self.check(t, t_ref, data_kwargs)
 
     def check_resize(self, mocker, t_ref, t):
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
         mock_ref = mocker.patch("torchvision.transforms.functional.resize")
 
         for dp, dp_ref in self.make_datapoints():
@@ -1263,9 +1271,9 @@ class TestRefSegTransforms:
 
         # We are patching torch.randint -> random.randint here, because we can't patch the modules that are not imported
         # normally
-        t = prototype_transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
+        t = v2_transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
         mocker.patch(
-            "torchvision.prototype.transforms._geometry.torch.randint",
+            "torchvision.transforms.v2._geometry.torch.randint",
             new=patched_randint,
         )
 
@@ -1277,7 +1285,7 @@ class TestRefSegTransforms:
         torch.manual_seed(0)
         base_size = 520
 
-        t = prototype_transforms.Resize(size=base_size, antialias=True)
+        t = v2_transforms.Resize(size=base_size, antialias=True)
 
         t_ref = seg_transforms.RandomResize(min_size=base_size, max_size=base_size)
 
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index bb4b6ef11..7dff7a509 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -11,7 +11,6 @@ import pytest
 
 import torch
 
-import torchvision.prototype.transforms.utils
 from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
 from prototype_common_utils import (
     assert_close,
@@ -22,11 +21,12 @@ from prototype_common_utils import (
 from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
 from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F
-from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding
-from torchvision.prototype.transforms.functional._meta import clamp_bounding_box, convert_format_bounding_box
+from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
+from torchvision.transforms.v2.functional._meta import clamp_bounding_box, convert_format_bounding_box
+from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
@@ -168,11 +168,7 @@ class TestKernels:
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
-        datapoint_type = (
-            datapoints.Image
-            if torchvision.prototype.transforms.utils.is_simple_tensor(batched_input)
-            else type(batched_input)
-        )
+        datapoint_type = datapoints.Image if is_simple_tensor(batched_input) else type(batched_input)
         # This dictionary contains the number of rightmost dimensions that contain the actual data.
         # Everything to the left is considered a batch dimension.
         data_dims = {
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
index befccf0be..c9d374660 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -3,12 +3,12 @@ import pytest
 
 import torch
 
-import torchvision.prototype.transforms.utils
+import torchvision.transforms.v2.utils
 from prototype_common_utils import make_bounding_box, make_detection_mask, make_image
 
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms.functional import to_image_pil
-from torchvision.prototype.transforms.utils import has_all, has_any
+from torchvision import datapoints
+from torchvision.transforms.v2.functional import to_image_pil
+from torchvision.transforms.v2.utils import has_all, has_any
 
 
 IMAGE = make_image(color_space="RGB")
@@ -37,15 +37,15 @@ MASK = make_detection_mask(size=IMAGE.spatial_size)
         ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.prototype.transforms.utils.is_simple_tensor), True),
+        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor), True),
         (
             (torch.Tensor(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.prototype.transforms.utils.is_simple_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
             True,
         ),
         (
             (to_image_pil(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.prototype.transforms.utils.is_simple_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
             True,
         ),
     ],
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
new file mode 100644
index 000000000..04d5a0573
--- /dev/null
+++ b/torchvision/datapoints/__init__.py
@@ -0,0 +1,7 @@
+from ._bounding_box import BoundingBox, BoundingBoxFormat
+from ._datapoint import FillType, FillTypeJIT, InputType, InputTypeJIT
+from ._image import Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
+from ._mask import Mask
+from ._video import TensorVideoType, TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT
+
+from ._dataset_wrapper import wrap_dataset_for_transforms_v2  # type: ignore[attr-defined]  # usort: skip
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
similarity index 100%
rename from torchvision/prototype/datapoints/_bounding_box.py
rename to torchvision/datapoints/_bounding_box.py
diff --git a/torchvision/prototype/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
similarity index 99%
rename from torchvision/prototype/datapoints/_datapoint.py
rename to torchvision/datapoints/_datapoint.py
index 5f4a0d96e..2a2f34fc6 100644
--- a/torchvision/prototype/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -105,7 +105,7 @@ class Datapoint(torch.Tensor):
         # the class. This approach avoids the DataLoader issue described at
         # https://github.com/pytorch/vision/pull/6476#discussion_r953588621
         if Datapoint.__F is None:
-            from ..transforms import functional
+            from ..transforms.v2 import functional
 
             Datapoint.__F = functional
         return Datapoint.__F
diff --git a/torchvision/prototype/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
similarity index 98%
rename from torchvision/prototype/datapoints/_dataset_wrapper.py
rename to torchvision/datapoints/_dataset_wrapper.py
index 74f830951..dc4d1f472 100644
--- a/torchvision/prototype/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -8,9 +8,8 @@ from collections import defaultdict
 import torch
 from torch.utils.data import Dataset
 
-from torchvision import datasets
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F
+from torchvision import datapoints, datasets
+from torchvision.transforms.v2 import functional as F
 
 __all__ = ["wrap_dataset_for_transforms_v2"]
 
diff --git a/torchvision/prototype/datapoints/_image.py b/torchvision/datapoints/_image.py
similarity index 99%
rename from torchvision/prototype/datapoints/_image.py
rename to torchvision/datapoints/_image.py
index 4fc14323a..9c61740c5 100644
--- a/torchvision/prototype/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -24,7 +24,7 @@ class Image(Datapoint):
         requires_grad: Optional[bool] = None,
     ) -> Image:
         if isinstance(data, PIL.Image.Image):
-            from torchvision.prototype.transforms import functional as F
+            from torchvision.transforms.v2 import functional as F
 
             data = F.pil_to_tensor(data)
 
diff --git a/torchvision/prototype/datapoints/_mask.py b/torchvision/datapoints/_mask.py
similarity index 98%
rename from torchvision/prototype/datapoints/_mask.py
rename to torchvision/datapoints/_mask.py
index 41dce097c..2746feaaf 100644
--- a/torchvision/prototype/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -23,7 +23,7 @@ class Mask(Datapoint):
         requires_grad: Optional[bool] = None,
     ) -> Mask:
         if isinstance(data, PIL.Image.Image):
-            from torchvision.prototype.transforms import functional as F
+            from torchvision.transforms.v2 import functional as F
 
             data = F.pil_to_tensor(data)
 
diff --git a/torchvision/prototype/datapoints/_video.py b/torchvision/datapoints/_video.py
similarity index 100%
rename from torchvision/prototype/datapoints/_video.py
rename to torchvision/datapoints/_video.py
diff --git a/torchvision/prototype/datapoints/__init__.py b/torchvision/prototype/datapoints/__init__.py
index 554088b91..604628b25 100644
--- a/torchvision/prototype/datapoints/__init__.py
+++ b/torchvision/prototype/datapoints/__init__.py
@@ -1,8 +1 @@
-from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._datapoint import FillType, FillTypeJIT, InputType, InputTypeJIT
-from ._image import Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
 from ._label import Label, OneHotLabel
-from ._mask import Mask
-from ._video import TensorVideoType, TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT
-
-from ._dataset_wrapper import wrap_dataset_for_transforms_v2  # type: ignore[attr-defined]  # usort: skip
diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/datapoints/_label.py
index 0ee2eb9f8..7ed2f7522 100644
--- a/torchvision/prototype/datapoints/_label.py
+++ b/torchvision/prototype/datapoints/_label.py
@@ -5,7 +5,7 @@ from typing import Any, Optional, Sequence, Type, TypeVar, Union
 import torch
 from torch.utils._pytree import tree_map
 
-from ._datapoint import Datapoint
+from torchvision.datapoints._datapoint import Datapoint
 
 
 L = TypeVar("L", bound="_LabelBase")
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index d8f560a36..f38823616 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -6,7 +6,8 @@ import numpy as np
 
 import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.datapoints import BoundingBox
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 66999c4c5..2c8194687 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -4,7 +4,8 @@ from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tupl
 
 import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.datapoints import BoundingBox
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
index de87f46c8..7d1782919 100644
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ b/torchvision/prototype/datasets/_builtin/cifar.py
@@ -6,7 +6,8 @@ from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Tuple, U
 
 import numpy as np
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datapoints import Image, Label
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index e02ca706b..6616b4e34 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -14,7 +14,8 @@ from torchdata.datapipes.iter import (
     Mapper,
     UnBatcher,
 )
-from torchvision.prototype.datapoints import BoundingBox, Label, Mask
+from torchvision.datapoints import BoundingBox, Mask
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index db561f89e..bc41ba028 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -15,7 +15,8 @@ from torchdata.datapipes.iter import (
     Mapper,
 )
 from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.datapoints import BoundingBox
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py
index 73c6184b6..17f092aa3 100644
--- a/torchvision/prototype/datasets/_builtin/fer2013.py
+++ b/torchvision/prototype/datasets/_builtin/fer2013.py
@@ -3,7 +3,8 @@ from typing import Any, Dict, List, Union
 
 import torch
 from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper
-from torchvision.prototype.datapoints import Image, Label
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
 
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index adcc31b27..85116ca38 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -2,7 +2,8 @@ import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.datapoints import BoundingBox
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 9364aa3ad..8f22a33ae 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -7,7 +7,8 @@ from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Sequence
 
 import torch
 from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datapoints import Image, Label
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE
 from torchvision.prototype.utils._internal import fromfile
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
index 9de224b95..4de5ae276 100644
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ b/torchvision/prototype/datasets/_builtin/pcam.py
@@ -4,7 +4,8 @@ from collections import namedtuple
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datapoints import Image, Label
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
 
diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py
index 9ae2c17ab..92e1b93b4 100644
--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ b/torchvision/prototype/datasets/_builtin/semeion.py
@@ -3,7 +3,8 @@ from typing import Any, Dict, List, Tuple, Union
 
 import torch
 from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper
-from torchvision.prototype.datapoints import Image, OneHotLabel
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import OneHotLabel
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
 
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index 02db37169..a76b2dba2 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -2,7 +2,8 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.datapoints import BoundingBox
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py
index d276298ca..94de4cf42 100644
--- a/torchvision/prototype/datasets/_builtin/svhn.py
+++ b/torchvision/prototype/datasets/_builtin/svhn.py
@@ -3,7 +3,8 @@ from typing import Any, BinaryIO, Dict, List, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher
-from torchvision.prototype.datapoints import Image, Label
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat
 
diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py
index 7d1fed04e..b5486669e 100644
--- a/torchvision/prototype/datasets/_builtin/usps.py
+++ b/torchvision/prototype/datasets/_builtin/usps.py
@@ -3,7 +3,8 @@ from typing import Any, Dict, List, Union
 
 import torch
 from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper
-from torchvision.prototype.datapoints import Image, Label
+from torchvision.datapoints import Image
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
 
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index d14189132..a13cfb764 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -5,8 +5,9 @@ from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
 from xml.etree import ElementTree
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
+from torchvision.datapoints import BoundingBox
 from torchvision.datasets import VOCDetection
-from torchvision.prototype.datapoints import BoundingBox, Label
+from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
diff --git a/torchvision/prototype/datasets/utils/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py
index 64cd9f7b9..8adc1e57a 100644
--- a/torchvision/prototype/datasets/utils/_encoded.py
+++ b/torchvision/prototype/datasets/utils/_encoded.py
@@ -7,7 +7,7 @@ from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
 import PIL.Image
 import torch
 
-from torchvision.prototype.datapoints._datapoint import Datapoint
+from torchvision.datapoints._datapoint import Datapoint
 from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
 
 D = TypeVar("D", bound="EncodedData")
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index ff3b75845..4f8fdef48 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -1,59 +1,6 @@
-from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
-
-from . import functional, utils  # usort: skip
-
-from ._transform import Transform  # usort: skip
 from ._presets import StereoMatching  # usort: skip
 
-from ._augment import RandomCutmix, RandomErasing, RandomMixup, SimpleCopyPaste
-from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
-from ._color import (
-    ColorJitter,
-    Grayscale,
-    RandomAdjustSharpness,
-    RandomAutocontrast,
-    RandomEqualize,
-    RandomGrayscale,
-    RandomInvert,
-    RandomPhotometricDistort,
-    RandomPosterize,
-    RandomSolarize,
-)
-from ._container import Compose, RandomApply, RandomChoice, RandomOrder
-from ._geometry import (
-    CenterCrop,
-    ElasticTransform,
-    FiveCrop,
-    FixedSizeCrop,
-    Pad,
-    RandomAffine,
-    RandomCrop,
-    RandomHorizontalFlip,
-    RandomIoUCrop,
-    RandomPerspective,
-    RandomResize,
-    RandomResizedCrop,
-    RandomRotation,
-    RandomShortestSize,
-    RandomVerticalFlip,
-    RandomZoomOut,
-    Resize,
-    ScaleJitter,
-    TenCrop,
-)
-from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
-from ._misc import (
-    GaussianBlur,
-    Identity,
-    Lambda,
-    LinearTransformation,
-    Normalize,
-    PermuteDimensions,
-    SanitizeBoundingBoxes,
-    ToDtype,
-    TransposeDimensions,
-)
-from ._temporal import UniformTemporalSubsample
-from ._type_conversion import LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
-
-from ._deprecated import ToTensor  # usort: skip
+from ._augment import RandomCutmix, RandomMixup, SimpleCopyPaste
+from ._geometry import FixedSizeCrop
+from ._misc import PermuteDimensions, TransposeDimensions
+from ._type_conversion import LabelToOneHot
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 3ceabba5e..afa411b48 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -1,112 +1,17 @@
 import math
-import numbers
-import warnings
 from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import transforms as _transforms
+from torchvision import datapoints
 from torchvision.ops import masks_to_boxes
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
-from torchvision.prototype.transforms.functional._geometry import _check_interpolation
+from torchvision.prototype import datapoints as proto_datapoints
+from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
 
-from ._transform import _RandomApplyTransform
-from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
-
-
-class RandomErasing(_RandomApplyTransform):
-    _v1_transform_cls = _transforms.RandomErasing
-
-    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
-        return dict(
-            super()._extract_params_for_v1_transform(),
-            value="random" if self.value is None else self.value,
-        )
-
-    _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)
-
-    def __init__(
-        self,
-        p: float = 0.5,
-        scale: Tuple[float, float] = (0.02, 0.33),
-        ratio: Tuple[float, float] = (0.3, 3.3),
-        value: float = 0.0,
-        inplace: bool = False,
-    ):
-        super().__init__(p=p)
-        if not isinstance(value, (numbers.Number, str, tuple, list)):
-            raise TypeError("Argument value should be either a number or str or a sequence")
-        if isinstance(value, str) and value != "random":
-            raise ValueError("If value is str, it should be 'random'")
-        if not isinstance(scale, (tuple, list)):
-            raise TypeError("Scale should be a sequence")
-        if not isinstance(ratio, (tuple, list)):
-            raise TypeError("Ratio should be a sequence")
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("Scale and ratio should be of kind (min, max)")
-        if scale[0] < 0 or scale[1] > 1:
-            raise ValueError("Scale should be between 0 and 1")
-        self.scale = scale
-        self.ratio = ratio
-        if isinstance(value, (int, float)):
-            self.value = [float(value)]
-        elif isinstance(value, str):
-            self.value = None
-        elif isinstance(value, (list, tuple)):
-            self.value = [float(v) for v in value]
-        else:
-            self.value = value
-        self.inplace = inplace
-
-        self._log_ratio = torch.log(torch.tensor(self.ratio))
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        img_c, img_h, img_w = query_chw(flat_inputs)
-
-        if self.value is not None and not (len(self.value) in (1, img_c)):
-            raise ValueError(
-                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
-            )
-
-        area = img_h * img_w
-
-        log_ratio = self._log_ratio
-        for _ in range(10):
-            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-            aspect_ratio = torch.exp(
-                torch.empty(1).uniform_(
-                    log_ratio[0],  # type: ignore[arg-type]
-                    log_ratio[1],  # type: ignore[arg-type]
-                )
-            ).item()
-
-            h = int(round(math.sqrt(erase_area * aspect_ratio)))
-            w = int(round(math.sqrt(erase_area / aspect_ratio)))
-            if not (h < img_h and w < img_w):
-                continue
-
-            if self.value is None:
-                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
-            else:
-                v = torch.tensor(self.value)[:, None, None]
-
-            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
-            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
-            break
-        else:
-            i, j, h, w, v = 0, 0, img_h, img_w, None
-
-        return dict(i=i, j=j, h=h, w=w, v=v)
-
-    def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
-        if params["v"] is not None:
-            inpt = F.erase(inpt, **params, inplace=self.inplace)
-
-        return inpt
+from torchvision.transforms.v2._transform import _RandomApplyTransform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_spatial_size
 
 
 class _BaseMixupCutmix(_RandomApplyTransform):
@@ -118,19 +23,19 @@ class _BaseMixupCutmix(_RandomApplyTransform):
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
             has_any(flat_inputs, datapoints.Image, datapoints.Video, is_simple_tensor)
-            and has_any(flat_inputs, datapoints.OneHotLabel)
+            and has_any(flat_inputs, proto_datapoints.OneHotLabel)
         ):
             raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
-        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Label):
+        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask, proto_datapoints.Label):
             raise TypeError(
                 f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
             )
 
-    def _mixup_onehotlabel(self, inpt: datapoints.OneHotLabel, lam: float) -> datapoints.OneHotLabel:
+    def _mixup_onehotlabel(self, inpt: proto_datapoints.OneHotLabel, lam: float) -> proto_datapoints.OneHotLabel:
         if inpt.ndim < 2:
             raise ValueError("Need a batch of one hot labels")
         output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
-        return datapoints.OneHotLabel.wrap_like(inpt, output)
+        return proto_datapoints.OneHotLabel.wrap_like(inpt, output)
 
 
 class RandomMixup(_BaseMixupCutmix):
@@ -149,7 +54,7 @@ class RandomMixup(_BaseMixupCutmix):
                 output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
 
             return output
-        elif isinstance(inpt, datapoints.OneHotLabel):
+        elif isinstance(inpt, proto_datapoints.OneHotLabel):
             return self._mixup_onehotlabel(inpt, lam)
         else:
             return inpt
@@ -193,7 +98,7 @@ class RandomCutmix(_BaseMixupCutmix):
                 output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
 
             return output
-        elif isinstance(inpt, datapoints.OneHotLabel):
+        elif isinstance(inpt, proto_datapoints.OneHotLabel):
             lam_adjusted = params["lam_adjusted"]
             return self._mixup_onehotlabel(inpt, lam_adjusted)
         else:
@@ -307,7 +212,7 @@ class SimpleCopyPaste(Transform):
                 bboxes.append(obj)
             elif isinstance(obj, datapoints.Mask):
                 masks.append(obj)
-            elif isinstance(obj, (datapoints.Label, datapoints.OneHotLabel)):
+            elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)):
                 labels.append(obj)
 
         if not (len(images) == len(bboxes) == len(masks) == len(labels)):
@@ -345,7 +250,7 @@ class SimpleCopyPaste(Transform):
             elif isinstance(obj, datapoints.Mask):
                 flat_sample[i] = datapoints.Mask.wrap_like(obj, output_targets[c2]["masks"])
                 c2 += 1
-            elif isinstance(obj, (datapoints.Label, datapoints.OneHotLabel)):
+            elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)):
                 flat_sample[i] = obj.wrap_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
                 c3 += 1
 
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 69238760b..fa4ccef2e 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -1,829 +1,13 @@
-import math
-import numbers
-import warnings
-from typing import Any, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Sequence, Type, Union
 
 import PIL.Image
 import torch
 
-from torchvision import transforms as _transforms
-from torchvision.ops.boxes import box_iou
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
-from torchvision.prototype.transforms.functional._geometry import _check_interpolation
-from torchvision.transforms.functional import _get_perspective_coeffs
-
-from ._transform import _RandomApplyTransform
-from ._utils import (
-    _check_padding_arg,
-    _check_padding_mode_arg,
-    _check_sequence_input,
-    _setup_angle,
-    _setup_fill_arg,
-    _setup_float_or_seq,
-    _setup_size,
-)
-from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_spatial_size
-
-
-class RandomHorizontalFlip(_RandomApplyTransform):
-    _v1_transform_cls = _transforms.RandomHorizontalFlip
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.horizontal_flip(inpt)
-
-
-class RandomVerticalFlip(_RandomApplyTransform):
-    _v1_transform_cls = _transforms.RandomVerticalFlip
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.vertical_flip(inpt)
-
-
-class Resize(Transform):
-    _v1_transform_cls = _transforms.Resize
-
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> None:
-        super().__init__()
-
-        if isinstance(size, int):
-            size = [size]
-        elif isinstance(size, (list, tuple)) and len(size) in {1, 2}:
-            size = list(size)
-        else:
-            raise ValueError(
-                f"size can either be an integer or a list or tuple of one or two integers, " f"but got {size} instead."
-            )
-        self.size = size
-
-        self.interpolation = _check_interpolation(interpolation)
-        self.max_size = max_size
-        self.antialias = antialias
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(
-            inpt,
-            self.size,
-            interpolation=self.interpolation,
-            max_size=self.max_size,
-            antialias=self.antialias,
-        )
-
-
-class CenterCrop(Transform):
-    _v1_transform_cls = _transforms.CenterCrop
-
-    def __init__(self, size: Union[int, Sequence[int]]):
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.center_crop(inpt, output_size=self.size)
-
-
-class RandomResizedCrop(Transform):
-    _v1_transform_cls = _transforms.RandomResizedCrop
-
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        scale: Tuple[float, float] = (0.08, 1.0),
-        ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-        if not isinstance(scale, Sequence):
-            raise TypeError("Scale should be a sequence")
-        scale = cast(Tuple[float, float], scale)
-        if not isinstance(ratio, Sequence):
-            raise TypeError("Ratio should be a sequence")
-        ratio = cast(Tuple[float, float], ratio)
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("Scale and ratio should be of kind (min, max)")
-
-        self.scale = scale
-        self.ratio = ratio
-        self.interpolation = _check_interpolation(interpolation)
-        self.antialias = antialias
-
-        self._log_ratio = torch.log(torch.tensor(self.ratio))
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
-        area = height * width
-
-        log_ratio = self._log_ratio
-        for _ in range(10):
-            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-            aspect_ratio = torch.exp(
-                torch.empty(1).uniform_(
-                    log_ratio[0],  # type: ignore[arg-type]
-                    log_ratio[1],  # type: ignore[arg-type]
-                )
-            ).item()
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if 0 < w <= width and 0 < h <= height:
-                i = torch.randint(0, height - h + 1, size=(1,)).item()
-                j = torch.randint(0, width - w + 1, size=(1,)).item()
-                break
-        else:
-            # Fallback to central crop
-            in_ratio = float(width) / float(height)
-            if in_ratio < min(self.ratio):
-                w = width
-                h = int(round(w / min(self.ratio)))
-            elif in_ratio > max(self.ratio):
-                h = height
-                w = int(round(h * max(self.ratio)))
-            else:  # whole image
-                w = width
-                h = height
-            i = (height - h) // 2
-            j = (width - w) // 2
-
-        return dict(top=i, left=j, height=h, width=w)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resized_crop(
-            inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
-        )
-
-
-ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
-
-
-class FiveCrop(Transform):
-    """
-    Example:
-        >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], datapoints.Label]):
-        ...         images_or_videos, labels = sample
-        ...         batch_size = len(images_or_videos)
-        ...         image_or_video = images_or_videos[0]
-        ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
-        ...         labels = datapoints.Label.wrap_like(labels, labels.repeat(batch_size))
-        ...         return images_or_videos, labels
-        ...
-        >>> image = datapoints.Image(torch.rand(3, 256, 256))
-        >>> label = datapoints.Label(0)
-        >>> transform = transforms.Compose([transforms.FiveCrop(), BatchMultiCrop()])
-        >>> images, labels = transform(image, label)
-        >>> images.shape
-        torch.Size([5, 3, 224, 224])
-        >>> labels.shape
-        torch.Size([5])
-    """
-
-    _v1_transform_cls = _transforms.FiveCrop
-
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
-    def __init__(self, size: Union[int, Sequence[int]]) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-    def _transform(
-        self, inpt: ImageOrVideoTypeJIT, params: Dict[str, Any]
-    ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
-        return F.five_crop(inpt, self.size)
-
-    def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
-
-
-class TenCrop(Transform):
-    """
-    See :class:`~torchvision.prototype.transforms.FiveCrop` for an example.
-    """
-
-    _v1_transform_cls = _transforms.TenCrop
-
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
-    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-        self.vertical_flip = vertical_flip
-
-    def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
-
-    def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Tuple[
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-    ]:
-        return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
-
-
-class Pad(Transform):
-    _v1_transform_cls = _transforms.Pad
-
-    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
-        params = super()._extract_params_for_v1_transform()
-
-        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
-            raise ValueError(
-                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
-            )
-
-        return params
-
-    def __init__(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-    ) -> None:
-        super().__init__()
-
-        _check_padding_arg(padding)
-        _check_padding_mode_arg(padding_mode)
-
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        if not isinstance(padding, int):
-            padding = list(padding)
-        self.padding = padding
-        self.fill = _setup_fill_arg(fill)
-        self.padding_mode = padding_mode
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
-
-
-class RandomZoomOut(_RandomApplyTransform):
-    def __init__(
-        self,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        side_range: Sequence[float] = (1.0, 4.0),
-        p: float = 0.5,
-    ) -> None:
-        super().__init__(p=p)
-
-        self.fill = _setup_fill_arg(fill)
-
-        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
-
-        self.side_range = side_range
-        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
-            raise ValueError(f"Invalid canvas side range provided {side_range}.")
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(flat_inputs)
-
-        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
-        canvas_width = int(orig_w * r)
-        canvas_height = int(orig_h * r)
-
-        r = torch.rand(2)
-        left = int((canvas_width - orig_w) * r[0])
-        top = int((canvas_height - orig_h) * r[1])
-        right = canvas_width - (left + orig_w)
-        bottom = canvas_height - (top + orig_h)
-        padding = [left, top, right, bottom]
-
-        return dict(padding=padding)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        return F.pad(inpt, **params, fill=fill)
-
-
-class RandomRotation(Transform):
-    _v1_transform_cls = _transforms.RandomRotation
-
-    def __init__(
-        self,
-        degrees: Union[numbers.Number, Sequence],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        center: Optional[List[float]] = None,
-    ) -> None:
-        super().__init__()
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        self.interpolation = _check_interpolation(interpolation)
-        self.expand = expand
-
-        self.fill = _setup_fill_arg(fill)
-
-        if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2,))
-
-        self.center = center
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
-        return dict(angle=angle)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        return F.rotate(
-            inpt,
-            **params,
-            interpolation=self.interpolation,
-            expand=self.expand,
-            center=self.center,
-            fill=fill,
-        )
-
-
-class RandomAffine(Transform):
-    _v1_transform_cls = _transforms.RandomAffine
-
-    def __init__(
-        self,
-        degrees: Union[numbers.Number, Sequence],
-        translate: Optional[Sequence[float]] = None,
-        scale: Optional[Sequence[float]] = None,
-        shear: Optional[Union[int, float, Sequence[float]]] = None,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        center: Optional[List[float]] = None,
-    ) -> None:
-        super().__init__()
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        if translate is not None:
-            _check_sequence_input(translate, "translate", req_sizes=(2,))
-            for t in translate:
-                if not (0.0 <= t <= 1.0):
-                    raise ValueError("translation values should be between 0 and 1")
-        self.translate = translate
-        if scale is not None:
-            _check_sequence_input(scale, "scale", req_sizes=(2,))
-            for s in scale:
-                if s <= 0:
-                    raise ValueError("scale values should be positive")
-        self.scale = scale
-
-        if shear is not None:
-            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
-        else:
-            self.shear = shear
-
-        self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
-
-        if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2,))
-
-        self.center = center
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
-
-        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
-        if self.translate is not None:
-            max_dx = float(self.translate[0] * width)
-            max_dy = float(self.translate[1] * height)
-            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
-            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
-            translate = (tx, ty)
-        else:
-            translate = (0, 0)
-
-        if self.scale is not None:
-            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-        else:
-            scale = 1.0
-
-        shear_x = shear_y = 0.0
-        if self.shear is not None:
-            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
-            if len(self.shear) == 4:
-                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
-
-        shear = (shear_x, shear_y)
-        return dict(angle=angle, translate=translate, scale=scale, shear=shear)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        return F.affine(
-            inpt,
-            **params,
-            interpolation=self.interpolation,
-            fill=fill,
-            center=self.center,
-        )
-
-
-class RandomCrop(Transform):
-    _v1_transform_cls = _transforms.RandomCrop
-
-    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
-        params = super()._extract_params_for_v1_transform()
-
-        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
-            raise ValueError(
-                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
-            )
-
-        padding = self.padding
-        if padding is not None:
-            pad_left, pad_right, pad_top, pad_bottom = padding
-            padding = [pad_left, pad_top, pad_right, pad_bottom]
-        params["padding"] = padding
-
-        return params
-
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        padding: Optional[Union[int, Sequence[int]]] = None,
-        pad_if_needed: bool = False,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-    ) -> None:
-        super().__init__()
-
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-        if pad_if_needed or padding is not None:
-            if padding is not None:
-                _check_padding_arg(padding)
-            _check_padding_mode_arg(padding_mode)
-
-        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
-        self.pad_if_needed = pad_if_needed
-        self.fill = _setup_fill_arg(fill)
-        self.padding_mode = padding_mode
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        padded_height, padded_width = query_spatial_size(flat_inputs)
-
-        if self.padding is not None:
-            pad_left, pad_right, pad_top, pad_bottom = self.padding
-            padded_height += pad_top + pad_bottom
-            padded_width += pad_left + pad_right
-        else:
-            pad_left = pad_right = pad_top = pad_bottom = 0
-
-        cropped_height, cropped_width = self.size
-
-        if self.pad_if_needed:
-            if padded_height < cropped_height:
-                diff = cropped_height - padded_height
-
-                pad_top += diff
-                pad_bottom += diff
-                padded_height += 2 * diff
-
-            if padded_width < cropped_width:
-                diff = cropped_width - padded_width
-
-                pad_left += diff
-                pad_right += diff
-                padded_width += 2 * diff
-
-        if padded_height < cropped_height or padded_width < cropped_width:
-            raise ValueError(
-                f"Required crop size {(cropped_height, cropped_width)} is larger than "
-                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
-            )
-
-        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-        needs_pad = any(padding)
-
-        needs_vert_crop, top = (
-            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
-            if padded_height > cropped_height
-            else (False, 0)
-        )
-        needs_horz_crop, left = (
-            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
-            if padded_width > cropped_width
-            else (False, 0)
-        )
-
-        return dict(
-            needs_crop=needs_vert_crop or needs_horz_crop,
-            top=top,
-            left=left,
-            height=cropped_height,
-            width=cropped_width,
-            needs_pad=needs_pad,
-            padding=padding,
-        )
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if params["needs_pad"]:
-            fill = self.fill[type(inpt)]
-            inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
-
-        if params["needs_crop"]:
-            inpt = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
-
-        return inpt
-
-
-class RandomPerspective(_RandomApplyTransform):
-    _v1_transform_cls = _transforms.RandomPerspective
-
-    def __init__(
-        self,
-        distortion_scale: float = 0.5,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        p: float = 0.5,
-    ) -> None:
-        super().__init__(p=p)
-
-        if not (0 <= distortion_scale <= 1):
-            raise ValueError("Argument distortion_scale value should be between 0 and 1")
-
-        self.distortion_scale = distortion_scale
-        self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
-
-        distortion_scale = self.distortion_scale
-
-        half_height = height // 2
-        half_width = width // 2
-        bound_height = int(distortion_scale * half_height) + 1
-        bound_width = int(distortion_scale * half_width) + 1
-        topleft = [
-            int(torch.randint(0, bound_width, size=(1,))),
-            int(torch.randint(0, bound_height, size=(1,))),
-        ]
-        topright = [
-            int(torch.randint(width - bound_width, width, size=(1,))),
-            int(torch.randint(0, bound_height, size=(1,))),
-        ]
-        botright = [
-            int(torch.randint(width - bound_width, width, size=(1,))),
-            int(torch.randint(height - bound_height, height, size=(1,))),
-        ]
-        botleft = [
-            int(torch.randint(0, bound_width, size=(1,))),
-            int(torch.randint(height - bound_height, height, size=(1,))),
-        ]
-        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
-        endpoints = [topleft, topright, botright, botleft]
-        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
-        return dict(coefficients=perspective_coeffs)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        return F.perspective(
-            inpt,
-            None,
-            None,
-            fill=fill,
-            interpolation=self.interpolation,
-            **params,
-        )
-
-
-class ElasticTransform(Transform):
-    _v1_transform_cls = _transforms.ElasticTransform
-
-    def __init__(
-        self,
-        alpha: Union[float, Sequence[float]] = 50.0,
-        sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    ) -> None:
-        super().__init__()
-        self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
-
-        self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        size = list(query_spatial_size(flat_inputs))
-
-        dx = torch.rand([1, 1] + size) * 2 - 1
-        if self.sigma[0] > 0.0:
-            kx = int(8 * self.sigma[0] + 1)
-            # if kernel size is even we have to make it odd
-            if kx % 2 == 0:
-                kx += 1
-            dx = F.gaussian_blur(dx, [kx, kx], list(self.sigma))
-        dx = dx * self.alpha[0] / size[0]
-
-        dy = torch.rand([1, 1] + size) * 2 - 1
-        if self.sigma[1] > 0.0:
-            ky = int(8 * self.sigma[1] + 1)
-            # if kernel size is even we have to make it odd
-            if ky % 2 == 0:
-                ky += 1
-            dy = F.gaussian_blur(dy, [ky, ky], list(self.sigma))
-        dy = dy * self.alpha[1] / size[1]
-        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
-        return dict(displacement=displacement)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        return F.elastic(
-            inpt,
-            **params,
-            fill=fill,
-            interpolation=self.interpolation,
-        )
-
-
-class RandomIoUCrop(Transform):
-    def __init__(
-        self,
-        min_scale: float = 0.3,
-        max_scale: float = 1.0,
-        min_aspect_ratio: float = 0.5,
-        max_aspect_ratio: float = 2.0,
-        sampler_options: Optional[List[float]] = None,
-        trials: int = 40,
-    ):
-        super().__init__()
-        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        if sampler_options is None:
-            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
-        self.options = sampler_options
-        self.trials = trials
-
-    def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if not (
-            has_all(flat_inputs, datapoints.BoundingBox)
-            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
-            and has_any(flat_inputs, datapoints.Label, datapoints.OneHotLabel)
-        ):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
-                "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks."
-            )
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(flat_inputs)
-        bboxes = query_bounding_box(flat_inputs)
-
-        while True:
-            # sample an option
-            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
-            min_jaccard_overlap = self.options[idx]
-            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
-                return dict()
-
-            for _ in range(self.trials):
-                # check the aspect ratio limitations
-                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
-                new_w = int(orig_w * r[0])
-                new_h = int(orig_h * r[1])
-                aspect_ratio = new_w / new_h
-                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
-                    continue
-
-                # check for 0 area crops
-                r = torch.rand(2)
-                left = int((orig_w - new_w) * r[0])
-                top = int((orig_h - new_h) * r[1])
-                right = left + new_w
-                bottom = top + new_h
-                if left == right or top == bottom:
-                    continue
-
-                # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_format_bounding_box(
-                    bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
-                )
-                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
-                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
-                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
-                if not is_within_crop_area.any():
-                    continue
-
-                # check at least 1 box with jaccard limitations
-                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
-                ious = box_iou(
-                    xyxy_bboxes,
-                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
-                )
-                if ious.max() < min_jaccard_overlap:
-                    continue
-
-                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if len(params) < 1:
-            return inpt
-
-        is_within_crop_area = params["is_within_crop_area"]
-
-        if isinstance(inpt, (datapoints.Label, datapoints.OneHotLabel)):
-            return inpt.wrap_like(inpt, inpt[is_within_crop_area])  # type: ignore[arg-type]
-
-        output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
-
-        if isinstance(output, datapoints.BoundingBox):
-            bboxes = output[is_within_crop_area]
-            bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size)
-            output = datapoints.BoundingBox.wrap_like(output, bboxes)
-        elif isinstance(output, datapoints.Mask):
-            # apply is_within_crop_area if mask is one-hot encoded
-            masks = output[is_within_crop_area]
-            output = datapoints.Mask.wrap_like(output, masks)
-
-        return output
-
-
-class ScaleJitter(Transform):
-    def __init__(
-        self,
-        target_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (0.1, 2.0),
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ):
-        super().__init__()
-        self.target_size = target_size
-        self.scale_range = scale_range
-        self.interpolation = _check_interpolation(interpolation)
-        self.antialias = antialias
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(flat_inputs)
-
-        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
-        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        return dict(size=(new_height, new_width))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
-
-
-class RandomShortestSize(Transform):
-    def __init__(
-        self,
-        min_size: Union[List[int], Tuple[int], int],
-        max_size: Optional[int] = None,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ):
-        super().__init__()
-        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
-        self.max_size = max_size
-        self.interpolation = _check_interpolation(interpolation)
-        self.antialias = antialias
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(flat_inputs)
-
-        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
-        r = min_size / min(orig_height, orig_width)
-        if self.max_size is not None:
-            r = min(r, self.max_size / max(orig_height, orig_width))
-
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        return dict(size=(new_height, new_width))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
+from torchvision import datapoints
+from torchvision.prototype.datapoints import Label, OneHotLabel
+from torchvision.transforms.v2 import functional as F, Transform
+from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_box, query_spatial_size
 
 
 class FixedSizeCrop(Transform):
@@ -854,9 +38,7 @@ class FixedSizeCrop(Transform):
                 f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
             )
 
-        if has_any(flat_inputs, datapoints.BoundingBox) and not has_any(
-            flat_inputs, datapoints.Label, datapoints.OneHotLabel
-        ):
+        if has_any(flat_inputs, datapoints.BoundingBox) and not has_any(flat_inputs, Label, OneHotLabel):
             raise TypeError(
                 f"If a BoundingBox is contained in the input sample, "
                 f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
@@ -927,7 +109,7 @@ class FixedSizeCrop(Transform):
             )
 
         if params["is_valid"] is not None:
-            if isinstance(inpt, (datapoints.Label, datapoints.OneHotLabel, datapoints.Mask)):
+            if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)):
                 inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
             elif isinstance(inpt, datapoints.BoundingBox):
                 inpt = datapoints.BoundingBox.wrap_like(
@@ -940,25 +122,3 @@ class FixedSizeCrop(Transform):
             inpt = F.pad(inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         return inpt
-
-
-class RandomResize(Transform):
-    def __init__(
-        self,
-        min_size: int,
-        max_size: int,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> None:
-        super().__init__()
-        self.min_size = min_size
-        self.max_size = max_size
-        self.interpolation = _check_interpolation(interpolation)
-        self.antialias = antialias
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        size = int(torch.randint(self.min_size, self.max_size, ()))
-        return dict(size=[size])
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias)
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index caed3eec9..b51b59a15 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,182 +1,13 @@
-import collections
 import warnings
-from contextlib import suppress
-from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Tuple, Type, Union
-
-import PIL.Image
+from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
 
 import torch
-from torch.utils._pytree import tree_flatten, tree_unflatten
-
-from torchvision import transforms as _transforms
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, Transform
-
-from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size
-from .utils import has_any, is_simple_tensor, query_bounding_box
-
-
-class Identity(Transform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt
-
-
-class Lambda(Transform):
-    def __init__(self, lambd: Callable[[Any], Any], *types: Type):
-        super().__init__()
-        self.lambd = lambd
-        self.types = types or (object,)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, self.types):
-            return self.lambd(inpt)
-        else:
-            return inpt
-
-    def extra_repr(self) -> str:
-        extras = []
-        name = getattr(self.lambd, "__name__", None)
-        if name:
-            extras.append(name)
-        extras.append(f"types={[type.__name__ for type in self.types]}")
-        return ", ".join(extras)
-
-
-class LinearTransformation(Transform):
-    _v1_transform_cls = _transforms.LinearTransformation
-
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
-
-    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
-        super().__init__()
-        if transformation_matrix.size(0) != transformation_matrix.size(1):
-            raise ValueError(
-                "transformation_matrix should be square. Got "
-                f"{tuple(transformation_matrix.size())} rectangular matrix."
-            )
-
-        if mean_vector.size(0) != transformation_matrix.size(0):
-            raise ValueError(
-                f"mean_vector should have the same length {mean_vector.size(0)}"
-                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
-            )
-
-        if transformation_matrix.device != mean_vector.device:
-            raise ValueError(
-                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
-            )
-
-        if transformation_matrix.dtype != mean_vector.dtype:
-            raise ValueError(
-                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
-            )
-
-        self.transformation_matrix = transformation_matrix
-        self.mean_vector = mean_vector
-
-    def _check_inputs(self, sample: Any) -> Any:
-        if has_any(sample, PIL.Image.Image):
-            raise TypeError("LinearTransformation does not work on PIL Images")
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        shape = inpt.shape
-        n = shape[-3] * shape[-2] * shape[-1]
-        if n != self.transformation_matrix.shape[0]:
-            raise ValueError(
-                "Input tensor and transformation matrix have incompatible shape."
-                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
-                + f"{self.transformation_matrix.shape[0]}"
-            )
-
-        if inpt.device.type != self.mean_vector.device.type:
-            raise ValueError(
-                "Input tensor should be on the same device as transformation matrix and mean vector. "
-                f"Got {inpt.device} vs {self.mean_vector.device}"
-            )
-
-        flat_inpt = inpt.reshape(-1, n) - self.mean_vector
-
-        transformation_matrix = self.transformation_matrix.to(flat_inpt.dtype)
-        output = torch.mm(flat_inpt, transformation_matrix)
-        output = output.reshape(shape)
-
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
-        return output
-
-
-class Normalize(Transform):
-    _v1_transform_cls = _transforms.Normalize
-    _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
-
-    def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
-        super().__init__()
-        self.mean = list(mean)
-        self.std = list(std)
-        self.inplace = inplace
-
-    def _check_inputs(self, sample: Any) -> Any:
-        if has_any(sample, PIL.Image.Image):
-            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
-
-    def _transform(
-        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
-    ) -> Any:
-        return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
-
-
-class GaussianBlur(Transform):
-    _v1_transform_cls = _transforms.GaussianBlur
-
-    def __init__(
-        self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
-    ) -> None:
-        super().__init__()
-        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
-        for ks in self.kernel_size:
-            if ks <= 0 or ks % 2 == 0:
-                raise ValueError("Kernel size value should be an odd and positive number.")
-
-        if isinstance(sigma, (int, float)):
-            if sigma <= 0:
-                raise ValueError("If sigma is a single number, it must be positive.")
-            sigma = float(sigma)
-        elif isinstance(sigma, Sequence) and len(sigma) == 2:
-            if not 0.0 < sigma[0] <= sigma[1]:
-                raise ValueError("sigma values should be positive and of the form (min, max).")
-        else:
-            raise TypeError("sigma should be a single int or float or a list/tuple with length 2 floats.")
-
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
-        return dict(sigma=[sigma, sigma])
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.gaussian_blur(inpt, self.kernel_size, **params)
-
 
-class ToDtype(Transform):
-    _transformed_types = (torch.Tensor,)
+from torchvision import datapoints
+from torchvision.transforms.v2 import Transform
 
-    def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
-        super().__init__()
-        if not isinstance(dtype, dict):
-            dtype = _get_defaultdict(dtype)
-        if torch.Tensor in dtype and any(cls in dtype for cls in [datapoints.Image, datapoints.Video]):
-            warnings.warn(
-                "Got `dtype` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
-                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
-                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
-            )
-        self.dtype = dtype
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        dtype = self.dtype[type(inpt)]
-        if dtype is None:
-            return inpt
-        return inpt.to(dtype=dtype)
+from torchvision.transforms.v2._utils import _get_defaultdict
+from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 class PermuteDimensions(Transform):
@@ -225,115 +56,3 @@ class TransposeDimensions(Transform):
         if dims is None:
             return inpt.as_subclass(torch.Tensor)
         return inpt.transpose(*dims)
-
-
-class SanitizeBoundingBoxes(Transform):
-    # This removes boxes and their corresponding labels:
-    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
-    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
-
-    def __init__(
-        self,
-        min_size: float = 1.0,
-        labels_getter: Union[Callable[[Any], Optional[torch.Tensor]], str, None] = "default",
-    ) -> None:
-        super().__init__()
-
-        if min_size < 1:
-            raise ValueError(f"min_size must be >= 1, got {min_size}.")
-        self.min_size = min_size
-
-        self.labels_getter = labels_getter
-        self._labels_getter: Optional[Callable[[Any], Optional[torch.Tensor]]]
-        if labels_getter == "default":
-            self._labels_getter = self._find_labels_default_heuristic
-        elif callable(labels_getter):
-            self._labels_getter = labels_getter
-        elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: inputs[labels_getter]
-        elif labels_getter is None:
-            self._labels_getter = None
-        else:
-            raise ValueError(
-                "labels_getter should either be a str, callable, or 'default'. "
-                f"Got {labels_getter} of type {type(labels_getter)}."
-            )
-
-    @staticmethod
-    def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
-        # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive
-        # Returns None if nothing is found
-        candidate_key = None
-        with suppress(StopIteration):
-            candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
-        if candidate_key is None:
-            with suppress(StopIteration):
-                candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
-        if candidate_key is None:
-            raise ValueError(
-                "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
-                "If there are no samples and it is by design, pass labels_getter=None."
-            )
-        return inputs[candidate_key]
-
-    def forward(self, *inputs: Any) -> Any:
-        inputs = inputs if len(inputs) > 1 else inputs[0]
-
-        if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping):
-            raise ValueError(
-                f"If labels_getter is a str or 'default' (got {self.labels_getter}), "
-                f"then the input to forward() must be a dict. Got {type(inputs)} instead."
-            )
-
-        if self._labels_getter is None:
-            labels = None
-        else:
-            labels = self._labels_getter(inputs)
-            if labels is not None and not isinstance(labels, torch.Tensor):
-                raise ValueError(f"The labels in the input to forward() must be a tensor, got {type(labels)} instead.")
-
-        flat_inputs, spec = tree_flatten(inputs)
-        # TODO: this enforces one single BoundingBox entry.
-        # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
-        # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
-        boxes = query_bounding_box(flat_inputs)
-
-        if boxes.ndim != 2:
-            raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")
-
-        if labels is not None and boxes.shape[0] != labels.shape[0]:
-            raise ValueError(
-                f"Number of boxes (shape={boxes.shape}) and number of labels (shape={labels.shape}) do not match."
-            )
-
-        boxes = cast(
-            datapoints.BoundingBox,
-            F.convert_format_bounding_box(
-                boxes,
-                new_format=datapoints.BoundingBoxFormat.XYXY,
-            ),
-        )
-        ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
-        mask = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
-        # TODO: Do we really need to check for out of bounds here? All
-        # transforms should be clamping anyway, so this should never happen?
-        image_h, image_w = boxes.spatial_size
-        mask &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
-        mask &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
-
-        params = dict(mask=mask, labels=labels)
-        flat_outputs = [
-            # Even-though it may look like we're transforming all inputs, we don't:
-            # _transform() will only care about BoundingBoxes and the labels
-            self._transform(inpt, params)
-            for inpt in flat_inputs
-        ]
-
-        return tree_unflatten(flat_outputs, spec)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-
-        if (inpt is not None and inpt is params["labels"]) or isinstance(inpt, datapoints.BoundingBox):
-            inpt = inpt[params["mask"]]
-
-        return inpt
diff --git a/torchvision/prototype/transforms/_presets.py b/torchvision/prototype/transforms/_presets.py
index 7f18e885c..25c39a903 100644
--- a/torchvision/prototype/transforms/_presets.py
+++ b/torchvision/prototype/transforms/_presets.py
@@ -9,9 +9,9 @@ import PIL.Image
 import torch
 from torch import Tensor
 
-from torchvision.prototype.transforms.functional._geometry import _check_interpolation
+from torchvision.transforms.v2 import functional as F, InterpolationMode
 
-from . import functional as F, InterpolationMode
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
 
 __all__ = ["StereoMatching"]
 
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index c84aee62a..4cd3cf468 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -1,67 +1,29 @@
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict
 
-import numpy as np
-import PIL.Image
 import torch
 
 from torch.nn.functional import one_hot
 
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, Transform
-
-from torchvision.prototype.transforms.utils import is_simple_tensor
+from torchvision.prototype import datapoints as proto_datapoints
+from torchvision.transforms.v2 import Transform
 
 
 class LabelToOneHot(Transform):
-    _transformed_types = (datapoints.Label,)
+    _transformed_types = (proto_datapoints.Label,)
 
     def __init__(self, num_categories: int = -1):
         super().__init__()
         self.num_categories = num_categories
 
-    def _transform(self, inpt: datapoints.Label, params: Dict[str, Any]) -> datapoints.OneHotLabel:
+    def _transform(self, inpt: proto_datapoints.Label, params: Dict[str, Any]) -> proto_datapoints.OneHotLabel:
         num_categories = self.num_categories
         if num_categories == -1 and inpt.categories is not None:
             num_categories = len(inpt.categories)
         output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories)
-        return datapoints.OneHotLabel(output, categories=inpt.categories)
+        return proto_datapoints.OneHotLabel(output, categories=inpt.categories)
 
     def extra_repr(self) -> str:
         if self.num_categories == -1:
             return ""
 
         return f"num_categories={self.num_categories}"
-
-
-class PILToTensor(Transform):
-    _transformed_types = (PIL.Image.Image,)
-
-    def _transform(self, inpt: Union[PIL.Image.Image], params: Dict[str, Any]) -> torch.Tensor:
-        return F.pil_to_tensor(inpt)
-
-
-class ToImageTensor(Transform):
-    _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
-
-    def _transform(
-        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
-    ) -> datapoints.Image:
-        return F.to_image_tensor(inpt)
-
-
-class ToImagePIL(Transform):
-    _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
-
-    def __init__(self, mode: Optional[str] = None) -> None:
-        super().__init__()
-        self.mode = mode
-
-    def _transform(
-        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
-    ) -> PIL.Image.Image:
-        return F.to_image_pil(inpt, mode=self.mode)
-
-
-# We changed the name to align them with the new naming scheme. Still, `ToPILImage` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-ToPILImage = ToImagePIL
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
new file mode 100644
index 000000000..520e0088e
--- /dev/null
+++ b/torchvision/transforms/v2/__init__.py
@@ -0,0 +1,47 @@
+from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
+
+from . import functional, utils  # usort: skip
+
+from ._transform import Transform  # usort: skip
+
+from ._augment import RandomErasing
+from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
+from ._color import (
+    ColorJitter,
+    Grayscale,
+    RandomAdjustSharpness,
+    RandomAutocontrast,
+    RandomEqualize,
+    RandomGrayscale,
+    RandomInvert,
+    RandomPhotometricDistort,
+    RandomPosterize,
+    RandomSolarize,
+)
+from ._container import Compose, RandomApply, RandomChoice, RandomOrder
+from ._geometry import (
+    CenterCrop,
+    ElasticTransform,
+    FiveCrop,
+    Pad,
+    RandomAffine,
+    RandomCrop,
+    RandomHorizontalFlip,
+    RandomIoUCrop,
+    RandomPerspective,
+    RandomResize,
+    RandomResizedCrop,
+    RandomRotation,
+    RandomShortestSize,
+    RandomVerticalFlip,
+    RandomZoomOut,
+    Resize,
+    ScaleJitter,
+    TenCrop,
+)
+from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
+from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBoxes, ToDtype
+from ._temporal import UniformTemporalSubsample
+from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
+
+from ._deprecated import ToTensor  # usort: skip
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
new file mode 100644
index 000000000..1375400ed
--- /dev/null
+++ b/torchvision/transforms/v2/_augment.py
@@ -0,0 +1,105 @@
+import math
+import numbers
+import warnings
+from typing import Any, Dict, List, Tuple, Union
+
+import PIL.Image
+import torch
+from torchvision import datapoints, transforms as _transforms
+from torchvision.transforms.v2 import functional as F
+
+from ._transform import _RandomApplyTransform
+from .utils import is_simple_tensor, query_chw
+
+
+class RandomErasing(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomErasing
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return dict(
+            super()._extract_params_for_v1_transform(),
+            value="random" if self.value is None else self.value,
+        )
+
+    _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)
+
+    def __init__(
+        self,
+        p: float = 0.5,
+        scale: Tuple[float, float] = (0.02, 0.33),
+        ratio: Tuple[float, float] = (0.3, 3.3),
+        value: float = 0.0,
+        inplace: bool = False,
+    ):
+        super().__init__(p=p)
+        if not isinstance(value, (numbers.Number, str, tuple, list)):
+            raise TypeError("Argument value should be either a number or str or a sequence")
+        if isinstance(value, str) and value != "random":
+            raise ValueError("If value is str, it should be 'random'")
+        if not isinstance(scale, (tuple, list)):
+            raise TypeError("Scale should be a sequence")
+        if not isinstance(ratio, (tuple, list)):
+            raise TypeError("Ratio should be a sequence")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+        if scale[0] < 0 or scale[1] > 1:
+            raise ValueError("Scale should be between 0 and 1")
+        self.scale = scale
+        self.ratio = ratio
+        if isinstance(value, (int, float)):
+            self.value = [float(value)]
+        elif isinstance(value, str):
+            self.value = None
+        elif isinstance(value, (list, tuple)):
+            self.value = [float(v) for v in value]
+        else:
+            self.value = value
+        self.inplace = inplace
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        img_c, img_h, img_w = query_chw(flat_inputs)
+
+        if self.value is not None and not (len(self.value) in (1, img_c)):
+            raise ValueError(
+                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
+            )
+
+        area = img_h * img_w
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            h = int(round(math.sqrt(erase_area * aspect_ratio)))
+            w = int(round(math.sqrt(erase_area / aspect_ratio)))
+            if not (h < img_h and w < img_w):
+                continue
+
+            if self.value is None:
+                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
+            else:
+                v = torch.tensor(self.value)[:, None, None]
+
+            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
+            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
+            break
+        else:
+            i, j, h, w, v = 0, 0, img_h, img_w, None
+
+        return dict(i=i, j=j, h=h, w=w, v=v)
+
+    def _transform(
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
+        if params["v"] is not None:
+            inpt = F.erase(inpt, **params, inplace=self.inplace)
+
+        return inpt
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
similarity index 98%
rename from torchvision/prototype/transforms/_auto_augment.py
rename to torchvision/transforms/v2/_auto_augment.py
index 67afecf5d..bdc3c89d7 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -5,12 +5,11 @@ import PIL.Image
 import torch
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
-from torchvision import transforms as _transforms
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
-from torchvision.prototype.transforms.functional._geometry import _check_interpolation
-from torchvision.prototype.transforms.functional._meta import get_spatial_size
+from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms import functional_tensor as _FT
+from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.functional._meta import get_spatial_size
 
 from ._utils import _setup_fill_arg
 from .utils import check_type, is_simple_tensor
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/transforms/v2/_color.py
similarity index 98%
rename from torchvision/prototype/transforms/_color.py
rename to torchvision/transforms/v2/_color.py
index 8ac0d8577..f1b04d775 100644
--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -3,9 +3,8 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
-from torchvision import transforms as _transforms
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, Transform
+from torchvision import datapoints, transforms as _transforms
+from torchvision.transforms.v2 import functional as F, Transform
 
 from ._transform import _RandomApplyTransform
 from .utils import is_simple_tensor, query_chw
diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/transforms/v2/_container.py
similarity index 98%
rename from torchvision/prototype/transforms/_container.py
rename to torchvision/transforms/v2/_container.py
index 42c73a2c1..555010fda 100644
--- a/torchvision/prototype/transforms/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -5,7 +5,7 @@ import torch
 
 from torch import nn
 from torchvision import transforms as _transforms
-from torchvision.prototype.transforms import Transform
+from torchvision.transforms.v2 import Transform
 
 
 class Compose(Transform):
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
similarity index 92%
rename from torchvision/prototype/transforms/_deprecated.py
rename to torchvision/transforms/v2/_deprecated.py
index cd37f4d73..bfb0d0623 100644
--- a/torchvision/prototype/transforms/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -4,10 +4,10 @@ from typing import Any, Dict, Union
 import numpy as np
 import PIL.Image
 import torch
-
-from torchvision.prototype.transforms import Transform
 from torchvision.transforms import functional as _F
 
+from torchvision.transforms.v2 import Transform
+
 
 class ToTensor(Transform):
     _transformed_types = (PIL.Image.Image, np.ndarray)
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
new file mode 100644
index 000000000..6a8e4a3e0
--- /dev/null
+++ b/torchvision/transforms/v2/_geometry.py
@@ -0,0 +1,847 @@
+import math
+import numbers
+import warnings
+from typing import Any, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
+
+import PIL.Image
+import torch
+
+from torchvision import datapoints, transforms as _transforms
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import _get_perspective_coeffs
+from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+
+from ._transform import _RandomApplyTransform
+from ._utils import (
+    _check_padding_arg,
+    _check_padding_mode_arg,
+    _check_sequence_input,
+    _setup_angle,
+    _setup_fill_arg,
+    _setup_float_or_seq,
+    _setup_size,
+)
+from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_spatial_size
+
+
+class RandomHorizontalFlip(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomHorizontalFlip
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.horizontal_flip(inpt)
+
+
+class RandomVerticalFlip(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomVerticalFlip
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.vertical_flip(inpt)
+
+
+class Resize(Transform):
+    _v1_transform_cls = _transforms.Resize
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        max_size: Optional[int] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ) -> None:
+        super().__init__()
+
+        if isinstance(size, int):
+            size = [size]
+        elif isinstance(size, (list, tuple)) and len(size) in {1, 2}:
+            size = list(size)
+        else:
+            raise ValueError(
+                f"size can either be an integer or a list or tuple of one or two integers, " f"but got {size} instead."
+            )
+        self.size = size
+
+        self.interpolation = _check_interpolation(interpolation)
+        self.max_size = max_size
+        self.antialias = antialias
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.resize(
+            inpt,
+            self.size,
+            interpolation=self.interpolation,
+            max_size=self.max_size,
+            antialias=self.antialias,
+        )
+
+
+class CenterCrop(Transform):
+    _v1_transform_cls = _transforms.CenterCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.center_crop(inpt, output_size=self.size)
+
+
+class RandomResizedCrop(Transform):
+    _v1_transform_cls = _transforms.RandomResizedCrop
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        scale: Tuple[float, float] = (0.08, 1.0),
+        ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if not isinstance(scale, Sequence):
+            raise TypeError("Scale should be a sequence")
+        scale = cast(Tuple[float, float], scale)
+        if not isinstance(ratio, Sequence):
+            raise TypeError("Ratio should be a sequence")
+        ratio = cast(Tuple[float, float], ratio)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
+        area = height * width
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                i = torch.randint(0, height - h + 1, size=(1,)).item()
+                j = torch.randint(0, width - w + 1, size=(1,)).item()
+                break
+        else:
+            # Fallback to central crop
+            in_ratio = float(width) / float(height)
+            if in_ratio < min(self.ratio):
+                w = width
+                h = int(round(w / min(self.ratio)))
+            elif in_ratio > max(self.ratio):
+                h = height
+                w = int(round(h * max(self.ratio)))
+            else:  # whole image
+                w = width
+                h = height
+            i = (height - h) // 2
+            j = (width - w) // 2
+
+        return dict(top=i, left=j, height=h, width=w)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.resized_crop(
+            inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
+
+
+class FiveCrop(Transform):
+    """
+    Example:
+        >>> class BatchMultiCrop(transforms.Transform):
+        ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
+        ...         images_or_videos, labels = sample
+        ...         batch_size = len(images_or_videos)
+        ...         image_or_video = images_or_videos[0]
+        ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
+        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
+        ...         return images_or_videos, labels
+        ...
+        >>> image = datapoints.Image(torch.rand(3, 256, 256))
+        >>> label = 3
+        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
+        >>> images, labels = transform(image, label)
+        >>> images.shape
+        torch.Size([5, 3, 224, 224])
+        >>> labels
+        tensor([3, 3, 3, 3, 3])
+    """
+
+    _v1_transform_cls = _transforms.FiveCrop
+
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
+
+    def __init__(self, size: Union[int, Sequence[int]]) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def _transform(
+        self, inpt: ImageOrVideoTypeJIT, params: Dict[str, Any]
+    ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
+        return F.five_crop(inpt, self.size)
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
+            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
+
+
+class TenCrop(Transform):
+    """
+    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+    """
+
+    _v1_transform_cls = _transforms.TenCrop
+
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
+
+    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.vertical_flip = vertical_flip
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
+            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
+
+    def _transform(
+        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+    ) -> Tuple[
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+        ImageOrVideoTypeJIT,
+    ]:
+        return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
+
+
+class Pad(Transform):
+    _v1_transform_cls = _transforms.Pad
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(
+                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
+            )
+
+        return params
+
+    def __init__(
+        self,
+        padding: Union[int, Sequence[int]],
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        _check_padding_arg(padding)
+        _check_padding_mode_arg(padding_mode)
+
+        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
+        if not isinstance(padding, int):
+            padding = list(padding)
+        self.padding = padding
+        self.fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self.fill[type(inpt)]
+        return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+
+class RandomZoomOut(_RandomApplyTransform):
+    def __init__(
+        self,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        side_range: Sequence[float] = (1.0, 4.0),
+        p: float = 0.5,
+    ) -> None:
+        super().__init__(p=p)
+
+        self.fill = _setup_fill_arg(fill)
+
+        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
+
+        self.side_range = side_range
+        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
+            raise ValueError(f"Invalid canvas side range provided {side_range}.")
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_h, orig_w = query_spatial_size(flat_inputs)
+
+        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
+        canvas_width = int(orig_w * r)
+        canvas_height = int(orig_h * r)
+
+        r = torch.rand(2)
+        left = int((canvas_width - orig_w) * r[0])
+        top = int((canvas_height - orig_h) * r[1])
+        right = canvas_width - (left + orig_w)
+        bottom = canvas_height - (top + orig_h)
+        padding = [left, top, right, bottom]
+
+        return dict(padding=padding)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self.fill[type(inpt)]
+        return F.pad(inpt, **params, fill=fill)
+
+
+class RandomRotation(Transform):
+    _v1_transform_cls = _transforms.RandomRotation
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        expand: bool = False,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        center: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        self.interpolation = _check_interpolation(interpolation)
+        self.expand = expand
+
+        self.fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        return dict(angle=angle)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self.fill[type(inpt)]
+        return F.rotate(
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            expand=self.expand,
+            center=self.center,
+            fill=fill,
+        )
+
+
+class RandomAffine(Transform):
+    _v1_transform_cls = _transforms.RandomAffine
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        translate: Optional[Sequence[float]] = None,
+        scale: Optional[Sequence[float]] = None,
+        shear: Optional[Union[int, float, Sequence[float]]] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        center: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2,))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError("translation values should be between 0 and 1")
+        self.translate = translate
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2,))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
+
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        if self.translate is not None:
+            max_dx = float(self.translate[0] * width)
+            max_dy = float(self.translate[1] * height)
+            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
+            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
+            translate = (tx, ty)
+        else:
+            translate = (0, 0)
+
+        if self.scale is not None:
+            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+        else:
+            scale = 1.0
+
+        shear_x = shear_y = 0.0
+        if self.shear is not None:
+            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
+            if len(self.shear) == 4:
+                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
+
+        shear = (shear_x, shear_y)
+        return dict(angle=angle, translate=translate, scale=scale, shear=shear)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self.fill[type(inpt)]
+        return F.affine(
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            fill=fill,
+            center=self.center,
+        )
+
+
+class RandomCrop(Transform):
+    _v1_transform_cls = _transforms.RandomCrop
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(
+                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
+            )
+
+        padding = self.padding
+        if padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = padding
+            padding = [pad_left, pad_top, pad_right, pad_bottom]
+        params["padding"] = padding
+
+        return params
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        padding: Optional[Union[int, Sequence[int]]] = None,
+        pad_if_needed: bool = False,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if pad_if_needed or padding is not None:
+            if padding is not None:
+                _check_padding_arg(padding)
+            _check_padding_mode_arg(padding_mode)
+
+        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
+        self.pad_if_needed = pad_if_needed
+        self.fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        padded_height, padded_width = query_spatial_size(flat_inputs)
+
+        if self.padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = self.padding
+            padded_height += pad_top + pad_bottom
+            padded_width += pad_left + pad_right
+        else:
+            pad_left = pad_right = pad_top = pad_bottom = 0
+
+        cropped_height, cropped_width = self.size
+
+        if self.pad_if_needed:
+            if padded_height < cropped_height:
+                diff = cropped_height - padded_height
+
+                pad_top += diff
+                pad_bottom += diff
+                padded_height += 2 * diff
+
+            if padded_width < cropped_width:
+                diff = cropped_width - padded_width
+
+                pad_left += diff
+                pad_right += diff
+                padded_width += 2 * diff
+
+        if padded_height < cropped_height or padded_width < cropped_width:
+            raise ValueError(
+                f"Required crop size {(cropped_height, cropped_width)} is larger than "
+                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
+            )
+
+        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+        needs_pad = any(padding)
+
+        needs_vert_crop, top = (
+            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
+            if padded_height > cropped_height
+            else (False, 0)
+        )
+        needs_horz_crop, left = (
+            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
+            if padded_width > cropped_width
+            else (False, 0)
+        )
+
+        return dict(
+            needs_crop=needs_vert_crop or needs_horz_crop,
+            top=top,
+            left=left,
+            height=cropped_height,
+            width=cropped_width,
+            needs_pad=needs_pad,
+            padding=padding,
+        )
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if params["needs_pad"]:
+            fill = self.fill[type(inpt)]
+            inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
+
+        if params["needs_crop"]:
+            inpt = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
+
+        return inpt
+
+
+class RandomPerspective(_RandomApplyTransform):
+    _v1_transform_cls = _transforms.RandomPerspective
+
+    def __init__(
+        self,
+        distortion_scale: float = 0.5,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        p: float = 0.5,
+    ) -> None:
+        super().__init__(p=p)
+
+        if not (0 <= distortion_scale <= 1):
+            raise ValueError("Argument distortion_scale value should be between 0 and 1")
+
+        self.distortion_scale = distortion_scale
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = _setup_fill_arg(fill)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_spatial_size(flat_inputs)
+
+        distortion_scale = self.distortion_scale
+
+        half_height = height // 2
+        half_width = width // 2
+        bound_height = int(distortion_scale * half_height) + 1
+        bound_width = int(distortion_scale * half_width) + 1
+        topleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        topright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        botright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        botleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return dict(coefficients=perspective_coeffs)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self.fill[type(inpt)]
+        return F.perspective(
+            inpt,
+            None,
+            None,
+            fill=fill,
+            interpolation=self.interpolation,
+            **params,
+        )
+
+
+class ElasticTransform(Transform):
+    _v1_transform_cls = _transforms.ElasticTransform
+
+    def __init__(
+        self,
+        alpha: Union[float, Sequence[float]] = 50.0,
+        sigma: Union[float, Sequence[float]] = 5.0,
+        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    ) -> None:
+        super().__init__()
+        self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
+        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
+
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = _setup_fill_arg(fill)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        size = list(query_spatial_size(flat_inputs))
+
+        dx = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[0] > 0.0:
+            kx = int(8 * self.sigma[0] + 1)
+            # if kernel size is even we have to make it odd
+            if kx % 2 == 0:
+                kx += 1
+            dx = F.gaussian_blur(dx, [kx, kx], list(self.sigma))
+        dx = dx * self.alpha[0] / size[0]
+
+        dy = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[1] > 0.0:
+            ky = int(8 * self.sigma[1] + 1)
+            # if kernel size is even we have to make it odd
+            if ky % 2 == 0:
+                ky += 1
+            dy = F.gaussian_blur(dy, [ky, ky], list(self.sigma))
+        dy = dy * self.alpha[1] / size[1]
+        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
+        return dict(displacement=displacement)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self.fill[type(inpt)]
+        return F.elastic(
+            inpt,
+            **params,
+            fill=fill,
+            interpolation=self.interpolation,
+        )
+
+
+class RandomIoUCrop(Transform):
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[List[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if not (
+            has_all(flat_inputs, datapoints.BoundingBox)
+            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
+        ):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
+                "and bounding boxes. Sample can also contain masks."
+            )
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_h, orig_w = query_spatial_size(flat_inputs)
+        bboxes = query_bounding_box(flat_inputs)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
+                return dict()
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # FIXME: I think we can stop here?
+
+                # check for any valid boxes with centers within the crop area
+                xyxy_bboxes = F.convert_format_bounding_box(
+                    bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
+                )
+                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
+                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
+                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
+                ious = box_iou(
+                    xyxy_bboxes,
+                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        # FIXME: refactor this to not remove anything
+
+        if len(params) < 1:
+            return inpt
+
+        is_within_crop_area = params["is_within_crop_area"]
+
+        output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
+
+        if isinstance(output, datapoints.BoundingBox):
+            bboxes = output[is_within_crop_area]
+            bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size)
+            output = datapoints.BoundingBox.wrap_like(output, bboxes)
+        elif isinstance(output, datapoints.Mask):
+            # apply is_within_crop_area if mask is one-hot encoded
+            masks = output[is_within_crop_area]
+            output = datapoints.Mask.wrap_like(output, masks)
+
+        return output
+
+
+class ScaleJitter(Transform):
+    def __init__(
+        self,
+        target_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (0.1, 2.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ):
+        super().__init__()
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_height, orig_width = query_spatial_size(flat_inputs)
+
+        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
+        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
+
+
+class RandomShortestSize(Transform):
+    def __init__(
+        self,
+        min_size: Union[List[int], Tuple[int], int],
+        max_size: Optional[int] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ):
+        super().__init__()
+        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
+        self.max_size = max_size
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_height, orig_width = query_spatial_size(flat_inputs)
+
+        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
+        r = min_size / min(orig_height, orig_width)
+        if self.max_size is not None:
+            r = min(r, self.max_size / max(orig_height, orig_width))
+
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
+
+
+class RandomResize(Transform):
+    def __init__(
+        self,
+        min_size: int,
+        max_size: int,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ) -> None:
+        super().__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        size = int(torch.randint(self.min_size, self.max_size, ()))
+        return dict(size=[size])
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.resize(inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias)
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/transforms/v2/_meta.py
similarity index 90%
rename from torchvision/prototype/transforms/_meta.py
rename to torchvision/transforms/v2/_meta.py
index 79bd5549b..6e6655d0b 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -2,9 +2,8 @@ from typing import Any, Dict, Union
 
 import torch
 
-from torchvision import transforms as _transforms
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, Transform
+from torchvision import datapoints, transforms as _transforms
+from torchvision.transforms.v2 import functional as F, Transform
 
 from .utils import is_simple_tensor
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
new file mode 100644
index 000000000..89e743dae
--- /dev/null
+++ b/torchvision/transforms/v2/_misc.py
@@ -0,0 +1,290 @@
+import collections
+import warnings
+from contextlib import suppress
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union
+
+import PIL.Image
+
+import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from torchvision import datapoints, transforms as _transforms
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size
+from .utils import has_any, is_simple_tensor, query_bounding_box
+
+
+class Identity(Transform):
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return inpt
+
+
+class Lambda(Transform):
+    def __init__(self, lambd: Callable[[Any], Any], *types: Type):
+        super().__init__()
+        self.lambd = lambd
+        self.types = types or (object,)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if isinstance(inpt, self.types):
+            return self.lambd(inpt)
+        else:
+            return inpt
+
+    def extra_repr(self) -> str:
+        extras = []
+        name = getattr(self.lambd, "__name__", None)
+        if name:
+            extras.append(name)
+        extras.append(f"types={[type.__name__ for type in self.types]}")
+        return ", ".join(extras)
+
+
+class LinearTransformation(Transform):
+    _v1_transform_cls = _transforms.LinearTransformation
+
+    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+
+    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
+        super().__init__()
+        if transformation_matrix.size(0) != transformation_matrix.size(1):
+            raise ValueError(
+                "transformation_matrix should be square. Got "
+                f"{tuple(transformation_matrix.size())} rectangular matrix."
+            )
+
+        if mean_vector.size(0) != transformation_matrix.size(0):
+            raise ValueError(
+                f"mean_vector should have the same length {mean_vector.size(0)}"
+                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
+            )
+
+        if transformation_matrix.device != mean_vector.device:
+            raise ValueError(
+                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
+            )
+
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
+        self.transformation_matrix = transformation_matrix
+        self.mean_vector = mean_vector
+
+    def _check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError("LinearTransformation does not work on PIL Images")
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        shape = inpt.shape
+        n = shape[-3] * shape[-2] * shape[-1]
+        if n != self.transformation_matrix.shape[0]:
+            raise ValueError(
+                "Input tensor and transformation matrix have incompatible shape."
+                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
+                + f"{self.transformation_matrix.shape[0]}"
+            )
+
+        if inpt.device.type != self.mean_vector.device.type:
+            raise ValueError(
+                "Input tensor should be on the same device as transformation matrix and mean vector. "
+                f"Got {inpt.device} vs {self.mean_vector.device}"
+            )
+
+        flat_inpt = inpt.reshape(-1, n) - self.mean_vector
+
+        transformation_matrix = self.transformation_matrix.to(flat_inpt.dtype)
+        output = torch.mm(flat_inpt, transformation_matrix)
+        output = output.reshape(shape)
+
+        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
+        return output
+
+
+class Normalize(Transform):
+    _v1_transform_cls = _transforms.Normalize
+    _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
+
+    def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
+        super().__init__()
+        self.mean = list(mean)
+        self.std = list(std)
+        self.inplace = inplace
+
+    def _check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def _transform(
+        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
+    ) -> Any:
+        return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
+
+
+class GaussianBlur(Transform):
+    _v1_transform_cls = _transforms.GaussianBlur
+
+    def __init__(
+        self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
+        for ks in self.kernel_size:
+            if ks <= 0 or ks % 2 == 0:
+                raise ValueError("Kernel size value should be an odd and positive number.")
+
+        if isinstance(sigma, (int, float)):
+            if sigma <= 0:
+                raise ValueError("If sigma is a single number, it must be positive.")
+            sigma = float(sigma)
+        elif isinstance(sigma, Sequence) and len(sigma) == 2:
+            if not 0.0 < sigma[0] <= sigma[1]:
+                raise ValueError("sigma values should be positive and of the form (min, max).")
+        else:
+            raise TypeError("sigma should be a single int or float or a list/tuple with length 2 floats.")
+
+        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
+        return dict(sigma=[sigma, sigma])
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.gaussian_blur(inpt, self.kernel_size, **params)
+
+
+class ToDtype(Transform):
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
+        super().__init__()
+        if not isinstance(dtype, dict):
+            dtype = _get_defaultdict(dtype)
+        if torch.Tensor in dtype and any(cls in dtype for cls in [datapoints.Image, datapoints.Video]):
+            warnings.warn(
+                "Got `dtype` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+            )
+        self.dtype = dtype
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        dtype = self.dtype[type(inpt)]
+        if dtype is None:
+            return inpt
+        return inpt.to(dtype=dtype)
+
+
+class SanitizeBoundingBoxes(Transform):
+    # This removes boxes and their corresponding labels:
+    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
+    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
+
+    def __init__(
+        self,
+        min_size: float = 1.0,
+        labels_getter: Union[Callable[[Any], Optional[torch.Tensor]], str, None] = "default",
+    ) -> None:
+        super().__init__()
+
+        if min_size < 1:
+            raise ValueError(f"min_size must be >= 1, got {min_size}.")
+        self.min_size = min_size
+
+        self.labels_getter = labels_getter
+        self._labels_getter: Optional[Callable[[Any], Optional[torch.Tensor]]]
+        if labels_getter == "default":
+            self._labels_getter = self._find_labels_default_heuristic
+        elif callable(labels_getter):
+            self._labels_getter = labels_getter
+        elif isinstance(labels_getter, str):
+            self._labels_getter = lambda inputs: inputs[labels_getter]
+        elif labels_getter is None:
+            self._labels_getter = None
+        else:
+            raise ValueError(
+                "labels_getter should either be a str, callable, or 'default'. "
+                f"Got {labels_getter} of type {type(labels_getter)}."
+            )
+
+    @staticmethod
+    def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
+        # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive
+        # Returns None if nothing is found
+        candidate_key = None
+        with suppress(StopIteration):
+            candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
+        if candidate_key is None:
+            with suppress(StopIteration):
+                candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
+        if candidate_key is None:
+            raise ValueError(
+                "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
+                "If there are no samples and it is by design, pass labels_getter=None."
+            )
+        return inputs[candidate_key]
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping):
+            raise ValueError(
+                f"If labels_getter is a str or 'default' (got {self.labels_getter}), "
+                f"then the input to forward() must be a dict. Got {type(inputs)} instead."
+            )
+
+        if self._labels_getter is None:
+            labels = None
+        else:
+            labels = self._labels_getter(inputs)
+            if labels is not None and not isinstance(labels, torch.Tensor):
+                raise ValueError(f"The labels in the input to forward() must be a tensor, got {type(labels)} instead.")
+
+        flat_inputs, spec = tree_flatten(inputs)
+        # TODO: this enforces one single BoundingBox entry.
+        # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
+        # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
+        boxes = query_bounding_box(flat_inputs)
+
+        if boxes.ndim != 2:
+            raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")
+
+        if labels is not None and boxes.shape[0] != labels.shape[0]:
+            raise ValueError(
+                f"Number of boxes (shape={boxes.shape}) and number of labels (shape={labels.shape}) do not match."
+            )
+
+        boxes = cast(
+            datapoints.BoundingBox,
+            F.convert_format_bounding_box(
+                boxes,
+                new_format=datapoints.BoundingBoxFormat.XYXY,
+            ),
+        )
+        ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
+        mask = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
+        # TODO: Do we really need to check for out of bounds here? All
+        # transforms should be clamping anyway, so this should never happen?
+        image_h, image_w = boxes.spatial_size
+        mask &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
+        mask &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
+
+        params = dict(mask=mask, labels=labels)
+        flat_outputs = [
+            # Even-though it may look like we're transforming all inputs, we don't:
+            # _transform() will only care about BoundingBoxes and the labels
+            self._transform(inpt, params)
+            for inpt in flat_inputs
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+
+        if (inpt is not None and inpt is params["labels"]) or isinstance(inpt, datapoints.BoundingBox):
+            inpt = inpt[params["mask"]]
+
+        return inpt
diff --git a/torchvision/prototype/transforms/_temporal.py b/torchvision/transforms/v2/_temporal.py
similarity index 53%
rename from torchvision/prototype/transforms/_temporal.py
rename to torchvision/transforms/v2/_temporal.py
index 62fe7f4ed..ab3b91d6c 100644
--- a/torchvision/prototype/transforms/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -1,18 +1,17 @@
 from typing import Any, Dict
 
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms import functional as F, Transform
+from torchvision import datapoints
+from torchvision.transforms.v2 import functional as F, Transform
 
-from torchvision.prototype.transforms.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 class UniformTemporalSubsample(Transform):
     _transformed_types = (is_simple_tensor, datapoints.Video)
 
-    def __init__(self, num_samples: int, temporal_dim: int = -4):
+    def __init__(self, num_samples: int):
         super().__init__()
         self.num_samples = num_samples
-        self.temporal_dim = temporal_dim
 
     def _transform(self, inpt: datapoints.VideoType, params: Dict[str, Any]) -> datapoints.VideoType:
-        return F.uniform_temporal_subsample(inpt, self.num_samples, temporal_dim=self.temporal_dim)
+        return F.uniform_temporal_subsample(inpt, self.num_samples)
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/transforms/v2/_transform.py
similarity index 98%
rename from torchvision/prototype/transforms/_transform.py
rename to torchvision/transforms/v2/_transform.py
index 7f3c03d5e..3f92b3c16 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -7,8 +7,8 @@ import PIL.Image
 import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms.utils import check_type, has_any, is_simple_tensor
+from torchvision import datapoints
+from torchvision.transforms.v2.utils import check_type, has_any, is_simple_tensor
 from torchvision.utils import _log_api_usage_once
 
 
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
new file mode 100644
index 000000000..984d5ba50
--- /dev/null
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from torchvision import datapoints
+from torchvision.transforms.v2 import functional as F, Transform
+
+from torchvision.transforms.v2.utils import is_simple_tensor
+
+
+class PILToTensor(Transform):
+    _transformed_types = (PIL.Image.Image,)
+
+    def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
+        return F.pil_to_tensor(inpt)
+
+
+class ToImageTensor(Transform):
+    _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
+
+    def _transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
+    ) -> datapoints.Image:
+        return F.to_image_tensor(inpt)
+
+
+class ToImagePIL(Transform):
+    _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
+
+    def __init__(self, mode: Optional[str] = None) -> None:
+        super().__init__()
+        self.mode = mode
+
+    def _transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
+    ) -> PIL.Image.Image:
+        return F.to_image_pil(inpt, mode=self.mode)
+
+
+# We changed the name to align them with the new naming scheme. Still, `ToPILImage` is
+# prevalent and well understood. Thus, we just alias it without deprecating the old name.
+ToPILImage = ToImagePIL
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/transforms/v2/_utils.py
similarity index 96%
rename from torchvision/prototype/transforms/_utils.py
rename to torchvision/transforms/v2/_utils.py
index f2d818b13..d68851576 100644
--- a/torchvision/prototype/transforms/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -3,8 +3,8 @@ import numbers
 from collections import defaultdict
 from typing import Any, Dict, Literal, Sequence, Type, TypeVar, Union
 
-from torchvision.prototype import datapoints
-from torchvision.prototype.datapoints._datapoint import FillType, FillTypeJIT
+from torchvision import datapoints
+from torchvision.datapoints._datapoint import FillType, FillTypeJIT
 
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
similarity index 100%
rename from torchvision/prototype/transforms/functional/__init__.py
rename to torchvision/transforms/v2/functional/__init__.py
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
similarity index 97%
rename from torchvision/prototype/transforms/functional/_augment.py
rename to torchvision/transforms/v2/functional/_augment.py
index 0164a0b5b..e9d0339a9 100644
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -3,7 +3,7 @@ from typing import Union
 import PIL.Image
 
 import torch
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 from torchvision.utils import _log_api_usage_once
 
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
similarity index 99%
rename from torchvision/prototype/transforms/functional/_color.py
rename to torchvision/transforms/v2/functional/_color.py
index e1c8bb87c..2ebb4f044 100644
--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -3,7 +3,7 @@ from typing import Union
 import PIL.Image
 import torch
 from torch.nn.functional import conv2d
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
similarity index 96%
rename from torchvision/prototype/transforms/functional/_deprecated.py
rename to torchvision/transforms/v2/functional/_deprecated.py
index 098702160..8f035f708 100644
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -4,7 +4,7 @@ from typing import Any, List, Union
 import PIL.Image
 import torch
 
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms import functional as _F
 
 
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
similarity index 99%
rename from torchvision/prototype/transforms/functional/_geometry.py
rename to torchvision/transforms/v2/functional/_geometry.py
index 22731bb15..c48250f3b 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional import (
     _check_antialias,
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
similarity index 99%
rename from torchvision/prototype/transforms/functional/_meta.py
rename to torchvision/transforms/v2/functional/_meta.py
index 5e32516fb..c61f7a710 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -2,8 +2,8 @@ from typing import List, Optional, Tuple, Union
 
 import PIL.Image
 import torch
-from torchvision.prototype import datapoints
-from torchvision.prototype.datapoints import BoundingBoxFormat
+from torchvision import datapoints
+from torchvision.datapoints import BoundingBoxFormat
 from torchvision.transforms import functional_pil as _FP
 from torchvision.transforms.functional_tensor import _max_value
 
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
similarity index 99%
rename from torchvision/prototype/transforms/functional/_misc.py
rename to torchvision/transforms/v2/functional/_misc.py
index 9d0a00f88..cf728e278 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -5,7 +5,7 @@ import PIL.Image
 import torch
 from torch.nn.functional import conv2d, pad as torch_pad
 
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 from torchvision.utils import _log_api_usage_once
diff --git a/torchvision/prototype/transforms/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
similarity index 55%
rename from torchvision/prototype/transforms/functional/_temporal.py
rename to torchvision/transforms/v2/functional/_temporal.py
index d39a64534..438e6b519 100644
--- a/torchvision/prototype/transforms/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -1,33 +1,27 @@
 import torch
 
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 
 from torchvision.utils import _log_api_usage_once
 
 from ._utils import is_simple_tensor
 
 
-def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int, temporal_dim: int = -4) -> torch.Tensor:
+def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
     # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
-    t_max = video.shape[temporal_dim] - 1
+    t_max = video.shape[-4] - 1
     indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
-    return torch.index_select(video, temporal_dim, indices)
+    return torch.index_select(video, -4, indices)
 
 
-def uniform_temporal_subsample(
-    inpt: datapoints.VideoTypeJIT, num_samples: int, temporal_dim: int = -4
-) -> datapoints.VideoTypeJIT:
+def uniform_temporal_subsample(inpt: datapoints.VideoTypeJIT, num_samples: int) -> datapoints.VideoTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(uniform_temporal_subsample)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return uniform_temporal_subsample_video(inpt, num_samples, temporal_dim=temporal_dim)
+        return uniform_temporal_subsample_video(inpt, num_samples)
     elif isinstance(inpt, datapoints.Video):
-        if temporal_dim != -4 and inpt.ndim - 4 != temporal_dim:
-            raise ValueError("Video inputs must have temporal_dim equivalent to -4")
-        output = uniform_temporal_subsample_video(
-            inpt.as_subclass(torch.Tensor), num_samples, temporal_dim=temporal_dim
-        )
+        output = uniform_temporal_subsample_video(inpt.as_subclass(torch.Tensor), num_samples)
         return datapoints.Video.wrap_like(inpt, output)
     else:
         raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
similarity index 95%
rename from torchvision/prototype/transforms/functional/_type_conversion.py
rename to torchvision/transforms/v2/functional/_type_conversion.py
index 286aa7485..67572cf4a 100644
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -3,7 +3,7 @@ from typing import Union
 import numpy as np
 import PIL.Image
 import torch
-from torchvision.prototype import datapoints
+from torchvision import datapoints
 from torchvision.transforms import functional as _F
 
 
diff --git a/torchvision/prototype/transforms/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
similarity index 70%
rename from torchvision/prototype/transforms/functional/_utils.py
rename to torchvision/transforms/v2/functional/_utils.py
index e4efeb601..f31ccb939 100644
--- a/torchvision/prototype/transforms/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -1,7 +1,7 @@
 from typing import Any
 
 import torch
-from torchvision.prototype.datapoints._datapoint import Datapoint
+from torchvision.datapoints._datapoint import Datapoint
 
 
 def is_simple_tensor(inpt: Any) -> bool:
diff --git a/torchvision/prototype/transforms/utils.py b/torchvision/transforms/v2/utils.py
similarity index 94%
rename from torchvision/prototype/transforms/utils.py
rename to torchvision/transforms/v2/utils.py
index ff7fff50c..c4cf481bc 100644
--- a/torchvision/prototype/transforms/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -3,10 +3,10 @@ from __future__ import annotations
 from typing import Any, Callable, List, Tuple, Type, Union
 
 import PIL.Image
+from torchvision import datapoints
 
 from torchvision._utils import sequence_to_str
-from torchvision.prototype import datapoints
-from torchvision.prototype.transforms.functional import get_dimensions, get_spatial_size, is_simple_tensor
+from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor
 
 
 def query_bounding_box(flat_inputs: List[Any]) -> datapoints.BoundingBox:
-- 
GitLab


From 55d3ba621fc49cc7352167e64bc0cb02bffef9a7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 16 Feb 2023 15:34:48 +0000
Subject: [PATCH 300/624] Deprecate functional_pil and functional_tensor and
 make them private (#7269)

---
 test/prototype_common_utils.py                |   2 +-
 test/prototype_transforms_kernel_infos.py     |   2 +-
 test/test_functional_tensor.py                |   4 +-
 test/test_transforms.py                       |   2 +-
 torchvision/transforms/_functional_pil.py     | 391 +++++++
 torchvision/transforms/_functional_tensor.py  | 962 +++++++++++++++++
 torchvision/transforms/functional.py          |   2 +-
 torchvision/transforms/functional_pil.py      | 398 +------
 torchvision/transforms/functional_tensor.py   | 967 +-----------------
 torchvision/transforms/v2/_auto_augment.py    |   2 +-
 .../transforms/v2/functional/_color.py        |   4 +-
 .../transforms/v2/functional/_geometry.py     |   4 +-
 torchvision/transforms/v2/functional/_meta.py |   4 +-
 13 files changed, 1383 insertions(+), 1361 deletions(-)
 create mode 100644 torchvision/transforms/_functional_pil.py
 create mode 100644 torchvision/transforms/_functional_tensor.py

diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 8648a09ad..33c390f9f 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -17,7 +17,7 @@ from datasets_utils import combinations_grid
 from torch.nn.functional import one_hot
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints
-from torchvision.transforms.functional_tensor import _max_value as get_max_value
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
 
 __all__ = [
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index a0f7da5e2..5e8be39ae 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -29,7 +29,7 @@ from prototype_common_utils import (
 )
 from torch.utils._pytree import tree_map
 from torchvision import datapoints
-from torchvision.transforms.functional_tensor import _max_value as get_max_value, _parse_pad_padding
+from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index e7c845dc8..0e1cc648a 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -11,9 +11,9 @@ import PIL.Image
 import pytest
 import torch
 import torchvision.transforms as T
+import torchvision.transforms._functional_pil as F_pil
+import torchvision.transforms._functional_tensor as F_t
 import torchvision.transforms.functional as F
-import torchvision.transforms.functional_pil as F_pil
-import torchvision.transforms.functional_tensor as F_t
 from common_utils import (
     _assert_approx_equal_tensor_to_pil,
     _assert_equal_tensor_to_pil,
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 57e61bbad..cec69c0cb 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -9,8 +9,8 @@ import numpy as np
 import pytest
 import torch
 import torchvision.transforms as transforms
+import torchvision.transforms._functional_tensor as F_t
 import torchvision.transforms.functional as F
-import torchvision.transforms.functional_tensor as F_t
 from PIL import Image
 from torch._utils_internal import get_file_path_2
 
diff --git a/torchvision/transforms/_functional_pil.py b/torchvision/transforms/_functional_pil.py
new file mode 100644
index 000000000..120998d00
--- /dev/null
+++ b/torchvision/transforms/_functional_pil.py
@@ -0,0 +1,391 @@
+import numbers
+from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageEnhance, ImageOps
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+
+@torch.jit.unused
+def _is_pil_image(img: Any) -> bool:
+    if accimage is not None:
+        return isinstance(img, (Image.Image, accimage.Image))
+    else:
+        return isinstance(img, Image.Image)
+
+
+@torch.jit.unused
+def get_dimensions(img: Any) -> List[int]:
+    if _is_pil_image(img):
+        if hasattr(img, "getbands"):
+            channels = len(img.getbands())
+        else:
+            channels = img.channels
+        width, height = img.size
+        return [channels, height, width]
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def get_image_size(img: Any) -> List[int]:
+    if _is_pil_image(img):
+        return list(img.size)
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def get_image_num_channels(img: Any) -> int:
+    if _is_pil_image(img):
+        if hasattr(img, "getbands"):
+            return len(img.getbands())
+        else:
+            return img.channels
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def hflip(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
+
+
+@torch.jit.unused
+def vflip(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
+
+
+@torch.jit.unused
+def adjust_brightness(img: Image.Image, brightness_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_contrast(img: Image.Image, contrast_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_saturation(img: Image.Image, saturation_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_hue(img: Image.Image, hue_factor: float) -> Image.Image:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    input_mode = img.mode
+    if input_mode in {"L", "1", "I", "F"}:
+        return img
+
+    h, s, v = img.convert("HSV").split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over="ignore"):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, "L")
+
+    img = Image.merge("HSV", (h, s, v)).convert(input_mode)
+    return img
+
+
+@torch.jit.unused
+def adjust_gamma(
+    img: Image.Image,
+    gamma: float,
+    gain: float = 1.0,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    input_mode = img.mode
+    img = img.convert("RGB")
+    gamma_map = [int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma)) for ele in range(256)] * 3
+    img = img.point(gamma_map)  # use PIL's point-function to accelerate this part
+
+    img = img.convert(input_mode)
+    return img
+
+
+@torch.jit.unused
+def pad(
+    img: Image.Image,
+    padding: Union[int, List[int], Tuple[int, ...]],
+    fill: Optional[Union[float, List[float], Tuple[float, ...]]] = 0,
+    padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if not isinstance(padding, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+    if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate fill arg")
+    if not isinstance(padding_mode, str):
+        raise TypeError("Got inappropriate padding_mode arg")
+
+    if isinstance(padding, list):
+        padding = tuple(padding)
+
+    if isinstance(padding, tuple) and len(padding) not in [1, 2, 4]:
+        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
+
+    if isinstance(padding, tuple) and len(padding) == 1:
+        # Compatibility with `functional_tensor.pad`
+        padding = padding[0]
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+    if padding_mode == "constant":
+        opts = _parse_fill(fill, img, name="fill")
+        if img.mode == "P":
+            palette = img.getpalette()
+            image = ImageOps.expand(img, border=padding, **opts)
+            image.putpalette(palette)
+            return image
+
+        return ImageOps.expand(img, border=padding, **opts)
+    else:
+        if isinstance(padding, int):
+            pad_left = pad_right = pad_top = pad_bottom = padding
+        if isinstance(padding, tuple) and len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        if isinstance(padding, tuple) and len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+
+        p = [pad_left, pad_top, pad_right, pad_bottom]
+        cropping = -np.minimum(p, 0)
+
+        if cropping.any():
+            crop_left, crop_top, crop_right, crop_bottom = cropping
+            img = img.crop((crop_left, crop_top, img.width - crop_right, img.height - crop_bottom))
+
+        pad_left, pad_top, pad_right, pad_bottom = np.maximum(p, 0)
+
+        if img.mode == "P":
+            palette = img.getpalette()
+            img = np.asarray(img)
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), mode=padding_mode)
+            img = Image.fromarray(img)
+            img.putpalette(palette)
+            return img
+
+        img = np.asarray(img)
+        # RGB image
+        if len(img.shape) == 3:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), padding_mode)
+        # Grayscale image
+        if len(img.shape) == 2:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode)
+
+        return Image.fromarray(img)
+
+
+@torch.jit.unused
+def crop(
+    img: Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.crop((left, top, left + width, top + height))
+
+
+@torch.jit.unused
+def resize(
+    img: Image.Image,
+    size: Union[List[int], int],
+    interpolation: int = Image.BILINEAR,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    if not (isinstance(size, list) and len(size) == 2):
+        raise TypeError(f"Got inappropriate size arg: {size}")
+
+    return img.resize(tuple(size[::-1]), interpolation)
+
+
+@torch.jit.unused
+def _parse_fill(
+    fill: Optional[Union[float, List[float], Tuple[float, ...]]],
+    img: Image.Image,
+    name: str = "fillcolor",
+) -> Dict[str, Optional[Union[float, List[float], Tuple[float, ...]]]]:
+
+    # Process fill color for affine transforms
+    num_channels = get_image_num_channels(img)
+    if fill is None:
+        fill = 0
+    if isinstance(fill, (int, float)) and num_channels > 1:
+        fill = tuple([fill] * num_channels)
+    if isinstance(fill, (list, tuple)):
+        if len(fill) != num_channels:
+            msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
+            raise ValueError(msg.format(len(fill), num_channels))
+
+        fill = tuple(fill)
+
+    if img.mode != "F":
+        if isinstance(fill, (list, tuple)):
+            fill = tuple(int(x) for x in fill)
+        else:
+            fill = int(fill)
+
+    return {name: fill}
+
+
+@torch.jit.unused
+def affine(
+    img: Image.Image,
+    matrix: List[float],
+    interpolation: int = Image.NEAREST,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    output_size = img.size
+    opts = _parse_fill(fill, img)
+    return img.transform(output_size, Image.AFFINE, matrix, interpolation, **opts)
+
+
+@torch.jit.unused
+def rotate(
+    img: Image.Image,
+    angle: float,
+    interpolation: int = Image.NEAREST,
+    expand: bool = False,
+    center: Optional[Tuple[int, int]] = None,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    opts = _parse_fill(fill, img)
+    return img.rotate(angle, interpolation, expand, center, **opts)
+
+
+@torch.jit.unused
+def perspective(
+    img: Image.Image,
+    perspective_coeffs: List[float],
+    interpolation: int = Image.BICUBIC,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    opts = _parse_fill(fill, img)
+
+    return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
+
+
+@torch.jit.unused
+def to_grayscale(img: Image.Image, num_output_channels: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if num_output_channels == 1:
+        img = img.convert("L")
+    elif num_output_channels == 3:
+        img = img.convert("L")
+        np_img = np.array(img, dtype=np.uint8)
+        np_img = np.dstack([np_img, np_img, np_img])
+        img = Image.fromarray(np_img, "RGB")
+    else:
+        raise ValueError("num_output_channels should be either 1 or 3")
+
+    return img
+
+
+@torch.jit.unused
+def invert(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.invert(img)
+
+
+@torch.jit.unused
+def posterize(img: Image.Image, bits: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.posterize(img, bits)
+
+
+@torch.jit.unused
+def solarize(img: Image.Image, threshold: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.solarize(img, threshold)
+
+
+@torch.jit.unused
+def adjust_sharpness(img: Image.Image, sharpness_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Sharpness(img)
+    img = enhancer.enhance(sharpness_factor)
+    return img
+
+
+@torch.jit.unused
+def autocontrast(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.autocontrast(img)
+
+
+@torch.jit.unused
+def equalize(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.equalize(img)
diff --git a/torchvision/transforms/_functional_tensor.py b/torchvision/transforms/_functional_tensor.py
new file mode 100644
index 000000000..d0e7c1788
--- /dev/null
+++ b/torchvision/transforms/_functional_tensor.py
@@ -0,0 +1,962 @@
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn.functional import conv2d, grid_sample, interpolate, pad as torch_pad
+
+
+def _is_tensor_a_torch_image(x: Tensor) -> bool:
+    return x.ndim >= 2
+
+
+def _assert_image_tensor(img: Tensor) -> None:
+    if not _is_tensor_a_torch_image(img):
+        raise TypeError("Tensor is not a torch image.")
+
+
+def get_dimensions(img: Tensor) -> List[int]:
+    _assert_image_tensor(img)
+    channels = 1 if img.ndim == 2 else img.shape[-3]
+    height, width = img.shape[-2:]
+    return [channels, height, width]
+
+
+def get_image_size(img: Tensor) -> List[int]:
+    # Returns (w, h) of tensor image
+    _assert_image_tensor(img)
+    return [img.shape[-1], img.shape[-2]]
+
+
+def get_image_num_channels(img: Tensor) -> int:
+    _assert_image_tensor(img)
+    if img.ndim == 2:
+        return 1
+    elif img.ndim > 2:
+        return img.shape[-3]
+
+    raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
+
+
+def _max_value(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 255
+    elif dtype == torch.int8:
+        return 127
+    elif dtype == torch.int16:
+        return 32767
+    elif dtype == torch.int32:
+        return 2147483647
+    elif dtype == torch.int64:
+        return 9223372036854775807
+    else:
+        # This is only here for completeness. This value is implicitly assumed in a lot of places so changing it is not
+        # easy.
+        return 1
+
+
+def _assert_channels(img: Tensor, permitted: List[int]) -> None:
+    c = get_dimensions(img)[0]
+    if c not in permitted:
+        raise TypeError(f"Input image tensor permitted channel values are {permitted}, but found {c}")
+
+
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    if image.dtype == dtype:
+        return image
+
+    if image.is_floating_point():
+
+        # TODO: replace with dtype.is_floating_point when torchscript supports it
+        if torch.tensor(0, dtype=dtype).is_floating_point():
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            msg = f"The cast from {image.dtype} to {dtype} cannot be performed safely."
+            raise RuntimeError(msg)
+
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # For data in the range 0-1, (float * 255).to(uint) is only 255
+        # when float is exactly 1.0.
+        # `max + 1 - epsilon` provides more evenly distributed mapping of
+        # ranges of floats to ints.
+        eps = 1e-3
+        max_val = float(_max_value(dtype))
+        result = image.mul(max_val + 1.0 - eps)
+        return result.to(dtype)
+    else:
+        input_max = float(_max_value(image.dtype))
+
+        # int to float
+        # TODO: replace with dtype.is_floating_point when torchscript supports it
+        if torch.tensor(0, dtype=dtype).is_floating_point():
+            image = image.to(dtype)
+            return image / input_max
+
+        output_max = float(_max_value(dtype))
+
+        # int to int
+        if input_max > output_max:
+            # factor should be forced to int for torch jit script
+            # otherwise factor is a float and image // factor can produce different results
+            factor = int((input_max + 1) // (output_max + 1))
+            image = torch.div(image, factor, rounding_mode="floor")
+            return image.to(dtype)
+        else:
+            # factor should be forced to int for torch jit script
+            # otherwise factor is a float and image * factor can produce different results
+            factor = int((output_max + 1) // (input_max + 1))
+            image = image.to(dtype)
+            return image * factor
+
+
+def vflip(img: Tensor) -> Tensor:
+    _assert_image_tensor(img)
+
+    return img.flip(-2)
+
+
+def hflip(img: Tensor) -> Tensor:
+    _assert_image_tensor(img)
+
+    return img.flip(-1)
+
+
+def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
+    _assert_image_tensor(img)
+
+    _, h, w = get_dimensions(img)
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        padding_ltrb = [
+            max(-left + min(0, right), 0),
+            max(-top + min(0, bottom), 0),
+            max(right - max(w, left), 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return pad(img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0)
+    return img[..., top:bottom, left:right]
+
+
+def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+    _assert_channels(img, [1, 3])
+
+    if num_output_channels not in (1, 3):
+        raise ValueError("num_output_channels should be either 1 or 3")
+
+    if img.shape[-3] == 3:
+        r, g, b = img.unbind(dim=-3)
+        # This implementation closely follows the TF one:
+        # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
+        l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
+        l_img = l_img.unsqueeze(dim=-3)
+    else:
+        l_img = img.clone()
+
+    if num_output_channels == 3:
+        return l_img.expand(img.shape)
+
+    return l_img
+
+
+def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    return _blend(img, torch.zeros_like(img), brightness_factor)
+
+
+def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [3, 1])
+    c = get_dimensions(img)[0]
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    if c == 3:
+        mean = torch.mean(rgb_to_grayscale(img).to(dtype), dim=(-3, -2, -1), keepdim=True)
+    else:
+        mean = torch.mean(img.to(dtype), dim=(-3, -2, -1), keepdim=True)
+
+    return _blend(img, mean, contrast_factor)
+
+
+def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor image")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
+        return img
+
+    orig_dtype = img.dtype
+    img = convert_image_dtype(img, torch.float32)
+
+    img = _rgb2hsv(img)
+    h, s, v = img.unbind(dim=-3)
+    h = (h + hue_factor) % 1.0
+    img = torch.stack((h, s, v), dim=-3)
+    img_hue_adj = _hsv2rgb(img)
+
+    return convert_image_dtype(img_hue_adj, orig_dtype)
+
+
+def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
+        return img
+
+    return _blend(img, rgb_to_grayscale(img), saturation_factor)
+
+
+def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
+    if not isinstance(img, torch.Tensor):
+        raise TypeError("Input img should be a Tensor.")
+
+    _assert_channels(img, [1, 3])
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    result = img
+    dtype = img.dtype
+    if not torch.is_floating_point(img):
+        result = convert_image_dtype(result, torch.float32)
+
+    result = (gain * result**gamma).clamp(0, 1)
+
+    result = convert_image_dtype(result, dtype)
+    return result
+
+
+def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
+    ratio = float(ratio)
+    bound = _max_value(img1.dtype)
+    return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
+
+
+def _rgb2hsv(img: Tensor) -> Tensor:
+    r, g, b = img.unbind(dim=-3)
+
+    # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/
+    # src/libImaging/Convert.c#L330
+    maxc = torch.max(img, dim=-3).values
+    minc = torch.min(img, dim=-3).values
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring, so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    cr = maxc - minc
+    # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = cr / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    cr_divisor = torch.where(eqc, ones, cr)
+    rc = (maxc - r) / cr_divisor
+    gc = (maxc - g) / cr_divisor
+    bc = (maxc - b) / cr_divisor
+
+    hr = (maxc == r) * (bc - gc)
+    hg = ((maxc == g) & (maxc != r)) * (2.0 + rc - bc)
+    hb = ((maxc != g) & (maxc != r)) * (4.0 + gc - rc)
+    h = hr + hg + hb
+    h = torch.fmod((h / 6.0 + 1.0), 1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv2rgb(img: Tensor) -> Tensor:
+    h, s, v = img.unbind(dim=-3)
+    i = torch.floor(h * 6.0)
+    f = (h * 6.0) - i
+    i = i.to(dtype=torch.int32)
+
+    p = torch.clamp((v * (1.0 - s)), 0.0, 1.0)
+    q = torch.clamp((v * (1.0 - s * f)), 0.0, 1.0)
+    t = torch.clamp((v * (1.0 - s * (1.0 - f))), 0.0, 1.0)
+    i = i % 6
+
+    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
+
+    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
+    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
+    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
+    a4 = torch.stack((a1, a2, a3), dim=-4)
+
+    return torch.einsum("...ijk, ...xijk -> ...xjk", mask.to(dtype=img.dtype), a4)
+
+
+def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor:
+    # padding is left, right, top, bottom
+
+    # crop if needed
+    if padding[0] < 0 or padding[1] < 0 or padding[2] < 0 or padding[3] < 0:
+        neg_min_padding = [-min(x, 0) for x in padding]
+        crop_left, crop_right, crop_top, crop_bottom = neg_min_padding
+        img = img[..., crop_top : img.shape[-2] - crop_bottom, crop_left : img.shape[-1] - crop_right]
+        padding = [max(x, 0) for x in padding]
+
+    in_sizes = img.size()
+
+    _x_indices = [i for i in range(in_sizes[-1])]  # [0, 1, 2, 3, ...]
+    left_indices = [i for i in range(padding[0] - 1, -1, -1)]  # e.g. [3, 2, 1, 0]
+    right_indices = [-(i + 1) for i in range(padding[1])]  # e.g. [-1, -2, -3]
+    x_indices = torch.tensor(left_indices + _x_indices + right_indices, device=img.device)
+
+    _y_indices = [i for i in range(in_sizes[-2])]
+    top_indices = [i for i in range(padding[2] - 1, -1, -1)]
+    bottom_indices = [-(i + 1) for i in range(padding[3])]
+    y_indices = torch.tensor(top_indices + _y_indices + bottom_indices, device=img.device)
+
+    ndim = img.ndim
+    if ndim == 3:
+        return img[:, y_indices[:, None], x_indices[None, :]]
+    elif ndim == 4:
+        return img[:, :, y_indices[:, None], x_indices[None, :]]
+    else:
+        raise RuntimeError("Symmetric padding of N-D tensors are not supported yet")
+
+
+def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
+    if isinstance(padding, int):
+        if torch.jit.is_scripting():
+            # This maybe unreachable
+            raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]")
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif len(padding) == 1:
+        pad_left = pad_right = pad_top = pad_bottom = padding[0]
+    elif len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    else:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
+
+
+def pad(
+    img: Tensor, padding: Union[int, List[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
+) -> Tensor:
+    _assert_image_tensor(img)
+
+    if fill is None:
+        fill = 0
+
+    if not isinstance(padding, (int, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+    if not isinstance(fill, (int, float)):
+        raise TypeError("Got inappropriate fill arg")
+    if not isinstance(padding_mode, str):
+        raise TypeError("Got inappropriate padding_mode arg")
+
+    if isinstance(padding, tuple):
+        padding = list(padding)
+
+    if isinstance(padding, list):
+        # TODO: Jit is failing on loading this op when scripted and saved
+        # https://github.com/pytorch/pytorch/issues/81100
+        if len(padding) not in [1, 2, 4]:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+    p = _parse_pad_padding(padding)
+
+    if padding_mode == "edge":
+        # remap padding_mode str
+        padding_mode = "replicate"
+    elif padding_mode == "symmetric":
+        # route to another implementation
+        return _pad_symmetric(img, p)
+
+    need_squeeze = False
+    if img.ndim < 4:
+        img = img.unsqueeze(dim=0)
+        need_squeeze = True
+
+    out_dtype = img.dtype
+    need_cast = False
+    if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
+        # Here we temporarily cast input tensor to float
+        # until pytorch issue is resolved :
+        # https://github.com/pytorch/pytorch/issues/40763
+        need_cast = True
+        img = img.to(torch.float32)
+
+    if padding_mode in ("reflect", "replicate"):
+        img = torch_pad(img, p, mode=padding_mode)
+    else:
+        img = torch_pad(img, p, mode=padding_mode, value=float(fill))
+
+    if need_squeeze:
+        img = img.squeeze(dim=0)
+
+    if need_cast:
+        img = img.to(out_dtype)
+
+    return img
+
+
+def resize(
+    img: Tensor,
+    size: List[int],
+    interpolation: str = "bilinear",
+    # TODO: in v0.17, change the default to True. This will a private function
+    # by then, so we don't care about warning here.
+    antialias: Optional[bool] = None,
+) -> Tensor:
+    _assert_image_tensor(img)
+
+    if isinstance(size, tuple):
+        size = list(size)
+
+    if antialias is None:
+        antialias = False
+
+    if antialias and interpolation not in ["bilinear", "bicubic"]:
+        # We manually set it to False to avoid an error downstream in interpolate()
+        # This behaviour is documented: the parameter is irrelevant for modes
+        # that are not bilinear or bicubic. We used to raise an error here, but
+        # now we don't as True is the default.
+        antialias = False
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
+
+    # Define align_corners to avoid warnings
+    align_corners = False if interpolation in ["bilinear", "bicubic"] else None
+
+    img = interpolate(img, size=size, mode=interpolation, align_corners=align_corners, antialias=antialias)
+
+    if interpolation == "bicubic" and out_dtype == torch.uint8:
+        img = img.clamp(min=0, max=255)
+
+    img = _cast_squeeze_out(img, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
+
+    return img
+
+
+def _assert_grid_transform_inputs(
+    img: Tensor,
+    matrix: Optional[List[float]],
+    interpolation: str,
+    fill: Optional[Union[int, float, List[float]]],
+    supported_interpolation_modes: List[str],
+    coeffs: Optional[List[float]] = None,
+) -> None:
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor")
+
+    _assert_image_tensor(img)
+
+    if matrix is not None and not isinstance(matrix, list):
+        raise TypeError("Argument matrix should be a list")
+
+    if matrix is not None and len(matrix) != 6:
+        raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None and not isinstance(fill, (int, float, tuple, list)):
+        warnings.warn("Argument fill should be either int, float, tuple or list")
+
+    # Check fill
+    num_channels = get_dimensions(img)[0]
+    if fill is not None and isinstance(fill, (tuple, list)) and len(fill) > 1 and len(fill) != num_channels:
+        msg = (
+            "The number of elements in 'fill' cannot broadcast to match the number of "
+            "channels of the image ({} != {})"
+        )
+        raise ValueError(msg.format(len(fill), num_channels))
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _cast_squeeze_in(img: Tensor, req_dtypes: List[torch.dtype]) -> Tuple[Tensor, bool, bool, torch.dtype]:
+    need_squeeze = False
+    # make image NCHW
+    if img.ndim < 4:
+        img = img.unsqueeze(dim=0)
+        need_squeeze = True
+
+    out_dtype = img.dtype
+    need_cast = False
+    if out_dtype not in req_dtypes:
+        need_cast = True
+        req_dtype = req_dtypes[0]
+        img = img.to(req_dtype)
+    return img, need_cast, need_squeeze, out_dtype
+
+
+def _cast_squeeze_out(img: Tensor, need_cast: bool, need_squeeze: bool, out_dtype: torch.dtype) -> Tensor:
+    if need_squeeze:
+        img = img.squeeze(dim=0)
+
+    if need_cast:
+        if out_dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+            # it is better to round before cast
+            img = torch.round(img)
+        img = img.to(out_dtype)
+
+    return img
+
+
+def _apply_grid_transform(
+    img: Tensor, grid: Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
+) -> Tensor:
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [grid.dtype])
+
+    if img.shape[0] > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3])
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
+        img = torch.cat((img, mask), dim=1)
+
+    img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        mask = img[:, -1:, :, :]  # N * 1 * H * W
+        img = img[:, :-1, :, :]  # N * C * H * W
+        mask = mask.expand_as(img)
+        fill_list, len_fill = (fill, len(fill)) if isinstance(fill, (tuple, list)) else ([float(fill)], 1)
+        fill_img = torch.tensor(fill_list, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
+        if mode == "nearest":
+            mask = mask < 0.5
+            img[mask] = fill_img[mask]
+        else:  # 'bilinear'
+            img = img * mask + (1.0 - mask) * fill_img
+
+    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
+    return img
+
+
+def _gen_affine_grid(
+    theta: Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=theta.dtype, device=theta.device)
+    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=theta.device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=theta.device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2) / torch.tensor([0.5 * w, 0.5 * h], dtype=theta.dtype, device=theta.device)
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
+def affine(
+    img: Tensor,
+    matrix: List[float],
+    interpolation: str = "nearest",
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
+
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
+    shape = img.shape
+    # grid will be generated on the same device as theta and img
+    grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
+
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    pts = torch.tensor(
+        [
+            [-0.5 * w, -0.5 * h, 1.0],
+            [-0.5 * w, 0.5 * h, 1.0],
+            [0.5 * w, 0.5 * h, 1.0],
+            [0.5 * w, -0.5 * h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, _ = new_pts.min(dim=0)
+    max_vals, _ = new_pts.max(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    min_vals += torch.tensor((w * 0.5, h * 0.5))
+    max_vals += torch.tensor((w * 0.5, h * 0.5))
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    cmax = torch.ceil((max_vals / tol).trunc_() * tol)
+    cmin = torch.floor((min_vals / tol).trunc_() * tol)
+    size = cmax - cmin
+    return int(size[0]), int(size[1])  # w, h
+
+
+def rotate(
+    img: Tensor,
+    matrix: List[float],
+    interpolation: str = "nearest",
+    expand: bool = False,
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
+    w, h = img.shape[-1], img.shape[-2]
+    ow, oh = _compute_affine_output_size(matrix, w, h) if expand else (w, h)
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
+    # grid will be generated on the same device as theta and img
+    grid = _gen_affine_grid(theta, w=w, h=h, ow=ow, oh=oh)
+
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device)
+    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.view(1, oh, ow, 2)
+
+
+def perspective(
+    img: Tensor,
+    perspective_coeffs: List[float],
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor.")
+
+    _assert_image_tensor(img)
+
+    _assert_grid_transform_inputs(
+        img,
+        matrix=None,
+        interpolation=interpolation,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    ow, oh = img.shape[-1], img.shape[-2]
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=img.device)
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor:
+    ksize_half = (kernel_size - 1) * 0.5
+
+    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
+    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
+    kernel1d = pdf / pdf.sum()
+
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
+) -> Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
+    kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :])
+    return kernel2d
+
+
+def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Tensor:
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError(f"img should be Tensor. Got {type(img)}")
+
+    _assert_image_tensor(img)
+
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
+    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    img = torch_pad(img, padding, mode="reflect")
+    img = conv2d(img, kernel, groups=img.shape[-3])
+
+    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
+    return img
+
+
+def invert(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    return _max_value(img.dtype) - img
+
+
+def posterize(img: Tensor, bits: int) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+    if img.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
+
+    _assert_channels(img, [1, 3])
+    mask = -int(2 ** (8 - bits))  # JIT-friendly for: ~(2 ** (8 - bits) - 1)
+    return img & mask
+
+
+def solarize(img: Tensor, threshold: float) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    if threshold > _max_value(img.dtype):
+        raise TypeError("Threshold should be less than bound of img.")
+
+    inverted_img = invert(img)
+    return torch.where(img >= threshold, inverted_img, img)
+
+
+def _blurred_degenerate_image(img: Tensor) -> Tensor:
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+
+    kernel = torch.ones((3, 3), dtype=dtype, device=img.device)
+    kernel[1, 1] = 5.0
+    kernel /= kernel.sum()
+    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
+    result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
+    result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
+
+    result = img.clone()
+    result[..., 1:-1, 1:-1] = result_tmp
+
+    return result
+
+
+def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    if img.size(-1) <= 2 or img.size(-2) <= 2:
+        return img
+
+    return _blend(img, _blurred_degenerate_image(img), sharpness_factor)
+
+
+def autocontrast(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    bound = _max_value(img.dtype)
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+
+    minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
+    maximum = img.amax(dim=(-2, -1), keepdim=True).to(dtype)
+    scale = bound / (maximum - minimum)
+    eq_idxs = torch.isfinite(scale).logical_not()
+    minimum[eq_idxs] = 0
+    scale[eq_idxs] = 1
+
+    return ((img - minimum) * scale).clamp(0, bound).to(img.dtype)
+
+
+def _scale_channel(img_chan: Tensor) -> Tensor:
+    # TODO: we should expect bincount to always be faster than histc, but this
+    # isn't always the case. Once
+    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
+    # block and only use bincount.
+    if img_chan.is_cuda:
+        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
+    else:
+        hist = torch.bincount(img_chan.reshape(-1), minlength=256)
+
+    nonzero_hist = hist[hist != 0]
+    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
+    if step == 0:
+        return img_chan
+
+    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
+    lut = torch.nn.functional.pad(lut, [1, 0])[:-1].clamp(0, 255)
+
+    return lut[img_chan.to(torch.int64)].to(torch.uint8)
+
+
+def _equalize_single_image(img: Tensor) -> Tensor:
+    return torch.stack([_scale_channel(img[c]) for c in range(img.size(0))])
+
+
+def equalize(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if not (3 <= img.ndim <= 4):
+        raise TypeError(f"Input image tensor should have 3 or 4 dimensions, but found {img.ndim}")
+    if img.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
+
+    _assert_channels(img, [1, 3])
+
+    if img.ndim == 3:
+        return _equalize_single_image(img)
+
+    return torch.stack([_equalize_single_image(x) for x in img])
+
+
+def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False) -> Tensor:
+    _assert_image_tensor(tensor)
+
+    if not tensor.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {tensor.dtype}.")
+
+    if tensor.ndim < 3:
+        raise ValueError(
+            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {tensor.size()}"
+        )
+
+    if not inplace:
+        tensor = tensor.clone()
+
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if (std == 0).any():
+        raise ValueError(f"std evaluated to zero after conversion to {dtype}, leading to division by zero.")
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+    return tensor.sub_(mean).div_(std)
+
+
+def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
+    _assert_image_tensor(img)
+
+    if not inplace:
+        img = img.clone()
+
+    img[..., i : i + h, j : j + w] = v
+    return img
+
+
+def _create_identity_grid(size: List[int]) -> Tensor:
+    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
+    grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
+    return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
+
+
+def elastic_transform(
+    img: Tensor,
+    displacement: Tensor,
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError(f"img should be Tensor. Got {type(img)}")
+
+    size = list(img.shape[-2:])
+    displacement = displacement.to(img.device)
+
+    identity_grid = _create_identity_grid(size)
+    grid = identity_grid.to(img.device) + displacement
+    return _apply_grid_transform(img, grid, interpolation, fill)
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index c5b2a71d0..09c8bdbcf 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -15,7 +15,7 @@ except ImportError:
     accimage = None
 
 from ..utils import _log_api_usage_once
-from . import functional_pil as F_pil, functional_tensor as F_t
+from . import _functional_pil as F_pil, _functional_tensor as F_t
 
 
 class InterpolationMode(Enum):
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py
index 120998d00..bfcbf1a54 100644
--- a/torchvision/transforms/functional_pil.py
+++ b/torchvision/transforms/functional_pil.py
@@ -1,391 +1,11 @@
-import numbers
-from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
+import warnings
 
-import numpy as np
-import torch
-from PIL import Image, ImageEnhance, ImageOps
+from torchvision.transforms._functional_pil import *  # noqa
 
-try:
-    import accimage
-except ImportError:
-    accimage = None
-
-
-@torch.jit.unused
-def _is_pil_image(img: Any) -> bool:
-    if accimage is not None:
-        return isinstance(img, (Image.Image, accimage.Image))
-    else:
-        return isinstance(img, Image.Image)
-
-
-@torch.jit.unused
-def get_dimensions(img: Any) -> List[int]:
-    if _is_pil_image(img):
-        if hasattr(img, "getbands"):
-            channels = len(img.getbands())
-        else:
-            channels = img.channels
-        width, height = img.size
-        return [channels, height, width]
-    raise TypeError(f"Unexpected type {type(img)}")
-
-
-@torch.jit.unused
-def get_image_size(img: Any) -> List[int]:
-    if _is_pil_image(img):
-        return list(img.size)
-    raise TypeError(f"Unexpected type {type(img)}")
-
-
-@torch.jit.unused
-def get_image_num_channels(img: Any) -> int:
-    if _is_pil_image(img):
-        if hasattr(img, "getbands"):
-            return len(img.getbands())
-        else:
-            return img.channels
-    raise TypeError(f"Unexpected type {type(img)}")
-
-
-@torch.jit.unused
-def hflip(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    return img.transpose(Image.FLIP_LEFT_RIGHT)
-
-
-@torch.jit.unused
-def vflip(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    return img.transpose(Image.FLIP_TOP_BOTTOM)
-
-
-@torch.jit.unused
-def adjust_brightness(img: Image.Image, brightness_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Brightness(img)
-    img = enhancer.enhance(brightness_factor)
-    return img
-
-
-@torch.jit.unused
-def adjust_contrast(img: Image.Image, contrast_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Contrast(img)
-    img = enhancer.enhance(contrast_factor)
-    return img
-
-
-@torch.jit.unused
-def adjust_saturation(img: Image.Image, saturation_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Color(img)
-    img = enhancer.enhance(saturation_factor)
-    return img
-
-
-@torch.jit.unused
-def adjust_hue(img: Image.Image, hue_factor: float) -> Image.Image:
-    if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    input_mode = img.mode
-    if input_mode in {"L", "1", "I", "F"}:
-        return img
-
-    h, s, v = img.convert("HSV").split()
-
-    np_h = np.array(h, dtype=np.uint8)
-    # uint8 addition take cares of rotation across boundaries
-    with np.errstate(over="ignore"):
-        np_h += np.uint8(hue_factor * 255)
-    h = Image.fromarray(np_h, "L")
-
-    img = Image.merge("HSV", (h, s, v)).convert(input_mode)
-    return img
-
-
-@torch.jit.unused
-def adjust_gamma(
-    img: Image.Image,
-    gamma: float,
-    gain: float = 1.0,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    if gamma < 0:
-        raise ValueError("Gamma should be a non-negative real number")
-
-    input_mode = img.mode
-    img = img.convert("RGB")
-    gamma_map = [int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma)) for ele in range(256)] * 3
-    img = img.point(gamma_map)  # use PIL's point-function to accelerate this part
-
-    img = img.convert(input_mode)
-    return img
-
-
-@torch.jit.unused
-def pad(
-    img: Image.Image,
-    padding: Union[int, List[int], Tuple[int, ...]],
-    fill: Optional[Union[float, List[float], Tuple[float, ...]]] = 0,
-    padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    if not isinstance(padding, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate padding arg")
-    if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate fill arg")
-    if not isinstance(padding_mode, str):
-        raise TypeError("Got inappropriate padding_mode arg")
-
-    if isinstance(padding, list):
-        padding = tuple(padding)
-
-    if isinstance(padding, tuple) and len(padding) not in [1, 2, 4]:
-        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
-
-    if isinstance(padding, tuple) and len(padding) == 1:
-        # Compatibility with `functional_tensor.pad`
-        padding = padding[0]
-
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
-        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-    if padding_mode == "constant":
-        opts = _parse_fill(fill, img, name="fill")
-        if img.mode == "P":
-            palette = img.getpalette()
-            image = ImageOps.expand(img, border=padding, **opts)
-            image.putpalette(palette)
-            return image
-
-        return ImageOps.expand(img, border=padding, **opts)
-    else:
-        if isinstance(padding, int):
-            pad_left = pad_right = pad_top = pad_bottom = padding
-        if isinstance(padding, tuple) and len(padding) == 2:
-            pad_left = pad_right = padding[0]
-            pad_top = pad_bottom = padding[1]
-        if isinstance(padding, tuple) and len(padding) == 4:
-            pad_left = padding[0]
-            pad_top = padding[1]
-            pad_right = padding[2]
-            pad_bottom = padding[3]
-
-        p = [pad_left, pad_top, pad_right, pad_bottom]
-        cropping = -np.minimum(p, 0)
-
-        if cropping.any():
-            crop_left, crop_top, crop_right, crop_bottom = cropping
-            img = img.crop((crop_left, crop_top, img.width - crop_right, img.height - crop_bottom))
-
-        pad_left, pad_top, pad_right, pad_bottom = np.maximum(p, 0)
-
-        if img.mode == "P":
-            palette = img.getpalette()
-            img = np.asarray(img)
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), mode=padding_mode)
-            img = Image.fromarray(img)
-            img.putpalette(palette)
-            return img
-
-        img = np.asarray(img)
-        # RGB image
-        if len(img.shape) == 3:
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), padding_mode)
-        # Grayscale image
-        if len(img.shape) == 2:
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode)
-
-        return Image.fromarray(img)
-
-
-@torch.jit.unused
-def crop(
-    img: Image.Image,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    return img.crop((left, top, left + width, top + height))
-
-
-@torch.jit.unused
-def resize(
-    img: Image.Image,
-    size: Union[List[int], int],
-    interpolation: int = Image.BILINEAR,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    if not (isinstance(size, list) and len(size) == 2):
-        raise TypeError(f"Got inappropriate size arg: {size}")
-
-    return img.resize(tuple(size[::-1]), interpolation)
-
-
-@torch.jit.unused
-def _parse_fill(
-    fill: Optional[Union[float, List[float], Tuple[float, ...]]],
-    img: Image.Image,
-    name: str = "fillcolor",
-) -> Dict[str, Optional[Union[float, List[float], Tuple[float, ...]]]]:
-
-    # Process fill color for affine transforms
-    num_channels = get_image_num_channels(img)
-    if fill is None:
-        fill = 0
-    if isinstance(fill, (int, float)) and num_channels > 1:
-        fill = tuple([fill] * num_channels)
-    if isinstance(fill, (list, tuple)):
-        if len(fill) != num_channels:
-            msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
-            raise ValueError(msg.format(len(fill), num_channels))
-
-        fill = tuple(fill)
-
-    if img.mode != "F":
-        if isinstance(fill, (list, tuple)):
-            fill = tuple(int(x) for x in fill)
-        else:
-            fill = int(fill)
-
-    return {name: fill}
-
-
-@torch.jit.unused
-def affine(
-    img: Image.Image,
-    matrix: List[float],
-    interpolation: int = Image.NEAREST,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    output_size = img.size
-    opts = _parse_fill(fill, img)
-    return img.transform(output_size, Image.AFFINE, matrix, interpolation, **opts)
-
-
-@torch.jit.unused
-def rotate(
-    img: Image.Image,
-    angle: float,
-    interpolation: int = Image.NEAREST,
-    expand: bool = False,
-    center: Optional[Tuple[int, int]] = None,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    opts = _parse_fill(fill, img)
-    return img.rotate(angle, interpolation, expand, center, **opts)
-
-
-@torch.jit.unused
-def perspective(
-    img: Image.Image,
-    perspective_coeffs: List[float],
-    interpolation: int = Image.BICUBIC,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    opts = _parse_fill(fill, img)
-
-    return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
-
-
-@torch.jit.unused
-def to_grayscale(img: Image.Image, num_output_channels: int) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    if num_output_channels == 1:
-        img = img.convert("L")
-    elif num_output_channels == 3:
-        img = img.convert("L")
-        np_img = np.array(img, dtype=np.uint8)
-        np_img = np.dstack([np_img, np_img, np_img])
-        img = Image.fromarray(np_img, "RGB")
-    else:
-        raise ValueError("num_output_channels should be either 1 or 3")
-
-    return img
-
-
-@torch.jit.unused
-def invert(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.invert(img)
-
-
-@torch.jit.unused
-def posterize(img: Image.Image, bits: int) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.posterize(img, bits)
-
-
-@torch.jit.unused
-def solarize(img: Image.Image, threshold: int) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.solarize(img, threshold)
-
-
-@torch.jit.unused
-def adjust_sharpness(img: Image.Image, sharpness_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Sharpness(img)
-    img = enhancer.enhance(sharpness_factor)
-    return img
-
-
-@torch.jit.unused
-def autocontrast(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.autocontrast(img)
-
-
-@torch.jit.unused
-def equalize(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.equalize(img)
+warnings.warn(
+    "The torchvision.transforms.functional_pil module is deprecated "
+    "in 0.15 and will be **removed in 0.17**. Please don't rely on it. "
+    "You probably just need to use APIs in "
+    "torchvision.transforms.functional or in "
+    "torchvision.transforms.v2.functional."
+)
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index d0e7c1788..a9f7f5090 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -1,962 +1,11 @@
 import warnings
-from typing import List, Optional, Tuple, Union
 
-import torch
-from torch import Tensor
-from torch.nn.functional import conv2d, grid_sample, interpolate, pad as torch_pad
+from torchvision.transforms._functional_tensor import *  # noqa
 
-
-def _is_tensor_a_torch_image(x: Tensor) -> bool:
-    return x.ndim >= 2
-
-
-def _assert_image_tensor(img: Tensor) -> None:
-    if not _is_tensor_a_torch_image(img):
-        raise TypeError("Tensor is not a torch image.")
-
-
-def get_dimensions(img: Tensor) -> List[int]:
-    _assert_image_tensor(img)
-    channels = 1 if img.ndim == 2 else img.shape[-3]
-    height, width = img.shape[-2:]
-    return [channels, height, width]
-
-
-def get_image_size(img: Tensor) -> List[int]:
-    # Returns (w, h) of tensor image
-    _assert_image_tensor(img)
-    return [img.shape[-1], img.shape[-2]]
-
-
-def get_image_num_channels(img: Tensor) -> int:
-    _assert_image_tensor(img)
-    if img.ndim == 2:
-        return 1
-    elif img.ndim > 2:
-        return img.shape[-3]
-
-    raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
-
-
-def _max_value(dtype: torch.dtype) -> int:
-    if dtype == torch.uint8:
-        return 255
-    elif dtype == torch.int8:
-        return 127
-    elif dtype == torch.int16:
-        return 32767
-    elif dtype == torch.int32:
-        return 2147483647
-    elif dtype == torch.int64:
-        return 9223372036854775807
-    else:
-        # This is only here for completeness. This value is implicitly assumed in a lot of places so changing it is not
-        # easy.
-        return 1
-
-
-def _assert_channels(img: Tensor, permitted: List[int]) -> None:
-    c = get_dimensions(img)[0]
-    if c not in permitted:
-        raise TypeError(f"Input image tensor permitted channel values are {permitted}, but found {c}")
-
-
-def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
-    if image.dtype == dtype:
-        return image
-
-    if image.is_floating_point():
-
-        # TODO: replace with dtype.is_floating_point when torchscript supports it
-        if torch.tensor(0, dtype=dtype).is_floating_point():
-            return image.to(dtype)
-
-        # float to int
-        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
-            image.dtype == torch.float64 and dtype == torch.int64
-        ):
-            msg = f"The cast from {image.dtype} to {dtype} cannot be performed safely."
-            raise RuntimeError(msg)
-
-        # https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
-        # For data in the range 0-1, (float * 255).to(uint) is only 255
-        # when float is exactly 1.0.
-        # `max + 1 - epsilon` provides more evenly distributed mapping of
-        # ranges of floats to ints.
-        eps = 1e-3
-        max_val = float(_max_value(dtype))
-        result = image.mul(max_val + 1.0 - eps)
-        return result.to(dtype)
-    else:
-        input_max = float(_max_value(image.dtype))
-
-        # int to float
-        # TODO: replace with dtype.is_floating_point when torchscript supports it
-        if torch.tensor(0, dtype=dtype).is_floating_point():
-            image = image.to(dtype)
-            return image / input_max
-
-        output_max = float(_max_value(dtype))
-
-        # int to int
-        if input_max > output_max:
-            # factor should be forced to int for torch jit script
-            # otherwise factor is a float and image // factor can produce different results
-            factor = int((input_max + 1) // (output_max + 1))
-            image = torch.div(image, factor, rounding_mode="floor")
-            return image.to(dtype)
-        else:
-            # factor should be forced to int for torch jit script
-            # otherwise factor is a float and image * factor can produce different results
-            factor = int((output_max + 1) // (input_max + 1))
-            image = image.to(dtype)
-            return image * factor
-
-
-def vflip(img: Tensor) -> Tensor:
-    _assert_image_tensor(img)
-
-    return img.flip(-2)
-
-
-def hflip(img: Tensor) -> Tensor:
-    _assert_image_tensor(img)
-
-    return img.flip(-1)
-
-
-def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
-    _assert_image_tensor(img)
-
-    _, h, w = get_dimensions(img)
-    right = left + width
-    bottom = top + height
-
-    if left < 0 or top < 0 or right > w or bottom > h:
-        padding_ltrb = [
-            max(-left + min(0, right), 0),
-            max(-top + min(0, bottom), 0),
-            max(right - max(w, left), 0),
-            max(bottom - max(h, top), 0),
-        ]
-        return pad(img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0)
-    return img[..., top:bottom, left:right]
-
-
-def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-    _assert_channels(img, [1, 3])
-
-    if num_output_channels not in (1, 3):
-        raise ValueError("num_output_channels should be either 1 or 3")
-
-    if img.shape[-3] == 3:
-        r, g, b = img.unbind(dim=-3)
-        # This implementation closely follows the TF one:
-        # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
-        l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
-        l_img = l_img.unsqueeze(dim=-3)
-    else:
-        l_img = img.clone()
-
-    if num_output_channels == 3:
-        return l_img.expand(img.shape)
-
-    return l_img
-
-
-def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
-    if brightness_factor < 0:
-        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-
-    return _blend(img, torch.zeros_like(img), brightness_factor)
-
-
-def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
-    if contrast_factor < 0:
-        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [3, 1])
-    c = get_dimensions(img)[0]
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    if c == 3:
-        mean = torch.mean(rgb_to_grayscale(img).to(dtype), dim=(-3, -2, -1), keepdim=True)
-    else:
-        mean = torch.mean(img.to(dtype), dim=(-3, -2, -1), keepdim=True)
-
-    return _blend(img, mean, contrast_factor)
-
-
-def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
-    if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
-
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError("Input img should be Tensor image")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
-        return img
-
-    orig_dtype = img.dtype
-    img = convert_image_dtype(img, torch.float32)
-
-    img = _rgb2hsv(img)
-    h, s, v = img.unbind(dim=-3)
-    h = (h + hue_factor) % 1.0
-    img = torch.stack((h, s, v), dim=-3)
-    img_hue_adj = _hsv2rgb(img)
-
-    return convert_image_dtype(img_hue_adj, orig_dtype)
-
-
-def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
-    if saturation_factor < 0:
-        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-
-    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
-        return img
-
-    return _blend(img, rgb_to_grayscale(img), saturation_factor)
-
-
-def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
-    if not isinstance(img, torch.Tensor):
-        raise TypeError("Input img should be a Tensor.")
-
-    _assert_channels(img, [1, 3])
-
-    if gamma < 0:
-        raise ValueError("Gamma should be a non-negative real number")
-
-    result = img
-    dtype = img.dtype
-    if not torch.is_floating_point(img):
-        result = convert_image_dtype(result, torch.float32)
-
-    result = (gain * result**gamma).clamp(0, 1)
-
-    result = convert_image_dtype(result, dtype)
-    return result
-
-
-def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
-    ratio = float(ratio)
-    bound = _max_value(img1.dtype)
-    return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
-
-
-def _rgb2hsv(img: Tensor) -> Tensor:
-    r, g, b = img.unbind(dim=-3)
-
-    # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/
-    # src/libImaging/Convert.c#L330
-    maxc = torch.max(img, dim=-3).values
-    minc = torch.min(img, dim=-3).values
-
-    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
-    # from happening in the results, because
-    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
-    #   + H channel has division by `(maxc - minc)`.
-    #
-    # Instead of overwriting NaN afterwards, we just prevent it from occurring, so
-    # we don't need to deal with it in case we save the NaN in a buffer in
-    # backprop, if it is ever supported, but it doesn't hurt to do so.
-    eqc = maxc == minc
-
-    cr = maxc - minc
-    # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine.
-    ones = torch.ones_like(maxc)
-    s = cr / torch.where(eqc, ones, maxc)
-    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
-    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
-    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
-    # replacing denominator with 1 when `eqc` is fine.
-    cr_divisor = torch.where(eqc, ones, cr)
-    rc = (maxc - r) / cr_divisor
-    gc = (maxc - g) / cr_divisor
-    bc = (maxc - b) / cr_divisor
-
-    hr = (maxc == r) * (bc - gc)
-    hg = ((maxc == g) & (maxc != r)) * (2.0 + rc - bc)
-    hb = ((maxc != g) & (maxc != r)) * (4.0 + gc - rc)
-    h = hr + hg + hb
-    h = torch.fmod((h / 6.0 + 1.0), 1.0)
-    return torch.stack((h, s, maxc), dim=-3)
-
-
-def _hsv2rgb(img: Tensor) -> Tensor:
-    h, s, v = img.unbind(dim=-3)
-    i = torch.floor(h * 6.0)
-    f = (h * 6.0) - i
-    i = i.to(dtype=torch.int32)
-
-    p = torch.clamp((v * (1.0 - s)), 0.0, 1.0)
-    q = torch.clamp((v * (1.0 - s * f)), 0.0, 1.0)
-    t = torch.clamp((v * (1.0 - s * (1.0 - f))), 0.0, 1.0)
-    i = i % 6
-
-    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
-
-    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
-    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
-    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
-    a4 = torch.stack((a1, a2, a3), dim=-4)
-
-    return torch.einsum("...ijk, ...xijk -> ...xjk", mask.to(dtype=img.dtype), a4)
-
-
-def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor:
-    # padding is left, right, top, bottom
-
-    # crop if needed
-    if padding[0] < 0 or padding[1] < 0 or padding[2] < 0 or padding[3] < 0:
-        neg_min_padding = [-min(x, 0) for x in padding]
-        crop_left, crop_right, crop_top, crop_bottom = neg_min_padding
-        img = img[..., crop_top : img.shape[-2] - crop_bottom, crop_left : img.shape[-1] - crop_right]
-        padding = [max(x, 0) for x in padding]
-
-    in_sizes = img.size()
-
-    _x_indices = [i for i in range(in_sizes[-1])]  # [0, 1, 2, 3, ...]
-    left_indices = [i for i in range(padding[0] - 1, -1, -1)]  # e.g. [3, 2, 1, 0]
-    right_indices = [-(i + 1) for i in range(padding[1])]  # e.g. [-1, -2, -3]
-    x_indices = torch.tensor(left_indices + _x_indices + right_indices, device=img.device)
-
-    _y_indices = [i for i in range(in_sizes[-2])]
-    top_indices = [i for i in range(padding[2] - 1, -1, -1)]
-    bottom_indices = [-(i + 1) for i in range(padding[3])]
-    y_indices = torch.tensor(top_indices + _y_indices + bottom_indices, device=img.device)
-
-    ndim = img.ndim
-    if ndim == 3:
-        return img[:, y_indices[:, None], x_indices[None, :]]
-    elif ndim == 4:
-        return img[:, :, y_indices[:, None], x_indices[None, :]]
-    else:
-        raise RuntimeError("Symmetric padding of N-D tensors are not supported yet")
-
-
-def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
-    if isinstance(padding, int):
-        if torch.jit.is_scripting():
-            # This maybe unreachable
-            raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]")
-        pad_left = pad_right = pad_top = pad_bottom = padding
-    elif len(padding) == 1:
-        pad_left = pad_right = pad_top = pad_bottom = padding[0]
-    elif len(padding) == 2:
-        pad_left = pad_right = padding[0]
-        pad_top = pad_bottom = padding[1]
-    else:
-        pad_left = padding[0]
-        pad_top = padding[1]
-        pad_right = padding[2]
-        pad_bottom = padding[3]
-
-    return [pad_left, pad_right, pad_top, pad_bottom]
-
-
-def pad(
-    img: Tensor, padding: Union[int, List[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
-) -> Tensor:
-    _assert_image_tensor(img)
-
-    if fill is None:
-        fill = 0
-
-    if not isinstance(padding, (int, tuple, list)):
-        raise TypeError("Got inappropriate padding arg")
-    if not isinstance(fill, (int, float)):
-        raise TypeError("Got inappropriate fill arg")
-    if not isinstance(padding_mode, str):
-        raise TypeError("Got inappropriate padding_mode arg")
-
-    if isinstance(padding, tuple):
-        padding = list(padding)
-
-    if isinstance(padding, list):
-        # TODO: Jit is failing on loading this op when scripted and saved
-        # https://github.com/pytorch/pytorch/issues/81100
-        if len(padding) not in [1, 2, 4]:
-            raise ValueError(
-                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
-            )
-
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
-        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-    p = _parse_pad_padding(padding)
-
-    if padding_mode == "edge":
-        # remap padding_mode str
-        padding_mode = "replicate"
-    elif padding_mode == "symmetric":
-        # route to another implementation
-        return _pad_symmetric(img, p)
-
-    need_squeeze = False
-    if img.ndim < 4:
-        img = img.unsqueeze(dim=0)
-        need_squeeze = True
-
-    out_dtype = img.dtype
-    need_cast = False
-    if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
-        # Here we temporarily cast input tensor to float
-        # until pytorch issue is resolved :
-        # https://github.com/pytorch/pytorch/issues/40763
-        need_cast = True
-        img = img.to(torch.float32)
-
-    if padding_mode in ("reflect", "replicate"):
-        img = torch_pad(img, p, mode=padding_mode)
-    else:
-        img = torch_pad(img, p, mode=padding_mode, value=float(fill))
-
-    if need_squeeze:
-        img = img.squeeze(dim=0)
-
-    if need_cast:
-        img = img.to(out_dtype)
-
-    return img
-
-
-def resize(
-    img: Tensor,
-    size: List[int],
-    interpolation: str = "bilinear",
-    # TODO: in v0.17, change the default to True. This will a private function
-    # by then, so we don't care about warning here.
-    antialias: Optional[bool] = None,
-) -> Tensor:
-    _assert_image_tensor(img)
-
-    if isinstance(size, tuple):
-        size = list(size)
-
-    if antialias is None:
-        antialias = False
-
-    if antialias and interpolation not in ["bilinear", "bicubic"]:
-        # We manually set it to False to avoid an error downstream in interpolate()
-        # This behaviour is documented: the parameter is irrelevant for modes
-        # that are not bilinear or bicubic. We used to raise an error here, but
-        # now we don't as True is the default.
-        antialias = False
-
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
-
-    # Define align_corners to avoid warnings
-    align_corners = False if interpolation in ["bilinear", "bicubic"] else None
-
-    img = interpolate(img, size=size, mode=interpolation, align_corners=align_corners, antialias=antialias)
-
-    if interpolation == "bicubic" and out_dtype == torch.uint8:
-        img = img.clamp(min=0, max=255)
-
-    img = _cast_squeeze_out(img, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
-
-    return img
-
-
-def _assert_grid_transform_inputs(
-    img: Tensor,
-    matrix: Optional[List[float]],
-    interpolation: str,
-    fill: Optional[Union[int, float, List[float]]],
-    supported_interpolation_modes: List[str],
-    coeffs: Optional[List[float]] = None,
-) -> None:
-
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError("Input img should be Tensor")
-
-    _assert_image_tensor(img)
-
-    if matrix is not None and not isinstance(matrix, list):
-        raise TypeError("Argument matrix should be a list")
-
-    if matrix is not None and len(matrix) != 6:
-        raise ValueError("Argument matrix should have 6 float values")
-
-    if coeffs is not None and len(coeffs) != 8:
-        raise ValueError("Argument coeffs should have 8 float values")
-
-    if fill is not None and not isinstance(fill, (int, float, tuple, list)):
-        warnings.warn("Argument fill should be either int, float, tuple or list")
-
-    # Check fill
-    num_channels = get_dimensions(img)[0]
-    if fill is not None and isinstance(fill, (tuple, list)) and len(fill) > 1 and len(fill) != num_channels:
-        msg = (
-            "The number of elements in 'fill' cannot broadcast to match the number of "
-            "channels of the image ({} != {})"
-        )
-        raise ValueError(msg.format(len(fill), num_channels))
-
-    if interpolation not in supported_interpolation_modes:
-        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
-
-
-def _cast_squeeze_in(img: Tensor, req_dtypes: List[torch.dtype]) -> Tuple[Tensor, bool, bool, torch.dtype]:
-    need_squeeze = False
-    # make image NCHW
-    if img.ndim < 4:
-        img = img.unsqueeze(dim=0)
-        need_squeeze = True
-
-    out_dtype = img.dtype
-    need_cast = False
-    if out_dtype not in req_dtypes:
-        need_cast = True
-        req_dtype = req_dtypes[0]
-        img = img.to(req_dtype)
-    return img, need_cast, need_squeeze, out_dtype
-
-
-def _cast_squeeze_out(img: Tensor, need_cast: bool, need_squeeze: bool, out_dtype: torch.dtype) -> Tensor:
-    if need_squeeze:
-        img = img.squeeze(dim=0)
-
-    if need_cast:
-        if out_dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
-            # it is better to round before cast
-            img = torch.round(img)
-        img = img.to(out_dtype)
-
-    return img
-
-
-def _apply_grid_transform(
-    img: Tensor, grid: Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
-) -> Tensor:
-
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [grid.dtype])
-
-    if img.shape[0] > 1:
-        # Apply same grid to a batch of images
-        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3])
-
-    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
-    if fill is not None:
-        mask = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
-        img = torch.cat((img, mask), dim=1)
-
-    img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False)
-
-    # Fill with required color
-    if fill is not None:
-        mask = img[:, -1:, :, :]  # N * 1 * H * W
-        img = img[:, :-1, :, :]  # N * C * H * W
-        mask = mask.expand_as(img)
-        fill_list, len_fill = (fill, len(fill)) if isinstance(fill, (tuple, list)) else ([float(fill)], 1)
-        fill_img = torch.tensor(fill_list, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
-        if mode == "nearest":
-            mask = mask < 0.5
-            img[mask] = fill_img[mask]
-        else:  # 'bilinear'
-            img = img * mask + (1.0 - mask) * fill_img
-
-    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
-    return img
-
-
-def _gen_affine_grid(
-    theta: Tensor,
-    w: int,
-    h: int,
-    ow: int,
-    oh: int,
-) -> Tensor:
-    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
-    # AffineGridGenerator.cpp#L18
-    # Difference with AffineGridGenerator is that:
-    # 1) we normalize grid values after applying theta
-    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
-
-    d = 0.5
-    base_grid = torch.empty(1, oh, ow, 3, dtype=theta.dtype, device=theta.device)
-    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=theta.device)
-    base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=theta.device).unsqueeze_(-1)
-    base_grid[..., 1].copy_(y_grid)
-    base_grid[..., 2].fill_(1)
-
-    rescaled_theta = theta.transpose(1, 2) / torch.tensor([0.5 * w, 0.5 * h], dtype=theta.dtype, device=theta.device)
-    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
-    return output_grid.view(1, oh, ow, 2)
-
-
-def affine(
-    img: Tensor,
-    matrix: List[float],
-    interpolation: str = "nearest",
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
-
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
-    shape = img.shape
-    # grid will be generated on the same device as theta and img
-    grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
-    return _apply_grid_transform(img, grid, interpolation, fill=fill)
-
-
-def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
-
-    # Inspired of PIL implementation:
-    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
-
-    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-    # Points are shifted due to affine matrix torch convention about
-    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
-    pts = torch.tensor(
-        [
-            [-0.5 * w, -0.5 * h, 1.0],
-            [-0.5 * w, 0.5 * h, 1.0],
-            [0.5 * w, 0.5 * h, 1.0],
-            [0.5 * w, -0.5 * h, 1.0],
-        ]
-    )
-    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
-    new_pts = torch.matmul(pts, theta.T)
-    min_vals, _ = new_pts.min(dim=0)
-    max_vals, _ = new_pts.max(dim=0)
-
-    # shift points to [0, w] and [0, h] interval to match PIL results
-    min_vals += torch.tensor((w * 0.5, h * 0.5))
-    max_vals += torch.tensor((w * 0.5, h * 0.5))
-
-    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
-    tol = 1e-4
-    cmax = torch.ceil((max_vals / tol).trunc_() * tol)
-    cmin = torch.floor((min_vals / tol).trunc_() * tol)
-    size = cmax - cmin
-    return int(size[0]), int(size[1])  # w, h
-
-
-def rotate(
-    img: Tensor,
-    matrix: List[float],
-    interpolation: str = "nearest",
-    expand: bool = False,
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
-    w, h = img.shape[-1], img.shape[-2]
-    ow, oh = _compute_affine_output_size(matrix, w, h) if expand else (w, h)
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
-    # grid will be generated on the same device as theta and img
-    grid = _gen_affine_grid(theta, w=w, h=h, ow=ow, oh=oh)
-
-    return _apply_grid_transform(img, grid, interpolation, fill=fill)
-
-
-def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
-    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
-    # src/libImaging/Geometry.c#L394
-
-    #
-    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    #
-    theta1 = torch.tensor(
-        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
-    )
-    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
-
-    d = 0.5
-    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
-    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
-    base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
-    base_grid[..., 1].copy_(y_grid)
-    base_grid[..., 2].fill_(1)
-
-    rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device)
-    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
-    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
-
-    output_grid = output_grid1 / output_grid2 - 1.0
-    return output_grid.view(1, oh, ow, 2)
-
-
-def perspective(
-    img: Tensor,
-    perspective_coeffs: List[float],
-    interpolation: str = "bilinear",
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError("Input img should be Tensor.")
-
-    _assert_image_tensor(img)
-
-    _assert_grid_transform_inputs(
-        img,
-        matrix=None,
-        interpolation=interpolation,
-        fill=fill,
-        supported_interpolation_modes=["nearest", "bilinear"],
-        coeffs=perspective_coeffs,
-    )
-
-    ow, oh = img.shape[-1], img.shape[-2]
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=img.device)
-    return _apply_grid_transform(img, grid, interpolation, fill=fill)
-
-
-def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor:
-    ksize_half = (kernel_size - 1) * 0.5
-
-    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
-    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
-    kernel1d = pdf / pdf.sum()
-
-    return kernel1d
-
-
-def _get_gaussian_kernel2d(
-    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
-) -> Tensor:
-    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
-    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
-    kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :])
-    return kernel2d
-
-
-def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Tensor:
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError(f"img should be Tensor. Got {type(img)}")
-
-    _assert_image_tensor(img)
-
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
-    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
-
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
-
-    # padding = (left, right, top, bottom)
-    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
-    img = torch_pad(img, padding, mode="reflect")
-    img = conv2d(img, kernel, groups=img.shape[-3])
-
-    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
-    return img
-
-
-def invert(img: Tensor) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-
-    _assert_channels(img, [1, 3])
-
-    return _max_value(img.dtype) - img
-
-
-def posterize(img: Tensor, bits: int) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-    if img.dtype != torch.uint8:
-        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
-
-    _assert_channels(img, [1, 3])
-    mask = -int(2 ** (8 - bits))  # JIT-friendly for: ~(2 ** (8 - bits) - 1)
-    return img & mask
-
-
-def solarize(img: Tensor, threshold: float) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-
-    _assert_channels(img, [1, 3])
-
-    if threshold > _max_value(img.dtype):
-        raise TypeError("Threshold should be less than bound of img.")
-
-    inverted_img = invert(img)
-    return torch.where(img >= threshold, inverted_img, img)
-
-
-def _blurred_degenerate_image(img: Tensor) -> Tensor:
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-
-    kernel = torch.ones((3, 3), dtype=dtype, device=img.device)
-    kernel[1, 1] = 5.0
-    kernel /= kernel.sum()
-    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
-
-    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
-    result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
-    result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
-
-    result = img.clone()
-    result[..., 1:-1, 1:-1] = result_tmp
-
-    return result
-
-
-def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
-    if sharpness_factor < 0:
-        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-
-    if img.size(-1) <= 2 or img.size(-2) <= 2:
-        return img
-
-    return _blend(img, _blurred_degenerate_image(img), sharpness_factor)
-
-
-def autocontrast(img: Tensor) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-
-    _assert_channels(img, [1, 3])
-
-    bound = _max_value(img.dtype)
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-
-    minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
-    maximum = img.amax(dim=(-2, -1), keepdim=True).to(dtype)
-    scale = bound / (maximum - minimum)
-    eq_idxs = torch.isfinite(scale).logical_not()
-    minimum[eq_idxs] = 0
-    scale[eq_idxs] = 1
-
-    return ((img - minimum) * scale).clamp(0, bound).to(img.dtype)
-
-
-def _scale_channel(img_chan: Tensor) -> Tensor:
-    # TODO: we should expect bincount to always be faster than histc, but this
-    # isn't always the case. Once
-    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
-    # block and only use bincount.
-    if img_chan.is_cuda:
-        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
-    else:
-        hist = torch.bincount(img_chan.reshape(-1), minlength=256)
-
-    nonzero_hist = hist[hist != 0]
-    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
-    if step == 0:
-        return img_chan
-
-    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
-    lut = torch.nn.functional.pad(lut, [1, 0])[:-1].clamp(0, 255)
-
-    return lut[img_chan.to(torch.int64)].to(torch.uint8)
-
-
-def _equalize_single_image(img: Tensor) -> Tensor:
-    return torch.stack([_scale_channel(img[c]) for c in range(img.size(0))])
-
-
-def equalize(img: Tensor) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if not (3 <= img.ndim <= 4):
-        raise TypeError(f"Input image tensor should have 3 or 4 dimensions, but found {img.ndim}")
-    if img.dtype != torch.uint8:
-        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
-
-    _assert_channels(img, [1, 3])
-
-    if img.ndim == 3:
-        return _equalize_single_image(img)
-
-    return torch.stack([_equalize_single_image(x) for x in img])
-
-
-def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False) -> Tensor:
-    _assert_image_tensor(tensor)
-
-    if not tensor.is_floating_point():
-        raise TypeError(f"Input tensor should be a float tensor. Got {tensor.dtype}.")
-
-    if tensor.ndim < 3:
-        raise ValueError(
-            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {tensor.size()}"
-        )
-
-    if not inplace:
-        tensor = tensor.clone()
-
-    dtype = tensor.dtype
-    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
-    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
-    if (std == 0).any():
-        raise ValueError(f"std evaluated to zero after conversion to {dtype}, leading to division by zero.")
-    if mean.ndim == 1:
-        mean = mean.view(-1, 1, 1)
-    if std.ndim == 1:
-        std = std.view(-1, 1, 1)
-    return tensor.sub_(mean).div_(std)
-
-
-def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
-    _assert_image_tensor(img)
-
-    if not inplace:
-        img = img.clone()
-
-    img[..., i : i + h, j : j + w] = v
-    return img
-
-
-def _create_identity_grid(size: List[int]) -> Tensor:
-    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
-    grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
-    return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
-
-
-def elastic_transform(
-    img: Tensor,
-    displacement: Tensor,
-    interpolation: str = "bilinear",
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError(f"img should be Tensor. Got {type(img)}")
-
-    size = list(img.shape[-2:])
-    displacement = displacement.to(img.device)
-
-    identity_grid = _create_identity_grid(size)
-    grid = identity_grid.to(img.device) + displacement
-    return _apply_grid_transform(img, grid, interpolation, fill)
+warnings.warn(
+    "The torchvision.transforms.functional_tensor module is deprecated "
+    "in 0.15 and will be **removed in 0.17**. Please don't rely on it. "
+    "You probably just need to use APIs in "
+    "torchvision.transforms.functional or in "
+    "torchvision.transforms.v2.functional."
+)
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index bdc3c89d7..fcd9c758c 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -6,7 +6,7 @@ import torch
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 from torchvision import datapoints, transforms as _transforms
-from torchvision.transforms import functional_tensor as _FT
+from torchvision.transforms import _functional_tensor as _FT
 from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
 from torchvision.transforms.v2.functional._meta import get_spatial_size
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 2ebb4f044..cf8d73dff 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -4,8 +4,8 @@ import PIL.Image
 import torch
 from torch.nn.functional import conv2d
 from torchvision import datapoints
-from torchvision.transforms import functional_pil as _FP
-from torchvision.transforms.functional_tensor import _max_value
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index c48250f3b..baa2f24f2 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -8,7 +8,8 @@ import torch
 from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
 from torchvision import datapoints
-from torchvision.transforms import functional_pil as _FP
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _pad_symmetric
 from torchvision.transforms.functional import (
     _check_antialias,
     _compute_resized_output_size as __compute_resized_output_size,
@@ -19,7 +20,6 @@ from torchvision.transforms.functional import (
     pil_to_tensor,
     to_pil_image,
 )
-from torchvision.transforms.functional_tensor import _pad_symmetric
 
 from torchvision.utils import _log_api_usage_once
 
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index c61f7a710..dc96838dd 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -4,8 +4,8 @@ import PIL.Image
 import torch
 from torchvision import datapoints
 from torchvision.datapoints import BoundingBoxFormat
-from torchvision.transforms import functional_pil as _FP
-from torchvision.transforms.functional_tensor import _max_value
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
-- 
GitLab


From 8f198e533e5832caeb16ef80132c5270c75d6080 Mon Sep 17 00:00:00 2001
From: Aditya Oke <47158509+oke-aditya@users.noreply.github.com>
Date: Thu, 16 Feb 2023 21:39:22 +0530
Subject: [PATCH 301/624] Fix dropout issue in swin transformers (#7224)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 torchvision/models/swin_transformer.py       | 12 ++++++++----
 torchvision/models/video/swin_transformer.py |  7 +++++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 0d3ab9ad3..249ca37b9 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -126,7 +126,8 @@ def shifted_window_attention(
     qkv_bias: Optional[Tensor] = None,
     proj_bias: Optional[Tensor] = None,
     logit_scale: Optional[torch.Tensor] = None,
-):
+    training: bool = True,
+) -> Tensor:
     """
     Window based multi-head self attention (W-MSA) module with relative position bias.
     It supports both of shifted and non-shifted window.
@@ -143,6 +144,7 @@ def shifted_window_attention(
         qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
         proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
         logit_scale (Tensor[out_dim], optional): Logit scale of cosine attention for Swin Transformer V2. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
     Returns:
         Tensor[N, H, W, C]: The output tensor after shifted window attention.
     """
@@ -207,11 +209,11 @@ def shifted_window_attention(
         attn = attn.view(-1, num_heads, x.size(1), x.size(1))
 
     attn = F.softmax(attn, dim=-1)
-    attn = F.dropout(attn, p=attention_dropout)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
 
     x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C)
     x = F.linear(x, proj_weight, proj_bias)
-    x = F.dropout(x, p=dropout)
+    x = F.dropout(x, p=dropout, training=training)
 
     # reverse windows
     x = x.view(B, pad_H // window_size[0], pad_W // window_size[1], window_size[0], window_size[1], C)
@@ -286,7 +288,7 @@ class ShiftedWindowAttention(nn.Module):
             self.relative_position_bias_table, self.relative_position_index, self.window_size  # type: ignore[arg-type]
         )
 
-    def forward(self, x: Tensor):
+    def forward(self, x: Tensor) -> Tensor:
         """
         Args:
             x (Tensor): Tensor with layout of [B, H, W, C]
@@ -306,6 +308,7 @@ class ShiftedWindowAttention(nn.Module):
             dropout=self.dropout,
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
+            training=self.training,
         )
 
 
@@ -391,6 +394,7 @@ class ShiftedWindowAttentionV2(ShiftedWindowAttention):
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
             logit_scale=self.logit_scale,
+            training=self.training,
         )
 
 
diff --git a/torchvision/models/video/swin_transformer.py b/torchvision/models/video/swin_transformer.py
index c6a1602d2..25cf3cf99 100644
--- a/torchvision/models/video/swin_transformer.py
+++ b/torchvision/models/video/swin_transformer.py
@@ -124,6 +124,7 @@ def shifted_window_attention_3d(
     dropout: float = 0.0,
     qkv_bias: Optional[Tensor] = None,
     proj_bias: Optional[Tensor] = None,
+    training: bool = True,
 ) -> Tensor:
     """
     Window based multi-head self attention (W-MSA) module with relative position bias.
@@ -140,6 +141,7 @@ def shifted_window_attention_3d(
         dropout (float): Dropout ratio of output. Default: 0.0.
         qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
         proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
     Returns:
         Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
     """
@@ -194,11 +196,11 @@ def shifted_window_attention_3d(
         attn = attn.view(-1, num_heads, x.size(1), x.size(1))
 
     attn = F.softmax(attn, dim=-1)
-    attn = F.dropout(attn, p=attention_dropout)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
 
     x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), c)
     x = F.linear(x, proj_weight, proj_bias)
-    x = F.dropout(x, p=dropout)
+    x = F.dropout(x, p=dropout, training=training)
 
     # reverse windows
     x = x.view(
@@ -310,6 +312,7 @@ class ShiftedWindowAttention3d(nn.Module):
             dropout=self.dropout,
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
+            training=self.training,
         )
 
 
-- 
GitLab


From e405f3c319cd5d4127fc4d03644dd429a7556153 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 16 Feb 2023 17:01:39 +0000
Subject: [PATCH 302/624] Make rotate_image_tensor's results fully BC with v1
 (#7271)

---
 torchvision/transforms/v2/functional/_geometry.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index baa2f24f2..1ffd7f454 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -832,10 +832,10 @@ def rotate_image_tensor(
     center_f = [0.0, 0.0]
     if center is not None:
         if expand:
+            # TODO: Do we actually want to warn, or just document this?
             warnings.warn("The provided center argument has no effect on the result if expand is True")
-        else:
-            # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-            center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
 
     # due to current incoherence of rotation angle direction between affine and rotate implementations
     # we need to set -angle.
-- 
GitLab


From d4d20f01e191dbacd0a0e6c8a5db5062222753ba Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 16 Feb 2023 18:47:04 +0100
Subject: [PATCH 303/624] make type alias private (#7266)

---
 torchvision/datapoints/__init__.py            |  6 +-
 torchvision/datapoints/_bounding_box.py       | 10 +--
 torchvision/datapoints/_datapoint.py          | 16 ++--
 torchvision/datapoints/_image.py              | 18 ++--
 torchvision/datapoints/_mask.py               | 10 +--
 torchvision/datapoints/_video.py              | 18 ++--
 torchvision/prototype/transforms/_augment.py  | 10 +--
 torchvision/prototype/transforms/_geometry.py |  2 +-
 torchvision/prototype/transforms/_misc.py     |  4 +-
 torchvision/transforms/v2/_augment.py         |  4 +-
 torchvision/transforms/v2/_auto_augment.py    | 20 ++---
 torchvision/transforms/v2/_color.py           |  8 +-
 torchvision/transforms/v2/_geometry.py        | 18 ++--
 torchvision/transforms/v2/_meta.py            |  4 +-
 torchvision/transforms/v2/_misc.py            |  2 +-
 torchvision/transforms/v2/_temporal.py        |  2 +-
 torchvision/transforms/v2/_utils.py           |  8 +-
 .../transforms/v2/functional/_augment.py      |  4 +-
 .../transforms/v2/functional/_color.py        | 26 +++---
 .../transforms/v2/functional/_deprecated.py   |  2 +-
 .../transforms/v2/functional/_geometry.py     | 84 +++++++++----------
 torchvision/transforms/v2/functional/_meta.py | 18 ++--
 torchvision/transforms/v2/functional/_misc.py |  6 +-
 .../transforms/v2/functional/_temporal.py     |  2 +-
 24 files changed, 151 insertions(+), 151 deletions(-)

diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index 04d5a0573..2a1a218fe 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,7 +1,7 @@
 from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._datapoint import FillType, FillTypeJIT, InputType, InputTypeJIT
-from ._image import Image, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT
+from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT
+from ._image import _ImageType, _ImageTypeJIT, _TensorImageType, _TensorImageTypeJIT, Image
 from ._mask import Mask
-from ._video import TensorVideoType, TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT
+from ._video import _TensorVideoType, _TensorVideoTypeJIT, _VideoType, _VideoTypeJIT, Video
 
 from ._dataset_wrapper import wrap_dataset_for_transforms_v2  # type: ignore[attr-defined]  # usort: skip
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index b904dd5e5..1dc46f8f2 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -6,7 +6,7 @@ import torch
 from torchvision._utils import StrEnum
 from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
 
-from ._datapoint import Datapoint, FillTypeJIT
+from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class BoundingBoxFormat(StrEnum):
@@ -136,7 +136,7 @@ class BoundingBox(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> BoundingBox:
         output, spatial_size = self._F.rotate_bounding_box(
             self.as_subclass(torch.Tensor),
@@ -155,7 +155,7 @@ class BoundingBox(Datapoint):
         scale: float,
         shear: List[float],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> BoundingBox:
         output = self._F.affine_bounding_box(
@@ -175,7 +175,7 @@ class BoundingBox(Datapoint):
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> BoundingBox:
         output = self._F.perspective_bounding_box(
@@ -192,7 +192,7 @@ class BoundingBox(Datapoint):
         self,
         displacement: torch.Tensor,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> BoundingBox:
         output = self._F.elastic_bounding_box(
             self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 2a2f34fc6..fe489d13e 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -11,8 +11,8 @@ from torchvision.transforms import InterpolationMode
 
 
 D = TypeVar("D", bound="Datapoint")
-FillType = Union[int, float, Sequence[int], Sequence[float], None]
-FillTypeJIT = Optional[List[float]]
+_FillType = Union[int, float, Sequence[int], Sequence[float], None]
+_FillTypeJIT = Optional[List[float]]
 
 
 class Datapoint(torch.Tensor):
@@ -181,7 +181,7 @@ class Datapoint(torch.Tensor):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Datapoint:
         return self
 
@@ -192,7 +192,7 @@ class Datapoint(torch.Tensor):
         scale: float,
         shear: List[float],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Datapoint:
         return self
@@ -202,7 +202,7 @@ class Datapoint(torch.Tensor):
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Datapoint:
         return self
@@ -211,7 +211,7 @@ class Datapoint(torch.Tensor):
         self,
         displacement: torch.Tensor,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Datapoint:
         return self
 
@@ -255,5 +255,5 @@ class Datapoint(torch.Tensor):
         return self
 
 
-InputType = Union[torch.Tensor, PIL.Image.Image, Datapoint]
-InputTypeJIT = torch.Tensor
+_InputType = Union[torch.Tensor, PIL.Image.Image, Datapoint]
+_InputTypeJIT = torch.Tensor
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 9c61740c5..21dfe5a5c 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -6,7 +6,7 @@ import PIL.Image
 import torch
 from torchvision.transforms.functional import InterpolationMode
 
-from ._datapoint import Datapoint, FillTypeJIT
+from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class Image(Datapoint):
@@ -116,7 +116,7 @@ class Image(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Image:
         output = self._F.rotate_image_tensor(
             self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
@@ -130,7 +130,7 @@ class Image(Datapoint):
         scale: float,
         shear: List[float],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Image:
         output = self._F.affine_image_tensor(
@@ -150,7 +150,7 @@ class Image(Datapoint):
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Image:
         output = self._F.perspective_image_tensor(
@@ -167,7 +167,7 @@ class Image(Datapoint):
         self,
         displacement: torch.Tensor,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Image:
         output = self._F.elastic_image_tensor(
             self.as_subclass(torch.Tensor), displacement, interpolation=interpolation, fill=fill
@@ -241,7 +241,7 @@ class Image(Datapoint):
         return Image.wrap_like(self, output)
 
 
-ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
-ImageTypeJIT = torch.Tensor
-TensorImageType = Union[torch.Tensor, Image]
-TensorImageTypeJIT = torch.Tensor
+_ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
+_ImageTypeJIT = torch.Tensor
+_TensorImageType = Union[torch.Tensor, Image]
+_TensorImageTypeJIT = torch.Tensor
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index 2746feaaf..bb70ec122 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -6,7 +6,7 @@ import PIL.Image
 import torch
 from torchvision.transforms import InterpolationMode
 
-from ._datapoint import Datapoint, FillTypeJIT
+from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class Mask(Datapoint):
@@ -96,7 +96,7 @@ class Mask(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Mask:
         output = self._F.rotate_mask(self.as_subclass(torch.Tensor), angle, expand=expand, center=center, fill=fill)
         return Mask.wrap_like(self, output)
@@ -108,7 +108,7 @@ class Mask(Datapoint):
         scale: float,
         shear: List[float],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Mask:
         output = self._F.affine_mask(
@@ -127,7 +127,7 @@ class Mask(Datapoint):
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Mask:
         output = self._F.perspective_mask(
@@ -139,7 +139,7 @@ class Mask(Datapoint):
         self,
         displacement: torch.Tensor,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Mask:
         output = self._F.elastic_mask(self.as_subclass(torch.Tensor), displacement, fill=fill)
         return Mask.wrap_like(self, output)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index f62edd68e..ab51c1023 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -5,7 +5,7 @@ from typing import Any, List, Optional, Tuple, Union
 import torch
 from torchvision.transforms.functional import InterpolationMode
 
-from ._datapoint import Datapoint, FillTypeJIT
+from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class Video(Datapoint):
@@ -115,7 +115,7 @@ class Video(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Video:
         output = self._F.rotate_video(
             self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
@@ -129,7 +129,7 @@ class Video(Datapoint):
         scale: float,
         shear: List[float],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> Video:
         output = self._F.affine_video(
@@ -149,7 +149,7 @@ class Video(Datapoint):
         startpoints: Optional[List[List[int]]],
         endpoints: Optional[List[List[int]]],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
     ) -> Video:
         output = self._F.perspective_video(
@@ -166,7 +166,7 @@ class Video(Datapoint):
         self,
         displacement: torch.Tensor,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
+        fill: _FillTypeJIT = None,
     ) -> Video:
         output = self._F.elastic_video(
             self.as_subclass(torch.Tensor), displacement, interpolation=interpolation, fill=fill
@@ -232,7 +232,7 @@ class Video(Datapoint):
         return Video.wrap_like(self, output)
 
 
-VideoType = Union[torch.Tensor, Video]
-VideoTypeJIT = torch.Tensor
-TensorVideoType = Union[torch.Tensor, Video]
-TensorVideoTypeJIT = torch.Tensor
+_VideoType = Union[torch.Tensor, Video]
+_VideoTypeJIT = torch.Tensor
+_TensorVideoType = Union[torch.Tensor, Video]
+_TensorVideoTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index afa411b48..d04baf739 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -119,15 +119,15 @@ class SimpleCopyPaste(Transform):
 
     def _copy_paste(
         self,
-        image: datapoints.TensorImageType,
+        image: datapoints._TensorImageType,
         target: Dict[str, Any],
-        paste_image: datapoints.TensorImageType,
+        paste_image: datapoints._TensorImageType,
         paste_target: Dict[str, Any],
         random_selection: torch.Tensor,
         blending: bool,
         resize_interpolation: F.InterpolationMode,
         antialias: Optional[bool],
-    ) -> Tuple[datapoints.TensorImageType, Dict[str, Any]]:
+    ) -> Tuple[datapoints._TensorImageType, Dict[str, Any]]:
 
         paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection])
         paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection])
@@ -199,7 +199,7 @@ class SimpleCopyPaste(Transform):
 
     def _extract_image_targets(
         self, flat_sample: List[Any]
-    ) -> Tuple[List[datapoints.TensorImageType], List[Dict[str, Any]]]:
+    ) -> Tuple[List[datapoints._TensorImageType], List[Dict[str, Any]]]:
         # fetch all images, bboxes, masks and labels from unstructured input
         # with List[image], List[BoundingBox], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
@@ -230,7 +230,7 @@ class SimpleCopyPaste(Transform):
     def _insert_outputs(
         self,
         flat_sample: List[Any],
-        output_images: List[datapoints.TensorImageType],
+        output_images: List[datapoints._TensorImageType],
         output_targets: List[Dict[str, Any]],
     ) -> None:
         c0, c1, c2, c3 = 0, 0, 0, 0
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index fa4ccef2e..512ffec0a 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -14,7 +14,7 @@ class FixedSizeCrop(Transform):
     def __init__(
         self,
         size: Union[int, Sequence[int]],
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         padding_mode: str = "constant",
     ) -> None:
         super().__init__()
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index b51b59a15..3a4e6e956 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -26,7 +26,7 @@ class PermuteDimensions(Transform):
         self.dims = dims
 
     def _transform(
-        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         dims = self.dims[type(inpt)]
         if dims is None:
@@ -50,7 +50,7 @@ class TransposeDimensions(Transform):
         self.dims = dims
 
     def _transform(
-        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
     ) -> torch.Tensor:
         dims = self.dims[type(inpt)]
         if dims is None:
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 1375400ed..157605d6f 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -97,8 +97,8 @@ class RandomErasing(_RandomApplyTransform):
         return dict(i=i, j=j, h=h, w=w, v=v)
 
     def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
+        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
         if params["v"] is not None:
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index fcd9c758c..b4791755d 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -20,7 +20,7 @@ class _AutoAugmentBase(Transform):
         self,
         *,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
     ) -> None:
         super().__init__()
         self.interpolation = _check_interpolation(interpolation)
@@ -35,7 +35,7 @@ class _AutoAugmentBase(Transform):
         self,
         inputs: Any,
         unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBox, datapoints.Mask),
-    ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints.ImageType, datapoints.VideoType]]:
+    ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints._ImageType, datapoints._VideoType]]:
         flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
         needs_transform_list = self._needs_transform_list(flat_inputs)
 
@@ -68,7 +68,7 @@ class _AutoAugmentBase(Transform):
     def _unflatten_and_insert_image_or_video(
         self,
         flat_inputs_with_spec: Tuple[List[Any], TreeSpec, int],
-        image_or_video: Union[datapoints.ImageType, datapoints.VideoType],
+        image_or_video: Union[datapoints._ImageType, datapoints._VideoType],
     ) -> Any:
         flat_inputs, spec, idx = flat_inputs_with_spec
         flat_inputs[idx] = image_or_video
@@ -76,12 +76,12 @@ class _AutoAugmentBase(Transform):
 
     def _apply_image_or_video_transform(
         self,
-        image: Union[datapoints.ImageType, datapoints.VideoType],
+        image: Union[datapoints._ImageType, datapoints._VideoType],
         transform_id: str,
         magnitude: float,
         interpolation: Union[InterpolationMode, int],
-        fill: Dict[Type, datapoints.FillTypeJIT],
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
+        fill: Dict[Type, datapoints._FillTypeJIT],
+    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
         fill_ = fill[type(image)]
 
         if transform_id == "Identity":
@@ -194,7 +194,7 @@ class AutoAugment(_AutoAugmentBase):
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.policy = policy
@@ -351,7 +351,7 @@ class RandAugment(_AutoAugmentBase):
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_ops = num_ops
@@ -404,7 +404,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         self,
         num_magnitude_bins: int = 31,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
     ):
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_magnitude_bins = num_magnitude_bins
@@ -462,7 +462,7 @@ class AugMix(_AutoAugmentBase):
         alpha: float = 1.0,
         all_ops: bool = True,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self._PARAMETER_MAX = 10
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index f1b04d775..64796e16c 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -163,8 +163,8 @@ class RandomPhotometricDistort(Transform):
         )
 
     def _permute_channels(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], permutation: torch.Tensor
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
+        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], permutation: torch.Tensor
+    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
 
         orig_inpt = inpt
         if isinstance(orig_inpt, PIL.Image.Image):
@@ -179,8 +179,8 @@ class RandomPhotometricDistort(Transform):
         return output
 
     def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.ImageType, datapoints.VideoType]:
+        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
+    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
         if params["brightness"]:
             inpt = F.adjust_brightness(
                 inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 6a8e4a3e0..65d116e20 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -160,7 +160,7 @@ class RandomResizedCrop(Transform):
         )
 
 
-ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
+ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
 
 
 class FiveCrop(Transform):
@@ -232,7 +232,7 @@ class TenCrop(Transform):
             raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
 
     def _transform(
-        self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
     ) -> Tuple[
         ImageOrVideoTypeJIT,
         ImageOrVideoTypeJIT,
@@ -264,7 +264,7 @@ class Pad(Transform):
     def __init__(
         self,
         padding: Union[int, Sequence[int]],
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -287,7 +287,7 @@ class Pad(Transform):
 class RandomZoomOut(_RandomApplyTransform):
     def __init__(
         self,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         side_range: Sequence[float] = (1.0, 4.0),
         p: float = 0.5,
     ) -> None:
@@ -330,7 +330,7 @@ class RandomRotation(Transform):
         degrees: Union[numbers.Number, Sequence],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
@@ -371,7 +371,7 @@ class RandomAffine(Transform):
         scale: Optional[Sequence[float]] = None,
         shear: Optional[Union[int, float, Sequence[float]]] = None,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
@@ -464,7 +464,7 @@ class RandomCrop(Transform):
         size: Union[int, Sequence[int]],
         padding: Optional[Union[int, Sequence[int]]] = None,
         pad_if_needed: bool = False,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -556,7 +556,7 @@ class RandomPerspective(_RandomApplyTransform):
     def __init__(
         self,
         distortion_scale: float = 0.5,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         p: float = 0.5,
     ) -> None:
@@ -618,7 +618,7 @@ class ElasticTransform(Transform):
         self,
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[datapoints.FillType, Dict[Type, datapoints.FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     ) -> None:
         super().__init__()
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 6e6655d0b..0d1544094 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -31,8 +31,8 @@ class ConvertDtype(Transform):
         self.dtype = dtype
 
     def _transform(
-        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
-    ) -> Union[datapoints.TensorImageType, datapoints.TensorVideoType]:
+        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
+    ) -> Union[datapoints._TensorImageType, datapoints._TensorVideoType]:
         return F.convert_dtype(inpt, self.dtype)
 
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 89e743dae..de1b7ce00 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -119,7 +119,7 @@ class Normalize(Transform):
             raise TypeError(f"{type(self).__name__}() does not support PIL images.")
 
     def _transform(
-        self, inpt: Union[datapoints.TensorImageType, datapoints.TensorVideoType], params: Dict[str, Any]
+        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
     ) -> Any:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index ab3b91d6c..b26d6b045 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -13,5 +13,5 @@ class UniformTemporalSubsample(Transform):
         super().__init__()
         self.num_samples = num_samples
 
-    def _transform(self, inpt: datapoints.VideoType, params: Dict[str, Any]) -> datapoints.VideoType:
+    def _transform(self, inpt: datapoints._VideoType, params: Dict[str, Any]) -> datapoints._VideoType:
         return F.uniform_temporal_subsample(inpt, self.num_samples)
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index d68851576..9942602eb 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -4,7 +4,7 @@ from collections import defaultdict
 from typing import Any, Dict, Literal, Sequence, Type, TypeVar, Union
 
 from torchvision import datapoints
-from torchvision.datapoints._datapoint import FillType, FillTypeJIT
+from torchvision.datapoints._datapoint import _FillType, _FillTypeJIT
 
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
@@ -26,7 +26,7 @@ def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size:
     return arg
 
 
-def _check_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> None:
+def _check_fill_arg(fill: Union[_FillType, Dict[Type, _FillType]]) -> None:
     if isinstance(fill, dict):
         for key, value in fill.items():
             # Check key for type
@@ -52,7 +52,7 @@ def _get_defaultdict(default: T) -> Dict[Any, T]:
     return defaultdict(functools.partial(_default_arg, default))
 
 
-def _convert_fill_arg(fill: datapoints.FillType) -> datapoints.FillTypeJIT:
+def _convert_fill_arg(fill: datapoints._FillType) -> datapoints._FillTypeJIT:
     # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
     # So, we can't reassign fill to 0
     # if fill is None:
@@ -65,7 +65,7 @@ def _convert_fill_arg(fill: datapoints.FillType) -> datapoints.FillTypeJIT:
     return fill  # type: ignore[return-value]
 
 
-def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillTypeJIT]:
+def _setup_fill_arg(fill: Union[_FillType, Dict[Type, _FillType]]) -> Dict[Type, _FillTypeJIT]:
     _check_fill_arg(fill)
 
     if isinstance(fill, dict):
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index e9d0339a9..9aedae814 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -36,14 +36,14 @@ def erase_video(
 
 
 def erase(
-    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT],
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT],
     i: int,
     j: int,
     h: int,
     w: int,
     v: torch.Tensor,
     inplace: bool = False,
-) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
+) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(erase)
 
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index cf8d73dff..4ba7e5b36 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -37,8 +37,8 @@ rgb_to_grayscale_image_pil = _FP.to_grayscale
 
 
 def rgb_to_grayscale(
-    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], num_output_channels: int = 1
-) -> Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]:
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], num_output_channels: int = 1
+) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(rgb_to_grayscale)
     if num_output_channels not in (1, 3):
@@ -85,7 +85,7 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
     return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor)
 
 
-def adjust_brightness(inpt: datapoints.InputTypeJIT, brightness_factor: float) -> datapoints.InputTypeJIT:
+def adjust_brightness(inpt: datapoints._InputTypeJIT, brightness_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_brightness)
 
@@ -127,7 +127,7 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
     return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor)
 
 
-def adjust_saturation(inpt: datapoints.InputTypeJIT, saturation_factor: float) -> datapoints.InputTypeJIT:
+def adjust_saturation(inpt: datapoints._InputTypeJIT, saturation_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_saturation)
 
@@ -171,7 +171,7 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
     return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor)
 
 
-def adjust_contrast(inpt: datapoints.InputTypeJIT, contrast_factor: float) -> datapoints.InputTypeJIT:
+def adjust_contrast(inpt: datapoints._InputTypeJIT, contrast_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_contrast)
 
@@ -247,7 +247,7 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
     return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor)
 
 
-def adjust_sharpness(inpt: datapoints.InputTypeJIT, sharpness_factor: float) -> datapoints.InputTypeJIT:
+def adjust_sharpness(inpt: datapoints._InputTypeJIT, sharpness_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_sharpness)
 
@@ -364,7 +364,7 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
     return adjust_hue_image_tensor(video, hue_factor=hue_factor)
 
 
-def adjust_hue(inpt: datapoints.InputTypeJIT, hue_factor: float) -> datapoints.InputTypeJIT:
+def adjust_hue(inpt: datapoints._InputTypeJIT, hue_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_hue)
 
@@ -407,7 +407,7 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
     return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain)
 
 
-def adjust_gamma(inpt: datapoints.InputTypeJIT, gamma: float, gain: float = 1) -> datapoints.InputTypeJIT:
+def adjust_gamma(inpt: datapoints._InputTypeJIT, gamma: float, gain: float = 1) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(adjust_gamma)
 
@@ -444,7 +444,7 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
     return posterize_image_tensor(video, bits=bits)
 
 
-def posterize(inpt: datapoints.InputTypeJIT, bits: int) -> datapoints.InputTypeJIT:
+def posterize(inpt: datapoints._InputTypeJIT, bits: int) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(posterize)
 
@@ -475,7 +475,7 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
     return solarize_image_tensor(video, threshold=threshold)
 
 
-def solarize(inpt: datapoints.InputTypeJIT, threshold: float) -> datapoints.InputTypeJIT:
+def solarize(inpt: datapoints._InputTypeJIT, threshold: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(solarize)
 
@@ -528,7 +528,7 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
     return autocontrast_image_tensor(video)
 
 
-def autocontrast(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+def autocontrast(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(autocontrast)
 
@@ -621,7 +621,7 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
     return equalize_image_tensor(video)
 
 
-def equalize(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+def equalize(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(equalize)
 
@@ -655,7 +655,7 @@ def invert_video(video: torch.Tensor) -> torch.Tensor:
     return invert_image_tensor(video)
 
 
-def invert(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+def invert(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(invert)
 
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index 8f035f708..954daa97c 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -31,7 +31,7 @@ def to_tensor(inpt: Any) -> torch.Tensor:
     return _F.to_tensor(inpt)
 
 
-def get_image_size(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> List[int]:
+def get_image_size(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
         "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 1ffd7f454..3da9e71c5 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -71,7 +71,7 @@ def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image_tensor(video)
 
 
-def horizontal_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+def horizontal_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(horizontal_flip)
 
@@ -120,7 +120,7 @@ def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
     return vertical_flip_image_tensor(video)
 
 
-def vertical_flip(inpt: datapoints.InputTypeJIT) -> datapoints.InputTypeJIT:
+def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(vertical_flip)
 
@@ -255,12 +255,12 @@ def resize_video(
 
 
 def resize(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     size: List[int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(resize)
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
@@ -428,7 +428,7 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
 
 
 def _apply_grid_transform(
-    img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT
+    img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints._FillTypeJIT
 ) -> torch.Tensor:
 
     # We are using context knowledge that grid should have float dtype
@@ -470,7 +470,7 @@ def _assert_grid_transform_inputs(
     image: torch.Tensor,
     matrix: Optional[List[float]],
     interpolation: str,
-    fill: datapoints.FillTypeJIT,
+    fill: datapoints._FillTypeJIT,
     supported_interpolation_modes: List[str],
     coeffs: Optional[List[float]] = None,
 ) -> None:
@@ -533,7 +533,7 @@ def affine_image_tensor(
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     interpolation = _check_interpolation(interpolation)
@@ -585,7 +585,7 @@ def affine_image_pil(
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     interpolation = _check_interpolation(interpolation)
@@ -721,7 +721,7 @@ def affine_mask(
     translate: List[float],
     scale: float,
     shear: List[float],
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
@@ -754,7 +754,7 @@ def affine_video(
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     return affine_image_tensor(
@@ -770,15 +770,15 @@ def affine_video(
 
 
 def affine(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     angle: Union[int, float],
     translate: List[float],
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     center: Optional[List[float]] = None,
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(affine)
 
@@ -822,7 +822,7 @@ def rotate_image_tensor(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> torch.Tensor:
     interpolation = _check_interpolation(interpolation)
 
@@ -867,7 +867,7 @@ def rotate_image_pil(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> PIL.Image.Image:
     interpolation = _check_interpolation(interpolation)
 
@@ -910,7 +910,7 @@ def rotate_mask(
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -939,19 +939,19 @@ def rotate_video(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> torch.Tensor:
     return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
 
 def rotate(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     angle: float,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints.FillTypeJIT = None,
-) -> datapoints.InputTypeJIT:
+    fill: datapoints._FillTypeJIT = None,
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(rotate)
 
@@ -1156,11 +1156,11 @@ def pad_video(
 
 
 def pad(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     padding: List[int],
     fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(pad)
 
@@ -1239,7 +1239,7 @@ def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int
     return crop_image_tensor(video, top, left, height, width)
 
 
-def crop(inpt: datapoints.InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints.InputTypeJIT:
+def crop(inpt: datapoints._InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(crop)
 
@@ -1308,7 +1308,7 @@ def perspective_image_tensor(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
@@ -1355,7 +1355,7 @@ def perspective_image_pil(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BICUBIC,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
@@ -1461,7 +1461,7 @@ def perspective_mask(
     mask: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
@@ -1485,7 +1485,7 @@ def perspective_video(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     return perspective_image_tensor(
@@ -1494,13 +1494,13 @@ def perspective_video(
 
 
 def perspective(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(perspective)
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
@@ -1526,7 +1526,7 @@ def elastic_image_tensor(
     image: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> torch.Tensor:
     interpolation = _check_interpolation(interpolation)
 
@@ -1583,7 +1583,7 @@ def elastic_image_pil(
     image: PIL.Image.Image,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
     output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill)
@@ -1656,7 +1656,7 @@ def elastic_bounding_box(
 def elastic_mask(
     mask: torch.Tensor,
     displacement: torch.Tensor,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1676,17 +1676,17 @@ def elastic_video(
     video: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> torch.Tensor:
     return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
 
 
 def elastic(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints.FillTypeJIT = None,
-) -> datapoints.InputTypeJIT:
+    fill: datapoints._FillTypeJIT = None,
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(elastic)
 
@@ -1802,7 +1802,7 @@ def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tens
     return center_crop_image_tensor(video, output_size)
 
 
-def center_crop(inpt: datapoints.InputTypeJIT, output_size: List[int]) -> datapoints.InputTypeJIT:
+def center_crop(inpt: datapoints._InputTypeJIT, output_size: List[int]) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(center_crop)
 
@@ -1888,7 +1888,7 @@ def resized_crop_video(
 
 
 def resized_crop(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     top: int,
     left: int,
     height: int,
@@ -1896,7 +1896,7 @@ def resized_crop(
     size: List[int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(resized_crop)
 
@@ -1972,7 +1972,7 @@ def five_crop_video(
     return five_crop_image_tensor(video, size)
 
 
-ImageOrVideoTypeJIT = Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]
+ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
 
 
 def five_crop(
@@ -2069,7 +2069,7 @@ def ten_crop_video(
 
 
 def ten_crop(
-    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], size: List[int], vertical_flip: bool = False
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], size: List[int], vertical_flip: bool = False
 ) -> Tuple[
     ImageOrVideoTypeJIT,
     ImageOrVideoTypeJIT,
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index dc96838dd..8ffa39661 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -27,7 +27,7 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
 get_dimensions_image_pil = _FP.get_dimensions
 
 
-def get_dimensions(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> List[int]:
+def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_dimensions)
 
@@ -64,7 +64,7 @@ def get_num_channels_video(video: torch.Tensor) -> int:
     return get_num_channels_image_tensor(video)
 
 
-def get_num_channels(inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT]) -> int:
+def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> int:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_num_channels)
 
@@ -114,7 +114,7 @@ def get_spatial_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[
     return list(bounding_box.spatial_size)
 
 
-def get_spatial_size(inpt: datapoints.InputTypeJIT) -> List[int]:
+def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_spatial_size)
 
@@ -135,7 +135,7 @@ def get_num_frames_video(video: torch.Tensor) -> int:
     return video.shape[-4]
 
 
-def get_num_frames(inpt: datapoints.VideoTypeJIT) -> int:
+def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_num_frames)
 
@@ -208,11 +208,11 @@ def _convert_format_bounding_box(
 
 
 def convert_format_bounding_box(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     old_format: Optional[BoundingBoxFormat] = None,
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     # This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor
     # inputs as well as extract it from `datapoints.BoundingBox` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
@@ -259,10 +259,10 @@ def _clamp_bounding_box(
 
 
 def clamp_bounding_box(
-    inpt: datapoints.InputTypeJIT,
+    inpt: datapoints._InputTypeJIT,
     format: Optional[BoundingBoxFormat] = None,
     spatial_size: Optional[Tuple[int, int]] = None,
-) -> datapoints.InputTypeJIT:
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_box)
 
@@ -355,7 +355,7 @@ def convert_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float) -
 
 
 def convert_dtype(
-    inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], dtype: torch.dtype = torch.float
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], dtype: torch.dtype = torch.float
 ) -> torch.Tensor:
     if not torch.jit.is_scripting():
         _log_api_usage_once(convert_dtype)
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index cf728e278..9abb3ac22 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -53,7 +53,7 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
 
 
 def normalize(
-    inpt: Union[datapoints.TensorImageTypeJIT, datapoints.TensorVideoTypeJIT],
+    inpt: Union[datapoints._TensorImageTypeJIT, datapoints._TensorVideoTypeJIT],
     mean: List[float],
     std: List[float],
     inplace: bool = False,
@@ -166,8 +166,8 @@ def gaussian_blur_video(
 
 
 def gaussian_blur(
-    inpt: datapoints.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> datapoints.InputTypeJIT:
+    inpt: datapoints._InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(gaussian_blur)
 
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 438e6b519..5612a3877 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -14,7 +14,7 @@ def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> t
     return torch.index_select(video, -4, indices)
 
 
-def uniform_temporal_subsample(inpt: datapoints.VideoTypeJIT, num_samples: int) -> datapoints.VideoTypeJIT:
+def uniform_temporal_subsample(inpt: datapoints._VideoTypeJIT, num_samples: int) -> datapoints._VideoTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(uniform_temporal_subsample)
 
-- 
GitLab


From b7892d3ae12d1466299da5a62653517b5ff46a2b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 16 Feb 2023 17:54:12 +0000
Subject: [PATCH 304/624] Make RandomIoUCrop compatible with
 SanitizeBoundingBoxes (#7268)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_prototype_transforms.py             | 43 +++++++++++--------
 test/test_prototype_transforms_consistency.py | 17 +++++---
 torchvision/transforms/v2/_geometry.py        | 15 ++-----
 torchvision/transforms/v2/_misc.py            | 14 +++---
 4 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index b8f20a26b..8b1665a3d 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1488,16 +1488,13 @@ class TestRandomIoUCrop:
 
         fn.assert_has_calls(expected_calls)
 
-        expected_within_targets = sum(is_within_crop_area)
-
         # check number of bboxes vs number of labels:
         output_bboxes = output[1]
         assert isinstance(output_bboxes, datapoints.BoundingBox)
-        assert len(output_bboxes) == expected_within_targets
+        assert (output_bboxes[~is_within_crop_area] == 0).all()
 
         output_masks = output[2]
         assert isinstance(output_masks, datapoints.Mask)
-        assert len(output_masks) == expected_within_targets
 
 
 class TestScaleJitter:
@@ -2253,10 +2250,11 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 
 
 @pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
-@pytest.mark.parametrize("label_type", (torch.Tensor, list))
 @pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
 @pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
-def test_detection_preset(image_type, label_type, data_augmentation, to_tensor):
+@pytest.mark.parametrize("sanitize", (True, False))
+def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
+    torch.manual_seed(0)
     if data_augmentation == "hflip":
         t = [
             transforms.RandomHorizontalFlip(p=1),
@@ -2290,20 +2288,20 @@ def test_detection_preset(image_type, label_type, data_augmentation, to_tensor):
         t = [
             transforms.RandomPhotometricDistort(p=1),
             transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
-            # TODO: put back IoUCrop once we remove its hard requirement for Labels
-            # transforms.RandomIoUCrop(),
+            transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
             to_tensor(),
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssdlite":
         t = [
-            # TODO: put back IoUCrop once we remove its hard requirement for Labels
-            # transforms.RandomIoUCrop(),
+            transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
             to_tensor(),
             transforms.ConvertImageDtype(torch.float),
         ]
+    if sanitize:
+        t += [transforms.SanitizeBoundingBoxes()]
     t = transforms.Compose(t)
 
     num_boxes = 5
@@ -2317,10 +2315,7 @@ def test_detection_preset(image_type, label_type, data_augmentation, to_tensor):
         assert is_simple_tensor(image)
 
     label = torch.randint(0, 10, size=(num_boxes,))
-    if label_type is list:
-        label = label.tolist()
 
-    # TODO: is the shape of the boxes OK? Should it be (1, num_boxes, 4)?? Same for masks
     boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
     boxes[:, 2:] += boxes[:, :2]
     boxes = boxes.clamp(min=0, max=min(H, W))
@@ -2343,8 +2338,19 @@ def test_detection_preset(image_type, label_type, data_augmentation, to_tensor):
         assert isinstance(out["image"], datapoints.Image)
     assert isinstance(out["label"], type(sample["label"]))
 
-    out["label"] = torch.tensor(out["label"])
-    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes
+    num_boxes_expected = {
+        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
+        # doesn't remove them strictly speaking, it just marks some boxes as
+        # degenerate and those boxes will be later removed by
+        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # param is True.
+        # Note that the values below are probably specific to the random seed
+        # set above (which is fine).
+        (True, "ssd"): 4,
+        (True, "ssdlite"): 4,
+    }.get((sanitize, data_augmentation), num_boxes)
+
+    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
 
 
 @pytest.mark.parametrize("min_size", (1, 10))
@@ -2377,7 +2383,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter):
     valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
 
     boxes = torch.tensor(boxes)
-    labels = torch.arange(boxes.shape[-2])
+    labels = torch.arange(boxes.shape[0])
 
     boxes = datapoints.BoundingBox(
         boxes,
@@ -2385,12 +2391,15 @@ def test_sanitize_bounding_boxes(min_size, labels_getter):
         spatial_size=(H, W),
     )
 
+    masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
+
     sample = {
         "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
         "labels": labels,
         "boxes": boxes,
         "whatever": torch.rand(10),
         "None": None,
+        "masks": masks,
     }
 
     out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
@@ -2402,7 +2411,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter):
         assert out["labels"] is sample["labels"]
     else:
         assert isinstance(out["labels"], torch.Tensor)
-        assert out["boxes"].shape[:-1] == out["labels"].shape
+        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
         # This works because we conveniently set labels to arange(num_boxes)
         assert out["labels"].tolist() == valid_indices
 
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 9b3482f3f..ebee2eec5 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -1090,13 +1090,16 @@ class TestRefDetTransforms:
         "t_ref, t, data_kwargs",
         [
             (det_transforms.RandomHorizontalFlip(p=1.0), v2_transforms.RandomHorizontalFlip(p=1.0), {}),
-            # FIXME: make
-            #  v2_transforms.Compose([
-            #      v2_transforms.RandomIoUCrop(),
-            #      v2_transforms.SanitizeBoundingBoxes()
-            #  ])
-            #  work
-            # (det_transforms.RandomIoUCrop(), v2_transforms.RandomIoUCrop(), {"with_mask": False}),
+            (
+                det_transforms.RandomIoUCrop(),
+                v2_transforms.Compose(
+                    [
+                        v2_transforms.RandomIoUCrop(),
+                        v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]),
+                    ]
+                ),
+                {"with_mask": False},
+            ),
             (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}),
             (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024)), {}),
             (
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 65d116e20..d8ab0bb24 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -721,8 +721,6 @@ class RandomIoUCrop(Transform):
                 if left == right or top == bottom:
                     continue
 
-                # FIXME: I think we can stop here?
-
                 # check for any valid boxes with centers within the crop area
                 xyxy_bboxes = F.convert_format_bounding_box(
                     bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
@@ -745,23 +743,16 @@ class RandomIoUCrop(Transform):
                 return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        # FIXME: refactor this to not remove anything
 
         if len(params) < 1:
             return inpt
 
-        is_within_crop_area = params["is_within_crop_area"]
-
         output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
 
         if isinstance(output, datapoints.BoundingBox):
-            bboxes = output[is_within_crop_area]
-            bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size)
-            output = datapoints.BoundingBox.wrap_like(output, bboxes)
-        elif isinstance(output, datapoints.Mask):
-            # apply is_within_crop_area if mask is one-hot encoded
-            masks = output[is_within_crop_area]
-            output = datapoints.Mask.wrap_like(output, masks)
+            # We "mark" the invalid boxes as degenreate, and they can be
+            # removed by a later call to SanitizeBoundingBoxes()
+            output[~params["is_within_crop_area"]] = 0
 
         return output
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index de1b7ce00..6dd0755cf 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -265,14 +265,14 @@ class SanitizeBoundingBoxes(Transform):
             ),
         )
         ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
-        mask = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
+        valid = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
         # TODO: Do we really need to check for out of bounds here? All
         # transforms should be clamping anyway, so this should never happen?
         image_h, image_w = boxes.spatial_size
-        mask &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
-        mask &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
+        valid &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
+        valid &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
 
-        params = dict(mask=mask, labels=labels)
+        params = dict(valid=valid, labels=labels)
         flat_outputs = [
             # Even-though it may look like we're transforming all inputs, we don't:
             # _transform() will only care about BoundingBoxes and the labels
@@ -284,7 +284,9 @@ class SanitizeBoundingBoxes(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
-        if (inpt is not None and inpt is params["labels"]) or isinstance(inpt, datapoints.BoundingBox):
-            inpt = inpt[params["mask"]]
+        if (inpt is not None and inpt is params["labels"]) or isinstance(
+            inpt, (datapoints.BoundingBox, datapoints.Mask)
+        ):
+            inpt = inpt[params["valid"]]
 
         return inpt
-- 
GitLab


From efd6bc0647537b8a5210ea2508b2d4cb6bcff650 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 16 Feb 2023 18:57:48 +0100
Subject: [PATCH 305/624] make fill defaultdict an implementation detail
 (#7258)

---
 torchvision/prototype/transforms/_geometry.py |  5 ++-
 torchvision/transforms/v2/_geometry.py        | 43 ++++++++++---------
 torchvision/transforms/v2/_transform.py       | 19 ++------
 3 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 512ffec0a..8d5cc24d2 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -22,7 +22,8 @@ class FixedSizeCrop(Transform):
         self.crop_height = size[0]
         self.crop_width = size[1]
 
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
         self.padding_mode = padding_mode
 
@@ -118,7 +119,7 @@ class FixedSizeCrop(Transform):
                 )
 
         if params["needs_pad"]:
-            fill = self.fill[type(inpt)]
+            fill = self._fill[type(inpt)]
             inpt = F.pad(inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         return inpt
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index d8ab0bb24..f1eed87b9 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -255,9 +255,7 @@ class Pad(Transform):
         params = super()._extract_params_for_v1_transform()
 
         if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
-            raise ValueError(
-                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
-            )
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
 
         return params
 
@@ -276,11 +274,12 @@ class Pad(Transform):
         if not isinstance(padding, int):
             padding = list(padding)
         self.padding = padding
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
         self.padding_mode = padding_mode
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
+        fill = self._fill[type(inpt)]
         return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
 
 
@@ -293,7 +292,8 @@ class RandomZoomOut(_RandomApplyTransform):
     ) -> None:
         super().__init__(p=p)
 
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
         _check_sequence_input(side_range, "side_range", req_sizes=(2,))
 
@@ -318,7 +318,7 @@ class RandomZoomOut(_RandomApplyTransform):
         return dict(padding=padding)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
+        fill = self._fill[type(inpt)]
         return F.pad(inpt, **params, fill=fill)
 
 
@@ -338,7 +338,8 @@ class RandomRotation(Transform):
         self.interpolation = _check_interpolation(interpolation)
         self.expand = expand
 
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
         if center is not None:
             _check_sequence_input(center, "center", req_sizes=(2,))
@@ -350,7 +351,7 @@ class RandomRotation(Transform):
         return dict(angle=angle)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
+        fill = self._fill[type(inpt)]
         return F.rotate(
             inpt,
             **params,
@@ -395,7 +396,8 @@ class RandomAffine(Transform):
             self.shear = shear
 
         self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
         if center is not None:
             _check_sequence_input(center, "center", req_sizes=(2,))
@@ -430,7 +432,7 @@ class RandomAffine(Transform):
         return dict(angle=angle, translate=translate, scale=scale, shear=shear)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
+        fill = self._fill[type(inpt)]
         return F.affine(
             inpt,
             **params,
@@ -447,9 +449,7 @@ class RandomCrop(Transform):
         params = super()._extract_params_for_v1_transform()
 
         if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
-            raise ValueError(
-                f"{type(self.__name__)}() can only be scripted for a scalar `fill`, but got {self.fill} for images."
-            )
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
 
         padding = self.padding
         if padding is not None:
@@ -478,7 +478,8 @@ class RandomCrop(Transform):
 
         self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
         self.pad_if_needed = pad_if_needed
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
         self.padding_mode = padding_mode
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
@@ -541,7 +542,7 @@ class RandomCrop(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["needs_pad"]:
-            fill = self.fill[type(inpt)]
+            fill = self._fill[type(inpt)]
             inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         if params["needs_crop"]:
@@ -567,7 +568,8 @@ class RandomPerspective(_RandomApplyTransform):
 
         self.distortion_scale = distortion_scale
         self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         height, width = query_spatial_size(flat_inputs)
@@ -600,7 +602,7 @@ class RandomPerspective(_RandomApplyTransform):
         return dict(coefficients=perspective_coeffs)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
+        fill = self._fill[type(inpt)]
         return F.perspective(
             inpt,
             None,
@@ -626,7 +628,8 @@ class ElasticTransform(Transform):
         self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
 
         self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         size = list(query_spatial_size(flat_inputs))
@@ -652,7 +655,7 @@ class ElasticTransform(Transform):
         return dict(displacement=displacement)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
+        fill = self._fill[type(inpt)]
         return F.elastic(
             inpt,
             **params,
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index 3f92b3c16..f83ed5d6e 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -108,30 +108,17 @@ class Transform(nn.Module):
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
         # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
-        # v2 transform instance. It does two things:
-        # 1. Extract all available public attributes that are specific to that transform and not `nn.Module` in general
-        # 2. If available handle the `fill` attribute for v1 compatibility (see below for details)
+        # v2 transform instance. It extracts all available public attributes that are specific to that transform and
+        # not `nn.Module` in general.
         # Overwrite this method on the v2 transform class if the above is not sufficient. For example, this might happen
         # if the v2 transform introduced new parameters that are not support by the v1 transform.
         common_attrs = nn.Module().__dict__.keys()
-        params = {
+        return {
             attr: value
             for attr, value in self.__dict__.items()
             if not attr.startswith("_") and attr not in common_attrs
         }
 
-        # transforms v2 has a more complex handling for the `fill` parameter than v1. By default, the input is parsed
-        # with `prototype.transforms._utils._setup_fill_arg()`, which returns a defaultdict that holds the fill value
-        # for the different datapoint types. Below we extract the value for tensors and return that together with the
-        # other params.
-        # This is needed for `Pad`, `ElasticTransform`, `RandomAffine`, `RandomCrop`, `RandomPerspective` and
-        # `RandomRotation`
-        if "fill" in params:
-            fill_type_defaultdict = params.pop("fill")
-            params["fill"] = fill_type_defaultdict[torch.Tensor]
-
-        return params
-
     def __prepare_scriptable__(self) -> nn.Module:
         # This method is called early on when `torch.jit.script`'ing an `nn.Module` instance. If it succeeds, the return
         # value is used for scripting over the original object that should have been scripted. Since the v1 transforms
-- 
GitLab


From 0774b32d803534aef4b259bf17829c70bc570cef Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Thu, 16 Feb 2023 16:27:29 -0500
Subject: [PATCH 306/624] Vision 3.11 Windows Conda Builds (#7263)

---
 .circleci/config.yml     | 74 ++++++++++++++++++++++++++++++++++++++++
 .circleci/regenerate.py  |  2 --
 packaging/build_conda.sh |  6 +++-
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 55e4709a2..a8aedd346 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1282,6 +1282,23 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.10_cu118
           python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cpu
+          name: binary_win_conda_py3.11_cpu
+          python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.11_cu117
+          python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cu118
+          name: binary_win_conda_py3.11_cu118
+          python_version: '3.11'
       - build_docs:
           filters:
             branches:
@@ -1903,6 +1920,63 @@ workflows:
           name: nightly_binary_win_conda_py3.10_cu118_upload
           requires:
           - nightly_binary_win_conda_py3.10_cu118
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cpu
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cpu_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cpu
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu117
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu117_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu118
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cu118
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index d8d0d3c7b..a4a321d2c 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -80,8 +80,6 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         # moment since the necessary dependencies are not
                         # available. Windows 3.11 Wheels will be built from
                         # CircleCI here, however.
-                        if python_version == "3.11" and btype == "conda":
-                            continue
 
                         w += workflow_pair(
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index e80c7dfbe..f2ccc62d3 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -13,4 +13,8 @@ setup_visual_studio_constraint
 setup_junit_results_folder
 export CUDATOOLKIT_CHANNEL="nvidia"
 
-conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchvision
+if [[ "$PYTHON_VERSION" == "3.11" ]]; then
+  export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c malfet"
+fi
+
+conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --no-test --python "$PYTHON_VERSION" packaging/torchvision
-- 
GitLab


From cb5f7835ebd88c501a258a2888732e439c8bffeb Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 17 Feb 2023 03:06:30 -0500
Subject: [PATCH 307/624] Remove nit comment in .circleci/regenerate.py (#7273)

---
 .circleci/regenerate.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index a4a321d2c..4c1651a3a 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -76,11 +76,6 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                         if os_type != "win" and btype == "conda":
                             continue
 
-                        # Not supporting Python 3.11 conda packages at the
-                        # moment since the necessary dependencies are not
-                        # available. Windows 3.11 Wheels will be built from
-                        # CircleCI here, however.
-
                         w += workflow_pair(
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
                         )
-- 
GitLab


From 276ee2ae4307a1e43175724ebc99e38abb1e3163 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 17 Feb 2023 08:26:50 +0000
Subject: [PATCH 308/624] Add a removable warning in torchvision.transforms.v2
 and torchvision.datapoints (#7270)

---
 test/conftest.py                      |  4 ++++
 torchvision/__init__.py               | 16 ++++++++++++++++
 torchvision/datapoints/__init__.py    |  7 +++++++
 torchvision/transforms/v2/__init__.py |  7 +++++++
 4 files changed, 34 insertions(+)

diff --git a/test/conftest.py b/test/conftest.py
index adb13180a..b3ab70af6 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,9 +3,13 @@ import random
 import numpy as np
 import pytest
 import torch
+import torchvision
 from common_utils import CUDA_NOT_AVAILABLE_MSG, IN_FBCODE, IN_OSS_CI, IN_RE_WORKER, OSS_CI_GPU_NO_CUDA_MSG
 
 
+torchvision.disable_beta_transforms_warning()
+
+
 def pytest_configure(config):
     # register an additional marker (see pytest_collection_modifyitems)
     config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device")
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index e7bf4f24c..f29da9cf6 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -95,3 +95,19 @@ def get_video_backend():
 
 def _is_tracing():
     return torch._C._get_tracing_state()
+
+
+_WARN_ABOUT_BETA_TRANSFORMS = True
+_BETA_TRANSFORMS_WARNING = (
+    "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. "
+    "While we will try our best to maintain backward compatibility, "
+    "some APIs or behaviors might change without a deprecation cycle. "
+    "To help us improve these new features, please provide your feedback "
+    "here: https://github.com/pytorch/vision/issues/6753."
+    "You can silence this warning by calling torchvision.disable_beta_transform_warning()."
+)
+
+
+def disable_beta_transforms_warning():
+    global _WARN_ABOUT_BETA_TRANSFORMS
+    _WARN_ABOUT_BETA_TRANSFORMS = False
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index 2a1a218fe..4c90d957c 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -5,3 +5,10 @@ from ._mask import Mask
 from ._video import _TensorVideoType, _TensorVideoTypeJIT, _VideoType, _VideoTypeJIT, Video
 
 from ._dataset_wrapper import wrap_dataset_for_transforms_v2  # type: ignore[attr-defined]  # usort: skip
+
+from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
+
+if _WARN_ABOUT_BETA_TRANSFORMS:
+    import warnings
+
+    warnings.warn(_BETA_TRANSFORMS_WARNING)
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 520e0088e..7ad72c009 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -45,3 +45,10 @@ from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
 from ._deprecated import ToTensor  # usort: skip
+
+from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
+
+if _WARN_ABOUT_BETA_TRANSFORMS:
+    import warnings
+
+    warnings.warn(_BETA_TRANSFORMS_WARNING)
-- 
GitLab


From 25960493bc0673b95567fa041f018da0b861de66 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 17 Feb 2023 10:59:33 +0100
Subject: [PATCH 309/624] convert all bounding boxes to XYXY in dataset wrapper
 (#7275)

---
 torchvision/datapoints/_dataset_wrapper.py | 27 +++++++++++++++-------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index dc4d1f472..e358c83d9 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -230,10 +230,13 @@ def coco_dectection_wrapper_factory(dataset):
         batched_target["image_id"] = image_id
 
         spatial_size = tuple(F.get_spatial_size(image))
-        batched_target["boxes"] = datapoints.BoundingBox(
-            batched_target["bbox"],
-            format=datapoints.BoundingBoxFormat.XYWH,
-            spatial_size=spatial_size,
+        batched_target["boxes"] = F.convert_format_bounding_box(
+            datapoints.BoundingBox(
+                batched_target["bbox"],
+                format=datapoints.BoundingBoxFormat.XYWH,
+                spatial_size=spatial_size,
+            ),
+            new_format=datapoints.BoundingBoxFormat.XYXY,
         )
         batched_target["masks"] = datapoints.Mask(
             torch.stack(
@@ -323,8 +326,13 @@ def celeba_wrapper_factory(dataset):
             target,
             target_types=dataset.target_type,
             type_wrappers={
-                "bbox": lambda item: datapoints.BoundingBox(
-                    item, format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+                "bbox": lambda item: F.convert_format_bounding_box(
+                    datapoints.BoundingBox(
+                        item,
+                        format=datapoints.BoundingBoxFormat.XYWH,
+                        spatial_size=(image.height, image.width),
+                    ),
+                    new_format=datapoints.BoundingBoxFormat.XYXY,
                 ),
             },
         )
@@ -416,8 +424,11 @@ def widerface_wrapper(dataset):
         image, target = sample
 
         if target is not None:
-            target["bbox"] = datapoints.BoundingBox(
-                target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+            target["bbox"] = F.convert_format_bounding_box(
+                datapoints.BoundingBox(
+                    target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+                ),
+                new_format=datapoints.BoundingBoxFormat.XYXY,
             )
 
         return image, target
-- 
GitLab


From dfa81ce4191ea8dc96a5b509ed27c6938a8b853d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 17 Feb 2023 10:02:57 +0000
Subject: [PATCH 310/624] Remove old TODO about using `_log_api_usage_once()`
 (#7277)

---
 torchvision/transforms/v2/functional/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 0909b7634..ffb34c877 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -1,5 +1,3 @@
-# TODO: Add _log_api_usage_once() in all mid-level kernels. If they remain not jit-scriptable we can use decorators
-
 from torchvision.transforms import InterpolationMode  # usort: skip
 
 from ._utils import is_simple_tensor  # usort: skip
-- 
GitLab


From 56b0497646653bf1f027ce9c2ba668331252f67c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 17 Feb 2023 13:31:09 +0100
Subject: [PATCH 311/624] Add more tests for bounding boxes (#7276)

---
 test/test_prototype_transforms.py            | 15 +++++++++++++++
 test/test_prototype_transforms_functional.py | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 8b1665a3d..040933097 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -269,6 +269,21 @@ class TestSmoke:
             else:
                 assert output_item is input_item
 
+            if isinstance(input_item, datapoints.BoundingBox) and not isinstance(
+                transform, transforms.ConvertBoundingBoxFormat
+            ):
+                assert output_item.format == input_item.format
+
+        # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
+        # transform that does this), back into a valid one.
+        # TODO: we should test that against all degenerate boxes above
+        for format in list(datapoints.BoundingBoxFormat):
+            sample = dict(
+                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
+                labels=torch.tensor([3]),
+            )
+            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+
     @parametrize(
         [
             (
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index 7dff7a509..ffee57eea 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -508,6 +508,22 @@ class TestDispatchers:
         with pytest.raises(TypeError, match=re.escape(str(type(unkown_input)))):
             info.dispatcher(unkown_input, *other_args, **kwargs)
 
+    @make_info_args_kwargs_parametrization(
+        [
+            info
+            for info in DISPATCHER_INFOS
+            if datapoints.BoundingBox in info.kernels and info.dispatcher is not F.convert_format_bounding_box
+        ],
+        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBox),
+    )
+    def test_bounding_box_format_consistency(self, info, args_kwargs):
+        (bounding_box, *other_args), kwargs = args_kwargs.load()
+        format = bounding_box.format
+
+        output = info.dispatcher(bounding_box, *other_args, **kwargs)
+
+        assert output.format == format
+
 
 @pytest.mark.parametrize(
     ("alias", "target"),
-- 
GitLab


From ac1512b6673e88bb2a0b6cb9b3a663a50850ad4b Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 17 Feb 2023 15:03:54 +0100
Subject: [PATCH 312/624] =?UTF-8?q?Added=20wrap=5Fdataset=5Ffor=5Ftransfor?=
 =?UTF-8?q?ms=5Fv2=20into=20datasets=20and=20handled=20beta=20w=E2=80=A6?=
 =?UTF-8?q?=20(#7279)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 .github/workflows/test-linux-cpu.yml |  8 ++++-
 test/check_v2_dataset_warnings.py    | 19 ++++++++++++
 test/datasets_utils.py               |  2 +-
 test/test_datasets.py                | 43 +++++++++++++++++++++++++++
 test/test_prototype_datapoints.py    | 44 +---------------------------
 torchvision/datapoints/__init__.py   |  6 ++--
 torchvision/datasets/__init__.py     | 15 ++++++++++
 7 files changed, 88 insertions(+), 49 deletions(-)
 create mode 100644 test/check_v2_dataset_warnings.py

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 5dc7550d8..68ebc54f2 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -41,7 +41,7 @@ jobs:
         # Create Conda Env
         conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
         conda activate /work/ci_env
-        
+
         # Install PyTorch, Torchvision, and testing libraries
         set -ex
         conda install \
@@ -55,3 +55,9 @@ jobs:
         # Run Tests
         python3 -m torch.utils.collect_env
         python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+
+        # Specific test for warnings on "from torchvision.datasets import wrap_dataset_for_transforms_v2"
+        # We keep them separate to avoid any side effects due to warnings / imports.
+        # TODO: Remove this and add proper tests (possibly using a sub-process solution as described
+        # in https://github.com/pytorch/vision/pull/7269).
+        python3 -m pytest -v test/check_v2_dataset_warnings.py
diff --git a/test/check_v2_dataset_warnings.py b/test/check_v2_dataset_warnings.py
new file mode 100644
index 000000000..8bb53ee34
--- /dev/null
+++ b/test/check_v2_dataset_warnings.py
@@ -0,0 +1,19 @@
+import pytest
+
+
+def test_warns_if_imported_from_datasets(mocker):
+    mocker.patch("torchvision._WARN_ABOUT_BETA_TRANSFORMS", return_value=True)
+
+    import torchvision
+
+    with pytest.warns(UserWarning, match=torchvision._BETA_TRANSFORMS_WARNING):
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        assert callable(wrap_dataset_for_transforms_v2)
+
+
+@pytest.mark.filterwarnings("error")
+def test_no_warns_if_imported_from_datasets():
+    from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+    assert callable(wrap_dataset_for_transforms_v2)
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index e8290b55c..f4bcdfc42 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -584,8 +584,8 @@ class DatasetTestCase(unittest.TestCase):
 
     @test_all_configs
     def test_transforms_v2_wrapper(self, config):
-        from torchvision.datapoints import wrap_dataset_for_transforms_v2
         from torchvision.datapoints._datapoint import Datapoint
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
 
         try:
             with self.create_dataset(config) as (dataset, _):
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 015f727a1..605b799e7 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -8,6 +8,7 @@ import os
 import pathlib
 import pickle
 import random
+import re
 import shutil
 import string
 import unittest
@@ -3309,5 +3310,47 @@ class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
                 pass
 
 
+class TestDatasetWrapper:
+    def test_unknown_type(self):
+        unknown_object = object()
+        with pytest.raises(
+            TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`")
+        ):
+            datasets.wrap_dataset_for_transforms_v2(unknown_object)
+
+    def test_unknown_dataset(self):
+        class MyVisionDataset(datasets.VisionDataset):
+            pass
+
+        dataset = MyVisionDataset("root")
+
+        with pytest.raises(TypeError, match="No wrapper exist"):
+            datasets.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_missing_wrapper(self):
+        dataset = datasets.FakeData()
+
+        with pytest.raises(TypeError, match="please open an issue"):
+            datasets.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_subclass(self, mocker):
+        from torchvision import datapoints
+
+        sentinel = object()
+        mocker.patch.dict(
+            datapoints._dataset_wrapper.WRAPPER_FACTORIES,
+            clear=False,
+            values={datasets.FakeData: lambda dataset: lambda idx, sample: sentinel},
+        )
+
+        class MyFakeData(datasets.FakeData):
+            pass
+
+        dataset = MyFakeData()
+        wrapped_dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+        assert wrapped_dataset[0] is sentinel
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index 615fa9f61..b7aebd4c1 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -1,11 +1,9 @@
-import re
-
 import pytest
 import torch
 
 from PIL import Image
 
-from torchvision import datapoints, datasets
+from torchvision import datapoints
 from torchvision.prototype import datapoints as proto_datapoints
 
 
@@ -163,43 +161,3 @@ def test_bbox_instance(data, format):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat.from_str(format.upper())
     assert bboxes.format == format
-
-
-class TestDatasetWrapper:
-    def test_unknown_type(self):
-        unknown_object = object()
-        with pytest.raises(
-            TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`")
-        ):
-            datapoints.wrap_dataset_for_transforms_v2(unknown_object)
-
-    def test_unknown_dataset(self):
-        class MyVisionDataset(datasets.VisionDataset):
-            pass
-
-        dataset = MyVisionDataset("root")
-
-        with pytest.raises(TypeError, match="No wrapper exist"):
-            datapoints.wrap_dataset_for_transforms_v2(dataset)
-
-    def test_missing_wrapper(self):
-        dataset = datasets.FakeData()
-
-        with pytest.raises(TypeError, match="please open an issue"):
-            datapoints.wrap_dataset_for_transforms_v2(dataset)
-
-    def test_subclass(self, mocker):
-        sentinel = object()
-        mocker.patch.dict(
-            datapoints._dataset_wrapper.WRAPPER_FACTORIES,
-            clear=False,
-            values={datasets.FakeData: lambda dataset: lambda idx, sample: sentinel},
-        )
-
-        class MyFakeData(datasets.FakeData):
-            pass
-
-        dataset = MyFakeData()
-        wrapped_dataset = datapoints.wrap_dataset_for_transforms_v2(dataset)
-
-        assert wrapped_dataset[0] is sentinel
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index 4c90d957c..c9343048a 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,13 +1,11 @@
+from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
+
 from ._bounding_box import BoundingBox, BoundingBoxFormat
 from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT
 from ._image import _ImageType, _ImageTypeJIT, _TensorImageType, _TensorImageTypeJIT, Image
 from ._mask import Mask
 from ._video import _TensorVideoType, _TensorVideoTypeJIT, _VideoType, _VideoTypeJIT, Video
 
-from ._dataset_wrapper import wrap_dataset_for_transforms_v2  # type: ignore[attr-defined]  # usort: skip
-
-from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
-
 if _WARN_ABOUT_BETA_TRANSFORMS:
     import warnings
 
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index e18a9a54b..7d3357e3d 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -128,3 +128,18 @@ __all__ = (
     "InStereo2k",
     "ETH3DStereo",
 )
+
+
+# We override current module's attributes to handle the import:
+# from torchvision.datasets import wrap_dataset_for_transforms_v2
+# with beta state v2 warning from torchvision.datapoints
+# We also want to avoid raising the warning when importing other attributes
+# from torchvision.datasets
+# Ref: https://peps.python.org/pep-0562/
+def __getattr__(name):
+    if name in ("wrap_dataset_for_transforms_v2",):
+        from torchvision.datapoints._dataset_wrapper import wrap_dataset_for_transforms_v2
+
+        return wrap_dataset_for_transforms_v2
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-- 
GitLab


From 4774fe3afc61b40a56244e9411a7c3e64ae8147f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 17 Feb 2023 15:16:57 +0100
Subject: [PATCH 313/624] Split tests for transforms v2 and prototype (#7278)

---
 .circleci/unittest/linux/scripts/run_test.sh  |   15 +-
 test/builtin_dataset_mocks.py                 |    3 +-
 test/common_utils.py                          |  611 ++++-
 test/datasets_utils.py                        |   17 -
 test/prototype_common_utils.py                |  622 +----
 test/test_datapoints.py                       |   32 +
 test/test_datasets.py                         |  111 +-
 test/test_prototype_datapoints.py             |   30 -
 test/test_prototype_transforms.py             | 2243 ++---------------
 test/test_transforms_v2.py                    | 2047 +++++++++++++++
 ...y.py => test_transforms_v2_consistency.py} |   12 +-
 ...al.py => test_transforms_v2_functional.py} |   13 +-
 ...s_utils.py => test_transforms_v2_utils.py} |    2 +-
 ...s.py => transforms_v2_dispatcher_infos.py} |    6 +-
 ...infos.py => transforms_v2_kernel_infos.py} |    4 +-
 15 files changed, 2924 insertions(+), 2844 deletions(-)
 create mode 100644 test/test_datapoints.py
 create mode 100644 test/test_transforms_v2.py
 rename test/{test_prototype_transforms_consistency.py => test_transforms_v2_consistency.py} (99%)
 rename test/{test_prototype_transforms_functional.py => test_transforms_v2_functional.py} (99%)
 rename test/{test_prototype_transforms_utils.py => test_transforms_v2_utils.py} (97%)
 rename test/{prototype_transforms_dispatcher_infos.py => transforms_v2_dispatcher_infos.py} (98%)
 rename test/{prototype_transforms_kernel_infos.py => transforms_v2_kernel_infos.py} (99%)

diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh
index 8f6b8cb84..5348baa71 100755
--- a/.circleci/unittest/linux/scripts/run_test.sh
+++ b/.circleci/unittest/linux/scripts/run_test.sh
@@ -6,4 +6,17 @@ eval "$(./conda/bin/conda shell.bash hook)"
 conda activate ./env
 
 python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
+
+case "$(uname -s)" in
+  Darwin*)
+    # The largest macOS runner is not able to handle the regular test suite plus the transforms v2 tests at the same
+    # time due to insufficient resources. Thus, we ignore the transforms v2 tests at first and run them in a separate
+    # step afterwards.
+    GLOB='test/test_transforms_v2*'
+    pytest --junitxml=test-results/junit.xml -v --durations 20 --ignore-glob="${GLOB}"
+    eval "pytest --junitxml=test-results/junit-transforms-v2.xml -v --durations 20 ${GLOB}"
+    ;;
+  *)
+    pytest --junitxml=test-results/junit.xml -v --durations 20
+    ;;
+esac
diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index bbccec208..ef5d5e1ec 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -18,7 +18,8 @@ from collections import Counter, defaultdict
 import numpy as np
 import pytest
 import torch
-from datasets_utils import combinations_grid, create_image_file, create_image_folder, make_tar, make_zip
+from common_utils import combinations_grid
+from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
 from torchvision.prototype import datasets
diff --git a/test/common_utils.py b/test/common_utils.py
index b76158b6c..670115c6e 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -1,16 +1,29 @@
+import collections.abc
 import contextlib
+import dataclasses
+import enum
 import functools
+import itertools
 import os
+import pathlib
 import random
 import shutil
 import tempfile
+from collections import defaultdict
+from typing import Callable, Sequence, Tuple, Union
 
 import numpy as np
+
+import PIL.Image
+import pytest
 import torch
+import torch.testing
 from PIL import Image
-from torchvision import io
 
-import __main__  # noqa: 401
+from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
+from torchvision import datapoints, io
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -137,9 +150,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu
     return batch_tensor
 
 
-assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
-
-
 def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
     names = []
     for i in range(num_videos):
@@ -160,6 +170,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
 
 
 def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
+    # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it
     np_pil_image = np.array(pil_image)
     if np_pil_image.ndim == 2:
         np_pil_image = np_pil_image[:, :, None]
@@ -172,6 +183,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
 def _assert_approx_equal_tensor_to_pil(
     tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None
 ):
+    # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it
     # TODO: we could just merge this into _assert_equal_tensor_to_pil
     np_pil_image = np.array(pil_image)
     if np_pil_image.ndim == 2:
@@ -237,3 +249,592 @@ def cache(fn):
         return out
 
     return wrapper
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+class ImagePair(TensorLikePair):
+    def __init__(
+        self,
+        actual,
+        expected,
+        *,
+        mae=False,
+        **other_parameters,
+    ):
+        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
+            actual, expected = [to_image_tensor(input) for input in [actual, expected]]
+
+        super().__init__(actual, expected, **other_parameters)
+        self.mae = mae
+
+    def compare(self) -> None:
+        actual, expected = self.actual, self.expected
+
+        self._compare_attributes(actual, expected)
+        actual, expected = self._equalize_attributes(actual, expected)
+
+        if self.mae:
+            actual, expected = self._promote_for_comparison(actual, expected)
+            mae = float(torch.abs(actual - expected).float().mean())
+            if mae > self.atol:
+                self._fail(
+                    AssertionError,
+                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
+                )
+        else:
+            super()._compare_values(actual, expected)
+
+
+def assert_close(
+    actual,
+    expected,
+    *,
+    allow_subclasses=True,
+    rtol=None,
+    atol=None,
+    equal_nan=False,
+    check_device=True,
+    check_dtype=True,
+    check_layout=True,
+    check_stride=False,
+    msg=None,
+    **kwargs,
+):
+    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
+    __tracebackhide__ = True
+
+    error_metas = not_close_error_metas(
+        actual,
+        expected,
+        pair_types=(
+            NonePair,
+            BooleanPair,
+            NumberPair,
+            ImagePair,
+            TensorLikePair,
+        ),
+        allow_subclasses=allow_subclasses,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=check_device,
+        check_dtype=check_dtype,
+        check_layout=check_layout,
+        check_stride=check_stride,
+        **kwargs,
+    )
+
+    if error_metas:
+        raise error_metas[0].to_error(msg)
+
+
+assert_equal = functools.partial(assert_close, rtol=0, atol=0)
+
+
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 10:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper
+
+
+class ArgsKwargs:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __iter__(self):
+        yield self.args
+        yield self.kwargs
+
+    def load(self, device="cpu"):
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
+
+
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    "random",
+)
+
+
+def _parse_spatial_size(size, *, name="size"):
+    if size == "random":
+        return tuple(torch.randint(15, 33, (2,)).tolist())
+    elif isinstance(size, int) and size > 0:
+        return (size, size)
+    elif (
+        isinstance(size, collections.abc.Sequence)
+        and len(size) == 2
+        and all(isinstance(length, int) and length > 0 for length in size)
+    ):
+        return tuple(size)
+    else:
+        raise pytest.UsageError(
+            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
+            f"but got {size} instead."
+        )
+
+
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
+
+
+def from_loader(loader_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loader = loader_fn(*args, **kwargs)
+        return loader.load(device)
+
+    return wrapper
+
+
+def from_loaders(loaders_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loaders = loaders_fn(*args, **kwargs)
+        for loader in loaders:
+            yield loader.load(device)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class TensorLoader:
+    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
+    shape: Sequence[int]
+    dtype: torch.dtype
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device)
+
+
+@dataclasses.dataclass
+class ImageLoader(TensorLoader):
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    num_channels: int = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.spatial_size = self.shape[-2:]
+        self.num_channels = self.shape[-3]
+
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
+def make_image_loader(
+    size="random",
+    *,
+    color_space="RGB",
+    extra_dims=(),
+    dtype=torch.float32,
+    constant_alpha=True,
+):
+    size = _parse_spatial_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device):
+        max_value = get_max_value(dtype)
+        data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
+        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
+            data[..., -1, :, :] = max_value
+        return datapoints.Image(data)
+
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype)
+
+
+make_image = from_loader(make_image_loader)
+
+
+def make_image_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
+    ),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.float32, torch.float64, torch.uint8),
+    constant_alpha=True,
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
+        yield make_image_loader(**params, constant_alpha=constant_alpha)
+
+
+make_images = from_loaders(make_image_loaders)
+
+
+def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dtype=torch.uint8):
+    size = _parse_spatial_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
+
+        return datapoints.Image(image_tensor)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=("RGB",),
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
+        yield make_image_loader_for_interpolation(**params)
+
+
+@dataclasses.dataclass
+class BoundingBoxLoader(TensorLoader):
+    format: datapoints.BoundingBoxFormat
+    spatial_size: Tuple[int, int]
+
+
+def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
+    low, high = torch.broadcast_tensors(
+        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
+    )
+    return torch.stack(
+        [
+            torch.randint(low_scalar, high_scalar, (), **kwargs)
+            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
+        ]
+    ).reshape(low.shape)
+
+
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+    if format not in {
+        datapoints.BoundingBoxFormat.XYXY,
+        datapoints.BoundingBoxFormat.XYWH,
+        datapoints.BoundingBoxFormat.CXCYWH,
+    }:
+        raise pytest.UsageError(f"Can't make bounding box in format {format}")
+
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
+
+    def fn(shape, dtype, device):
+        *extra_dims, num_coordinates = shape
+        if num_coordinates != 4:
+            raise pytest.UsageError()
+
+        if any(dim == 0 for dim in extra_dims):
+            return datapoints.BoundingBox(
+                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+            )
+
+        height, width = spatial_size
+
+        if format == datapoints.BoundingBoxFormat.XYXY:
+            x1 = torch.randint(0, width // 2, extra_dims)
+            y1 = torch.randint(0, height // 2, extra_dims)
+            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
+            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
+            parts = (x1, y1, x2, y2)
+        elif format == datapoints.BoundingBoxFormat.XYWH:
+            x = torch.randint(0, width // 2, extra_dims)
+            y = torch.randint(0, height // 2, extra_dims)
+            w = randint_with_tensor_bounds(1, width - x)
+            h = randint_with_tensor_bounds(1, height - y)
+            parts = (x, y, w, h)
+        else:  # format == features.BoundingBoxFormat.CXCYWH:
+            cx = torch.randint(1, width - 1, extra_dims)
+            cy = torch.randint(1, height - 1, extra_dims)
+            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
+            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
+            parts = (cx, cy, w, h)
+
+        return datapoints.BoundingBox(
+            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        )
+
+    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
+
+
+make_bounding_box = from_loader(make_bounding_box_loader)
+
+
+def make_bounding_box_loaders(
+    *,
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    formats=tuple(datapoints.BoundingBoxFormat),
+    spatial_size="random",
+    dtypes=(torch.float32, torch.float64, torch.int64),
+):
+    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+
+
+make_bounding_boxes = from_loaders(make_bounding_box_loaders)
+
+
+class MaskLoader(TensorLoader):
+    pass
+
+
+def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
+    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
+    size = _parse_spatial_size(size)
+    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
+
+    def fn(shape, dtype, device):
+        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
+        return datapoints.Mask(data)
+
+    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
+
+
+make_detection_mask = from_loader(make_detection_mask_loader)
+
+
+def make_detection_mask_loaders(
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
+        yield make_detection_mask_loader(**params)
+
+
+make_detection_masks = from_loaders(make_detection_mask_loaders)
+
+
+def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    size = _parse_spatial_size(size)
+    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
+
+    def fn(shape, dtype, device):
+        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
+        return datapoints.Mask(data)
+
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+
+
+make_segmentation_mask = from_loader(make_segmentation_mask_loader)
+
+
+def make_segmentation_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_categories=(1, 2, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_segmentation_mask_loader(**params)
+
+
+make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
+
+
+def make_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, "random"),
+    num_categories=(1, 2, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
+    yield from make_segmentation_mask_loaders(
+        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
+    )
+
+
+make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size="random",
+    *,
+    color_space="RGB",
+    num_frames="random",
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_spatial_size(size)
+    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
+
+    def fn(shape, dtype, device):
+        video = make_image(size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device)
+        return datapoints.Video(video)
+
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
+
+
+make_video = from_loader(make_video_loader)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "RGB",
+    ),
+    num_frames=(1, 0, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8, torch.float32, torch.float64),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason, condition=None):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
+
+
+class InfoBase:
+    def __init__(
+        self,
+        *,
+        # Identifier if the info that shows up the parametrization.
+        id,
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
+
+        self.test_marks = test_marks or []
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index f4bcdfc42..768324955 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -170,23 +170,6 @@ def test_all_configs(test):
     return wrapper
 
 
-def combinations_grid(**kwargs):
-    """Creates a grid of input combinations.
-
-    Each element in the returned sequence is a dictionary containing one possible combination as values.
-
-    Example:
-        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
-        [
-            {'foo': 'bar', 'spam': 'eggs'},
-            {'foo': 'bar', 'spam': 'ham'},
-            {'foo': 'baz', 'spam': 'eggs'},
-            {'foo': 'baz', 'spam': 'ham'}
-        ]
-    """
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
 class DatasetTestCase(unittest.TestCase):
     """Abstract base class for all dataset testcases.
 
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 33c390f9f..8259246c0 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -1,432 +1,14 @@
-"""This module is separated from common_utils.py to prevent the former to be dependent on torchvision.prototype"""
-
 import collections.abc
 import dataclasses
-import enum
-import functools
-import pathlib
-from collections import defaultdict
-from typing import Callable, Optional, Sequence, Tuple, Union
+from typing import Optional, Sequence
 
-import PIL.Image
 import pytest
 import torch
-import torch.testing
-import torchvision.prototype.datapoints as proto_datapoints
-from datasets_utils import combinations_grid
-from torch.nn.functional import one_hot
-from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
-from torchvision import datapoints
-from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
-
-__all__ = [
-    "assert_close",
-    "assert_equal",
-    "ArgsKwargs",
-    "VALID_EXTRA_DIMS",
-    "make_image_loaders",
-    "make_image",
-    "make_images",
-    "make_bounding_box_loaders",
-    "make_bounding_box",
-    "make_bounding_boxes",
-    "make_label",
-    "make_one_hot_labels",
-    "make_detection_mask_loaders",
-    "make_detection_mask",
-    "make_detection_masks",
-    "make_segmentation_mask_loaders",
-    "make_segmentation_mask",
-    "make_segmentation_masks",
-    "make_mask_loaders",
-    "make_masks",
-    "make_video",
-    "make_videos",
-    "TestMark",
-    "mark_framework_limitation",
-    "InfoBase",
-]
-
-
-class ImagePair(TensorLikePair):
-    def __init__(
-        self,
-        actual,
-        expected,
-        *,
-        mae=False,
-        **other_parameters,
-    ):
-        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
-            actual, expected = [to_image_tensor(input) for input in [actual, expected]]
-
-        super().__init__(actual, expected, **other_parameters)
-        self.mae = mae
-
-    def compare(self) -> None:
-        actual, expected = self.actual, self.expected
-
-        self._compare_attributes(actual, expected)
-        actual, expected = self._equalize_attributes(actual, expected)
-
-        if self.mae:
-            actual, expected = self._promote_for_comparison(actual, expected)
-            mae = float(torch.abs(actual - expected).float().mean())
-            if mae > self.atol:
-                self._fail(
-                    AssertionError,
-                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
-                )
-        else:
-            super()._compare_values(actual, expected)
-
-
-def assert_close(
-    actual,
-    expected,
-    *,
-    allow_subclasses=True,
-    rtol=None,
-    atol=None,
-    equal_nan=False,
-    check_device=True,
-    check_dtype=True,
-    check_layout=True,
-    check_stride=False,
-    msg=None,
-    **kwargs,
-):
-    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
-    __tracebackhide__ = True
-
-    error_metas = not_close_error_metas(
-        actual,
-        expected,
-        pair_types=(
-            NonePair,
-            BooleanPair,
-            NumberPair,
-            ImagePair,
-            TensorLikePair,
-        ),
-        allow_subclasses=allow_subclasses,
-        rtol=rtol,
-        atol=atol,
-        equal_nan=equal_nan,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride,
-        **kwargs,
-    )
-
-    if error_metas:
-        raise error_metas[0].to_error(msg)
-
-
-assert_equal = functools.partial(assert_close, rtol=0, atol=0)
-
-
-def parametrized_error_message(*args, **kwargs):
-    def to_str(obj):
-        if isinstance(obj, torch.Tensor) and obj.numel() > 10:
-            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
-        elif isinstance(obj, enum.Enum):
-            return f"{type(obj).__name__}.{obj.name}"
-        else:
-            return repr(obj)
-
-    if args or kwargs:
-        postfix = "\n".join(
-            [
-                "",
-                "Failure happened for the following parameters:",
-                "",
-                *[to_str(arg) for arg in args],
-                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
-            ]
-        )
-    else:
-        postfix = ""
-
-    def wrapper(msg):
-        return msg + postfix
-
-    return wrapper
-
-
-class ArgsKwargs:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-    def __iter__(self):
-        yield self.args
-        yield self.kwargs
-
-    def load(self, device="cpu"):
-        return ArgsKwargs(
-            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
-            **{
-                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
-                for keyword, arg in self.kwargs.items()
-            },
-        )
-
-
-DEFAULT_SQUARE_SPATIAL_SIZE = 15
-DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
-DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
-DEFAULT_SPATIAL_SIZES = (
-    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-    "random",
-)
-
-
-def _parse_spatial_size(size, *, name="size"):
-    if size == "random":
-        return tuple(torch.randint(15, 33, (2,)).tolist())
-    elif isinstance(size, int) and size > 0:
-        return (size, size)
-    elif (
-        isinstance(size, collections.abc.Sequence)
-        and len(size) == 2
-        and all(isinstance(length, int) and length > 0 for length in size)
-    ):
-        return tuple(size)
-    else:
-        raise pytest.UsageError(
-            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
-            f"but got {size} instead."
-        )
-
-
-VALID_EXTRA_DIMS = ((), (4,), (2, 3))
-DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
-
-DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
-
-
-def from_loader(loader_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loader = loader_fn(*args, **kwargs)
-        return loader.load(device)
-
-    return wrapper
-
-
-def from_loaders(loaders_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loaders = loaders_fn(*args, **kwargs)
-        for loader in loaders:
-            yield loader.load(device)
-
-    return wrapper
-
-
-@dataclasses.dataclass
-class TensorLoader:
-    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
-    shape: Sequence[int]
-    dtype: torch.dtype
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device)
-
-
-@dataclasses.dataclass
-class ImageLoader(TensorLoader):
-    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
-    num_channels: int = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.spatial_size = self.shape[-2:]
-        self.num_channels = self.shape[-3]
-
-
-NUM_CHANNELS_MAP = {
-    "GRAY": 1,
-    "GRAY_ALPHA": 2,
-    "RGB": 3,
-    "RGBA": 4,
-}
-
-
-def get_num_channels(color_space):
-    num_channels = NUM_CHANNELS_MAP.get(color_space)
-    if not num_channels:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
-    return num_channels
-
-
-def make_image_loader(
-    size="random",
-    *,
-    color_space="RGB",
-    extra_dims=(),
-    dtype=torch.float32,
-    constant_alpha=True,
-):
-    size = _parse_spatial_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device):
-        max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
-        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
-            data[..., -1, :, :] = max_value
-        return datapoints.Image(data)
-
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype)
-
-
-make_image = from_loader(make_image_loader)
-
-
-def make_image_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "GRAY_ALPHA",
-        "RGB",
-        "RGBA",
-    ),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.float64, torch.uint8),
-    constant_alpha=True,
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
-        yield make_image_loader(**params, constant_alpha=constant_alpha)
-
-
-make_images = from_loaders(make_image_loaders)
-
-
-def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dtype=torch.uint8):
-    size = _parse_spatial_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device):
-        height, width = shape[-2:]
-
-        image_pil = (
-            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
-            .resize((width, height))
-            .convert(
-                {
-                    "GRAY": "L",
-                    "GRAY_ALPHA": "LA",
-                    "RGB": "RGB",
-                    "RGBA": "RGBA",
-                }[color_space]
-            )
-        )
-
-        image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
-
-        return datapoints.Image(image_tensor)
-
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype)
-
-
-def make_image_loaders_for_interpolation(
-    sizes=((233, 147),),
-    color_spaces=("RGB",),
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
-        yield make_image_loader_for_interpolation(**params)
-
-
-@dataclasses.dataclass
-class BoundingBoxLoader(TensorLoader):
-    format: datapoints.BoundingBoxFormat
-    spatial_size: Tuple[int, int]
-
-
-def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
-    low, high = torch.broadcast_tensors(
-        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
-    )
-    return torch.stack(
-        [
-            torch.randint(low_scalar, high_scalar, (), **kwargs)
-            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
-        ]
-    ).reshape(low.shape)
-
-
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
-    if format not in {
-        datapoints.BoundingBoxFormat.XYXY,
-        datapoints.BoundingBoxFormat.XYWH,
-        datapoints.BoundingBoxFormat.CXCYWH,
-    }:
-        raise pytest.UsageError(f"Can't make bounding box in format {format}")
-
-    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
-
-    def fn(shape, dtype, device):
-        *extra_dims, num_coordinates = shape
-        if num_coordinates != 4:
-            raise pytest.UsageError()
-
-        if any(dim == 0 for dim in extra_dims):
-            return datapoints.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
-            )
-
-        height, width = spatial_size
-
-        if format == datapoints.BoundingBoxFormat.XYXY:
-            x1 = torch.randint(0, width // 2, extra_dims)
-            y1 = torch.randint(0, height // 2, extra_dims)
-            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-            parts = (x1, y1, x2, y2)
-        elif format == datapoints.BoundingBoxFormat.XYWH:
-            x = torch.randint(0, width // 2, extra_dims)
-            y = torch.randint(0, height // 2, extra_dims)
-            w = randint_with_tensor_bounds(1, width - x)
-            h = randint_with_tensor_bounds(1, height - y)
-            parts = (x, y, w, h)
-        else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, extra_dims)
-            cy = torch.randint(1, height - 1, extra_dims)
-            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-            parts = (cx, cy, w, h)
-
-        return datapoints.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
-        )
-
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
-
-
-make_bounding_box = from_loader(make_bounding_box_loader)
-
-
-def make_bounding_box_loaders(
-    *,
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size="random",
-    dtypes=(torch.float32, torch.float64, torch.int64),
-):
-    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
 
+from common_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
+from torch.nn.functional import one_hot
 
-make_bounding_boxes = from_loaders(make_bounding_box_loaders)
+from torchvision.prototype import datapoints
 
 
 @dataclasses.dataclass
@@ -458,7 +40,7 @@ def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
         # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
         # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
         data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
-        return proto_datapoints.Label(data, categories=categories)
+        return datapoints.Label(data, categories=categories)
 
     return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)
 
@@ -482,7 +64,7 @@ def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int
             # since `one_hot` only supports int64
             label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
             data = one_hot(label, num_classes=num_categories).to(dtype)
-        return proto_datapoints.OneHotLabel(data, categories=categories)
+        return datapoints.OneHotLabel(data, categories=categories)
 
     return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)
 
@@ -498,195 +80,3 @@ def make_one_hot_label_loaders(
 
 
 make_one_hot_labels = from_loaders(make_one_hot_label_loaders)
-
-
-class MaskLoader(TensorLoader):
-    pass
-
-
-def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_spatial_size(size)
-    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
-
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return datapoints.Mask(data)
-
-    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
-
-
-make_detection_mask = from_loader(make_detection_mask_loader)
-
-
-def make_detection_mask_loaders(
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
-        yield make_detection_mask_loader(**params)
-
-
-make_detection_masks = from_loaders(make_detection_mask_loaders)
-
-
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_spatial_size(size)
-    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
-
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return datapoints.Mask(data)
-
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
-
-
-make_segmentation_mask = from_loader(make_segmentation_mask_loader)
-
-
-def make_segmentation_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_segmentation_mask_loader(**params)
-
-
-make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
-
-
-def make_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
-    num_categories=(1, 2, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
-    yield from make_segmentation_mask_loaders(
-        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
-    )
-
-
-make_masks = from_loaders(make_mask_loaders)
-
-
-class VideoLoader(ImageLoader):
-    pass
-
-
-def make_video_loader(
-    size="random",
-    *,
-    color_space="RGB",
-    num_frames="random",
-    extra_dims=(),
-    dtype=torch.uint8,
-):
-    size = _parse_spatial_size(size)
-    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
-
-    def fn(shape, dtype, device):
-        video = make_image(size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device)
-        return datapoints.Video(video)
-
-    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
-
-
-make_video = from_loader(make_video_loader)
-
-
-def make_video_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "RGB",
-    ),
-    num_frames=(1, 0, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8, torch.float32, torch.float64),
-):
-    for params in combinations_grid(
-        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
-    ):
-        yield make_video_loader(**params)
-
-
-make_videos = from_loaders(make_video_loaders)
-
-
-class TestMark:
-    def __init__(
-        self,
-        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
-        # no test class, i.e. a standalone test function, use `None`.
-        test_id,
-        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
-        mark,
-        *,
-        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
-        # applied. If omitted, defaults to always apply.
-        condition=None,
-    ):
-        self.test_id = test_id
-        self.mark = mark
-        self.condition = condition or (lambda args_kwargs: True)
-
-
-def mark_framework_limitation(test_id, reason, condition=None):
-    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
-    # framework cannot handle the kernel in general or a specific parameter combination.
-    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
-    # still justified.
-    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
-    # we are wasting CI resources for no reason for most of the time
-    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
-
-
-class InfoBase:
-    def __init__(
-        self,
-        *,
-        # Identifier if the info that shows up the parametrization.
-        id,
-        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
-        # See the `TestMark` class for details
-        test_marks=None,
-        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
-        # `TestMark`), the dtype, and the device.
-        closeness_kwargs=None,
-    ):
-        self.id = id
-
-        self.test_marks = test_marks or []
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-        self.closeness_kwargs = closeness_kwargs or dict()
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
-
-    def get_closeness_kwargs(self, test_id, *, dtype, device):
-        if not (isinstance(test_id, tuple) and len(test_id) == 2):
-            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
-            if callable(test_id):
-                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
-            else:
-                msg += f", but got {test_id} instead."
-            raise pytest.UsageError(msg)
-        if isinstance(device, torch.device):
-            device = device.type
-        return self.closeness_kwargs.get((test_id, dtype, device), dict())
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
new file mode 100644
index 000000000..5b875a6ef
--- /dev/null
+++ b/test/test_datapoints.py
@@ -0,0 +1,32 @@
+import pytest
+import torch
+from PIL import Image
+
+from torchvision import datapoints
+
+
+@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
+def test_image_instance(data):
+    image = datapoints.Image(data)
+    assert isinstance(image, torch.Tensor)
+    assert image.ndim == 3 and image.shape[0] == 3
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
+def test_mask_instance(data):
+    mask = datapoints.Mask(data)
+    assert isinstance(mask, torch.Tensor)
+    assert mask.ndim == 3 and mask.shape[0] == 1
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
+@pytest.mark.parametrize(
+    "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
+)
+def test_bbox_instance(data, format):
+    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
+    assert isinstance(bboxes, torch.Tensor)
+    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat.from_str(format.upper())
+    assert bboxes.format == format
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 605b799e7..74d03e7ea 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -22,12 +22,13 @@ import PIL
 import pytest
 import torch
 import torch.nn.functional as F
+from common_utils import combinations_grid
 from torchvision import datasets
 
 
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
     @staticmethod
     def _make_binary_file(num_elements, root, name):
@@ -113,9 +114,7 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Caltech101
     FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        target_type=("category", "annotation", ["category", "annotation"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(target_type=("category", "annotation", ["category", "annotation"]))
     REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir, config):
@@ -208,7 +207,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
 class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.WIDERFace
     FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         widerface_dir = pathlib.Path(tmpdir) / "widerface"
@@ -269,8 +268,8 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
         "color",
     )
     ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
-        *datasets_utils.combinations_grid(
+        *combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
+        *combinations_grid(
             mode=("coarse",),
             split=("train", "train_extra", "val"),
             target_type=TARGET_TYPES,
@@ -387,7 +386,7 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
 class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ImageNet
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
 
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir)
@@ -417,7 +416,7 @@ class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
 
 class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CIFAR10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     _VERSION_CONFIG = dict(
         base_folder="cifar-10-batches-py",
@@ -490,7 +489,7 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CelebA
     FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "valid", "test", "all"),
         target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
     )
@@ -614,9 +613,7 @@ class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
 
     ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(
-            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
-        ),
+        *combinations_grid(year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")),
         dict(year="2007", image_set="test"),
     )
 
@@ -791,7 +788,7 @@ class CocoCaptionsTestCase(CocoDetectionTestCase):
 
     def _create_annotations(self, image_ids, num_annotations_per_image):
         captions = [str(idx) for idx in range(num_annotations_per_image)]
-        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
+        annotations = combinations_grid(image_id=image_ids, caption=captions)
         for id, annotation in enumerate(annotations):
             annotation["id"] = id
         return annotations, dict(captions=captions)
@@ -805,7 +802,7 @@ class CocoCaptionsTestCase(CocoDetectionTestCase):
 class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.UCF101
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))
 
     _VIDEO_FOLDER = "videos"
     _ANNOTATIONS_FOLDER = "annotations"
@@ -866,9 +863,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.LSUN
 
     REQUIRED_PACKAGES = ("lmdb",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"]))
 
     _CATEGORIES = (
         "bedroom",
@@ -953,7 +948,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
 
 class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.Kinetics
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
 
     def inject_fake_data(self, tmpdir, config):
         classes = ("Abseiling", "Zumba")
@@ -973,7 +968,7 @@ class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
 class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.HMDB51
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))
 
     _VIDEO_FOLDER = "videos"
     _SPLITS_FOLDER = "splits"
@@ -1033,7 +1028,7 @@ class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
 class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Omniglot
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(background=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         target_folder = (
@@ -1113,7 +1108,7 @@ class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
 class USPSTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.USPS
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         num_images = 2 if config["train"] else 1
@@ -1135,7 +1130,7 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
 
     REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
     )
 
@@ -1221,7 +1216,7 @@ class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
     _TRAIN_FEATURE_TYPES = (torch.Tensor,)
     _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
 
-    datasets_utils.combinations_grid(train=(True, False))
+    combinations_grid(train=(True, False))
 
     _NAME = "liberty"
 
@@ -1380,7 +1375,7 @@ class Flickr30kTestCase(Flickr8kTestCase):
 class MNISTTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.MNIST
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     _MAGIC_DTYPES = {
         torch.uint8: 8,
@@ -1450,7 +1445,7 @@ class EMNISTTestCase(MNISTTestCase):
     DATASET_CLASS = datasets.EMNIST
 
     DEFAULT_CONFIG = dict(split="byclass")
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("byclass", "bymerge", "balanced", "letters", "digits", "mnist"), train=(True, False)
     )
 
@@ -1461,7 +1456,7 @@ class EMNISTTestCase(MNISTTestCase):
 class QMNISTTestCase(MNISTTestCase):
     DATASET_CLASS = datasets.QMNIST
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(what=("train", "test", "test10k", "nist"))
+    ADDITIONAL_CONFIGS = combinations_grid(what=("train", "test", "test10k", "nist"))
 
     _LABELS_SIZE = (8,)
     _LABELS_DTYPE = torch.int32
@@ -1507,7 +1502,7 @@ class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
     DATASET_CLASS = datasets.MovingMNIST
     FEATURE_TYPES = (torch.Tensor,)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
+    ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
 
     def inject_fake_data(self, tmpdir, config):
         base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__)
@@ -1543,7 +1538,7 @@ class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
     # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the
     # 'test_is_valid_file()' method.
     DEFAULT_CONFIG = dict(extensions=_EXTENSIONS)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])
+    ADDITIONAL_CONFIGS = combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])
 
     def dataset_args(self, tmpdir, config):
         return tmpdir, datasets.folder.pil_loader
@@ -1612,7 +1607,7 @@ class ImageFolderTestCase(datasets_utils.ImageDatasetTestCase):
 class KittiTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti
     FEATURE_TYPES = (PIL.Image.Image, (list, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         kitti_dir = os.path.join(tmpdir, "Kitti", "raw")
@@ -1648,7 +1643,7 @@ class KittiTestCase(datasets_utils.ImageDatasetTestCase):
 class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SVHN
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "extra"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "extra"))
 
     def inject_fake_data(self, tmpdir, config):
         import scipy.io as sio
@@ -1669,7 +1664,7 @@ class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
 
 class Places365TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Places365
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train-standard", "train-challenge", "val"),
         small=(False, True),
     )
@@ -1761,7 +1756,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.INaturalist
     FEATURE_TYPES = (PIL.Image.Image, (int, tuple))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]),
         version=("2021_train",),
     )
@@ -1798,7 +1793,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
 class LFWPeopleTestCase(datasets_utils.DatasetTestCase):
     DATASET_CLASS = datasets.LFWPeople
     FEATURE_TYPES = (PIL.Image.Image, int)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("10fold", "train", "test"), image_set=("original", "funneled", "deepfunneled")
     )
     _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"}
@@ -1874,7 +1869,7 @@ class LFWPairsTestCase(LFWPeopleTestCase):
 
 class SintelTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Sintel
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
@@ -1942,7 +1937,7 @@ class SintelTestCase(datasets_utils.ImageDatasetTestCase):
 
 class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.KittiFlow
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2002,7 +1997,7 @@ class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FlyingChairs
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
@@ -2057,7 +2052,7 @@ class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FlyingThings3D
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "test"), pass_name=("clean", "final", "both"), camera=("left", "right", "both")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
@@ -2194,7 +2189,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Food101
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = pathlib.Path(tmpdir) / "food-101"
@@ -2229,7 +2224,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
 
 class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FGVCAircraft
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer")
     )
 
@@ -2312,7 +2307,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DTD
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "test", "val"),
         # There is no need to test the whole matrix here, since each fold is treated exactly the same
         partition=(1, 5, 10),
@@ -2346,7 +2341,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FER2013
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
@@ -2381,7 +2376,7 @@ class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.GTSRB
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = os.path.join(tmpdir, "gtsrb")
@@ -2431,7 +2426,7 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CLEVRClassification
     FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
@@ -2463,7 +2458,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.OxfordIIITPet
     FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("trainval", "test"),
         target_types=("category", "segmentation", ["category", "segmentation"], []),
     )
@@ -2522,7 +2517,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
 class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StanfordCars
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         import scipy.io as io
@@ -2566,7 +2561,7 @@ class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
 class Country211TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Country211
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
@@ -2593,7 +2588,7 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase):
 class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Flowers102
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2629,7 +2624,7 @@ class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
 class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.PCAM
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     REQUIRED_PACKAGES = ("h5py",)
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2651,7 +2646,7 @@ class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
 
 class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.RenderedSST2
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2673,7 +2668,7 @@ class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
 
 class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti2012Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2735,7 +2730,7 @@ class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti2015Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2873,7 +2868,7 @@ class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FallingThingsStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(variant=("single", "mixed", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
@@ -2947,7 +2942,7 @@ class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SceneFlowStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         variant=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final", "both")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
@@ -3034,7 +3029,7 @@ class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
 class InStereo2k(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.InStereo2k
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     @staticmethod
     def _make_scene_folder(root: str, name: str, size: Tuple[int, int]):
@@ -3076,7 +3071,7 @@ class InStereo2k(datasets_utils.ImageDatasetTestCase):
 
 class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SintelStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(pass_name=("final", "clean", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -3152,7 +3147,7 @@ class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ETH3DStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
@@ -3219,7 +3214,7 @@ class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):
 
 class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Middlebury2014Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "additional"),
         calibration=("perfect", "imperfect", "both"),
         use_ambient_views=(True, False),
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
index b7aebd4c1..04e3cd67f 100644
--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
@@ -1,9 +1,6 @@
 import pytest
 import torch
 
-from PIL import Image
-
-from torchvision import datapoints
 from torchvision.prototype import datapoints as proto_datapoints
 
 
@@ -134,30 +131,3 @@ def test_wrap_like():
     assert type(label_new) is proto_datapoints.Label
     assert label_new.data_ptr() == output.data_ptr()
     assert label_new.categories is label.categories
-
-
-@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
-def test_image_instance(data):
-    image = datapoints.Image(data)
-    assert isinstance(image, torch.Tensor)
-    assert image.ndim == 3 and image.shape[0] == 3
-
-
-@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
-def test_mask_instance(data):
-    mask = datapoints.Mask(data)
-    assert isinstance(mask, torch.Tensor)
-    assert mask.ndim == 3 and mask.shape[0] == 1
-
-
-@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
-@pytest.mark.parametrize(
-    "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
-)
-def test_bbox_instance(data, format):
-    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
-    assert isinstance(bboxes, torch.Tensor)
-    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat.from_str(format.upper())
-    assert bboxes.format == format
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 040933097..255c3b5c3 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,61 +1,32 @@
 import itertools
-import pathlib
-import random
-import re
-import warnings
-from collections import defaultdict
 
-import numpy as np
+import re
 
 import PIL.Image
 import pytest
 import torch
-import torchvision.prototype.datapoints as proto_datapoints
-import torchvision.prototype.transforms as proto_transforms
-import torchvision.transforms.v2 as transforms
 
-import torchvision.transforms.v2.utils
-from common_utils import cpu_and_gpu
-from prototype_common_utils import (
+from common_utils import (
     assert_equal,
     DEFAULT_EXTRA_DIMS,
     make_bounding_box,
-    make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_images,
-    make_label,
-    make_one_hot_labels,
     make_segmentation_mask,
     make_video,
     make_videos,
 )
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
-from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
-from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2._utils import _convert_fill_arg
-from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
-
-BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
-
-
-def make_vanilla_tensor_images(*args, **kwargs):
-    for image in make_images(*args, **kwargs):
-        if image.ndim > 3:
-            continue
-        yield image.data
-
 
-def make_pil_images(*args, **kwargs):
-    for image in make_vanilla_tensor_images(*args, **kwargs):
-        yield to_pil_image(image)
+from prototype_common_utils import make_label, make_one_hot_labels
 
+from torchvision.datapoints import BoundingBox, BoundingBoxFormat, Image, Mask, Video
+from torchvision.prototype import datapoints, transforms
+from torchvision.transforms.v2._utils import _convert_fill_arg
+from torchvision.transforms.v2.functional import InterpolationMode, pil_to_tensor, to_image_pil
+from torchvision.transforms.v2.utils import check_type, is_simple_tensor
 
-def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_box in make_bounding_boxes(*args, **kwargs):
-        yield bounding_box.data
+BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
 
 
 def parametrize(transforms_with_inputs):
@@ -73,1541 +44,47 @@ def parametrize(transforms_with_inputs):
     )
 
 
-def auto_augment_adapter(transform, input, device):
-    adapted_input = {}
-    image_or_video_found = False
-    for key, value in input.items():
-        if isinstance(value, (datapoints.BoundingBox, datapoints.Mask)):
-            # AA transforms don't support bounding boxes or masks
-            continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
-            if image_or_video_found:
-                # AA transforms only support a single image or video
-                continue
-            image_or_video_found = True
-        adapted_input[key] = value
-    return adapted_input
-
-
-def linear_transformation_adapter(transform, input, device):
-    flat_inputs = list(input.values())
-    c, h, w = query_chw(
-        [
-            item
-            for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs))
-            if needs_transform
-        ]
-    )
-    num_elements = c * h * w
-    transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device)
-    transform.mean_vector = torch.randn((num_elements,), device=device)
-    return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
-
-
-def normalize_adapter(transform, input, device):
-    adapted_input = {}
-    for key, value in input.items():
-        if isinstance(value, PIL.Image.Image):
-            # normalize doesn't support PIL images
-            continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
-            # normalize doesn't support integer images
-            value = F.convert_dtype(value, torch.float32)
-        adapted_input[key] = value
-    return adapted_input
-
-
-class TestSmoke:
-    @pytest.mark.parametrize(
-        ("transform", "adapter"),
-        [
-            (transforms.RandomErasing(p=1.0), None),
-            (transforms.AugMix(), auto_augment_adapter),
-            (transforms.AutoAugment(), auto_augment_adapter),
-            (transforms.RandAugment(), auto_augment_adapter),
-            (transforms.TrivialAugmentWide(), auto_augment_adapter),
-            (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
-            (transforms.Grayscale(), None),
-            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
-            (transforms.RandomAutocontrast(p=1.0), None),
-            (transforms.RandomEqualize(p=1.0), None),
-            (transforms.RandomGrayscale(p=1.0), None),
-            (transforms.RandomInvert(p=1.0), None),
-            (transforms.RandomPhotometricDistort(p=1.0), None),
-            (transforms.RandomPosterize(bits=4, p=1.0), None),
-            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
-            (transforms.CenterCrop([16, 16]), None),
-            (transforms.ElasticTransform(sigma=1.0), None),
-            (transforms.Pad(4), None),
-            (transforms.RandomAffine(degrees=30.0), None),
-            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
-            (transforms.RandomHorizontalFlip(p=1.0), None),
-            (transforms.RandomPerspective(p=1.0), None),
-            (transforms.RandomResize(min_size=10, max_size=20), None),
-            (transforms.RandomResizedCrop([16, 16]), None),
-            (transforms.RandomRotation(degrees=30), None),
-            (transforms.RandomShortestSize(min_size=10), None),
-            (transforms.RandomVerticalFlip(p=1.0), None),
-            (transforms.RandomZoomOut(p=1.0), None),
-            (transforms.Resize([16, 16], antialias=True), None),
-            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
-            (transforms.ClampBoundingBox(), None),
-            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
-            (transforms.ConvertDtype(), None),
-            (transforms.GaussianBlur(kernel_size=3), None),
-            (
-                transforms.LinearTransformation(
-                    # These are just dummy values that will be filled by the adapter. We can't define them upfront,
-                    # because for we neither know the spatial size nor the device at this point
-                    transformation_matrix=torch.empty((1, 1)),
-                    mean_vector=torch.empty((1,)),
-                ),
-                linear_transformation_adapter,
-            ),
-            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
-            (transforms.ToDtype(torch.float64), None),
-            (transforms.UniformTemporalSubsample(num_samples=2), None),
-        ],
-        ids=lambda transform: type(transform).__name__,
-    )
-    @pytest.mark.parametrize("container_type", [dict, list, tuple])
-    @pytest.mark.parametrize(
-        "image_or_video",
-        [
-            make_image(),
-            make_video(),
-            next(make_pil_images(color_spaces=["RGB"])),
-            next(make_vanilla_tensor_images()),
-        ],
-    )
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_common(self, transform, adapter, container_type, image_or_video, device):
-        spatial_size = F.get_spatial_size(image_or_video)
-        input = dict(
-            image_or_video=image_or_video,
-            image_datapoint=make_image(size=spatial_size),
-            video_datapoint=make_video(size=spatial_size),
-            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
-            bounding_box_xyxy=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
-            ),
-            bounding_box_xywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
-            ),
-            bounding_box_cxcywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
-            ),
-            bounding_box_degenerate_xyxy=datapoints.BoundingBox(
-                [
-                    [0, 0, 0, 0],  # no height or width
-                    [0, 0, 0, 1],  # no height
-                    [0, 0, 1, 0],  # no width
-                    [2, 0, 1, 1],  # x1 > x2, y1 < y2
-                    [0, 2, 1, 1],  # x1 < x2, y1 > y2
-                    [2, 2, 1, 1],  # x1 > x2, y1 > y2
-                ],
-                format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=spatial_size,
-            ),
-            bounding_box_degenerate_xywh=datapoints.BoundingBox(
-                [
-                    [0, 0, 0, 0],  # no height or width
-                    [0, 0, 0, 1],  # no height
-                    [0, 0, 1, 0],  # no width
-                    [0, 0, 1, -1],  # negative height
-                    [0, 0, -1, 1],  # negative width
-                    [0, 0, -1, -1],  # negative height and width
-                ],
-                format=datapoints.BoundingBoxFormat.XYWH,
-                spatial_size=spatial_size,
-            ),
-            bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
-                [
-                    [0, 0, 0, 0],  # no height or width
-                    [0, 0, 0, 1],  # no height
-                    [0, 0, 1, 0],  # no width
-                    [0, 0, 1, -1],  # negative height
-                    [0, 0, -1, 1],  # negative width
-                    [0, 0, -1, -1],  # negative height and width
-                ],
-                format=datapoints.BoundingBoxFormat.CXCYWH,
-                spatial_size=spatial_size,
-            ),
-            detection_mask=make_detection_mask(size=spatial_size),
-            segmentation_mask=make_segmentation_mask(size=spatial_size),
-            int=0,
-            float=0.0,
-            bool=True,
-            none=None,
-            str="str",
-            path=pathlib.Path.cwd(),
-            object=object(),
-            tensor=torch.empty(5),
-            array=np.empty(5),
+@parametrize(
+    [
+        (
+            transform,
+            [
+                dict(inpt=inpt, one_hot_label=one_hot_label)
+                for inpt, one_hot_label in itertools.product(
+                    itertools.chain(
+                        make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                        make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                    ),
+                    make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                )
+            ],
         )
-        if adapter is not None:
-            input = adapter(transform, input, device)
-
-        if container_type in {tuple, list}:
-            input = container_type(input.values())
-
-        input_flat, input_spec = tree_flatten(input)
-        input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat]
-        input = tree_unflatten(input_flat, input_spec)
-
-        torch.manual_seed(0)
-        output = transform(input)
-        output_flat, output_spec = tree_flatten(output)
-
-        assert output_spec == input_spec
-
-        for output_item, input_item, should_be_transformed in zip(
-            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
-        ):
-            if should_be_transformed:
-                assert type(output_item) is type(input_item)
-            else:
-                assert output_item is input_item
-
-            if isinstance(input_item, datapoints.BoundingBox) and not isinstance(
-                transform, transforms.ConvertBoundingBoxFormat
-            ):
-                assert output_item.format == input_item.format
-
-        # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
-        # transform that does this), back into a valid one.
-        # TODO: we should test that against all degenerate boxes above
-        for format in list(datapoints.BoundingBoxFormat):
-            sample = dict(
-                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
-                labels=torch.tensor([3]),
-            )
-            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
-
-    @parametrize(
-        [
-            (
-                transform,
-                [
-                    dict(inpt=inpt, one_hot_label=one_hot_label)
-                    for inpt, one_hot_label in itertools.product(
-                        itertools.chain(
-                            make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                            make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                        ),
-                        make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                    )
-                ],
-            )
-            for transform in [
-                proto_transforms.RandomMixup(alpha=1.0),
-                proto_transforms.RandomCutmix(alpha=1.0),
-            ]
-        ]
-    )
-    def test_mixup_cutmix(self, transform, input):
-        transform(input)
-
-        # add other data that should bypass and won't raise any error
-        input_copy = dict(input)
-        input_copy["path"] = "/path/to/somewhere"
-        input_copy["num"] = 1234
-        transform(input_copy)
-
-        # Check if we raise an error if sample contains bbox or mask or label
-        err_msg = "does not support PIL images, bounding boxes, masks and plain labels"
-        input_copy = dict(input)
-        for unsup_data in [
-            make_label(),
-            make_bounding_box(format="XYXY"),
-            make_detection_mask(),
-            make_segmentation_mask(),
-        ]:
-            input_copy["unsupported"] = unsup_data
-            with pytest.raises(TypeError, match=err_msg):
-                transform(input_copy)
-
-    @parametrize(
-        [
-            (
-                transform,
-                itertools.chain.from_iterable(
-                    fn(
-                        color_spaces=[
-                            "GRAY",
-                            "RGB",
-                        ],
-                        dtypes=[torch.uint8],
-                        extra_dims=[(), (4,)],
-                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
-                    )
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                        make_pil_images,
-                        make_videos,
-                    ]
-                ),
-            )
-            for transform in (
-                transforms.RandAugment(),
-                transforms.TrivialAugmentWide(),
-                transforms.AutoAugment(),
-                transforms.AugMix(),
-            )
-        ]
-    )
-    def test_auto_augment(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
-                itertools.chain.from_iterable(
-                    fn(color_spaces=["RGB"], dtypes=[torch.float32])
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                        make_videos,
-                    ]
-                ),
-            ),
+        for transform in [
+            transforms.RandomMixup(alpha=1.0),
+            transforms.RandomCutmix(alpha=1.0),
         ]
-    )
-    def test_normalize(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.RandomResizedCrop([16, 16], antialias=True),
-                itertools.chain(
-                    make_images(extra_dims=[(4,)]),
-                    make_vanilla_tensor_images(),
-                    make_pil_images(),
-                    make_videos(extra_dims=[()]),
-                ),
-            )
-        ]
-    )
-    def test_random_resized_crop(self, transform, input):
-        transform(input)
-
-
-@pytest.mark.parametrize(
-    "flat_inputs",
-    itertools.permutations(
-        [
-            next(make_vanilla_tensor_images()),
-            next(make_vanilla_tensor_images()),
-            next(make_pil_images()),
-            make_image(),
-            next(make_videos()),
-        ],
-        3,
-    ),
+    ]
 )
-def test_simple_tensor_heuristic(flat_inputs):
-    def split_on_simple_tensor(to_split):
-        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
-        # 1. The first simple tensor. If none is present, this will be `None`
-        # 2. A list of the remaining simple tensors
-        # 3. A list of all other items
-        simple_tensors = []
-        others = []
-        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
-        # affect the splitting.
-        for item, inpt in zip(to_split, flat_inputs):
-            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
-        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
-
-    class CopyCloneTransform(transforms.Transform):
-        def _transform(self, inpt, params):
-            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
-
-        @staticmethod
-        def was_applied(output, inpt):
-            identity = output is inpt
-            if identity:
-                return False
-
-            # Make sure nothing fishy is going on
-            assert_equal(output, inpt)
-            return True
-
-    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
-
-    transform = CopyCloneTransform()
-    transformed_sample = transform(flat_inputs)
-
-    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
-
-    if first_simple_tensor_input is not None:
-        if other_inputs:
-            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
-        else:
-            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
-
-    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
-        assert not transform.was_applied(output, inpt)
-
-    for input, output in zip(other_inputs, other_outputs):
-        assert transform.was_applied(output, input)
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomHorizontalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
-        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_datapoints_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(datapoints.Image(input))
-
-        assert_equal(datapoints.Image(expected), actual)
-
-    def test_datapoints_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(datapoints.Mask(input))
-
-        assert_equal(datapoints.Mask(expected), actual)
-
-    def test_datapoints_bounding_box(self, p):
-        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.spatial_size == expected.spatial_size
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomVerticalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
-        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_datapoints_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(datapoints.Image(input))
-
-        assert_equal(datapoints.Image(expected), actual)
-
-    def test_datapoints_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(datapoints.Mask(input))
-
-        assert_equal(datapoints.Mask(expected), actual)
-
-    def test_datapoints_bounding_box(self, p):
-        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.spatial_size == expected.spatial_size
-
-
-class TestPad:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.Pad("abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.Pad([-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.Pad(12, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.Pad(12, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, fill, padding_mode, mocker):
-        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        _ = transform(inpt)
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        if isinstance(padding, tuple):
-            padding = list(padding)
-        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
-
-    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.Pad(1, fill=fill, padding_mode="constant")
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        image = datapoints.Image(torch.rand(3, 32, 32))
-        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-        _ = transform(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms._utils._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
-            ]
-        else:
-            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
-            ]
-        fn.assert_has_calls(calls)
-
-
-class TestRandomZoomOut:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomZoomOut(fill="abc")
-
-        with pytest.raises(TypeError, match="should be a sequence of length"):
-            transforms.RandomZoomOut(0, side_range=0)
-
-        with pytest.raises(ValueError, match="Invalid canvas side range"):
-            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__get_params(self, fill, side_range, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
-
-        image = mocker.MagicMock(spec=datapoints.Image)
-        h, w = image.spatial_size = (24, 32)
-
-        params = transform._get_params([image])
-
-        assert len(params["padding"]) == 4
-        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
-        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill)
-
-    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, p=1.0)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        image = datapoints.Image(torch.rand(3, 32, 32))
-        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms._utils._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, **params, fill=fill),
-                mocker.call(mask, **params, fill=fill),
-            ]
-        else:
-            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, **params, fill=fill_img),
-                mocker.call(mask, **params, fill=fill_mask),
-            ]
-        fn.assert_has_calls(calls)
-
-
-class TestRandomRotation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomRotation(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomRotation(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomRotation(12, fill="abc")
-
-        with pytest.raises(TypeError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=12)
-
-        with pytest.raises(ValueError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=[1, 2, 3])
-
-    def test__get_params(self):
-        angle_bound = 34
-        transform = transforms.RandomRotation(angle_bound)
-
-        params = transform._get_params(None)
-        assert -angle_bound <= params["angle"] <= angle_bound
-
-        angle_bounds = [12, 34]
-        transform = transforms.RandomRotation(angle_bounds)
-
-        params = transform._get_params(None)
-        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("expand", [False, True])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, expand, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomRotation(
-            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.rotate")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
-
-    @pytest.mark.parametrize("angle", [34, -87])
-    @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_spatial_size(self, angle, expand):
-        # Specific test for BoundingBox.rotate
-        bbox = datapoints.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
-        )
-        img = datapoints.Image(torch.rand(1, 3, 32, 32))
-
-        out_img = img.rotate(angle, expand=expand)
-        out_bbox = bbox.rotate(angle, expand=expand)
-
-        assert out_img.spatial_size == out_bbox.spatial_size
-
-
-class TestRandomAffine:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomAffine(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        for kwargs in [
-            {"center": 12},
-            {"translate": 12},
-            {"scale": 12},
-        ]:
-            with pytest.raises(TypeError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
-            with pytest.raises(ValueError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
-            transforms.RandomAffine(12, translate=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="scale values should be positive"):
-            transforms.RandomAffine(12, scale=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(12, shear=-10)
-
-        for s in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
-                transforms.RandomAffine(12, shear=s)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    def test__get_params(self, degrees, translate, scale, shear, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
-
-        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
-        params = transform._get_params([image])
-
-        if not isinstance(degrees, (list, tuple)):
-            assert -degrees <= params["angle"] <= degrees
-        else:
-            assert degrees[0] <= params["angle"] <= degrees[1]
-
-        if translate is not None:
-            w_max = int(round(translate[0] * w))
-            h_max = int(round(translate[1] * h))
-            assert -w_max <= params["translate"][0] <= w_max
-            assert -h_max <= params["translate"][1] <= h_max
-        else:
-            assert params["translate"] == (0, 0)
-
-        if scale is not None:
-            assert scale[0] <= params["scale"] <= scale[1]
-        else:
-            assert params["scale"] == 1.0
-
-        if shear is not None:
-            if isinstance(shear, float):
-                assert -shear <= params["shear"][0] <= shear
-                assert params["shear"][1] == 0.0
-            elif len(shear) == 2:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert params["shear"][1] == 0.0
-            else:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert shear[2] <= params["shear"][1] <= shear[3]
-        else:
-            assert params["shear"] == (0, 0)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomAffine(
-            degrees,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.affine")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
-
-
-class TestRandomCrop:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Please provide only two dimensions"):
-            transforms.RandomCrop([10, 12, 14])
-
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.RandomCrop([10, 12], padding="abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomCrop([10, 12], padding=1, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
-
-        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
-        params = transform._get_params([image])
-
-        if padding is not None:
-            if isinstance(padding, int):
-                pad_top = pad_bottom = pad_left = pad_right = padding
-            elif isinstance(padding, list) and len(padding) == 2:
-                pad_left = pad_right = padding[0]
-                pad_top = pad_bottom = padding[1]
-            elif isinstance(padding, list) and len(padding) == 4:
-                pad_left, pad_top, pad_right, pad_bottom = padding
-
-            h += pad_top + pad_bottom
-            w += pad_left + pad_right
-        else:
-            pad_left = pad_right = pad_top = pad_bottom = 0
-
-        if pad_if_needed:
-            if w < size[1]:
-                diff = size[1] - w
-                pad_left += diff
-                pad_right += diff
-                w += 2 * diff
-            if h < size[0]:
-                diff = size[0] - h
-                pad_top += diff
-                pad_bottom += diff
-                h += 2 * diff
-
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-
-        assert 0 <= params["top"] <= h - size[0] + 1
-        assert 0 <= params["left"] <= w - size[1] + 1
-        assert params["height"] == size[0]
-        assert params["width"] == size[1]
-        assert params["needs_pad"] is any(padding)
-        assert params["padding"] == padding
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("pad_if_needed", [False, True])
-    @pytest.mark.parametrize("fill", [False, True])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
-        output_size = [10, 12]
-        transform = transforms.RandomCrop(
-            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
-        )
-
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (32, 32)
-
-        expected = mocker.MagicMock(spec=datapoints.Image)
-        expected.num_channels = 3
-        if isinstance(padding, int):
-            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
-        elif isinstance(padding, list):
-            expected.spatial_size = (
-                inpt.spatial_size[0] + sum(padding[0::2]),
-                inpt.spatial_size[1] + sum(padding[1::2]),
-            )
-        else:
-            expected.spatial_size = inpt.spatial_size
-        _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
-        fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-        if padding is None and not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif padding is None:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-        else:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-
-
-class TestGaussianBlur:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
-            transforms.GaussianBlur([10, 12, 14])
-
-        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
-            transforms.GaussianBlur(4)
-
-        with pytest.raises(
-            TypeError, match="sigma should be a single int or float or a list/tuple with length 2 floats."
-        ):
-            transforms.GaussianBlur(3, sigma=[1, 2, 3])
-
-        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
-            transforms.GaussianBlur(3, sigma=-1.0)
-
-        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
-            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
-
-    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
-    def test__get_params(self, sigma):
-        transform = transforms.GaussianBlur(3, sigma=sigma)
-        params = transform._get_params([])
-
-        if isinstance(sigma, float):
-            assert params["sigma"][0] == params["sigma"][1] == 10
-        else:
-            assert sigma[0] <= params["sigma"][0] <= sigma[1]
-            assert sigma[0] <= params["sigma"][1] <= sigma[1]
-
-    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
-    def test__transform(self, kernel_size, sigma, mocker):
-        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
-
-        if isinstance(kernel_size, (tuple, list)):
-            assert transform.kernel_size == kernel_size
-        else:
-            kernel_size = (kernel_size, kernel_size)
-            assert transform.kernel_size == kernel_size
-
-        if isinstance(sigma, (tuple, list)):
-            assert transform.sigma == sigma
-        else:
-            assert transform.sigma == [sigma, sigma]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-
-        fn.assert_called_once_with(inpt, kernel_size, **params)
-
-
-class TestRandomColorOp:
-    @pytest.mark.parametrize("p", [0.0, 1.0])
-    @pytest.mark.parametrize(
-        "transform_cls, func_op_name, kwargs",
-        [
-            (transforms.RandomEqualize, "equalize", {}),
-            (transforms.RandomInvert, "invert", {}),
-            (transforms.RandomAutocontrast, "autocontrast", {}),
-            (transforms.RandomPosterize, "posterize", {"bits": 4}),
-            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
-            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
-        ],
-    )
-    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
-        transform = transform_cls(p=p, **kwargs)
-
-        fn = mocker.patch(f"torchvision.transforms.v2.functional.{func_op_name}")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        _ = transform(inpt)
-        if p > 0.0:
-            fn.assert_called_once_with(inpt, **kwargs)
-        else:
-            assert fn.call_count == 0
-
-
-class TestRandomPerspective:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
-            transforms.RandomPerspective(distortion_scale=-1.0)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomPerspective(0.5, fill="abc")
-
-    def test__get_params(self, mocker):
-        dscale = 0.5
-        transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        params = transform._get_params([image])
-
-        h, w = image.spatial_size
-        assert "coefficients" in params
-        assert len(params["coefficients"]) == 8
-
-    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
-    def test__transform(self, distortion_scale, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
-
-
-class TestElasticTransform:
-    def test_assertions(self):
-
-        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
-            transforms.ElasticTransform({})
-
-        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
-            transforms.ElasticTransform([1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
-            transforms.ElasticTransform([1, 2])
-
-        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
-            transforms.ElasticTransform(1.0, {})
-
-        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
-            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
-            transforms.ElasticTransform(1.0, [1, 2])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.ElasticTransform(1.0, 2.0, fill="abc")
-
-    def test__get_params(self, mocker):
-        alpha = 2.0
-        sigma = 3.0
-        transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        params = transform._get_params([image])
-
-        h, w = image.spatial_size
-        displacement = params["displacement"]
-        assert displacement.shape == (1, h, w, 2)
-        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
-        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
-
-    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
-    def test__transform(self, alpha, sigma, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
-
-        if isinstance(alpha, float):
-            assert transform.alpha == [alpha, alpha]
-        else:
-            assert transform.alpha == alpha
-
-        if isinstance(sigma, float):
-            assert transform.sigma == [sigma, sigma]
-        else:
-            assert transform.sigma == sigma
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock()
-        _ = transform(inpt)
-        params = transform._get_params([inpt])
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
-
-class TestRandomErasing:
-    def test_assertions(self, mocker):
-        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
-            transforms.RandomErasing(value={})
-
-        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
-            transforms.RandomErasing(value="abc")
-
-        with pytest.raises(TypeError, match="Scale should be a sequence"):
-            transforms.RandomErasing(scale=123)
-
-        with pytest.raises(TypeError, match="Ratio should be a sequence"):
-            transforms.RandomErasing(ratio=123)
-
-        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
-            transforms.RandomErasing(scale=[-1, 2])
-
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
-
-        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
-            transform._get_params([image])
-
-    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=value)
-        params = transform._get_params([image])
-
-        v = params["v"]
-        h, w = params["h"], params["w"]
-        i, j = params["i"], params["j"]
-        assert isinstance(v, torch.Tensor)
-        if value == "random":
-            assert v.shape == (image.num_channels, h, w)
-        elif isinstance(value, (int, float)):
-            assert v.shape == (1, 1, 1)
-        elif isinstance(value, (list, tuple)):
-            assert v.shape == (image.num_channels, 1, 1)
-
-        assert 0 <= i <= image.spatial_size[0] - h
-        assert 0 <= j <= image.spatial_size[1] - w
-
-    @pytest.mark.parametrize("p", [0, 1])
-    def test__transform(self, mocker, p):
-        transform = transforms.RandomErasing(p=p)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        i_sentinel = mocker.MagicMock()
-        j_sentinel = mocker.MagicMock()
-        h_sentinel = mocker.MagicMock()
-        w_sentinel = mocker.MagicMock()
-        v_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._augment.RandomErasing._get_params",
-            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._augment.F.erase")
-        output = transform(inpt_sentinel)
-
-        if p:
-            mock.assert_called_once_with(
-                inpt_sentinel,
-                i=i_sentinel,
-                j=j_sentinel,
-                h=h_sentinel,
-                w=w_sentinel,
-                v=v_sentinel,
-                inplace=transform.inplace,
-            )
-        else:
-            mock.assert_not_called()
-            assert output is inpt_sentinel
-
-
-class TestTransform:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test_check_transformed_types(self, inpt_type, mocker):
-        # This test ensures that we correctly handle which types to transform and which to bypass
-        t = transforms.Transform()
-        inpt = mocker.MagicMock(spec=inpt_type)
-
-        if inpt_type in (np.ndarray, str, int):
-            output = t(inpt)
-            assert output is inpt
-        else:
-            with pytest.raises(NotImplementedError):
-                t(inpt)
-
-
-class TestToImageTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch(
-            "torchvision.transforms.v2.functional.to_image_tensor",
-            return_value=torch.rand(1, 3, 8, 8),
-        )
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImageTensor()
-        transform(inpt)
-        if inpt_type in (datapoints.BoundingBox, datapoints.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestToImagePIL:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImagePIL()
-        transform(inpt)
-        if inpt_type in (datapoints.BoundingBox, PIL.Image.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToPILImage:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToPILImage()
-        transform(inpt)
-        if inpt_type in (PIL.Image.Image, datapoints.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        with pytest.warns(UserWarning, match="deprecated and will be removed"):
-            transform = transforms.ToTensor()
-        transform(inpt)
-        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestContainers:
-    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
-    def test_assertions(self, transform_cls):
-        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
-            transform_cls(transforms.RandomCrop(28))
-
-    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
-    @pytest.mark.parametrize(
-        "trfms",
-        [
-            [transforms.Pad(2), transforms.RandomCrop(28)],
-            [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
-            [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)],
-        ],
-    )
-    def test_ctor(self, transform_cls, trfms):
-        c = transform_cls(trfms)
-        inpt = torch.rand(1, 3, 32, 32)
-        output = c(inpt)
-        assert isinstance(output, torch.Tensor)
-        assert output.ndim == 4
-
-
-class TestRandomChoice:
-    def test_assertions(self):
-        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
-
-        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
-
-
-class TestRandomIoUCrop:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
-    def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        bboxes = datapoints.BoundingBox(
-            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
-            format="XYXY",
-            spatial_size=image.spatial_size,
-            device=device,
-        )
-        sample = [image, bboxes]
-
-        transform = transforms.RandomIoUCrop(sampler_options=options)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params(sample)
-
-            if options == [2.0]:
-                assert len(params) == 0
-                return
-
-            assert len(params["is_within_crop_area"]) > 0
-            assert params["is_within_crop_area"].dtype == torch.bool
-
-            orig_h = image.spatial_size[0]
-            orig_w = image.spatial_size[1]
-            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
-            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
-
-            left, top = params["left"], params["top"]
-            new_h, new_w = params["height"], params["width"]
-            ious = box_iou(
-                bboxes,
-                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
-            )
-            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
-
-    def test__transform_empty_params(self, mocker):
-        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
-        image = datapoints.Image(torch.rand(1, 3, 4, 4))
-        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
-        label = proto_datapoints.Label(torch.tensor([1]))
-        sample = [image, bboxes, label]
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock(return_value={})
-        output = transform(sample)
-        torch.testing.assert_close(output, sample)
-
-    def test_forward_assertion(self):
-        transform = transforms.RandomIoUCrop()
-        with pytest.raises(
-            TypeError,
-            match="requires input sample to contain tensor or PIL images and bounding boxes",
-        ):
-            transform(torch.tensor(0))
-
-    def test__transform(self, mocker):
-        transform = transforms.RandomIoUCrop()
-
-        image = datapoints.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
-        masks = make_detection_mask((32, 24), num_objects=6)
-
-        sample = [image, bboxes, masks]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.crop", side_effect=lambda x, **params: x)
-        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
-
-        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
-        transform._get_params = mocker.MagicMock(return_value=params)
-        output = transform(sample)
-
-        assert fn.call_count == 3
-
-        expected_calls = [
-            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-        ]
-
-        fn.assert_has_calls(expected_calls)
-
-        # check number of bboxes vs number of labels:
-        output_bboxes = output[1]
-        assert isinstance(output_bboxes, datapoints.BoundingBox)
-        assert (output_bboxes[~is_within_crop_area] == 0).all()
-
-        output_masks = output[2]
-        assert isinstance(output_masks, datapoints.Mask)
-
-
-class TestScaleJitter:
-    def test__get_params(self, mocker):
-        spatial_size = (24, 32)
-        target_size = (16, 12)
-        scale_range = (0.5, 1.5)
-
-        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params([sample])
-
-            assert "size" in params
-            size = params["size"]
-
-            assert isinstance(size, tuple) and len(size) == 2
-            height, width = size
-
-            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
-
-            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
-            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.ScaleJitter(
-            target_size=(16, 12), interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
-
-class TestRandomShortestSize:
-    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
-    def test__get_params(self, min_size, max_size, mocker):
-        spatial_size = (3, 10)
-
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
-
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
-        params = transform._get_params([sample])
-
-        assert "size" in params
-        size = params["size"]
-
-        assert isinstance(size, tuple) and len(size) == 2
-
-        longer = max(size)
-        shorter = min(size)
-        if max_size is not None:
-            assert longer <= max_size
-            assert shorter <= max_size
-        else:
-            assert shorter in min_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomShortestSize(
-            min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.RandomShortestSize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
+def test_mixup_cutmix(transform, input):
+    transform(input)
+
+    input_copy = dict(input)
+    input_copy["path"] = "/path/to/somewhere"
+    input_copy["num"] = 1234
+    transform(input_copy)
+
+    # Check if we raise an error if sample contains bbox or mask or label
+    err_msg = "does not support PIL images, bounding boxes, masks and plain labels"
+    input_copy = dict(input)
+    for unsup_data in [
+        make_label(),
+        make_bounding_box(format="XYXY"),
+        make_detection_mask(),
+        make_segmentation_mask(),
+    ]:
+        input_copy["unsupported"] = unsup_data
+        with pytest.raises(TypeError, match=err_msg):
+            transform(input_copy)
 
 
 class TestSimpleCopyPaste:
@@ -1617,27 +94,27 @@ class TestSimpleCopyPaste:
         return mocker.MagicMock(spec=image_type)
 
     def test__extract_image_targets_assertion(self, mocker):
-        transform = proto_transforms.SimpleCopyPaste()
+        transform = transforms.SimpleCopyPaste()
 
         flat_sample = [
             # images, batch size = 2
-            self.create_fake_image(mocker, datapoints.Image),
+            self.create_fake_image(mocker, Image),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=proto_datapoints.Label),
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=datapoints.Label),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
         ]
 
         with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"):
             transform._extract_image_targets(flat_sample)
 
-    @pytest.mark.parametrize("image_type", [datapoints.Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [proto_datapoints.Label, proto_datapoints.OneHotLabel])
+    @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor])
+    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
     def test__extract_image_targets(self, image_type, label_type, mocker):
-        transform = proto_transforms.SimpleCopyPaste()
+        transform = transforms.SimpleCopyPaste()
 
         flat_sample = [
             # images, batch size = 2
@@ -1645,12 +122,12 @@ class TestSimpleCopyPaste:
             self.create_fake_image(mocker, image_type),
             # labels, bboxes, masks
             mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
             mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
         ]
 
         images, targets = transform._extract_image_targets(flat_sample)
@@ -1665,15 +142,15 @@ class TestSimpleCopyPaste:
 
         for target in targets:
             for key, type_ in [
-                ("boxes", datapoints.BoundingBox),
-                ("masks", datapoints.Mask),
+                ("boxes", BoundingBox),
+                ("masks", Mask),
                 ("labels", label_type),
             ]:
                 assert key in target
                 assert isinstance(target[key], type_)
                 assert target[key] in flat_sample
 
-    @pytest.mark.parametrize("label_type", [proto_datapoints.Label, proto_datapoints.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
     def test__copy_paste(self, label_type):
         image = 2 * torch.ones(3, 32, 32)
         masks = torch.zeros(2, 32, 32)
@@ -1683,13 +160,13 @@ class TestSimpleCopyPaste:
         blending = True
         resize_interpolation = InterpolationMode.BILINEAR
         antialias = None
-        if label_type == proto_datapoints.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
-            "boxes": datapoints.BoundingBox(
+            "boxes": BoundingBox(
                 torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
             ),
-            "masks": datapoints.Mask(masks),
+            "masks": Mask(masks),
             "labels": label_type(labels),
         }
 
@@ -1698,17 +175,17 @@ class TestSimpleCopyPaste:
         paste_masks[0, 13:19, 12:18] = 1
         paste_masks[1, 15:19, 1:8] = 1
         paste_labels = torch.tensor([3, 4])
-        if label_type == proto_datapoints.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
-            "boxes": datapoints.BoundingBox(
+            "boxes": BoundingBox(
                 torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
             ),
-            "masks": datapoints.Mask(paste_masks),
+            "masks": Mask(paste_masks),
             "labels": label_type(paste_labels),
         }
 
-        transform = proto_transforms.SimpleCopyPaste()
+        transform = transforms.SimpleCopyPaste()
         random_selection = torch.tensor([0, 1])
         output_image, output_target = transform._copy_paste(
             image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias
@@ -1720,7 +197,7 @@ class TestSimpleCopyPaste:
         torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
 
         expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == proto_datapoints.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
             expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
         torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
 
@@ -1735,13 +212,11 @@ class TestFixedSizeCrop:
         batch_shape = (10,)
         spatial_size = (11, 5)
 
-        transform = proto_transforms.FixedSizeCrop(size=crop_size)
+        transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
             make_image(size=spatial_size, color_space="RGB"),
-            make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
-            ),
+            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -1763,7 +238,7 @@ class TestFixedSizeCrop:
         fill_sentinel = 12
         padding_mode_sentinel = mocker.MagicMock()
 
-        transform = proto_transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
+        transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
         transform._transformed_types = (mocker.MagicMock,)
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
@@ -1837,12 +312,12 @@ class TestFixedSizeCrop:
         )
 
         bounding_boxes = make_bounding_box(
-            format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
         masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
-        transform = proto_transforms.FixedSizeCrop((-1, -1))
+        transform = transforms.FixedSizeCrop((-1, -1))
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
         output = transform(
@@ -1875,11 +350,11 @@ class TestFixedSizeCrop:
         )
 
         bounding_box = make_bounding_box(
-            format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
-        transform = proto_transforms.FixedSizeCrop((-1, -1))
+        transform = transforms.FixedSizeCrop((-1, -1))
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
         transform(bounding_box)
@@ -1887,178 +362,48 @@ class TestFixedSizeCrop:
         mock.assert_called_once()
 
 
-class TestLinearTransformation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="transformation_matrix should be square"):
-            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
-
-        with pytest.raises(ValueError, match="mean_vector should have the same length"):
-            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
-
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            122 * torch.ones(1, 3, 8, 8),
-            122.0 * torch.ones(1, 3, 8, 8),
-            datapoints.Image(122 * torch.ones(1, 3, 8, 8)),
-            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
-        ],
-    )
-    def test__transform(self, inpt):
-
-        v = 121 * torch.ones(3 * 8 * 8)
-        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
-        transform = transforms.LinearTransformation(m, v)
-
-        if isinstance(inpt, PIL.Image.Image):
-            with pytest.raises(TypeError, match="LinearTransformation does not work on PIL Images"):
-                transform(inpt)
-        else:
-            output = transform(inpt)
-            assert isinstance(output, torch.Tensor)
-            assert output.unique() == 3 * 8 * 8
-            assert output.dtype == inpt.dtype
-
-
 class TestLabelToOneHot:
     def test__transform(self):
         categories = ["apple", "pear", "pineapple"]
-        labels = proto_datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
-        transform = proto_transforms.LabelToOneHot()
+        labels = datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
+        transform = transforms.LabelToOneHot()
         ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, proto_datapoints.OneHotLabel)
+        assert isinstance(ohe_labels, datapoints.OneHotLabel)
         assert ohe_labels.shape == (4, 3)
         assert ohe_labels.categories == labels.categories == categories
 
 
-class TestRandomResize:
-    def test__get_params(self):
-        min_size = 3
-        max_size = 6
-
-        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
-
-        for _ in range(10):
-            params = transform._get_params([])
-
-            assert isinstance(params["size"], list) and len(params["size"]) == 1
-            size = params["size"][0]
-
-            assert min_size <= size < max_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomResize(
-            min_size=-1, max_size=-1, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.RandomResize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_resize = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock_resize.assert_called_with(
-            inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
-
-class TestToDtype:
-    @pytest.mark.parametrize(
-        ("dtype", "expected_dtypes"),
-        [
-            (
-                torch.float64,
-                {
-                    datapoints.Video: torch.float64,
-                    datapoints.Image: torch.float64,
-                    datapoints.BoundingBox: torch.float64,
-                },
-            ),
-            (
-                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-            ),
-        ],
-    )
-    def test_call(self, dtype, expected_dtypes):
-        sample = dict(
-            video=make_video(dtype=torch.int64),
-            image=make_image(dtype=torch.uint8),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
-            str="str",
-            int=0,
-        )
-
-        transform = transforms.ToDtype(dtype)
-        transformed_sample = transform(sample)
-
-        for key, value in sample.items():
-            value_type = type(value)
-            transformed_value = transformed_sample[key]
-
-            # make sure the transformation retains the type
-            assert isinstance(transformed_value, value_type)
-
-            if isinstance(value, torch.Tensor):
-                assert transformed_value.dtype is expected_dtypes[value_type]
-            else:
-                assert transformed_value is value
-
-    @pytest.mark.filterwarnings("error")
-    def test_plain_tensor_call(self):
-        tensor = torch.empty((), dtype=torch.float32)
-        transform = transforms.ToDtype({torch.Tensor: torch.float64})
-
-        assert transform(tensor).dtype is torch.float64
-
-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
-    def test_plain_tensor_warning(self, other_type):
-        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.ToDtype(dtype={torch.Tensor: torch.float32, other_type: torch.float64})
-
-
 class TestPermuteDimensions:
     @pytest.mark.parametrize(
         ("dims", "inverse_dims"),
         [
             (
-                {datapoints.Image: (2, 1, 0), datapoints.Video: None},
-                {datapoints.Image: (2, 1, 0), datapoints.Video: None},
+                {Image: (2, 1, 0), Video: None},
+                {Image: (2, 1, 0), Video: None},
             ),
             (
-                {datapoints.Image: (2, 1, 0), datapoints.Video: (1, 2, 3, 0)},
-                {datapoints.Image: (2, 1, 0), datapoints.Video: (3, 0, 1, 2)},
+                {Image: (2, 1, 0), Video: (1, 2, 3, 0)},
+                {Image: (2, 1, 0), Video: (3, 0, 1, 2)},
             ),
         ],
     )
     def test_call(self, dims, inverse_dims):
         sample = dict(
             image=make_image(),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
         )
 
-        transform = proto_transforms.PermuteDimensions(dims)
+        transform = transforms.PermuteDimensions(dims)
         transformed_sample = transform(sample)
 
         for key, value in sample.items():
             value_type = type(value)
             transformed_value = transformed_sample[key]
 
-            if check_type(
-                value, (datapoints.Image, torchvision.transforms.v2.utils.is_simple_tensor, datapoints.Video)
-            ):
+            if check_type(value, (Image, is_simple_tensor, Video)):
                 if transform.dims.get(value_type) is not None:
                     assert transformed_value.permute(inverse_dims[value_type]).equal(value)
                 assert type(transformed_value) == torch.Tensor
@@ -2068,14 +413,14 @@ class TestPermuteDimensions:
     @pytest.mark.filterwarnings("error")
     def test_plain_tensor_call(self):
         tensor = torch.empty((2, 3, 4))
-        transform = proto_transforms.PermuteDimensions(dims=(1, 2, 0))
+        transform = transforms.PermuteDimensions(dims=(1, 2, 0))
 
         assert transform(tensor).shape == (3, 4, 2)
 
-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    @pytest.mark.parametrize("other_type", [Image, Video])
     def test_plain_tensor_warning(self, other_type):
         with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            proto_transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+            transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
 
 
 class TestTransposeDimensions:
@@ -2083,19 +428,19 @@ class TestTransposeDimensions:
         "dims",
         [
             (-1, -2),
-            {datapoints.Image: (1, 2), datapoints.Video: None},
+            {Image: (1, 2), Video: None},
         ],
     )
     def test_call(self, dims):
         sample = dict(
             image=make_image(),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
         )
 
-        transform = proto_transforms.TransposeDimensions(dims)
+        transform = transforms.TransposeDimensions(dims)
         transformed_sample = transform(sample)
 
         for key, value in sample.items():
@@ -2103,9 +448,7 @@ class TestTransposeDimensions:
             transformed_value = transformed_sample[key]
 
             transposed_dims = transform.dims.get(value_type)
-            if check_type(
-                value, (datapoints.Image, torchvision.transforms.v2.utils.is_simple_tensor, datapoints.Video)
-            ):
+            if check_type(value, (Image, is_simple_tensor, Video)):
                 if transposed_dims is not None:
                     assert transformed_value.transpose(*transposed_dims).equal(value)
                 assert type(transformed_value) == torch.Tensor
@@ -2115,372 +458,78 @@ class TestTransposeDimensions:
     @pytest.mark.filterwarnings("error")
     def test_plain_tensor_call(self):
         tensor = torch.empty((2, 3, 4))
-        transform = proto_transforms.TransposeDimensions(dims=(0, 2))
+        transform = transforms.TransposeDimensions(dims=(0, 2))
 
         assert transform(tensor).shape == (4, 3, 2)
 
-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    @pytest.mark.parametrize("other_type", [Image, Video])
     def test_plain_tensor_warning(self, other_type):
         with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            proto_transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
-
-
-class TestUniformTemporalSubsample:
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            torch.zeros(10, 3, 8, 8),
-            torch.zeros(1, 10, 3, 8, 8),
-            datapoints.Video(torch.zeros(1, 10, 3, 8, 8)),
-        ],
-    )
-    def test__transform(self, inpt):
-        num_samples = 5
-        transform = transforms.UniformTemporalSubsample(num_samples)
-
-        output = transform(inpt)
-        assert type(output) is type(inpt)
-        assert output.shape[-4] == num_samples
-        assert output.dtype == inpt.dtype
-
-
-# TODO: remove this test in 0.17 when the default of antialias changes to True
-def test_antialias_warning():
-    pil_img = PIL.Image.new("RGB", size=(10, 10), color=127)
-    tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8)
-    tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
-
-    match = "The default value of the antialias parameter"
-    with pytest.warns(UserWarning, match=match):
-        transforms.Resize((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.RandomResizedCrop((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.ScaleJitter((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.RandomShortestSize((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.RandomResize(10, 20)(tensor_img)
-
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize(tensor_img, (20, 20))
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize_image_tensor(tensor_img, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize(tensor_video, (20, 20))
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize_video(tensor_video, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Image(tensor_img).resize((20, 20))
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Video(tensor_video).resize((20, 20))
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20))
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("error")
-        transforms.Resize((20, 20))(pil_img)
-        transforms.RandomResizedCrop((20, 20))(pil_img)
-        transforms.ScaleJitter((20, 20))(pil_img)
-        transforms.RandomShortestSize((20, 20))(pil_img)
-        transforms.RandomResize(10, 20)(pil_img)
-        transforms.functional.resize(pil_img, (20, 20))
-
-        transforms.Resize((20, 20), antialias=True)(tensor_img)
-        transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
-        transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
-        transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
-        transforms.RandomResize(10, 20, antialias=True)(tensor_img)
-
-        transforms.functional.resize(tensor_img, (20, 20), antialias=True)
-        transforms.functional.resize_image_tensor(tensor_img, (20, 20), antialias=True)
-        transforms.functional.resize(tensor_video, (20, 20), antialias=True)
-        transforms.functional.resize_video(tensor_video, (20, 20), antialias=True)
-
-        datapoints.Image(tensor_img).resize((20, 20), antialias=True)
-        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
-        datapoints.Video(tensor_video).resize((20, 20), antialias=True)
-        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
-
-
-@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
-@pytest.mark.parametrize("label_type", (torch.Tensor, int))
-@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
-@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
-def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
-
-    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
-    if image_type is PIL.Image:
-        image = to_pil_image(image[0])
-    elif image_type is torch.Tensor:
-        image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
-
-    label = 1 if label_type is int else torch.tensor([1])
-
-    if dataset_return_type is dict:
-        sample = {
-            "image": image,
-            "label": label,
-        }
-    else:
-        sample = image, label
+            transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
 
-    t = transforms.Compose(
-        [
-            transforms.RandomResizedCrop((224, 224)),
-            transforms.RandomHorizontalFlip(p=1),
-            transforms.RandAugment(),
-            transforms.TrivialAugmentWide(),
-            transforms.AugMix(),
-            transforms.AutoAugment(),
-            to_tensor(),
-            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
-            # intended?  This results in a failure if we convert to tensor after
-            # it, because the image would still be uint8 which make Normalize
-            # fail.
-            transforms.ConvertImageDtype(torch.float),
-            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
-            transforms.RandomErasing(p=1),
-        ]
-    )
 
-    out = t(sample)
+import importlib.machinery
+import importlib.util
+from pathlib import Path
 
-    assert type(out) == type(sample)
 
-    if dataset_return_type is tuple:
-        out_image, out_label = out
-    else:
-        assert out.keys() == sample.keys()
-        out_image, out_label = out.values()
+def import_transforms_from_references(reference):
+    HERE = Path(__file__).parent
+    PROJECT_ROOT = HERE.parent
 
-    assert out_image.shape[-2:] == (224, 224)
-    assert out_label == label
-
-
-@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
-@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
-@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
-@pytest.mark.parametrize("sanitize", (True, False))
-def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
-    torch.manual_seed(0)
-    if data_augmentation == "hflip":
-        t = [
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "lsj":
-        t = [
-            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
-            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
-            # leaving FixedSizeCrop in prototype for now, and it expects Label
-            # classes which we won't release yet.
-            # transforms.FixedSizeCrop(
-            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})
-            # ),
-            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "multiscale":
-        t = [
-            transforms.RandomShortestSize(
-                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
-            ),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "ssd":
-        t = [
-            transforms.RandomPhotometricDistort(p=1),
-            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
-            transforms.RandomIoUCrop(),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "ssdlite":
-        t = [
-            transforms.RandomIoUCrop(),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    if sanitize:
-        t += [transforms.SanitizeBoundingBoxes()]
-    t = transforms.Compose(t)
-
-    num_boxes = 5
-    H = W = 250
-
-    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
-    if image_type is PIL.Image:
-        image = to_pil_image(image[0])
-    elif image_type is torch.Tensor:
-        image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
-
-    label = torch.randint(0, 10, size=(num_boxes,))
-
-    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
-    boxes[:, 2:] += boxes[:, :2]
-    boxes = boxes.clamp(min=0, max=min(H, W))
-    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
-
-    masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
-
-    sample = {
-        "image": image,
-        "label": label,
-        "boxes": boxes,
-        "masks": masks,
-    }
-
-    out = t(sample)
-
-    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
-        assert is_simple_tensor(out["image"])
-    else:
-        assert isinstance(out["image"], datapoints.Image)
-    assert isinstance(out["label"], type(sample["label"]))
-
-    num_boxes_expected = {
-        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
-        # doesn't remove them strictly speaking, it just marks some boxes as
-        # degenerate and those boxes will be later removed by
-        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
-        # param is True.
-        # Note that the values below are probably specific to the random seed
-        # set above (which is fine).
-        (True, "ssd"): 4,
-        (True, "ssdlite"): 4,
-    }.get((sanitize, data_augmentation), num_boxes)
-
-    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
-
-
-@pytest.mark.parametrize("min_size", (1, 10))
-@pytest.mark.parametrize(
-    "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
-)
-def test_sanitize_bounding_boxes(min_size, labels_getter):
-    H, W = 256, 128
-
-    boxes_and_validity = [
-        ([0, 1, 10, 1], False),  # Y1 == Y2
-        ([0, 1, 0, 20], False),  # X1 == X2
-        ([0, 0, min_size - 1, 10], False),  # H < min_size
-        ([0, 0, 10, min_size - 1], False),  # W < min_size
-        ([0, 0, 10, H + 1], False),  # Y2 > H
-        ([0, 0, W + 1, 10], False),  # X2 > W
-        ([-1, 1, 10, 20], False),  # any < 0
-        ([0, 0, -1, 20], False),  # any < 0
-        ([0, 0, -10, -1], False),  # any < 0
-        ([0, 0, min_size, 10], True),  # H < min_size
-        ([0, 0, 10, min_size], True),  # W < min_size
-        ([0, 0, W, H], True),  # TODO: Is that actually OK?? Should it be -1?
-        ([1, 1, 30, 20], True),
-        ([0, 0, 10, 10], True),
-        ([1, 1, 30, 20], True),
-    ]
-
-    random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
-    boxes, is_valid_mask = zip(*boxes_and_validity)
-    valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
-
-    boxes = torch.tensor(boxes)
-    labels = torch.arange(boxes.shape[0])
-
-    boxes = datapoints.BoundingBox(
-        boxes,
-        format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(H, W),
+    loader = importlib.machinery.SourceFileLoader(
+        "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py")
     )
+    spec = importlib.util.spec_from_loader("transforms", loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
 
-    masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
-
-    sample = {
-        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
-        "labels": labels,
-        "boxes": boxes,
-        "whatever": torch.rand(10),
-        "None": None,
-        "masks": masks,
-    }
 
-    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+det_transforms = import_transforms_from_references("detection")
 
-    assert out["image"] is sample["image"]
-    assert out["whatever"] is sample["whatever"]
 
-    if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
-        assert out["labels"] is sample["labels"]
-    else:
-        assert isinstance(out["labels"], torch.Tensor)
-        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
-        # This works because we conveniently set labels to arange(num_boxes)
-        assert out["labels"].tolist() == valid_indices
+def test_fixed_sized_crop_against_detection_reference():
+    def make_datapoints():
+        size = (600, 800)
+        num_objects = 22
 
+        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+        }
 
-@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
-def test_sanitize_bounding_boxes_default_heuristic(key):
-    labels = torch.arange(10)
-    d = {key: labels}
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
-
-    if key.lower() != "labels":
-        # If "labels" is in the dict (case-insensitive),
-        # it takes precedence over other keys which would otherwise be a match
-        d = {key: "something_else", "labels": labels}
-        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
-
+        yield (pil_image, target)
 
-def test_sanitize_bounding_boxes_errors():
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+        }
 
-    good_bbox = datapoints.BoundingBox(
-        [[0, 0, 10, 10]],
-        format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(20, 20),
-    )
+        yield (tensor_image, target)
 
-    with pytest.raises(ValueError, match="min_size must be >= 1"):
-        transforms.SanitizeBoundingBoxes(min_size=0)
-    with pytest.raises(ValueError, match="labels_getter should either be a str"):
-        transforms.SanitizeBoundingBoxes(labels_getter=12)
+        datapoint_image = make_image(size=size, color_space="RGB")
+        target = {
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+        }
 
-    with pytest.raises(ValueError, match="Could not infer where the labels are"):
-        bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+        yield (datapoint_image, target)
 
-    with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
-        not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
-        transforms.SanitizeBoundingBoxes()(not_a_dict)
+    t = transforms.FixedSizeCrop((1024, 1024), fill=0)
+    t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0)
 
-    with pytest.raises(ValueError, match="must be a tensor"):
-        not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
-        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+    for dp in make_datapoints():
+        # We should use prototype transform first as reference transform performs inplace target update
+        torch.manual_seed(12)
+        output = t(dp)
 
-    with pytest.raises(ValueError, match="Number of boxes"):
-        different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        torch.manual_seed(12)
+        expected_output = t_ref(*dp)
 
-    with pytest.raises(ValueError, match="boxes must be of shape"):
-        bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
-            [
-                [[0, 0, 10, 10]],
-                [[0, 0, 10, 10]],
-            ],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(20, 20),
-        )
-        different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        assert_equal(expected_output, output)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
new file mode 100644
index 000000000..733939b53
--- /dev/null
+++ b/test/test_transforms_v2.py
@@ -0,0 +1,2047 @@
+import itertools
+import pathlib
+import random
+import re
+import warnings
+from collections import defaultdict
+
+import numpy as np
+
+import PIL.Image
+import pytest
+import torch
+import torchvision.transforms.v2 as transforms
+
+from common_utils import (
+    assert_equal,
+    cpu_and_gpu,
+    make_bounding_box,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_segmentation_mask,
+    make_video,
+    make_videos,
+)
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import datapoints
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
+
+
+def make_vanilla_tensor_images(*args, **kwargs):
+    for image in make_images(*args, **kwargs):
+        if image.ndim > 3:
+            continue
+        yield image.data
+
+
+def make_pil_images(*args, **kwargs):
+    for image in make_vanilla_tensor_images(*args, **kwargs):
+        yield to_pil_image(image)
+
+
+def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
+    for bounding_box in make_bounding_boxes(*args, **kwargs):
+        yield bounding_box.data
+
+
+def parametrize(transforms_with_inputs):
+    return pytest.mark.parametrize(
+        ("transform", "input"),
+        [
+            pytest.param(
+                transform,
+                input,
+                id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}",
+            )
+            for transform, inputs in transforms_with_inputs
+            for idx, input in enumerate(inputs)
+        ],
+    )
+
+
+def auto_augment_adapter(transform, input, device):
+    adapted_input = {}
+    image_or_video_found = False
+    for key, value in input.items():
+        if isinstance(value, (datapoints.BoundingBox, datapoints.Mask)):
+            # AA transforms don't support bounding boxes or masks
+            continue
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
+            if image_or_video_found:
+                # AA transforms only support a single image or video
+                continue
+            image_or_video_found = True
+        adapted_input[key] = value
+    return adapted_input
+
+
+def linear_transformation_adapter(transform, input, device):
+    flat_inputs = list(input.values())
+    c, h, w = query_chw(
+        [
+            item
+            for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs))
+            if needs_transform
+        ]
+    )
+    num_elements = c * h * w
+    transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device)
+    transform.mean_vector = torch.randn((num_elements,), device=device)
+    return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
+
+
+def normalize_adapter(transform, input, device):
+    adapted_input = {}
+    for key, value in input.items():
+        if isinstance(value, PIL.Image.Image):
+            # normalize doesn't support PIL images
+            continue
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
+            # normalize doesn't support integer images
+            value = F.convert_dtype(value, torch.float32)
+        adapted_input[key] = value
+    return adapted_input
+
+
+class TestSmoke:
+    @pytest.mark.parametrize(
+        ("transform", "adapter"),
+        [
+            (transforms.RandomErasing(p=1.0), None),
+            (transforms.AugMix(), auto_augment_adapter),
+            (transforms.AutoAugment(), auto_augment_adapter),
+            (transforms.RandAugment(), auto_augment_adapter),
+            (transforms.TrivialAugmentWide(), auto_augment_adapter),
+            (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
+            (transforms.Grayscale(), None),
+            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
+            (transforms.RandomAutocontrast(p=1.0), None),
+            (transforms.RandomEqualize(p=1.0), None),
+            (transforms.RandomGrayscale(p=1.0), None),
+            (transforms.RandomInvert(p=1.0), None),
+            (transforms.RandomPhotometricDistort(p=1.0), None),
+            (transforms.RandomPosterize(bits=4, p=1.0), None),
+            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
+            (transforms.CenterCrop([16, 16]), None),
+            (transforms.ElasticTransform(sigma=1.0), None),
+            (transforms.Pad(4), None),
+            (transforms.RandomAffine(degrees=30.0), None),
+            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
+            (transforms.RandomHorizontalFlip(p=1.0), None),
+            (transforms.RandomPerspective(p=1.0), None),
+            (transforms.RandomResize(min_size=10, max_size=20), None),
+            (transforms.RandomResizedCrop([16, 16]), None),
+            (transforms.RandomRotation(degrees=30), None),
+            (transforms.RandomShortestSize(min_size=10), None),
+            (transforms.RandomVerticalFlip(p=1.0), None),
+            (transforms.RandomZoomOut(p=1.0), None),
+            (transforms.Resize([16, 16], antialias=True), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
+            (transforms.ClampBoundingBox(), None),
+            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
+            (transforms.ConvertDtype(), None),
+            (transforms.GaussianBlur(kernel_size=3), None),
+            (
+                transforms.LinearTransformation(
+                    # These are just dummy values that will be filled by the adapter. We can't define them upfront,
+                    # because for we neither know the spatial size nor the device at this point
+                    transformation_matrix=torch.empty((1, 1)),
+                    mean_vector=torch.empty((1,)),
+                ),
+                linear_transformation_adapter,
+            ),
+            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
+            (transforms.ToDtype(torch.float64), None),
+            (transforms.UniformTemporalSubsample(num_samples=2), None),
+        ],
+        ids=lambda transform: type(transform).__name__,
+    )
+    @pytest.mark.parametrize("container_type", [dict, list, tuple])
+    @pytest.mark.parametrize(
+        "image_or_video",
+        [
+            make_image(),
+            make_video(),
+            next(make_pil_images(color_spaces=["RGB"])),
+            next(make_vanilla_tensor_images()),
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_common(self, transform, adapter, container_type, image_or_video, device):
+        spatial_size = F.get_spatial_size(image_or_video)
+        input = dict(
+            image_or_video=image_or_video,
+            image_datapoint=make_image(size=spatial_size),
+            video_datapoint=make_video(size=spatial_size),
+            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
+            bounding_box_xyxy=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
+            ),
+            bounding_box_xywh=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
+            ),
+            bounding_box_cxcywh=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
+            ),
+            bounding_box_degenerate_xyxy=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [2, 0, 1, 1],  # x1 > x2, y1 < y2
+                    [0, 2, 1, 1],  # x1 < x2, y1 > y2
+                    [2, 2, 1, 1],  # x1 > x2, y1 > y2
+                ],
+                format=datapoints.BoundingBoxFormat.XYXY,
+                spatial_size=spatial_size,
+            ),
+            bounding_box_degenerate_xywh=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=datapoints.BoundingBoxFormat.XYWH,
+                spatial_size=spatial_size,
+            ),
+            bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=datapoints.BoundingBoxFormat.CXCYWH,
+                spatial_size=spatial_size,
+            ),
+            detection_mask=make_detection_mask(size=spatial_size),
+            segmentation_mask=make_segmentation_mask(size=spatial_size),
+            int=0,
+            float=0.0,
+            bool=True,
+            none=None,
+            str="str",
+            path=pathlib.Path.cwd(),
+            object=object(),
+            tensor=torch.empty(5),
+            array=np.empty(5),
+        )
+        if adapter is not None:
+            input = adapter(transform, input, device)
+
+        if container_type in {tuple, list}:
+            input = container_type(input.values())
+
+        input_flat, input_spec = tree_flatten(input)
+        input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat]
+        input = tree_unflatten(input_flat, input_spec)
+
+        torch.manual_seed(0)
+        output = transform(input)
+        output_flat, output_spec = tree_flatten(output)
+
+        assert output_spec == input_spec
+
+        for output_item, input_item, should_be_transformed in zip(
+            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
+        ):
+            if should_be_transformed:
+                assert type(output_item) is type(input_item)
+            else:
+                assert output_item is input_item
+
+            if isinstance(input_item, datapoints.BoundingBox) and not isinstance(
+                transform, transforms.ConvertBoundingBoxFormat
+            ):
+                assert output_item.format == input_item.format
+
+        # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
+        # transform that does this), back into a valid one.
+        # TODO: we should test that against all degenerate boxes above
+        for format in list(datapoints.BoundingBoxFormat):
+            sample = dict(
+                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
+                labels=torch.tensor([3]),
+            )
+            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+
+    @parametrize(
+        [
+            (
+                transform,
+                itertools.chain.from_iterable(
+                    fn(
+                        color_spaces=[
+                            "GRAY",
+                            "RGB",
+                        ],
+                        dtypes=[torch.uint8],
+                        extra_dims=[(), (4,)],
+                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
+                    )
+                    for fn in [
+                        make_images,
+                        make_vanilla_tensor_images,
+                        make_pil_images,
+                        make_videos,
+                    ]
+                ),
+            )
+            for transform in (
+                transforms.RandAugment(),
+                transforms.TrivialAugmentWide(),
+                transforms.AutoAugment(),
+                transforms.AugMix(),
+            )
+        ]
+    )
+    def test_auto_augment(self, transform, input):
+        transform(input)
+
+    @parametrize(
+        [
+            (
+                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
+                itertools.chain.from_iterable(
+                    fn(color_spaces=["RGB"], dtypes=[torch.float32])
+                    for fn in [
+                        make_images,
+                        make_vanilla_tensor_images,
+                        make_videos,
+                    ]
+                ),
+            ),
+        ]
+    )
+    def test_normalize(self, transform, input):
+        transform(input)
+
+    @parametrize(
+        [
+            (
+                transforms.RandomResizedCrop([16, 16], antialias=True),
+                itertools.chain(
+                    make_images(extra_dims=[(4,)]),
+                    make_vanilla_tensor_images(),
+                    make_pil_images(),
+                    make_videos(extra_dims=[()]),
+                ),
+            )
+        ]
+    )
+    def test_random_resized_crop(self, transform, input):
+        transform(input)
+
+
+@pytest.mark.parametrize(
+    "flat_inputs",
+    itertools.permutations(
+        [
+            next(make_vanilla_tensor_images()),
+            next(make_vanilla_tensor_images()),
+            next(make_pil_images()),
+            make_image(),
+            next(make_videos()),
+        ],
+        3,
+    ),
+)
+def test_simple_tensor_heuristic(flat_inputs):
+    def split_on_simple_tensor(to_split):
+        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
+        # 1. The first simple tensor. If none is present, this will be `None`
+        # 2. A list of the remaining simple tensors
+        # 3. A list of all other items
+        simple_tensors = []
+        others = []
+        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
+        # affect the splitting.
+        for item, inpt in zip(to_split, flat_inputs):
+            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
+        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
+
+    class CopyCloneTransform(transforms.Transform):
+        def _transform(self, inpt, params):
+            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
+
+        @staticmethod
+        def was_applied(output, inpt):
+            identity = output is inpt
+            if identity:
+                return False
+
+            # Make sure nothing fishy is going on
+            assert_equal(output, inpt)
+            return True
+
+    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
+
+    transform = CopyCloneTransform()
+    transformed_sample = transform(flat_inputs)
+
+    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
+
+    if first_simple_tensor_input is not None:
+        if other_inputs:
+            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+        else:
+            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+
+    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
+        assert not transform.was_applied(output, inpt)
+
+    for input, output in zip(other_inputs, other_outputs):
+        assert transform.was_applied(output, input)
+
+
+@pytest.mark.parametrize("p", [0.0, 1.0])
+class TestRandomHorizontalFlip:
+    def input_expected_image_tensor(self, p, dtype=torch.float32):
+        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
+        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
+
+        return input, expected if p == 1 else input
+
+    def test_simple_tensor(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(input)
+
+        assert_equal(expected, actual)
+
+    def test_pil_image(self, p):
+        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(to_pil_image(input))
+
+        assert_equal(expected, pil_to_tensor(actual))
+
+    def test_datapoints_image(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(datapoints.Image(input))
+
+        assert_equal(datapoints.Image(expected), actual)
+
+    def test_datapoints_mask(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(datapoints.Mask(input))
+
+        assert_equal(datapoints.Mask(expected), actual)
+
+    def test_datapoints_bounding_box(self, p):
+        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(input)
+
+        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
+        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
+        assert_equal(expected, actual)
+        assert actual.format == expected.format
+        assert actual.spatial_size == expected.spatial_size
+
+
+@pytest.mark.parametrize("p", [0.0, 1.0])
+class TestRandomVerticalFlip:
+    def input_expected_image_tensor(self, p, dtype=torch.float32):
+        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
+        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
+
+        return input, expected if p == 1 else input
+
+    def test_simple_tensor(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(input)
+
+        assert_equal(expected, actual)
+
+    def test_pil_image(self, p):
+        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(to_pil_image(input))
+
+        assert_equal(expected, pil_to_tensor(actual))
+
+    def test_datapoints_image(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(datapoints.Image(input))
+
+        assert_equal(datapoints.Image(expected), actual)
+
+    def test_datapoints_mask(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(datapoints.Mask(input))
+
+        assert_equal(datapoints.Mask(expected), actual)
+
+    def test_datapoints_bounding_box(self, p):
+        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(input)
+
+        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
+        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
+        assert_equal(expected, actual)
+        assert actual.format == expected.format
+        assert actual.spatial_size == expected.spatial_size
+
+
+class TestPad:
+    def test_assertions(self):
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.Pad("abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.Pad([-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.Pad(12, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.Pad(12, padding_mode="abc")
+
+    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
+    def test__transform(self, padding, fill, padding_mode, mocker):
+        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        _ = transform(inpt)
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        if isinstance(padding, tuple):
+            padding = list(padding)
+        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
+    def test__transform_image_mask(self, fill, mocker):
+        transform = transforms.Pad(1, fill=fill, padding_mode="constant")
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        image = datapoints.Image(torch.rand(3, 32, 32))
+        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
+        inpt = [image, mask]
+        _ = transform(inpt)
+
+        if isinstance(fill, int):
+            fill = transforms._utils._convert_fill_arg(fill)
+            calls = [
+                mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
+                mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
+            ]
+        else:
+            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
+            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
+            calls = [
+                mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
+                mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
+            ]
+        fn.assert_has_calls(calls)
+
+
+class TestRandomZoomOut:
+    def test_assertions(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomZoomOut(fill="abc")
+
+        with pytest.raises(TypeError, match="should be a sequence of length"):
+            transforms.RandomZoomOut(0, side_range=0)
+
+        with pytest.raises(ValueError, match="Invalid canvas side range"):
+            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
+
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
+    def test__get_params(self, fill, side_range, mocker):
+        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
+
+        image = mocker.MagicMock(spec=datapoints.Image)
+        h, w = image.spatial_size = (24, 32)
+
+        params = transform._get_params([image])
+
+        assert len(params["padding"]) == 4
+        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
+        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
+        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
+        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
+
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
+    def test__transform(self, fill, side_range, mocker):
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        torch.rand(1)  # random apply changes random state
+        params = transform._get_params([inpt])
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, fill=fill)
+
+    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
+    def test__transform_image_mask(self, fill, mocker):
+        transform = transforms.RandomZoomOut(fill=fill, p=1.0)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        image = datapoints.Image(torch.rand(3, 32, 32))
+        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
+        inpt = [image, mask]
+
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        torch.rand(1)  # random apply changes random state
+        params = transform._get_params(inpt)
+
+        if isinstance(fill, int):
+            fill = transforms._utils._convert_fill_arg(fill)
+            calls = [
+                mocker.call(image, **params, fill=fill),
+                mocker.call(mask, **params, fill=fill),
+            ]
+        else:
+            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
+            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
+            calls = [
+                mocker.call(image, **params, fill=fill_img),
+                mocker.call(mask, **params, fill=fill_mask),
+            ]
+        fn.assert_has_calls(calls)
+
+
+class TestRandomRotation:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="is a single number, it must be positive"):
+            transforms.RandomRotation(-0.7)
+
+        for d in [[-0.7], [-0.7, 0, 0.7]]:
+            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
+                transforms.RandomRotation(d)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomRotation(12, fill="abc")
+
+        with pytest.raises(TypeError, match="center should be a sequence of length"):
+            transforms.RandomRotation(12, center=12)
+
+        with pytest.raises(ValueError, match="center should be a sequence of length"):
+            transforms.RandomRotation(12, center=[1, 2, 3])
+
+    def test__get_params(self):
+        angle_bound = 34
+        transform = transforms.RandomRotation(angle_bound)
+
+        params = transform._get_params(None)
+        assert -angle_bound <= params["angle"] <= angle_bound
+
+        angle_bounds = [12, 34]
+        transform = transforms.RandomRotation(angle_bounds)
+
+        params = transform._get_params(None)
+        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
+
+    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
+    def test__transform(self, degrees, expand, fill, center, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        transform = transforms.RandomRotation(
+            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
+        )
+
+        if isinstance(degrees, (tuple, list)):
+            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
+        else:
+            assert transform.degrees == [float(-degrees), float(degrees)]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.rotate")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params(inpt)
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+    @pytest.mark.parametrize("angle", [34, -87])
+    @pytest.mark.parametrize("expand", [False, True])
+    def test_boundingbox_spatial_size(self, angle, expand):
+        # Specific test for BoundingBox.rotate
+        bbox = datapoints.BoundingBox(
+            torch.tensor([1, 2, 3, 4]), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
+        )
+        img = datapoints.Image(torch.rand(1, 3, 32, 32))
+
+        out_img = img.rotate(angle, expand=expand)
+        out_bbox = bbox.rotate(angle, expand=expand)
+
+        assert out_img.spatial_size == out_bbox.spatial_size
+
+
+class TestRandomAffine:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="is a single number, it must be positive"):
+            transforms.RandomAffine(-0.7)
+
+        for d in [[-0.7], [-0.7, 0, 0.7]]:
+            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
+                transforms.RandomAffine(d)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(12, fill="abc")
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(12, fill="abc")
+
+        for kwargs in [
+            {"center": 12},
+            {"translate": 12},
+            {"scale": 12},
+        ]:
+            with pytest.raises(TypeError, match="should be a sequence of length"):
+                transforms.RandomAffine(12, **kwargs)
+
+        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
+            with pytest.raises(ValueError, match="should be a sequence of length"):
+                transforms.RandomAffine(12, **kwargs)
+
+        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
+            transforms.RandomAffine(12, translate=[-1.0, 2.0])
+
+        with pytest.raises(ValueError, match="scale values should be positive"):
+            transforms.RandomAffine(12, scale=[-1.0, 2.0])
+
+        with pytest.raises(ValueError, match="is a single number, it must be positive"):
+            transforms.RandomAffine(12, shear=-10)
+
+        for s in [[-0.7], [-0.7, 0, 0.7]]:
+            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
+                transforms.RandomAffine(12, shear=s)
+
+    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
+    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
+    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
+    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
+    def test__get_params(self, degrees, translate, scale, shear, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+        h, w = image.spatial_size
+
+        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
+        params = transform._get_params([image])
+
+        if not isinstance(degrees, (list, tuple)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+        if translate is not None:
+            w_max = int(round(translate[0] * w))
+            h_max = int(round(translate[1] * h))
+            assert -w_max <= params["translate"][0] <= w_max
+            assert -h_max <= params["translate"][1] <= h_max
+        else:
+            assert params["translate"] == (0, 0)
+
+        if scale is not None:
+            assert scale[0] <= params["scale"] <= scale[1]
+        else:
+            assert params["scale"] == 1.0
+
+        if shear is not None:
+            if isinstance(shear, float):
+                assert -shear <= params["shear"][0] <= shear
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 2:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert params["shear"][1] == 0.0
+            else:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert shear[2] <= params["shear"][1] <= shear[3]
+        else:
+            assert params["shear"] == (0, 0)
+
+    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
+    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
+    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
+    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
+    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        transform = transforms.RandomAffine(
+            degrees,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+
+        if isinstance(degrees, (tuple, list)):
+            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
+        else:
+            assert transform.degrees == [float(-degrees), float(degrees)]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.affine")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params([inpt])
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
+
+
+class TestRandomCrop:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Please provide only two dimensions"):
+            transforms.RandomCrop([10, 12, 14])
+
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.RandomCrop([10, 12], padding="abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomCrop([10, 12], padding=1, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
+
+    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
+    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
+    def test__get_params(self, padding, pad_if_needed, size, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+        h, w = image.spatial_size
+
+        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
+        params = transform._get_params([image])
+
+        if padding is not None:
+            if isinstance(padding, int):
+                pad_top = pad_bottom = pad_left = pad_right = padding
+            elif isinstance(padding, list) and len(padding) == 2:
+                pad_left = pad_right = padding[0]
+                pad_top = pad_bottom = padding[1]
+            elif isinstance(padding, list) and len(padding) == 4:
+                pad_left, pad_top, pad_right, pad_bottom = padding
+
+            h += pad_top + pad_bottom
+            w += pad_left + pad_right
+        else:
+            pad_left = pad_right = pad_top = pad_bottom = 0
+
+        if pad_if_needed:
+            if w < size[1]:
+                diff = size[1] - w
+                pad_left += diff
+                pad_right += diff
+                w += 2 * diff
+            if h < size[0]:
+                diff = size[0] - h
+                pad_top += diff
+                pad_bottom += diff
+                h += 2 * diff
+
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+
+        assert 0 <= params["top"] <= h - size[0] + 1
+        assert 0 <= params["left"] <= w - size[1] + 1
+        assert params["height"] == size[0]
+        assert params["width"] == size[1]
+        assert params["needs_pad"] is any(padding)
+        assert params["padding"] == padding
+
+    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
+    @pytest.mark.parametrize("pad_if_needed", [False, True])
+    @pytest.mark.parametrize("fill", [False, True])
+    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
+    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
+        output_size = [10, 12]
+        transform = transforms.RandomCrop(
+            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
+        )
+
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (32, 32)
+
+        expected = mocker.MagicMock(spec=datapoints.Image)
+        expected.num_channels = 3
+        if isinstance(padding, int):
+            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
+        elif isinstance(padding, list):
+            expected.spatial_size = (
+                inpt.spatial_size[0] + sum(padding[0::2]),
+                inpt.spatial_size[1] + sum(padding[1::2]),
+            )
+        else:
+            expected.spatial_size = inpt.spatial_size
+        _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
+        fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
+
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params([inpt])
+        if padding is None and not pad_if_needed:
+            fn_crop.assert_called_once_with(
+                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
+            )
+        elif not pad_if_needed:
+            fn_crop.assert_called_once_with(
+                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
+            )
+        elif padding is None:
+            # vfdev-5: I do not know how to mock and test this case
+            pass
+        else:
+            # vfdev-5: I do not know how to mock and test this case
+            pass
+
+
+class TestGaussianBlur:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
+            transforms.GaussianBlur([10, 12, 14])
+
+        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
+            transforms.GaussianBlur(4)
+
+        with pytest.raises(
+            TypeError, match="sigma should be a single int or float or a list/tuple with length 2 floats."
+        ):
+            transforms.GaussianBlur(3, sigma=[1, 2, 3])
+
+        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
+            transforms.GaussianBlur(3, sigma=-1.0)
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
+
+    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
+    def test__get_params(self, sigma):
+        transform = transforms.GaussianBlur(3, sigma=sigma)
+        params = transform._get_params([])
+
+        if isinstance(sigma, float):
+            assert params["sigma"][0] == params["sigma"][1] == 10
+        else:
+            assert sigma[0] <= params["sigma"][0] <= sigma[1]
+            assert sigma[0] <= params["sigma"][1] <= sigma[1]
+
+    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
+    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
+    def test__transform(self, kernel_size, sigma, mocker):
+        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
+
+        if isinstance(kernel_size, (tuple, list)):
+            assert transform.kernel_size == kernel_size
+        else:
+            kernel_size = (kernel_size, kernel_size)
+            assert transform.kernel_size == kernel_size
+
+        if isinstance(sigma, (tuple, list)):
+            assert transform.sigma == sigma
+        else:
+            assert transform.sigma == [sigma, sigma]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params([inpt])
+
+        fn.assert_called_once_with(inpt, kernel_size, **params)
+
+
+class TestRandomColorOp:
+    @pytest.mark.parametrize("p", [0.0, 1.0])
+    @pytest.mark.parametrize(
+        "transform_cls, func_op_name, kwargs",
+        [
+            (transforms.RandomEqualize, "equalize", {}),
+            (transforms.RandomInvert, "invert", {}),
+            (transforms.RandomAutocontrast, "autocontrast", {}),
+            (transforms.RandomPosterize, "posterize", {"bits": 4}),
+            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
+            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
+        ],
+    )
+    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
+        transform = transform_cls(p=p, **kwargs)
+
+        fn = mocker.patch(f"torchvision.transforms.v2.functional.{func_op_name}")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        _ = transform(inpt)
+        if p > 0.0:
+            fn.assert_called_once_with(inpt, **kwargs)
+        else:
+            assert fn.call_count == 0
+
+
+class TestRandomPerspective:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
+            transforms.RandomPerspective(distortion_scale=-1.0)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomPerspective(0.5, fill="abc")
+
+    def test__get_params(self, mocker):
+        dscale = 0.5
+        transform = transforms.RandomPerspective(dscale)
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        params = transform._get_params([image])
+
+        h, w = image.spatial_size
+        assert "coefficients" in params
+        assert len(params["coefficients"]) == 8
+
+    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
+    def test__transform(self, distortion_scale, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        fill = 12
+        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        torch.rand(1)  # random apply changes random state
+        params = transform._get_params([inpt])
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
+
+
+class TestElasticTransform:
+    def test_assertions(self):
+
+        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
+            transforms.ElasticTransform({})
+
+        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
+            transforms.ElasticTransform([1.0, 2.0, 3.0])
+
+        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
+            transforms.ElasticTransform([1, 2])
+
+        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
+            transforms.ElasticTransform(1.0, {})
+
+        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
+            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
+
+        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
+            transforms.ElasticTransform(1.0, [1, 2])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.ElasticTransform(1.0, 2.0, fill="abc")
+
+    def test__get_params(self, mocker):
+        alpha = 2.0
+        sigma = 3.0
+        transform = transforms.ElasticTransform(alpha, sigma)
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        params = transform._get_params([image])
+
+        h, w = image.spatial_size
+        displacement = params["displacement"]
+        assert displacement.shape == (1, h, w, 2)
+        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
+        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
+
+    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
+    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
+    def test__transform(self, alpha, sigma, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        fill = 12
+        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
+
+        if isinstance(alpha, float):
+            assert transform.alpha == [alpha, alpha]
+        else:
+            assert transform.alpha == alpha
+
+        if isinstance(sigma, float):
+            assert transform.sigma == [sigma, sigma]
+        else:
+            assert transform.sigma == sigma
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        # Let's mock transform._get_params to control the output:
+        transform._get_params = mocker.MagicMock()
+        _ = transform(inpt)
+        params = transform._get_params([inpt])
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
+
+
+class TestRandomErasing:
+    def test_assertions(self, mocker):
+        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
+            transforms.RandomErasing(value={})
+
+        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
+            transforms.RandomErasing(value="abc")
+
+        with pytest.raises(TypeError, match="Scale should be a sequence"):
+            transforms.RandomErasing(scale=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence"):
+            transforms.RandomErasing(ratio=123)
+
+        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
+            transforms.RandomErasing(scale=[-1, 2])
+
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
+
+        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
+            transform._get_params([image])
+
+    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
+    def test__get_params(self, value, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        transform = transforms.RandomErasing(value=value)
+        params = transform._get_params([image])
+
+        v = params["v"]
+        h, w = params["h"], params["w"]
+        i, j = params["i"], params["j"]
+        assert isinstance(v, torch.Tensor)
+        if value == "random":
+            assert v.shape == (image.num_channels, h, w)
+        elif isinstance(value, (int, float)):
+            assert v.shape == (1, 1, 1)
+        elif isinstance(value, (list, tuple)):
+            assert v.shape == (image.num_channels, 1, 1)
+
+        assert 0 <= i <= image.spatial_size[0] - h
+        assert 0 <= j <= image.spatial_size[1] - w
+
+    @pytest.mark.parametrize("p", [0, 1])
+    def test__transform(self, mocker, p):
+        transform = transforms.RandomErasing(p=p)
+        transform._transformed_types = (mocker.MagicMock,)
+
+        i_sentinel = mocker.MagicMock()
+        j_sentinel = mocker.MagicMock()
+        h_sentinel = mocker.MagicMock()
+        w_sentinel = mocker.MagicMock()
+        v_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._augment.RandomErasing._get_params",
+            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock = mocker.patch("torchvision.transforms.v2._augment.F.erase")
+        output = transform(inpt_sentinel)
+
+        if p:
+            mock.assert_called_once_with(
+                inpt_sentinel,
+                i=i_sentinel,
+                j=j_sentinel,
+                h=h_sentinel,
+                w=w_sentinel,
+                v=v_sentinel,
+                inplace=transform.inplace,
+            )
+        else:
+            mock.assert_not_called()
+            assert output is inpt_sentinel
+
+
+class TestTransform:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test_check_transformed_types(self, inpt_type, mocker):
+        # This test ensures that we correctly handle which types to transform and which to bypass
+        t = transforms.Transform()
+        inpt = mocker.MagicMock(spec=inpt_type)
+
+        if inpt_type in (np.ndarray, str, int):
+            output = t(inpt)
+            assert output is inpt
+        else:
+            with pytest.raises(NotImplementedError):
+                t(inpt)
+
+
+class TestToImageTensor:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch(
+            "torchvision.transforms.v2.functional.to_image_tensor",
+            return_value=torch.rand(1, 3, 8, 8),
+        )
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToImageTensor()
+        transform(inpt)
+        if inpt_type in (datapoints.BoundingBox, datapoints.Image, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt)
+
+
+class TestToImagePIL:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToImagePIL()
+        transform(inpt)
+        if inpt_type in (datapoints.BoundingBox, PIL.Image.Image, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt, mode=transform.mode)
+
+
+class TestToPILImage:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToPILImage()
+        transform(inpt)
+        if inpt_type in (PIL.Image.Image, datapoints.BoundingBox, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt, mode=transform.mode)
+
+
+class TestToTensor:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            transform = transforms.ToTensor()
+        transform(inpt)
+        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBox, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt)
+
+
+class TestContainers:
+    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
+    def test_assertions(self, transform_cls):
+        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
+            transform_cls(transforms.RandomCrop(28))
+
+    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
+    @pytest.mark.parametrize(
+        "trfms",
+        [
+            [transforms.Pad(2), transforms.RandomCrop(28)],
+            [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
+            [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)],
+        ],
+    )
+    def test_ctor(self, transform_cls, trfms):
+        c = transform_cls(trfms)
+        inpt = torch.rand(1, 3, 32, 32)
+        output = c(inpt)
+        assert isinstance(output, torch.Tensor)
+        assert output.ndim == 4
+
+
+class TestRandomChoice:
+    def test_assertions(self):
+        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
+
+        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
+
+
+class TestRandomIoUCrop:
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
+    def test__get_params(self, device, options, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+        bboxes = datapoints.BoundingBox(
+            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
+            format="XYXY",
+            spatial_size=image.spatial_size,
+            device=device,
+        )
+        sample = [image, bboxes]
+
+        transform = transforms.RandomIoUCrop(sampler_options=options)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform._get_params(sample)
+
+            if options == [2.0]:
+                assert len(params) == 0
+                return
+
+            assert len(params["is_within_crop_area"]) > 0
+            assert params["is_within_crop_area"].dtype == torch.bool
+
+            orig_h = image.spatial_size[0]
+            orig_w = image.spatial_size[1]
+            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
+            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
+
+            left, top = params["left"], params["top"]
+            new_h, new_w = params["height"], params["width"]
+            ious = box_iou(
+                bboxes,
+                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
+            )
+            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
+
+    def test__transform_empty_params(self, mocker):
+        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
+        image = datapoints.Image(torch.rand(1, 3, 4, 4))
+        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
+        label = torch.tensor([1])
+        sample = [image, bboxes, label]
+        # Let's mock transform._get_params to control the output:
+        transform._get_params = mocker.MagicMock(return_value={})
+        output = transform(sample)
+        torch.testing.assert_close(output, sample)
+
+    def test_forward_assertion(self):
+        transform = transforms.RandomIoUCrop()
+        with pytest.raises(
+            TypeError,
+            match="requires input sample to contain tensor or PIL images and bounding boxes",
+        ):
+            transform(torch.tensor(0))
+
+    def test__transform(self, mocker):
+        transform = transforms.RandomIoUCrop()
+
+        image = datapoints.Image(torch.rand(3, 32, 24))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
+        masks = make_detection_mask((32, 24), num_objects=6)
+
+        sample = [image, bboxes, masks]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.crop", side_effect=lambda x, **params: x)
+        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
+
+        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
+        transform._get_params = mocker.MagicMock(return_value=params)
+        output = transform(sample)
+
+        assert fn.call_count == 3
+
+        expected_calls = [
+            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
+            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
+            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
+        ]
+
+        fn.assert_has_calls(expected_calls)
+
+        # check number of bboxes vs number of labels:
+        output_bboxes = output[1]
+        assert isinstance(output_bboxes, datapoints.BoundingBox)
+        assert (output_bboxes[~is_within_crop_area] == 0).all()
+
+        output_masks = output[2]
+        assert isinstance(output_masks, datapoints.Mask)
+
+
+class TestScaleJitter:
+    def test__get_params(self, mocker):
+        spatial_size = (24, 32)
+        target_size = (16, 12)
+        scale_range = (0.5, 1.5)
+
+        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
+        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform._get_params([sample])
+
+            assert "size" in params
+            size = params["size"]
+
+            assert isinstance(size, tuple) and len(size) == 2
+            height, width = size
+
+            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
+
+            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
+            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
+
+    def test__transform(self, mocker):
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
+        antialias_sentinel = mocker.MagicMock()
+
+        transform = transforms.ScaleJitter(
+            target_size=(16, 12), interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+        transform._transformed_types = (mocker.MagicMock,)
+
+        size_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
+        transform(inpt_sentinel)
+
+        mock.assert_called_once_with(
+            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+
+
+class TestRandomShortestSize:
+    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
+    def test__get_params(self, min_size, max_size, mocker):
+        spatial_size = (3, 10)
+
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
+
+        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+        params = transform._get_params([sample])
+
+        assert "size" in params
+        size = params["size"]
+
+        assert isinstance(size, tuple) and len(size) == 2
+
+        longer = max(size)
+        shorter = min(size)
+        if max_size is not None:
+            assert longer <= max_size
+            assert shorter <= max_size
+        else:
+            assert shorter in min_size
+
+    def test__transform(self, mocker):
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
+        antialias_sentinel = mocker.MagicMock()
+
+        transform = transforms.RandomShortestSize(
+            min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+        transform._transformed_types = (mocker.MagicMock,)
+
+        size_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._geometry.RandomShortestSize._get_params",
+            return_value=dict(size=size_sentinel),
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
+        transform(inpt_sentinel)
+
+        mock.assert_called_once_with(
+            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+
+
+class TestLinearTransformation:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="transformation_matrix should be square"):
+            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
+
+        with pytest.raises(ValueError, match="mean_vector should have the same length"):
+            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
+
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            122 * torch.ones(1, 3, 8, 8),
+            122.0 * torch.ones(1, 3, 8, 8),
+            datapoints.Image(122 * torch.ones(1, 3, 8, 8)),
+            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
+        ],
+    )
+    def test__transform(self, inpt):
+
+        v = 121 * torch.ones(3 * 8 * 8)
+        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
+        transform = transforms.LinearTransformation(m, v)
+
+        if isinstance(inpt, PIL.Image.Image):
+            with pytest.raises(TypeError, match="LinearTransformation does not work on PIL Images"):
+                transform(inpt)
+        else:
+            output = transform(inpt)
+            assert isinstance(output, torch.Tensor)
+            assert output.unique() == 3 * 8 * 8
+            assert output.dtype == inpt.dtype
+
+
+class TestRandomResize:
+    def test__get_params(self):
+        min_size = 3
+        max_size = 6
+
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
+
+        for _ in range(10):
+            params = transform._get_params([])
+
+            assert isinstance(params["size"], list) and len(params["size"]) == 1
+            size = params["size"][0]
+
+            assert min_size <= size < max_size
+
+    def test__transform(self, mocker):
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
+        antialias_sentinel = mocker.MagicMock()
+
+        transform = transforms.RandomResize(
+            min_size=-1, max_size=-1, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+        transform._transformed_types = (mocker.MagicMock,)
+
+        size_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._geometry.RandomResize._get_params",
+            return_value=dict(size=size_sentinel),
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock_resize = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
+        transform(inpt_sentinel)
+
+        mock_resize.assert_called_with(
+            inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+
+
+class TestToDtype:
+    @pytest.mark.parametrize(
+        ("dtype", "expected_dtypes"),
+        [
+            (
+                torch.float64,
+                {
+                    datapoints.Video: torch.float64,
+                    datapoints.Image: torch.float64,
+                    datapoints.BoundingBox: torch.float64,
+                },
+            ),
+            (
+                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+            ),
+        ],
+    )
+    def test_call(self, dtype, expected_dtypes):
+        sample = dict(
+            video=make_video(dtype=torch.int64),
+            image=make_image(dtype=torch.uint8),
+            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
+            str="str",
+            int=0,
+        )
+
+        transform = transforms.ToDtype(dtype)
+        transformed_sample = transform(sample)
+
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
+
+            # make sure the transformation retains the type
+            assert isinstance(transformed_value, value_type)
+
+            if isinstance(value, torch.Tensor):
+                assert transformed_value.dtype is expected_dtypes[value_type]
+            else:
+                assert transformed_value is value
+
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((), dtype=torch.float32)
+        transform = transforms.ToDtype({torch.Tensor: torch.float64})
+
+        assert transform(tensor).dtype is torch.float64
+
+    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, other_type: torch.float64})
+
+
+class TestUniformTemporalSubsample:
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.zeros(10, 3, 8, 8),
+            torch.zeros(1, 10, 3, 8, 8),
+            datapoints.Video(torch.zeros(1, 10, 3, 8, 8)),
+        ],
+    )
+    def test__transform(self, inpt):
+        num_samples = 5
+        transform = transforms.UniformTemporalSubsample(num_samples)
+
+        output = transform(inpt)
+        assert type(output) is type(inpt)
+        assert output.shape[-4] == num_samples
+        assert output.dtype == inpt.dtype
+
+
+# TODO: remove this test in 0.17 when the default of antialias changes to True
+def test_antialias_warning():
+    pil_img = PIL.Image.new("RGB", size=(10, 10), color=127)
+    tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8)
+    tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
+
+    match = "The default value of the antialias parameter"
+    with pytest.warns(UserWarning, match=match):
+        transforms.Resize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResizedCrop((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.ScaleJitter((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomShortestSize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResize(10, 20)(tensor_img)
+
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize(tensor_img, (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize_image_tensor(tensor_img, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize(tensor_video, (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize_video(tensor_video, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Image(tensor_img).resize((20, 20))
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Video(tensor_video).resize((20, 20))
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20))
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        transforms.Resize((20, 20))(pil_img)
+        transforms.RandomResizedCrop((20, 20))(pil_img)
+        transforms.ScaleJitter((20, 20))(pil_img)
+        transforms.RandomShortestSize((20, 20))(pil_img)
+        transforms.RandomResize(10, 20)(pil_img)
+        transforms.functional.resize(pil_img, (20, 20))
+
+        transforms.Resize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
+        transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
+        transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResize(10, 20, antialias=True)(tensor_img)
+
+        transforms.functional.resize(tensor_img, (20, 20), antialias=True)
+        transforms.functional.resize_image_tensor(tensor_img, (20, 20), antialias=True)
+        transforms.functional.resize(tensor_video, (20, 20), antialias=True)
+        transforms.functional.resize_video(tensor_video, (20, 20), antialias=True)
+
+        datapoints.Image(tensor_img).resize((20, 20), antialias=True)
+        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+        datapoints.Video(tensor_video).resize((20, 20), antialias=True)
+        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("label_type", (torch.Tensor, int))
+@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
+
+    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_simple_tensor(image)
+
+    label = 1 if label_type is int else torch.tensor([1])
+
+    if dataset_return_type is dict:
+        sample = {
+            "image": image,
+            "label": label,
+        }
+    else:
+        sample = image, label
+
+    t = transforms.Compose(
+        [
+            transforms.RandomResizedCrop((224, 224)),
+            transforms.RandomHorizontalFlip(p=1),
+            transforms.RandAugment(),
+            transforms.TrivialAugmentWide(),
+            transforms.AugMix(),
+            transforms.AutoAugment(),
+            to_tensor(),
+            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
+            # intended?  This results in a failure if we convert to tensor after
+            # it, because the image would still be uint8 which make Normalize
+            # fail.
+            transforms.ConvertImageDtype(torch.float),
+            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
+            transforms.RandomErasing(p=1),
+        ]
+    )
+
+    out = t(sample)
+
+    assert type(out) == type(sample)
+
+    if dataset_return_type is tuple:
+        out_image, out_label = out
+    else:
+        assert out.keys() == sample.keys()
+        out_image, out_label = out.values()
+
+    assert out_image.shape[-2:] == (224, 224)
+    assert out_label == label
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+@pytest.mark.parametrize("sanitize", (True, False))
+def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
+    torch.manual_seed(0)
+    if data_augmentation == "hflip":
+        t = [
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "lsj":
+        t = [
+            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
+            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
+            # leaving FixedSizeCrop in prototype for now, and it expects Label
+            # classes which we won't release yet.
+            # transforms.FixedSizeCrop(
+            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})
+            # ),
+            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "multiscale":
+        t = [
+            transforms.RandomShortestSize(
+                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
+            ),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssd":
+        t = [
+            transforms.RandomPhotometricDistort(p=1),
+            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssdlite":
+        t = [
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    if sanitize:
+        t += [transforms.SanitizeBoundingBoxes()]
+    t = transforms.Compose(t)
+
+    num_boxes = 5
+    H = W = 250
+
+    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_simple_tensor(image)
+
+    label = torch.randint(0, 10, size=(num_boxes,))
+
+    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = boxes.clamp(min=0, max=min(H, W))
+    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
+
+    masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
+
+    sample = {
+        "image": image,
+        "label": label,
+        "boxes": boxes,
+        "masks": masks,
+    }
+
+    out = t(sample)
+
+    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
+        assert is_simple_tensor(out["image"])
+    else:
+        assert isinstance(out["image"], datapoints.Image)
+    assert isinstance(out["label"], type(sample["label"]))
+
+    num_boxes_expected = {
+        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
+        # doesn't remove them strictly speaking, it just marks some boxes as
+        # degenerate and those boxes will be later removed by
+        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # param is True.
+        # Note that the values below are probably specific to the random seed
+        # set above (which is fine).
+        (True, "ssd"): 4,
+        (True, "ssdlite"): 4,
+    }.get((sanitize, data_augmentation), num_boxes)
+
+    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
+
+
+@pytest.mark.parametrize("min_size", (1, 10))
+@pytest.mark.parametrize(
+    "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
+)
+def test_sanitize_bounding_boxes(min_size, labels_getter):
+    H, W = 256, 128
+
+    boxes_and_validity = [
+        ([0, 1, 10, 1], False),  # Y1 == Y2
+        ([0, 1, 0, 20], False),  # X1 == X2
+        ([0, 0, min_size - 1, 10], False),  # H < min_size
+        ([0, 0, 10, min_size - 1], False),  # W < min_size
+        ([0, 0, 10, H + 1], False),  # Y2 > H
+        ([0, 0, W + 1, 10], False),  # X2 > W
+        ([-1, 1, 10, 20], False),  # any < 0
+        ([0, 0, -1, 20], False),  # any < 0
+        ([0, 0, -10, -1], False),  # any < 0
+        ([0, 0, min_size, 10], True),  # H < min_size
+        ([0, 0, 10, min_size], True),  # W < min_size
+        ([0, 0, W, H], True),  # TODO: Is that actually OK?? Should it be -1?
+        ([1, 1, 30, 20], True),
+        ([0, 0, 10, 10], True),
+        ([1, 1, 30, 20], True),
+    ]
+
+    random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
+    boxes, is_valid_mask = zip(*boxes_and_validity)
+    valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
+
+    boxes = torch.tensor(boxes)
+    labels = torch.arange(boxes.shape[0])
+
+    boxes = datapoints.BoundingBox(
+        boxes,
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=(H, W),
+    )
+
+    masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
+
+    sample = {
+        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
+        "labels": labels,
+        "boxes": boxes,
+        "whatever": torch.rand(10),
+        "None": None,
+        "masks": masks,
+    }
+
+    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+
+    assert out["image"] is sample["image"]
+    assert out["whatever"] is sample["whatever"]
+
+    if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
+        assert out["labels"] is sample["labels"]
+    else:
+        assert isinstance(out["labels"], torch.Tensor)
+        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
+        # This works because we conveniently set labels to arange(num_boxes)
+        assert out["labels"].tolist() == valid_indices
+
+
+@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
+def test_sanitize_bounding_boxes_default_heuristic(key):
+    labels = torch.arange(10)
+    d = {key: labels}
+    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+
+    if key.lower() != "labels":
+        # If "labels" is in the dict (case-insensitive),
+        # it takes precedence over other keys which would otherwise be a match
+        d = {key: "something_else", "labels": labels}
+        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+
+
+def test_sanitize_bounding_boxes_errors():
+
+    good_bbox = datapoints.BoundingBox(
+        [[0, 0, 10, 10]],
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=(20, 20),
+    )
+
+    with pytest.raises(ValueError, match="min_size must be >= 1"):
+        transforms.SanitizeBoundingBoxes(min_size=0)
+    with pytest.raises(ValueError, match="labels_getter should either be a str"):
+        transforms.SanitizeBoundingBoxes(labels_getter=12)
+
+    with pytest.raises(ValueError, match="Could not infer where the labels are"):
+        bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+
+    with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
+        not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
+        transforms.SanitizeBoundingBoxes()(not_a_dict)
+
+    with pytest.raises(ValueError, match="must be a tensor"):
+        not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
+        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+
+    with pytest.raises(ValueError, match="Number of boxes"):
+        different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
+
+    with pytest.raises(ValueError, match="boxes must be of shape"):
+        bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
+            [
+                [[0, 0, 10, 10]],
+                [[0, 0, 10, 10]],
+            ],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=(20, 20),
+        )
+        different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_transforms_v2_consistency.py
similarity index 99%
rename from test/test_prototype_transforms_consistency.py
rename to test/test_transforms_v2_consistency.py
index ebee2eec5..125d7ec7a 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -12,9 +12,8 @@ import PIL.Image
 import pytest
 
 import torch
-import torchvision.prototype.transforms as prototype_transforms
 import torchvision.transforms.v2 as v2_transforms
-from prototype_common_utils import (
+from common_utils import (
     ArgsKwargs,
     assert_close,
     assert_equal,
@@ -22,7 +21,6 @@ from prototype_common_utils import (
     make_detection_mask,
     make_image,
     make_images,
-    make_label,
     make_segmentation_mask,
 )
 from torch import nn
@@ -1056,6 +1054,9 @@ class TestRefDetTransforms:
         size = (600, 800)
         num_objects = 22
 
+        def make_label(extra_dims, categories):
+            return torch.randint(categories, extra_dims, dtype=torch.int64)
+
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
@@ -1102,11 +1103,6 @@ class TestRefDetTransforms:
             ),
             (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}),
             (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024)), {}),
-            (
-                det_transforms.FixedSizeCrop((1024, 1024), fill=0),
-                prototype_transforms.FixedSizeCrop((1024, 1024), fill=0),
-                {},
-            ),
             (
                 det_transforms.RandomShortestSize(
                     min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
diff --git a/test/test_prototype_transforms_functional.py b/test/test_transforms_v2_functional.py
similarity index 99%
rename from test/test_prototype_transforms_functional.py
rename to test/test_transforms_v2_functional.py
index ffee57eea..e648b35d4 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -11,15 +11,16 @@ import pytest
 
 import torch
 
-from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
-from prototype_common_utils import (
+from common_utils import (
     assert_close,
+    cache,
+    cpu_and_gpu,
     DEFAULT_SQUARE_SPATIAL_SIZE,
     make_bounding_boxes,
+    needs_cuda,
     parametrized_error_message,
+    set_rng_seed,
 )
-from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
-from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
@@ -27,6 +28,8 @@ from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_box, convert_format_bounding_box
 from torchvision.transforms.v2.utils import is_simple_tensor
+from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
+from transforms_v2_kernel_infos import KERNEL_INFOS
 
 
 KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
@@ -635,7 +638,7 @@ class TestConvertFormatBoundingBox:
 
 
 # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
-#  `prototype_transforms_kernel_infos.py`
+#  `transforms_v2_kernel_infos.py`
 
 
 def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
diff --git a/test/test_prototype_transforms_utils.py b/test/test_transforms_v2_utils.py
similarity index 97%
rename from test/test_prototype_transforms_utils.py
rename to test/test_transforms_v2_utils.py
index c9d374660..198ab39a4 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 
 import torchvision.transforms.v2.utils
-from prototype_common_utils import make_bounding_box, make_detection_mask, make_image
+from common_utils import make_bounding_box, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_image_pil
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
similarity index 98%
rename from test/prototype_transforms_dispatcher_infos.py
rename to test/transforms_v2_dispatcher_infos.py
index 308f787ba..1d9dd0252 100644
--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -2,9 +2,9 @@ import collections.abc
 
 import pytest
 import torchvision.transforms.v2.functional as F
-from prototype_common_utils import InfoBase, TestMark
-from prototype_transforms_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
+from common_utils import InfoBase, TestMark
 from torchvision import datapoints
+from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
@@ -49,7 +49,7 @@ class DispatcherInfo(InfoBase):
             if not kernel_info:
                 raise pytest.UsageError(
                     f"Can't register {kernel.__name__} for type {datapoint_type} since there is no `KernelInfo` for it. "
-                    f"Please add a `KernelInfo` for it in `prototype_transforms_kernel_infos.py`."
+                    f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`."
                 )
             kernel_infos[datapoint_type] = kernel_info
         self.kernel_infos = kernel_infos
diff --git a/test/prototype_transforms_kernel_infos.py b/test/transforms_v2_kernel_infos.py
similarity index 99%
rename from test/prototype_transforms_kernel_infos.py
rename to test/transforms_v2_kernel_infos.py
index 5e8be39ae..a14ce27d3 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -9,9 +9,9 @@ import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.transforms.v2.functional as F
-from datasets_utils import combinations_grid
-from prototype_common_utils import (
+from common_utils import (
     ArgsKwargs,
+    combinations_grid,
     get_num_channels,
     ImageLoader,
     InfoBase,
-- 
GitLab


From a192c95e77a4a4de3a8aeee45130ddc4d2773a83 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 17 Feb 2023 14:58:03 +0000
Subject: [PATCH 314/624] Remove check_v2_dataset_warnings.py which is failing
 (#7280)

---
 .github/workflows/test-linux-cpu.yml |  6 ------
 test/check_v2_dataset_warnings.py    | 19 -------------------
 2 files changed, 25 deletions(-)
 delete mode 100644 test/check_v2_dataset_warnings.py

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 68ebc54f2..19521cdd0 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -55,9 +55,3 @@ jobs:
         # Run Tests
         python3 -m torch.utils.collect_env
         python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
-
-        # Specific test for warnings on "from torchvision.datasets import wrap_dataset_for_transforms_v2"
-        # We keep them separate to avoid any side effects due to warnings / imports.
-        # TODO: Remove this and add proper tests (possibly using a sub-process solution as described
-        # in https://github.com/pytorch/vision/pull/7269).
-        python3 -m pytest -v test/check_v2_dataset_warnings.py
diff --git a/test/check_v2_dataset_warnings.py b/test/check_v2_dataset_warnings.py
deleted file mode 100644
index 8bb53ee34..000000000
--- a/test/check_v2_dataset_warnings.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pytest
-
-
-def test_warns_if_imported_from_datasets(mocker):
-    mocker.patch("torchvision._WARN_ABOUT_BETA_TRANSFORMS", return_value=True)
-
-    import torchvision
-
-    with pytest.warns(UserWarning, match=torchvision._BETA_TRANSFORMS_WARNING):
-        from torchvision.datasets import wrap_dataset_for_transforms_v2
-
-        assert callable(wrap_dataset_for_transforms_v2)
-
-
-@pytest.mark.filterwarnings("error")
-def test_no_warns_if_imported_from_datasets():
-    from torchvision.datasets import wrap_dataset_for_transforms_v2
-
-    assert callable(wrap_dataset_for_transforms_v2)
-- 
GitLab


From 7fe5136819a6eaae9c67690490979310933df275 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 09:55:12 +0000
Subject: [PATCH 315/624] Add warnings checks for v2 namespaces and deprecated
 files (#7288)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/common_utils.py       | 21 ++++++++++++++++
 test/conftest.py           |  3 ++-
 test/test_transforms.py    | 33 +++++++++++++++++++++++-
 test/test_transforms_v2.py | 51 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 670115c6e..2f74f3686 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -8,8 +8,10 @@ import os
 import pathlib
 import random
 import shutil
+import sys
 import tempfile
 from collections import defaultdict
+from subprocess import CalledProcessError, check_output, STDOUT
 from typing import Callable, Sequence, Tuple, Union
 
 import numpy as np
@@ -838,3 +840,22 @@ class InfoBase:
         if isinstance(device, torch.device):
             device = device.type
         return self.closeness_kwargs.get((test_id, dtype, device), dict())
+
+
+def assert_run_python_script(source_code):
+    """Utility to check assertions in an independent Python subprocess.
+    The script provided in the source code should return 0 and not print
+    anything on stderr or stdout. Taken from scikit-learn test utils.
+    source_code (str): The Python source code to execute.
+    """
+    with tempfile.NamedTemporaryFile(mode="wb") as f:
+        f.write(source_code.encode())
+        f.flush()
+
+        cmd = [sys.executable, f.name]
+        try:
+            out = check_output(cmd, stderr=STDOUT)
+        except CalledProcessError as e:
+            raise RuntimeError(f"script errored with output:\n{e.output.decode()}")
+        if out != b"":
+            raise AssertionError(out.decode())
diff --git a/test/conftest.py b/test/conftest.py
index b3ab70af6..a9e8f1cda 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -4,11 +4,12 @@ import numpy as np
 import pytest
 import torch
 import torchvision
-from common_utils import CUDA_NOT_AVAILABLE_MSG, IN_FBCODE, IN_OSS_CI, IN_RE_WORKER, OSS_CI_GPU_NO_CUDA_MSG
 
 
 torchvision.disable_beta_transforms_warning()
 
+from common_utils import CUDA_NOT_AVAILABLE_MSG, IN_FBCODE, IN_OSS_CI, IN_RE_WORKER, OSS_CI_GPU_NO_CUDA_MSG
+
 
 def pytest_configure(config):
     # register an additional marker (see pytest_collection_modifyitems)
diff --git a/test/test_transforms.py b/test/test_transforms.py
index cec69c0cb..03b385e9e 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2,6 +2,7 @@ import math
 import os
 import random
 import re
+import textwrap
 import warnings
 from functools import partial
 
@@ -24,7 +25,7 @@ try:
 except ImportError:
     stats = None
 
-from common_utils import assert_equal, cycle_over, float_dtypes, int_dtypes
+from common_utils import assert_equal, assert_run_python_script, cycle_over, float_dtypes, int_dtypes
 
 
 GRACE_HOPPER = get_file_path_2(
@@ -2266,5 +2267,35 @@ def test_random_grayscale_with_grayscale_input():
     torch.testing.assert_close(F.pil_to_tensor(output_pil), image_tensor)
 
 
+# TODO: remove in 0.17 when we can delete functional_pil.py and functional_tensor.py
+@pytest.mark.parametrize(
+    "import_statement",
+    (
+        "from torchvision.transforms import functional_pil",
+        "from torchvision.transforms import functional_tensor",
+        "from torchvision.transforms.functional_tensor import resize",
+        "from torchvision.transforms.functional_pil import resize",
+    ),
+)
+@pytest.mark.parametrize("from_private", (True, False))
+def test_functional_deprecation_warning(import_statement, from_private):
+    if from_private:
+        import_statement = import_statement.replace("functional", "_functional")
+        source = f"""
+        import warnings
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            {import_statement}
+        """
+    else:
+        source = f"""
+        import pytest
+        with pytest.warns(UserWarning, match="removed in 0.17"):
+            {import_statement}
+        """
+    assert_run_python_script(textwrap.dedent(source))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 733939b53..2e43c86f9 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -2,6 +2,7 @@ import itertools
 import pathlib
 import random
 import re
+import textwrap
 import warnings
 from collections import defaultdict
 
@@ -14,6 +15,7 @@ import torchvision.transforms.v2 as transforms
 
 from common_utils import (
     assert_equal,
+    assert_run_python_script,
     cpu_and_gpu,
     make_bounding_box,
     make_bounding_boxes,
@@ -2045,3 +2047,52 @@ def test_sanitize_bounding_boxes_errors():
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
         transforms.SanitizeBoundingBoxes()(different_sizes)
+
+
+@pytest.mark.parametrize(
+    "import_statement",
+    (
+        "from torchvision.transforms import v2",
+        "import torchvision.transforms.v2",
+        "from torchvision.transforms.v2 import Resize",
+        "import torchvision.transforms.v2.functional",
+        "from torchvision.transforms.v2.functional import resize",
+        "from torchvision import datapoints",
+        "from torchvision.datapoints import Image",
+        "from torchvision.datasets import wrap_dataset_for_transforms_v2",
+    ),
+)
+@pytest.mark.parametrize("call_disable_warning", (True, False))
+def test_warnings_v2_namespaces(import_statement, call_disable_warning):
+    if call_disable_warning:
+        source = f"""
+        import warnings
+        import torchvision
+        torchvision.disable_beta_transforms_warning()
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            {import_statement}
+        """
+    else:
+        source = f"""
+        import pytest
+        with pytest.warns(UserWarning, match="v2 namespaces are still Beta"):
+            {import_statement}
+        """
+    assert_run_python_script(textwrap.dedent(source))
+
+
+def test_no_warnings_v1_namespace():
+    source = """
+    import warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        import torchvision.transforms
+        from torchvision import transforms
+        import torchvision.transforms.functional
+        from torchvision.transforms import Resize
+        from torchvision.transforms.functional import resize
+        from torchvision import datasets
+        from torchvision.datasets import ImageNet
+    """
+    assert_run_python_script(textwrap.dedent(source))
-- 
GitLab


From 13755c9a0bbf7bc62606df018ac3f51d250a8bd5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 21 Feb 2023 11:20:33 +0100
Subject: [PATCH 316/624] add ffmpeg to Linux CPU and GPU unittest workflows
 (#7295)

---
 .github/workflows/test-linux-cpu.yml | 2 +-
 .github/workflows/test-linux-gpu.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 19521cdd0..769ee4f84 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -39,7 +39,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
index 831de27e3..95d06402d 100644
--- a/.github/workflows/test-linux-gpu.yml
+++ b/.github/workflows/test-linux-gpu.yml
@@ -43,7 +43,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
-- 
GitLab


From d03b776a5cd1f4d125eacf127f95d8571a852137 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 11:02:29 +0000
Subject: [PATCH 317/624] Update transforms docs sub-structure (#7291)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst | 87 ++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 51 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 5909b6896..d831b81e3 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -14,11 +14,10 @@ transformations.
 This is useful if you have to build a more complex transformation pipeline
 (e.g. in the case of segmentation tasks).
 
-Most transformations accept both `PIL <https://pillow.readthedocs.io>`_
-images and tensor images, although some transformations are :ref:`PIL-only
-<transforms_pil_only>` and some are :ref:`tensor-only
-<transforms_tensor_only>`. The :ref:`conversion_transforms` may be used to
-convert to and from PIL images.
+Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
+and tensor images, although some transformations are PIL-only and some are
+tensor-only. The :ref:`conversion_transforms` may be used to convert to and from
+PIL images, or for converting dtypes and ranges.
 
 The transformations that accept tensor images also accept batches of tensor
 images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a
@@ -70,8 +69,10 @@ The following examples illustrate the use of the available transforms:
     produce the same results.
 
 
-Scriptable transforms
----------------------
+Transforms scriptability
+------------------------
+
+.. TODO: Add note about v2 scriptability (in next PR)
 
 In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`.
 
@@ -89,39 +90,36 @@ Make sure to use only scriptable transformations, i.e. that work with ``torch.Te
 For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
 
 
-Compositions of transforms
---------------------------
+Geometry
+--------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Compose
-
+    Resize
+    RandomCrop
+    RandomResizedCrop
+    CenterCrop
+    FiveCrop
+    TenCrop
+    Pad
+    RandomAffine
+    RandomPerspective
+    RandomRotation
+    RandomHorizontalFlip
+    RandomVerticalFlip
 
-Transforms on PIL Image and torch.\*Tensor
-------------------------------------------
+Color
+-----
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    CenterCrop
     ColorJitter
-    FiveCrop
     Grayscale
-    Pad
-    RandomAffine
-    RandomApply
-    RandomCrop
     RandomGrayscale
-    RandomHorizontalFlip
-    RandomPerspective
-    RandomResizedCrop
-    RandomRotation
-    RandomVerticalFlip
-    Resize
-    TenCrop
     GaussianBlur
     RandomInvert
     RandomPosterize
@@ -130,23 +128,20 @@ Transforms on PIL Image and torch.\*Tensor
     RandomAutocontrast
     RandomEqualize
 
-
-.. _transforms_pil_only:
-
-Transforms on PIL Image only
-----------------------------
+Composition
+-----------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    Compose
+    RandomApply
     RandomChoice
     RandomOrder
 
-.. _transforms_tensor_only:
-
-Transforms on torch.\*Tensor only
----------------------------------
+Miscellaneous
+-------------
 
 .. autosummary::
     :toctree: generated/
@@ -155,12 +150,12 @@ Transforms on torch.\*Tensor only
     LinearTransformation
     Normalize
     RandomErasing
-    ConvertImageDtype
+    Lambda
 
 .. _conversion_transforms:
 
-Conversion Transforms
----------------------
+Conversion
+----------
 
 .. autosummary::
     :toctree: generated/
@@ -169,20 +164,10 @@ Conversion Transforms
     ToPILImage
     ToTensor
     PILToTensor
+    ConvertImageDtype
 
-
-Generic Transforms
-------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    Lambda
-
-
-Automatic Augmentation Transforms
----------------------------------
+Auto-Augmentation
+-----------------
 
 `AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
 Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
-- 
GitLab


From 928b05cad36eadb13e169f03028767c8bcd1f21d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 16:38:12 +0000
Subject: [PATCH 318/624] Added docs for v2 transforms (part 1) (#7297)

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/conf.py                           |   2 +
 docs/source/transforms.rst                    |  40 ++
 torchvision/transforms/v2/_augment.py         |  32 ++
 torchvision/transforms/v2/_auto_augment.py    |  80 ++++
 torchvision/transforms/v2/_color.py           | 139 +++++++
 torchvision/transforms/v2/_container.py       |  65 ++++
 torchvision/transforms/v2/_deprecated.py      |  25 ++
 torchvision/transforms/v2/_geometry.py        | 350 +++++++++++++++++-
 torchvision/transforms/v2/_meta.py            |  21 ++
 torchvision/transforms/v2/_misc.py            |  68 ++++
 torchvision/transforms/v2/_type_conversion.py |  30 ++
 11 files changed, 850 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 72c83d789..304a1cc6e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -33,6 +33,8 @@ from tabulate import tabulate
 
 sys.path.append(os.path.abspath("."))
 
+torchvision.disable_beta_transforms_warning()
+
 # -- General configuration ------------------------------------------------
 
 # Required version of sphinx is set from docs/requirements.txt
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index d831b81e3..00d929d06 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -98,17 +98,29 @@ Geometry
     :template: class.rst
 
     Resize
+    v2.Resize
     RandomCrop
+    v2.RandomCrop
     RandomResizedCrop
+    v2.RandomResizedCrop
     CenterCrop
+    v2.CenterCrop
     FiveCrop
+    v2.FiveCrop
     TenCrop
+    v2.TenCrop
     Pad
+    v2.Pad
     RandomAffine
+    v2.RandomAffine
     RandomPerspective
+    v2.RandomPerspective
     RandomRotation
+    v2.RandomRotation
     RandomHorizontalFlip
+    v2.RandomHorizontalFlip
     RandomVerticalFlip
+    v2.RandomVerticalFlip
 
 Color
 -----
@@ -118,15 +130,25 @@ Color
     :template: class.rst
 
     ColorJitter
+    v2.ColorJitter
     Grayscale
+    v2.Grayscale
     RandomGrayscale
+    v2.RandomGrayscale
     GaussianBlur
+    v2.GaussianBlur
     RandomInvert
+    v2.RandomInvert
     RandomPosterize
+    v2.RandomPosterize
     RandomSolarize
+    v2.RandomSolarize
     RandomAdjustSharpness
+    v2.RandomAdjustSharpness
     RandomAutocontrast
+    v2.RandomAutocontrast
     RandomEqualize
+    v2.RandomEqualize
 
 Composition
 -----------
@@ -136,9 +158,13 @@ Composition
     :template: class.rst
 
     Compose
+    v2.Compose
     RandomApply
+    v2.RandomApply
     RandomChoice
+    v2.RandomChoice
     RandomOrder
+    v2.RandomOrder
 
 Miscellaneous
 -------------
@@ -148,9 +174,13 @@ Miscellaneous
     :template: class.rst
 
     LinearTransformation
+    v2.LinearTransformation
     Normalize
+    v2.Normalize
     RandomErasing
+    v2.RandomErasing
     Lambda
+    v2.Lambda
 
 .. _conversion_transforms:
 
@@ -162,9 +192,15 @@ Conversion
     :template: class.rst
 
     ToPILImage
+    v2.ToPILImage
+    v2.ToImagePIL
     ToTensor
+    v2.ToTensor
     PILToTensor
+    v2.PILToTensor
     ConvertImageDtype
+    v2.ConvertImageDtype
+    v2.ConvertDtype
 
 Auto-Augmentation
 -----------------
@@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
 
     AutoAugmentPolicy
     AutoAugment
+    v2.AutoAugment
     RandAugment
+    v2.RandAugment
     TrivialAugmentWide
+    v2.TrivialAugmentWide
     AugMix
+    v2.AugMix
 
 .. _functional_transforms:
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 157605d6f..b5aac9ca9 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -13,6 +13,38 @@ from .utils import is_simple_tensor, query_chw
 
 
 class RandomErasing(_RandomApplyTransform):
+    """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
+
+    .. betastatus:: RandomErasing transform
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+         p: probability that the random erasing operation will be performed.
+         scale: range of proportion of erased area against input image.
+         ratio: range of aspect ratio of erased area.
+         value: erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+         inplace: boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased input.
+
+    Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
     _v1_transform_cls = _transforms.RandomErasing
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index b4791755d..98e23b997 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -162,6 +162,24 @@ class _AutoAugmentBase(Transform):
 
 
 class AutoAugment(_AutoAugmentBase):
+    r"""[BETA] AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    .. betastatus:: AutoAugment transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
     _v1_transform_cls = _transforms.AutoAugment
 
     _AUGMENTATION_SPACE = {
@@ -318,6 +336,27 @@ class AutoAugment(_AutoAugmentBase):
 
 
 class RandAugment(_AutoAugmentBase):
+    r"""[BETA] RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    .. betastatus:: RandAugment transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int): Number of augmentation transformations to apply sequentially.
+        magnitude (int): Magnitude for all the transformations.
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandAugment
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -379,6 +418,24 @@ class RandAugment(_AutoAugmentBase):
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
+    r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    .. betastatus:: TrivialAugmentWide transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.TrivialAugmentWide
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -430,6 +487,29 @@ class TrivialAugmentWide(_AutoAugmentBase):
 
 
 class AugMix(_AutoAugmentBase):
+    r"""[BETA] AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    .. betastatus:: AugMix transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int): The number of augmentation chains. Default is ``3``.
+        chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.AugMix
 
     _PARTIAL_AUGMENTATION_SPACE = {
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 64796e16c..785a3965e 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -11,6 +11,23 @@ from .utils import is_simple_tensor, query_chw
 
 
 class Grayscale(Transform):
+    """[BETA] Convert images or videos to grayscale.
+
+    .. betastatus:: Grayscale transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+
+    Returns:
+        PIL Image: Grayscale version of the input.
+
+        - If ``num_output_channels == 1`` : returned image is single channel
+        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
+    """
+
     _v1_transform_cls = _transforms.Grayscale
 
     _transformed_types = (
@@ -29,6 +46,24 @@ class Grayscale(Transform):
 
 
 class RandomGrayscale(_RandomApplyTransform):
+    """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
+
+    .. betastatus:: RandomGrayscale transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+
+    Returns:
+        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+
+    """
+
     _v1_transform_cls = _transforms.RandomGrayscale
 
     _transformed_types = (
@@ -50,6 +85,32 @@ class RandomGrayscale(_RandomApplyTransform):
 
 
 class ColorJitter(Transform):
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
+
+    .. betastatus:: ColorJitter transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
     _v1_transform_cls = _transforms.ColorJitter
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -205,6 +266,18 @@ class RandomPhotometricDistort(Transform):
 
 
 class RandomEqualize(_RandomApplyTransform):
+    """[BETA] Equalize the histogram of the given image randomly with a given probability.
+
+    .. betastatus:: RandomEqualize transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomEqualize
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -212,6 +285,18 @@ class RandomEqualize(_RandomApplyTransform):
 
 
 class RandomInvert(_RandomApplyTransform):
+    """[BETA] Inverts the colors of the given image randomly with a given probability.
+
+    .. betastatus:: RandomInvert transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomInvert
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -219,6 +304,20 @@ class RandomInvert(_RandomApplyTransform):
 
 
 class RandomPosterize(_RandomApplyTransform):
+    """[BETA] Posterize the image randomly with a given probability by reducing the
+    number of bits for each color channel.
+
+    .. betastatus:: RandomPosterize transform
+
+    If the image is torch Tensor, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomPosterize
 
     def __init__(self, bits: int, p: float = 0.5) -> None:
@@ -230,6 +329,20 @@ class RandomPosterize(_RandomApplyTransform):
 
 
 class RandomSolarize(_RandomApplyTransform):
+    """[BETA] Solarize the image randomly with a given probability by inverting all pixel
+    values above a threshold.
+
+    .. betastatus:: RandomSolarize transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomSolarize
 
     def __init__(self, threshold: float, p: float = 0.5) -> None:
@@ -241,6 +354,18 @@ class RandomSolarize(_RandomApplyTransform):
 
 
 class RandomAutocontrast(_RandomApplyTransform):
+    """[BETA] Autocontrast the pixels of the given image randomly with a given probability.
+
+    .. betastatus:: RandomAutocontrast transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAutocontrast
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -248,6 +373,20 @@ class RandomAutocontrast(_RandomApplyTransform):
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
+    """[BETA] Adjust the sharpness of the image randomly with a given probability.
+
+    .. betastatus:: RandomAdjustSharpness transform
+
+    If the image is torch Tensor,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAdjustSharpness
 
     def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 555010fda..66da9c187 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -9,6 +9,37 @@ from torchvision.transforms.v2 import Transform
 
 
 class Compose(Transform):
+    """[BETA] Composes several transforms together.
+
+    .. betastatus:: Compose transform
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         super().__init__()
         if not isinstance(transforms, Sequence):
@@ -29,6 +60,27 @@ class Compose(Transform):
 
 
 class RandomApply(Transform):
+    """[BETA] Apply randomly a list of transformations with a given probability.
+
+    .. betastatus:: RandomApply transform
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability
+    """
+
     _v1_transform_cls = _transforms.RandomApply
 
     def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
@@ -63,6 +115,12 @@ class RandomApply(Transform):
 
 
 class RandomChoice(Transform):
+    """[BETA] Apply single transformation randomly picked from a list.
+
+    .. betastatus:: RandomChoice transform
+
+    This transform does not support torchscript."""
+
     def __init__(
         self,
         transforms: Sequence[Callable],
@@ -99,6 +157,13 @@ class RandomChoice(Transform):
 
 
 class RandomOrder(Transform):
+    """[BETA] Apply a list of transformations in a random order.
+
+    .. betastatus:: RandomOrder transform
+
+    This transform does not support torchscript.
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index bfb0d0623..c44e6b08d 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,6 +10,31 @@ from torchvision.transforms.v2 import Transform
 
 
 class ToTensor(Transform):
+    """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+
+    .. betastatus:: ToTensor transform
+
+    .. warning::
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
+        Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
     _transformed_types = (PIL.Image.Image, np.ndarray)
 
     def __init__(self) -> None:
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index f1eed87b9..af8ca4b64 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -26,6 +26,18 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
+    """[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
+
+    .. betastatus:: RandomHorizontalFlip transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomHorizontalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -33,6 +45,18 @@ class RandomHorizontalFlip(_RandomApplyTransform):
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
+    """[BETA] Vertically flip the given image/box/mask randomly with a given probability.
+
+    .. betastatus:: RandomVerticalFlip transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomVerticalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -40,6 +64,62 @@ class RandomVerticalFlip(_RandomApplyTransform):
 
 
 class Resize(Transform):
+    """[BETA] Resize the input image/box/mask to the given size.
+
+    .. betastatus:: Resize transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    .. warning::
+        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
+        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
+        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
+        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
+        closer.
+
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image: if the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, then
+            the image is resized again so that the longer edge is equal to
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
+            smaller edge may be shorter than ``size``. This is only supported
+            if ``size`` is an int (or a sequence of length 1 in torchscript
+            mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.Resize
 
     def __init__(
@@ -76,6 +156,20 @@ class Resize(Transform):
 
 
 class CenterCrop(Transform):
+    """[BETA] Crops the given image/box/mask at the center.
+
+    .. betastatus:: CenterCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
     _v1_transform_cls = _transforms.CenterCrop
 
     def __init__(self, size: Union[int, Sequence[int]]):
@@ -87,6 +181,53 @@ class CenterCrop(Transform):
 
 
 class RandomResizedCrop(Transform):
+    """[BETA] Crop a random portion of image/box/mask and resize it to a given size.
+
+    .. betastatus:: RandomResizedCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    A crop of the original image is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.RandomResizedCrop
 
     def __init__(
@@ -164,7 +305,24 @@ ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
 
 
 class FiveCrop(Transform):
-    """
+    """[BETA] Crop the given image/box/mask into four corners and the central crop.
+
+    .. betastatus:: FiveCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
     Example:
         >>> class BatchMultiCrop(transforms.Transform):
         ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
@@ -209,8 +367,27 @@ class FiveCrop(Transform):
 
 
 class TenCrop(Transform):
-    """
+    """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    .. betastatus:: TenCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions.
+
     See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool): Use vertical flipping instead of horizontal
     """
 
     _v1_transform_cls = _transforms.TenCrop
@@ -249,6 +426,46 @@ class TenCrop(Transform):
 
 
 class Pad(Transform):
+    """[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
+
+    .. betastatus:: Pad transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
+    at most 3 leading dimensions for mode edge,
+    and an arbitrary number of leading dimensions for mode constant
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.Pad
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -323,6 +540,34 @@ class RandomZoomOut(_RandomApplyTransform):
 
 
 class RandomRotation(Transform):
+    """[BETA] Rotate the image/box/mask by angle.
+
+    .. betastatus:: RandomRotation transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+        fill (sequence or number): Pixel fill value for the area outside the rotated
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomRotation
 
     def __init__(
@@ -363,6 +608,42 @@ class RandomRotation(Transform):
 
 
 class RandomAffine(Transform):
+    """[BETA] Random affine transformation of the image/box/mask keeping center invariant.
+
+    .. betastatus:: RandomAffine transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomAffine
 
     def __init__(
@@ -443,6 +724,52 @@ class RandomAffine(Transform):
 
 
 class RandomCrop(Transform):
+    """[BETA] Crop the given image/box/mask at a random location.
+
+    .. betastatus:: RandomCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
+    but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.RandomCrop
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -552,6 +879,25 @@ class RandomCrop(Transform):
 
 
 class RandomPerspective(_RandomApplyTransform):
+    """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
+
+    .. betastatus:: RandomPerspective transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float): probability of the image being transformed. Default is 0.5.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandomPerspective
 
     def __init__(
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 0d1544094..7d0f0ec39 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -22,6 +22,27 @@ class ConvertBoundingBoxFormat(Transform):
 
 
 class ConvertDtype(Transform):
+    """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
+
+    .. betastatus:: ConvertDtype transform
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
     _v1_transform_cls = _transforms.ConvertImageDtype
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 6dd0755cf..6998d416c 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -21,6 +21,16 @@ class Identity(Transform):
 
 
 class Lambda(Transform):
+    """[BETA] Apply a user-defined lambda as a transform.
+
+    .. betastatus:: Lambda transform
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
     def __init__(self, lambd: Callable[[Any], Any], *types: Type):
         super().__init__()
         self.lambd = lambd
@@ -42,6 +52,26 @@ class Lambda(Transform):
 
 
 class LinearTransformation(Transform):
+    """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
+
+    .. betastatus:: LinearTransformation transform
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
     _v1_transform_cls = _transforms.LinearTransformation
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
@@ -105,6 +135,26 @@ class LinearTransformation(Transform):
 
 
 class Normalize(Transform):
+    """[BETA] Normalize a tensor image with mean and standard deviation.
+
+    .. betastatus:: Normalize transform
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
     _v1_transform_cls = _transforms.Normalize
     _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
@@ -125,6 +175,24 @@ class Normalize(Transform):
 
 
 class GaussianBlur(Transform):
+    """[BETA] Blurs image with randomly chosen Gaussian blur.
+
+    .. betastatus:: GausssianBlur transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+
+    Returns:
+        PIL Image or Tensor: Gaussian blurred version of the input image.
+    """
+
     _v1_transform_cls = _transforms.GaussianBlur
 
     def __init__(
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 984d5ba50..b0743feb1 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -11,6 +11,15 @@ from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 class PILToTensor(Transform):
+    """[BETA] Convert a ``PIL Image`` to a tensor of the same type.
+
+    .. betastatus:: PILToTensor transform
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
+    """
+
     _transformed_types = (PIL.Image.Image,)
 
     def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
@@ -27,6 +36,27 @@ class ToImageTensor(Transform):
 
 
 class ToImagePIL(Transform):
+    """[BETA] Convert a tensor or an ndarray to PIL Image.
+
+    .. betastatus:: ToImagePIL transform
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while preserving the value range.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+            ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:
-- 
GitLab


From 49c6961a2dea0ff9aa41e7c2e5cc9fc89ad1fd65 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 22 Feb 2023 11:34:07 +0100
Subject: [PATCH 319/624] reduce GHA log output (#7267)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 .github/workflows/test-linux-cpu.yml | 6 +++---
 .github/workflows/test-linux-gpu.yml | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 769ee4f84..8a9f7d33b 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -39,7 +39,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
+        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
@@ -50,8 +50,8 @@ jobs:
           -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
           "${CUDATOOLKIT}"
         python3 setup.py develop
-        python3 -m pip install pytest pytest-mock 'av<10'
+        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
 
         # Run Tests
         python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
index 95d06402d..d1275071b 100644
--- a/.github/workflows/test-linux-gpu.yml
+++ b/.github/workflows/test-linux-gpu.yml
@@ -43,7 +43,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
+        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
@@ -54,8 +54,8 @@ jobs:
           -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
           "${CUDATOOLKIT}"
         python3 setup.py develop
-        python3 -m pip install pytest pytest-mock 'av<10'
+        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
 
         # Run Tests
         python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
-- 
GitLab


From a46d97c96dfb2f7f9ddc7f4f889d9856b46428ad Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 22 Feb 2023 13:27:17 +0100
Subject: [PATCH 320/624] align transforms v2 signatures with v1 (#7301)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 test/test_transforms_v2_consistency.py  | 9 ++++++---
 torchvision/transforms/v2/_container.py | 2 +-
 torchvision/transforms/v2/_geometry.py  | 8 ++++----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 125d7ec7a..43f17c9b1 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -540,9 +540,12 @@ def test_signature_consistency(config):
             f"not. Please add a default value."
         )
 
-    legacy_kinds = {name: param.kind for name, param in legacy_params.items()}
-    prototype_kinds = {name: prototype_params[name].kind for name in legacy_kinds.keys()}
-    assert prototype_kinds == legacy_kinds
+    legacy_signature = list(legacy_params.keys())
+    # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature
+    # to the same number of parameters as the legacy one
+    prototype_signature = list(prototype_params.keys())[: len(legacy_signature)]
+
+    assert prototype_signature == legacy_signature
 
 
 def check_call_consistency(
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 66da9c187..08282962f 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -124,8 +124,8 @@ class RandomChoice(Transform):
     def __init__(
         self,
         transforms: Sequence[Callable],
-        probabilities: Optional[List[float]] = None,
         p: Optional[List[float]] = None,
+        probabilities: Optional[List[float]] = None,
     ) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index af8ca4b64..4d7a5fca3 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -575,8 +575,8 @@ class RandomRotation(Transform):
         degrees: Union[numbers.Number, Sequence],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         center: Optional[List[float]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -903,9 +903,9 @@ class RandomPerspective(_RandomApplyTransform):
     def __init__(
         self,
         distortion_scale: float = 0.5,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         p: float = 0.5,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__(p=p)
 
@@ -966,8 +966,8 @@ class ElasticTransform(Transform):
         self,
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
-- 
GitLab


From ed48bb1c8c5cbb4dfaf18f4c27a47c42ce2bde27 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 12:05:31 +0000
Subject: [PATCH 321/624] Extend default heuristic of SanitizeBoundingBoxes to
 support tuples (#7304)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_transforms_v2.py         | 54 +++++++++++++++++++++++-------
 torchvision/transforms/v2/_misc.py | 31 ++++++++++++-----
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 2e43c86f9..a1e1cb720 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1935,7 +1935,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 @pytest.mark.parametrize(
     "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
 )
-def test_sanitize_bounding_boxes(min_size, labels_getter):
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
+
+    if sample_type is tuple and not isinstance(labels_getter, str):
+        # The "lambda inputs: inputs["labels"]" labels_getter used in this test
+        # doesn't work if the input is a tuple.
+        return
+
     H, W = 256, 128
 
     boxes_and_validity = [
@@ -1970,35 +1977,56 @@ def test_sanitize_bounding_boxes(min_size, labels_getter):
     )
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
-
+    whatever = torch.rand(10)
+    input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)
     sample = {
-        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
+        "image": input_img,
         "labels": labels,
         "boxes": boxes,
-        "whatever": torch.rand(10),
+        "whatever": whatever,
         "None": None,
         "masks": masks,
     }
 
+    if sample_type is tuple:
+        img = sample.pop("image")
+        sample = (img, sample)
+
     out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
 
-    assert out["image"] is sample["image"]
-    assert out["whatever"] is sample["whatever"]
+    if sample_type is tuple:
+        out_image = out[0]
+        out_labels = out[1]["labels"]
+        out_boxes = out[1]["boxes"]
+        out_masks = out[1]["masks"]
+        out_whatever = out[1]["whatever"]
+    else:
+        out_image = out["image"]
+        out_labels = out["labels"]
+        out_boxes = out["boxes"]
+        out_masks = out["masks"]
+        out_whatever = out["whatever"]
+
+    assert out_image is input_img
+    assert out_whatever is whatever
 
     if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
-        assert out["labels"] is sample["labels"]
+        assert out_labels is labels
     else:
-        assert isinstance(out["labels"], torch.Tensor)
-        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
+        assert isinstance(out_labels, torch.Tensor)
+        assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0]
         # This works because we conveniently set labels to arange(num_boxes)
-        assert out["labels"].tolist() == valid_indices
+        assert out_labels.tolist() == valid_indices
 
 
 @pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
-def test_sanitize_bounding_boxes_default_heuristic(key):
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes_default_heuristic(key, sample_type):
     labels = torch.arange(10)
-    d = {key: labels}
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+    sample = {key: labels, "another_key": "whatever"}
+    if sample_type is tuple:
+        sample = (None, sample, "whatever_again")
+    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(sample) is labels
 
     if key.lower() != "labels":
         # If "labels" is in the dict (case-insensitive),
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 6998d416c..8cc4aa6a3 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -1,7 +1,7 @@
 import collections
 import warnings
 from contextlib import suppress
-from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union
+from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Sequence, Type, Union
 
 import PIL.Image
 
@@ -269,7 +269,9 @@ class SanitizeBoundingBoxes(Transform):
         elif callable(labels_getter):
             self._labels_getter = labels_getter
         elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: inputs[labels_getter]
+            self._labels_getter = lambda inputs: SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)[
+                labels_getter  # type: ignore[index]
+            ]
         elif labels_getter is None:
             self._labels_getter = None
         else:
@@ -278,10 +280,27 @@ class SanitizeBoundingBoxes(Transform):
                 f"Got {labels_getter} of type {type(labels_getter)}."
             )
 
+    @staticmethod
+    def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]:
+        # datasets outputs may be plain dicts like {"img": ..., "labels": ..., "bbox": ...}
+        # or tuples like (img, {"labels":..., "bbox": ...})
+        # This hacky helper accounts for both structures.
+        if isinstance(inputs, tuple):
+            inputs = inputs[1]
+
+        if not isinstance(inputs, collections.abc.Mapping):
+            raise ValueError(
+                f"If labels_getter is a str or 'default', "
+                f"then the input to forward() must be a dict or a tuple whose second element is a dict."
+                f" Got {type(inputs)} instead."
+            )
+        return inputs
+
     @staticmethod
     def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
-        # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive
+        # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive
         # Returns None if nothing is found
+        inputs = SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)
         candidate_key = None
         with suppress(StopIteration):
             candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
@@ -298,12 +317,6 @@ class SanitizeBoundingBoxes(Transform):
     def forward(self, *inputs: Any) -> Any:
         inputs = inputs if len(inputs) > 1 else inputs[0]
 
-        if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping):
-            raise ValueError(
-                f"If labels_getter is a str or 'default' (got {self.labels_getter}), "
-                f"then the input to forward() must be a dict. Got {type(inputs)} instead."
-            )
-
         if self._labels_getter is None:
             labels = None
         else:
-- 
GitLab


From c786d7550a8ee840d1ea7a0fcb44feba5c39e3ef Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 23 Feb 2023 14:46:35 +0100
Subject: [PATCH 322/624] add end-to-end example gallery for transforms v2
 (#7302)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/requirements.txt                       |   1 +
 gallery/assets/coco/images/000000000001.jpg |   1 +
 gallery/assets/coco/images/000000000002.jpg |   1 +
 gallery/assets/coco/instances.json          |   1 +
 gallery/plot_transforms_v2_e2e.py           | 152 ++++++++++++++++++++
 5 files changed, 156 insertions(+)
 create mode 120000 gallery/assets/coco/images/000000000001.jpg
 create mode 120000 gallery/assets/coco/images/000000000002.jpg
 create mode 100644 gallery/assets/coco/instances.json
 create mode 100644 gallery/plot_transforms_v2_e2e.py

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 09a11359a..2a50d9b8f 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1
 sphinx==5.0.0
 tabulate
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+pycocotools
diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg
new file mode 120000
index 000000000..9be80c7c2
--- /dev/null
+++ b/gallery/assets/coco/images/000000000001.jpg
@@ -0,0 +1 @@
+../../astronaut.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg
new file mode 120000
index 000000000..9f8efef99
--- /dev/null
+++ b/gallery/assets/coco/images/000000000002.jpg
@@ -0,0 +1 @@
+../../dog2.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json
new file mode 100644
index 000000000..fe0e09270
--- /dev/null
+++ b/gallery/assets/coco/instances.json
@@ -0,0 +1 @@
+{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]}
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
new file mode 100644
index 000000000..938578e4a
--- /dev/null
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -0,0 +1,152 @@
+"""
+==================================================
+transforms v2: End-to-end object detection example
+==================================================
+
+Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
+``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example
+showcases an end-to-end object detection training using the stable ``torchvisio.datasets`` and ``torchvision.models`` as
+well as the new ``torchvision.transforms.v2`` v2 API.
+"""
+
+import pathlib
+from collections import defaultdict
+
+import PIL.Image
+
+import torch
+import torch.utils.data
+
+import torchvision
+
+
+# sphinx_gallery_thumbnail_number = -1
+def show(sample):
+    import matplotlib.pyplot as plt
+
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+
+    fig.show()
+
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import models, datasets
+import torchvision.transforms.v2 as transforms
+
+
+########################################################################################################################
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns, and we'll see how to convert it to a format that is compatible with our new transforms.
+
+
+def load_example_coco_detection_dataset(**kwargs):
+    # This loads fake data for illustration purposes of this example. In practice, you'll have
+    # to replace this with the proper data
+    root = pathlib.Path("assets") / "coco"
+    return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs)
+
+
+dataset = load_example_coco_detection_dataset()
+
+sample = dataset[0]
+image, target = sample
+print(type(image))
+print(type(target), type(target[0]), list(target[0].keys()))
+
+
+########################################################################################################################
+# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
+# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
+# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It
+# also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding
+# ``torchvision.datapoints``.
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+sample = dataset[0]
+image, target = sample
+print(type(image))
+print(type(target), list(target.keys()))
+print(type(target["boxes"]), type(target["masks"]), type(target["labels"]))
+
+########################################################################################################################
+# As baseline, let's have a look at a sample without transformations:
+
+show(sample)
+
+
+########################################################################################################################
+# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
+# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
+
+transform = transforms.Compose(
+    [
+        transforms.RandomPhotometricDistort(),
+        transforms.RandomZoomOut(
+            fill=defaultdict(lambda: 0, {PIL.Image.Image: (123, 117, 104)})
+        ),
+        transforms.RandomIoUCrop(),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToImageTensor(),
+        transforms.ConvertImageDtype(torch.float32),
+        transforms.SanitizeBoundingBoxes(),
+    ]
+)
+
+########################################################################################################################
+# .. note::
+#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
+#    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
+#    the corresponding labels and optionally masks. It is particularly critical to add it if
+#    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+dataset = load_example_coco_detection_dataset(transforms=transform)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+torch.manual_seed(3141)
+sample = dataset[0]
+
+show(sample)
+
+
+########################################################################################################################
+# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
+# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection models expect a
+    # sequence of images and target dictionaries. The default collation function tries to
+    # `torch.stack` the individual elements, which fails in general for object detection,
+    # because the number of object instances varies between the samples. This is the same for
+    # `torchvision.transforms` v1
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("ssd300_vgg16", weights=None, weights_backbone=None).train()
+
+for images, targets in data_loader:
+    loss_dict = model(images, targets)
+    print(loss_dict)
+    # Put your training logic here
+    break
-- 
GitLab


From 14c003bd58dbf02393292517bf724d62e93360d9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 16:25:44 +0000
Subject: [PATCH 323/624] Add v2 docs for color transforms (#7310)

---
 docs/source/transforms.rst          |  1 +
 torchvision/transforms/v2/_color.py | 43 +++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 00d929d06..c2e9855d9 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -131,6 +131,7 @@ Color
 
     ColorJitter
     v2.ColorJitter
+    v2.RandomPhotometricDistort
     Grayscale
     v2.Grayscale
     RandomGrayscale
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 785a3965e..2a581bf56 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -46,7 +46,7 @@ class Grayscale(Transform):
 
 
 class RandomGrayscale(_RandomApplyTransform):
-    """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
+    """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1).
 
     .. betastatus:: RandomGrayscale transform
 
@@ -85,7 +85,7 @@ class RandomGrayscale(_RandomApplyTransform):
 
 
 class ColorJitter(Transform):
-    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video.
 
     .. betastatus:: ColorJitter transform
 
@@ -190,6 +190,31 @@ class ColorJitter(Transform):
 
 # TODO: This class seems to be untested
 class RandomPhotometricDistort(Transform):
+    """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
+    MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
+
+    .. betastatus:: RandomPhotometricDistort transform
+
+    This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
+    under the hood to adjust the contrast, saturation, hue, brightness, and also
+    randomly permutes channels.
+
+    Args:
+        brightness (tuple of float (min, max), optional): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        contrast tuple of float (min, max), optional): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers.
+        saturation (tuple of float (min, max), optional): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        hue (tuple of float (min, max), optional): How much to jitter hue.
+            hue_factor is chosen uniformly from [min, max].  Should have -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied.
+            Default is 0.5.
+    """
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -199,10 +224,10 @@ class RandomPhotometricDistort(Transform):
 
     def __init__(
         self,
+        brightness: Tuple[float, float] = (0.875, 1.125),
         contrast: Tuple[float, float] = (0.5, 1.5),
         saturation: Tuple[float, float] = (0.5, 1.5),
         hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
         p: float = 0.5,
     ):
         super().__init__()
@@ -266,7 +291,7 @@ class RandomPhotometricDistort(Transform):
 
 
 class RandomEqualize(_RandomApplyTransform):
-    """[BETA] Equalize the histogram of the given image randomly with a given probability.
+    """[BETA] Equalize the histogram of the given image or video with a given probability.
 
     .. betastatus:: RandomEqualize transform
 
@@ -285,7 +310,7 @@ class RandomEqualize(_RandomApplyTransform):
 
 
 class RandomInvert(_RandomApplyTransform):
-    """[BETA] Inverts the colors of the given image randomly with a given probability.
+    """[BETA] Inverts the colors of the given image or video with a given probability.
 
     .. betastatus:: RandomInvert transform
 
@@ -304,7 +329,7 @@ class RandomInvert(_RandomApplyTransform):
 
 
 class RandomPosterize(_RandomApplyTransform):
-    """[BETA] Posterize the image randomly with a given probability by reducing the
+    """[BETA] Posterize the image or video with a given probability by reducing the
     number of bits for each color channel.
 
     .. betastatus:: RandomPosterize transform
@@ -329,7 +354,7 @@ class RandomPosterize(_RandomApplyTransform):
 
 
 class RandomSolarize(_RandomApplyTransform):
-    """[BETA] Solarize the image randomly with a given probability by inverting all pixel
+    """[BETA] Solarize the image or video with a given probability by inverting all pixel
     values above a threshold.
 
     .. betastatus:: RandomSolarize transform
@@ -354,7 +379,7 @@ class RandomSolarize(_RandomApplyTransform):
 
 
 class RandomAutocontrast(_RandomApplyTransform):
-    """[BETA] Autocontrast the pixels of the given image randomly with a given probability.
+    """[BETA] Autocontrast the pixels of the given image or video with a given probability.
 
     .. betastatus:: RandomAutocontrast transform
 
@@ -373,7 +398,7 @@ class RandomAutocontrast(_RandomApplyTransform):
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
-    """[BETA] Adjust the sharpness of the image randomly with a given probability.
+    """[BETA] Adjust the sharpness of the image or video with a given probability.
 
     .. betastatus:: RandomAdjustSharpness transform
 
-- 
GitLab


From 1dc0318f2bdae482e3ed746f29e9bc8cf166fbd4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 16:26:25 +0000
Subject: [PATCH 324/624] Add docs for containers and undeprecate p for
 RandomChoice (#7311)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 test/test_transforms_v2.py              |  5 +---
 torchvision/transforms/v2/_container.py | 37 +++++++++++++------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index a1e1cb720..9173ec14f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1359,11 +1359,8 @@ class TestContainers:
 
 class TestRandomChoice:
     def test_assertions(self):
-        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
-
         with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1])
 
 
 class TestRandomIoUCrop:
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 08282962f..27affc710 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 
 import torch
@@ -78,7 +77,7 @@ class RandomApply(Transform):
 
     Args:
         transforms (sequence or torch.nn.Module): list of transformations
-        p (float): probability
+        p (float): probability of applying the list of transforms
     """
 
     _v1_transform_cls = _transforms.RandomApply
@@ -119,39 +118,38 @@ class RandomChoice(Transform):
 
     .. betastatus:: RandomChoice transform
 
-    This transform does not support torchscript."""
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (list of floats or None, optional): probability of each transform being picked.
+            If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
+            (default), all transforms have the same probability.
+    """
 
     def __init__(
         self,
         transforms: Sequence[Callable],
         p: Optional[List[float]] = None,
-        probabilities: Optional[List[float]] = None,
     ) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
-        if p is not None:
-            warnings.warn(
-                "Argument p is deprecated and will be removed in a future release. "
-                "Please use probabilities argument instead."
-            )
-            probabilities = p
 
-        if probabilities is None:
-            probabilities = [1] * len(transforms)
-        elif len(probabilities) != len(transforms):
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
             raise ValueError(
-                f"The number of probabilities doesn't match the number of transforms: "
-                f"{len(probabilities)} != {len(transforms)}"
+                f"The number of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
             )
 
         super().__init__()
 
         self.transforms = transforms
-        total = sum(probabilities)
-        self.probabilities = [prob / total for prob in probabilities]
+        total = sum(p)
+        self.p = [prob / total for prob in p]
 
     def forward(self, *inputs: Any) -> Any:
-        idx = int(torch.multinomial(torch.tensor(self.probabilities), 1))
+        idx = int(torch.multinomial(torch.tensor(self.p), 1))
         transform = self.transforms[idx]
         return transform(*inputs)
 
@@ -162,6 +160,9 @@ class RandomOrder(Transform):
     .. betastatus:: RandomOrder transform
 
     This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
     """
 
     def __init__(self, transforms: Sequence[Callable]) -> None:
-- 
GitLab


From 31a4ef9f815a86a924d0faa7709e091b5118f00d Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 23 Feb 2023 17:57:22 +0100
Subject: [PATCH 325/624] Updated geometric transforms v2 docstring (#7303)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst             |  12 +-
 torchvision/transforms/v2/_geometry.py | 402 ++++++++++++++++++++-----
 2 files changed, 334 insertions(+), 80 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index c2e9855d9..ddd6f37d0 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -99,10 +99,14 @@ Geometry
 
     Resize
     v2.Resize
+    v2.ScaleJitter
+    v2.RandomShortestSize
+    v2.RandomResize
     RandomCrop
     v2.RandomCrop
     RandomResizedCrop
     v2.RandomResizedCrop
+    v2.RandomIoUCrop
     CenterCrop
     v2.CenterCrop
     FiveCrop
@@ -111,17 +115,21 @@ Geometry
     v2.TenCrop
     Pad
     v2.Pad
+    v2.RandomZoomOut
+    RandomRotation
+    v2.RandomRotation
     RandomAffine
     v2.RandomAffine
     RandomPerspective
     v2.RandomPerspective
-    RandomRotation
-    v2.RandomRotation
+    ElasticTransform
+    v2.ElasticTransform
     RandomHorizontalFlip
     v2.RandomHorizontalFlip
     RandomVerticalFlip
     v2.RandomVerticalFlip
 
+
 Color
 -----
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 4d7a5fca3..c3342eb99 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -26,16 +26,17 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
-    """[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
+    """[BETA] Horizontally flip the input with a given probability.
 
     .. betastatus:: RandomHorizontalFlip transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
-        p (float): probability of the image being flipped. Default value is 0.5
+        p (float, optional): probability of the input being flipped. Default value is 0.5
     """
 
     _v1_transform_cls = _transforms.RandomHorizontalFlip
@@ -45,16 +46,17 @@ class RandomHorizontalFlip(_RandomApplyTransform):
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
-    """[BETA] Vertically flip the given image/box/mask randomly with a given probability.
+    """[BETA] Vertically flip the input with a given probability.
 
     .. betastatus:: RandomVerticalFlip transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
-        p (float): probability of the image being flipped. Default value is 0.5
+        p (float, optional): probability of the input being flipped. Default value is 0.5
     """
 
     _v1_transform_cls = _transforms.RandomVerticalFlip
@@ -64,12 +66,14 @@ class RandomVerticalFlip(_RandomApplyTransform):
 
 
 class Resize(Transform):
-    """[BETA] Resize the input image/box/mask to the given size.
+    """[BETA] Resize the input to the given size.
 
     .. betastatus:: Resize transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     .. warning::
         The output image might be different depending on its type: when downsampling, the interpolation of PIL images
@@ -87,7 +91,7 @@ class Resize(Transform):
 
             .. note::
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
@@ -156,12 +160,15 @@ class Resize(Transform):
 
 
 class CenterCrop(Transform):
-    """[BETA] Crops the given image/box/mask at the center.
+    """[BETA] Crop the input at the center.
 
     .. betastatus:: CenterCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
     If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
 
     Args:
@@ -181,14 +188,16 @@ class CenterCrop(Transform):
 
 
 class RandomResizedCrop(Transform):
-    """[BETA] Crop a random portion of image/box/mask and resize it to a given size.
+    """[BETA] Crop a random portion of the input and resize it to a given size.
 
     .. betastatus:: RandomResizedCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
-    A crop of the original image is made: the crop has a random area (H * W)
+    A crop of the original input is made: the crop has a random area (H * W)
     and a random aspect ratio. This crop is finally resized to the given
     size. This is popularly used to train the Inception networks.
 
@@ -199,11 +208,11 @@ class RandomResizedCrop(Transform):
 
             .. note::
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
-        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
             before resizing. The scale is defined with respect to the area of the original image.
-        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
             resizing.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
@@ -305,13 +314,13 @@ ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
 
 
 class FiveCrop(Transform):
-    """[BETA] Crop the given image/box/mask into four corners and the central crop.
+    """[BETA] Crop the image or video into four corners and the central crop.
 
     .. betastatus:: FiveCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
+    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
 
     .. Note::
          This transform returns a tuple of images and there may be a mismatch in the number of
@@ -367,14 +376,14 @@ class FiveCrop(Transform):
 
 
 class TenCrop(Transform):
-    """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
+    """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
     these (horizontal flipping is used by default).
 
     .. betastatus:: TenCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions.
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
+    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
 
     See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
 
@@ -387,7 +396,7 @@ class TenCrop(Transform):
         size (sequence or int): Desired output size of the crop. If size is an
             int instead of sequence like (h, w), a square crop (size, size) is
             made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
-        vertical_flip (bool): Use vertical flipping instead of horizontal
+        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
     """
 
     _v1_transform_cls = _transforms.TenCrop
@@ -426,14 +435,14 @@ class TenCrop(Transform):
 
 
 class Pad(Transform):
-    """[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
+    """[BETA] Pad the input on all sides with the given "pad" value.
 
     .. betastatus:: Pad transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
-    at most 3 leading dimensions for mode edge,
-    and an arbitrary number of leading dimensions for mode constant
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         padding (int or sequence): Padding on each border. If a single int is provided this
@@ -444,18 +453,17 @@ class Pad(Transform):
             .. note::
                 In torchscript mode padding as single int is not supported, use a sequence of
                 length 1: ``[padding, ]``.
-        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
-            length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or tuple value is supported for PIL Image.
-        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
-            Default is constant.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is "constant".
 
             - constant: pads with a constant value, this value is specified with fill
 
             - edge: pads with the last value at the edge of the image.
-              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
 
             - reflect: pads with reflection of image without repeating the last value on the edge.
               For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
@@ -501,6 +509,37 @@ class Pad(Transform):
 
 
 class RandomZoomOut(_RandomApplyTransform):
+    """[BETA] "Zoom out" transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. betastatus:: RandomZoomOut transform
+
+    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
+    Output spatial size is randomly sampled from original size up to a maximum size configured
+    with ``side_range`` parameter:
+
+    .. code-block:: python
+
+        r = uniform_sample(side_range[0], side_range[1])
+        output_width = input_width * r
+        output_height = input_height * r
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
+            scale the input size.
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
     def __init__(
         self,
         fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
@@ -540,18 +579,20 @@ class RandomZoomOut(_RandomApplyTransform):
 
 
 class RandomRotation(Transform):
-    """[BETA] Rotate the image/box/mask by angle.
+    """[BETA] Rotate the input by angle.
 
     .. betastatus:: RandomRotation transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         degrees (sequence or number): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees).
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
@@ -561,8 +602,11 @@ class RandomRotation(Transform):
             Note that the expand flag assumes rotation around the center and no translation.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
-        fill (sequence or number): Pixel fill value for the area outside the rotated
-            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
 
     .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
 
@@ -608,12 +652,14 @@ class RandomRotation(Transform):
 
 
 class RandomAffine(Transform):
-    """[BETA] Random affine transformation of the image/box/mask keeping center invariant.
+    """[BETA] Random affine transformation the input keeping center invariant.
 
     .. betastatus:: RandomAffine transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         degrees (sequence or number): Range of degrees to select from.
@@ -631,12 +677,15 @@ class RandomAffine(Transform):
             range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
             an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Will not apply shear by default.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
-        fill (sequence or number): Pixel fill value for the area outside the transformed
-            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
 
@@ -724,13 +773,14 @@ class RandomAffine(Transform):
 
 
 class RandomCrop(Transform):
-    """[BETA] Crop the given image/box/mask at a random location.
+    """[BETA] Crop the input at a random location.
 
     .. betastatus:: RandomCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
-    but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         size (sequence or int): Desired output size of the crop. If size is an
@@ -745,21 +795,20 @@ class RandomCrop(Transform):
             .. note::
                 In torchscript mode padding as single int is not supported, use a sequence of
                 length 1: ``[padding, ]``.
-        pad_if_needed (boolean): It will pad the image if smaller than the
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Since cropping is done
             after padding, the padding seems to be done at a random offset.
-        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
-            length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or tuple value is supported for PIL Image.
-        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
             Default is constant.
 
             - constant: pads with a constant value, this value is specified with fill
 
             - edge: pads with the last value at the edge of the image.
-              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
 
             - reflect: pads with reflection of image without repeating the last value on the edge.
               For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
@@ -879,23 +928,28 @@ class RandomCrop(Transform):
 
 
 class RandomPerspective(_RandomApplyTransform):
-    """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
+    """[BETA] Perform a random perspective transformation of the input with a given probability.
 
     .. betastatus:: RandomPerspective transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
-        distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
             Default is 0.5.
-        p (float): probability of the image being transformed. Default is 0.5.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        p (float, optional): probability of the input being transformed. Default is 0.5.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
-        fill (sequence or number): Pixel fill value for the area outside the transformed
-            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
     """
 
     _v1_transform_cls = _transforms.RandomPerspective
@@ -960,6 +1014,46 @@ class RandomPerspective(_RandomApplyTransform):
 
 
 class ElasticTransform(Transform):
+    """[BETA] Transform the input with elastic transformations.
+
+    .. betastatus:: RandomPerspective transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to transform the input.
+
+    .. note::
+        Implementation to transform bounding boxes is approximative (not exact).
+        We construct an approximation of the inverse grid as ``inverse_grid = idenity - displacement``.
+        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
+        Our assumption is that ``displacement * displacement`` is small and can be ignored.
+        Large displacements would lead to large errors in the approximation.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
     _v1_transform_cls = _transforms.ElasticTransform
 
     def __init__(
@@ -1011,6 +1105,34 @@ class ElasticTransform(Transform):
 
 
 class RandomIoUCrop(Transform):
+    """[BETA] Random IoU crop transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. betastatus:: RandomIoUCrop transform
+
+    This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
+
+    .. warning::
+        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        after or later in the transforms pipeline.
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_scale (float, optional): Minimum factors to scale the input size.
+        max_scale (float, optional): Maximum factors to scale the input size.
+        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
+        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
+        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
+            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
+        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
+            Default, 40.
+    """
+
     def __init__(
         self,
         min_scale: float = 0.3,
@@ -1107,6 +1229,45 @@ class RandomIoUCrop(Transform):
 
 
 class ScaleJitter(Transform):
+    """[BETA] Perform Large Scale Jitter on the input according to
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    .. betastatus:: ScaleJitter transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
+            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
+        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         target_size: Tuple[int, int],
@@ -1135,6 +1296,43 @@ class ScaleJitter(Transform):
 
 
 class RandomShortestSize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. betastatus:: RandomShortestSize transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
+        max_size (int, optional): Maximum spatial size. Default, None.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         min_size: Union[List[int], Tuple[int], int],
@@ -1166,6 +1364,54 @@ class RandomShortestSize(Transform):
 
 
 class RandomResize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. betastatus:: RandomResize transform
+
+    This transformation can be used together with ``RandomCrop`` as data augmentations to train
+    models on image segmentation task.
+
+    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
+
+    .. code-block:: python
+
+        size = uniform_sample(min_size, max_size)
+        output_width = size
+        output_height = size
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int): Minimum output size for random sampling
+        max_size (int): Maximum output size for random sampling
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         min_size: int,
-- 
GitLab


From 0daffad3b3aad3fec35b2068b82120df4b797351 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 10:32:36 +0100
Subject: [PATCH 326/624] Cleanup for e2e gallery example for transforms v2
 (#7318)

---
 docs/source/conf.py               | 1 +
 gallery/plot_transforms_v2_e2e.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 304a1cc6e..8b4ce17de 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,6 +62,7 @@ sphinx_gallery_conf = {
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
+    "remove_config_comments": True,
 }
 
 napoleon_use_ivar = True
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 938578e4a..533a3d5d7 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -1,6 +1,6 @@
 """
 ==================================================
-transforms v2: End-to-end object detection example
+Transforms v2: End-to-end object detection example
 ==================================================
 
 Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
@@ -20,7 +20,6 @@ import torch.utils.data
 import torchvision
 
 
-# sphinx_gallery_thumbnail_number = -1
 def show(sample):
     import matplotlib.pyplot as plt
 
@@ -125,6 +124,7 @@ dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
 torch.manual_seed(3141)
 sample = dataset[0]
 
+# sphinx_gallery_thumbnail_number = 2
 show(sample)
 
 
-- 
GitLab


From 2b70774e0e24970867cbbc63b922f0d4d72e5381 Mon Sep 17 00:00:00 2001
From: mpearce25 <pearcematthew25@gmail.com>
Date: Fri, 24 Feb 2023 04:35:24 -0500
Subject: [PATCH 327/624] Singular Sanitize BoundingBox (#7316)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 gallery/plot_transforms_v2_e2e.py      |  4 ++--
 test/test_transforms_v2.py             | 26 +++++++++++++-------------
 test/test_transforms_v2_consistency.py |  2 +-
 torchvision/transforms/v2/__init__.py  |  2 +-
 torchvision/transforms/v2/_geometry.py |  4 ++--
 torchvision/transforms/v2/_misc.py     |  6 +++---
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 533a3d5d7..aa25d214f 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -105,13 +105,13 @@ transform = transforms.Compose(
         transforms.RandomHorizontalFlip(),
         transforms.ToImageTensor(),
         transforms.ConvertImageDtype(torch.float32),
-        transforms.SanitizeBoundingBoxes(),
+        transforms.SanitizeBoundingBox(),
     ]
 )
 
 ########################################################################################################################
 # .. note::
-#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
+#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBox` transform is a no-op in this example, but it
 #    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
 #    the corresponding labels and optionally masks. It is particularly critical to add it if
 #    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9173ec14f..93d5f17fc 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -275,7 +275,7 @@ class TestSmoke:
                 boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
-            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+            assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4)
 
     @parametrize(
         [
@@ -1876,7 +1876,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             transforms.ConvertImageDtype(torch.float),
         ]
     if sanitize:
-        t += [transforms.SanitizeBoundingBoxes()]
+        t += [transforms.SanitizeBoundingBox()]
     t = transforms.Compose(t)
 
     num_boxes = 5
@@ -1917,7 +1917,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
         # doesn't remove them strictly speaking, it just marks some boxes as
         # degenerate and those boxes will be later removed by
-        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # SanitizeBoundingBox(), which we add to the pipelines if the sanitize
         # param is True.
         # Note that the values below are probably specific to the random seed
         # set above (which is fine).
@@ -1989,7 +1989,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
         img = sample.pop("image")
         sample = (img, sample)
 
-    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+    out = transforms.SanitizeBoundingBox(min_size=min_size, labels_getter=labels_getter)(sample)
 
     if sample_type is tuple:
         out_image = out[0]
@@ -2023,13 +2023,13 @@ def test_sanitize_bounding_boxes_default_heuristic(key, sample_type):
     sample = {key: labels, "another_key": "whatever"}
     if sample_type is tuple:
         sample = (None, sample, "whatever_again")
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(sample) is labels
+    assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(sample) is labels
 
     if key.lower() != "labels":
         # If "labels" is in the dict (case-insensitive),
         # it takes precedence over other keys which would otherwise be a match
         d = {key: "something_else", "labels": labels}
-        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+        assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(d) is labels
 
 
 def test_sanitize_bounding_boxes_errors():
@@ -2041,25 +2041,25 @@ def test_sanitize_bounding_boxes_errors():
     )
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
-        transforms.SanitizeBoundingBoxes(min_size=0)
+        transforms.SanitizeBoundingBox(min_size=0)
     with pytest.raises(ValueError, match="labels_getter should either be a str"):
-        transforms.SanitizeBoundingBoxes(labels_getter=12)
+        transforms.SanitizeBoundingBox(labels_getter=12)
 
     with pytest.raises(ValueError, match="Could not infer where the labels are"):
         bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+        transforms.SanitizeBoundingBox()(bad_labels_key)
 
     with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
         not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
-        transforms.SanitizeBoundingBoxes()(not_a_dict)
+        transforms.SanitizeBoundingBox()(not_a_dict)
 
     with pytest.raises(ValueError, match="must be a tensor"):
         not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
-        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+        transforms.SanitizeBoundingBox()(not_a_tensor)
 
     with pytest.raises(ValueError, match="Number of boxes"):
         different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        transforms.SanitizeBoundingBox()(different_sizes)
 
     with pytest.raises(ValueError, match="boxes must be of shape"):
         bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
@@ -2071,7 +2071,7 @@ def test_sanitize_bounding_boxes_errors():
             spatial_size=(20, 20),
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        transforms.SanitizeBoundingBox()(different_sizes)
 
 
 @pytest.mark.parametrize(
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 43f17c9b1..059a230ee 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1099,7 +1099,7 @@ class TestRefDetTransforms:
                 v2_transforms.Compose(
                     [
                         v2_transforms.RandomIoUCrop(),
-                        v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]),
+                        v2_transforms.SanitizeBoundingBox(labels_getter=lambda sample: sample[1]["labels"]),
                     ]
                 ),
                 {"with_mask": False},
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 7ad72c009..6573446a3 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -40,7 +40,7 @@ from ._geometry import (
     TenCrop,
 )
 from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBoxes, ToDtype
+from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index c3342eb99..b2618bb89 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -1114,7 +1114,7 @@ class RandomIoUCrop(Transform):
 
     .. warning::
         In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
-        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately
         after or later in the transforms pipeline.
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
@@ -1222,7 +1222,7 @@ class RandomIoUCrop(Transform):
 
         if isinstance(output, datapoints.BoundingBox):
             # We "mark" the invalid boxes as degenreate, and they can be
-            # removed by a later call to SanitizeBoundingBoxes()
+            # removed by a later call to SanitizeBoundingBox()
             output[~params["is_within_crop_area"]] = 0
 
         return output
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 8cc4aa6a3..53975a2ad 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -246,7 +246,7 @@ class ToDtype(Transform):
         return inpt.to(dtype=dtype)
 
 
-class SanitizeBoundingBoxes(Transform):
+class SanitizeBoundingBox(Transform):
     # This removes boxes and their corresponding labels:
     # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
     # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
@@ -269,7 +269,7 @@ class SanitizeBoundingBoxes(Transform):
         elif callable(labels_getter):
             self._labels_getter = labels_getter
         elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)[
+            self._labels_getter = lambda inputs: SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)[
                 labels_getter  # type: ignore[index]
             ]
         elif labels_getter is None:
@@ -300,7 +300,7 @@ class SanitizeBoundingBoxes(Transform):
     def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
         # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive
         # Returns None if nothing is found
-        inputs = SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)
+        inputs = SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)
         candidate_key = None
         with suppress(StopIteration):
             candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
-- 
GitLab


From c4b41939913990c3d9d84706d4806bbb2d082305 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 09:44:30 +0000
Subject: [PATCH 328/624] Misc docs transforms v2(#7314)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 docs/source/transforms.rst          |  2 ++
 torchvision/transforms/v2/_color.py | 33 +++++++------------
 torchvision/transforms/v2/_misc.py  | 51 +++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index ddd6f37d0..1dec6bedf 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -190,6 +190,7 @@ Miscellaneous
     v2.RandomErasing
     Lambda
     v2.Lambda
+    v2.SanitizeBoundingBox
 
 .. _conversion_transforms:
 
@@ -210,6 +211,7 @@ Conversion
     ConvertImageDtype
     v2.ConvertImageDtype
     v2.ConvertDtype
+    v2.ToDtype
 
 Auto-Augmentation
 -----------------
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 2a581bf56..237e8d618 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -15,17 +15,11 @@ class Grayscale(Transform):
 
     .. betastatus:: Grayscale transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
 
     Args:
         num_output_channels (int): (1 or 3) number of channels desired for output image
-
-    Returns:
-        PIL Image: Grayscale version of the input.
-
-        - If ``num_output_channels == 1`` : returned image is single channel
-        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
     """
 
     _v1_transform_cls = _transforms.Grayscale
@@ -50,18 +44,13 @@ class RandomGrayscale(_RandomApplyTransform):
 
     .. betastatus:: RandomGrayscale transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
 
     Args:
         p (float): probability that image should be converted to grayscale.
-
-    Returns:
-        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
-        with probability (1-p).
-        - If input image is 1 channel: grayscale version is 1 channel
-        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
-
     """
 
     _v1_transform_cls = _transforms.RandomGrayscale
@@ -89,7 +78,7 @@ class ColorJitter(Transform):
 
     .. betastatus:: ColorJitter transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
 
@@ -295,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform):
 
     .. betastatus:: RandomEqualize transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
 
@@ -334,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform):
 
     .. betastatus:: RandomPosterize transform
 
-    If the image is torch Tensor, it should be of type torch.uint8,
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
     and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -383,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform):
 
     .. betastatus:: RandomAutocontrast transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -402,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform):
 
     .. betastatus:: RandomAdjustSharpness transform
 
-    If the image is torch Tensor,
+    If the input is a :class:`torch.Tensor`,
     it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 53975a2ad..2237334f7 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -15,13 +15,14 @@ from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size
 from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
+# TODO: do we want/need to expose this?
 class Identity(Transform):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return inpt
 
 
 class Lambda(Transform):
-    """[BETA] Apply a user-defined lambda as a transform.
+    """[BETA] Apply a user-defined function as a transform.
 
     .. betastatus:: Lambda transform
 
@@ -52,7 +53,7 @@ class Lambda(Transform):
 
 
 class LinearTransformation(Transform):
-    """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
+    """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
 
     .. betastatus:: LinearTransformation transform
 
@@ -135,7 +136,7 @@ class LinearTransformation(Transform):
 
 
 class Normalize(Transform):
-    """[BETA] Normalize a tensor image with mean and standard deviation.
+    """[BETA] Normalize a tensor image or video with mean and standard deviation.
 
     .. betastatus:: Normalize transform
 
@@ -179,7 +180,7 @@ class GaussianBlur(Transform):
 
     .. betastatus:: GausssianBlur transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a Tensor, it is expected
     to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args:
@@ -188,9 +189,6 @@ class GaussianBlur(Transform):
             creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
             of float (min, max), sigma is chosen uniformly at random to lie in the
             given range.
-
-    Returns:
-        PIL Image or Tensor: Gaussian blurred version of the input image.
     """
 
     _v1_transform_cls = _transforms.GaussianBlur
@@ -225,6 +223,15 @@ class GaussianBlur(Transform):
 
 
 class ToDtype(Transform):
+    """[BETA] Converts the input to a specific dtype.
+
+    .. betastatus:: ToDtype transform
+
+    Args:
+        dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify
+            per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``.
+    """
+
     _transformed_types = (torch.Tensor,)
 
     def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
@@ -247,9 +254,33 @@ class ToDtype(Transform):
 
 
 class SanitizeBoundingBox(Transform):
-    # This removes boxes and their corresponding labels:
-    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
-    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
+
+    .. betastatus:: SanitizeBoundingBox transform
+
+    This transform removes bounding boxes and their associated labels/masks that:
+
+    - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
+            It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies
+            the key whose value corresponds to the labels. It can also be a callable that takes the same input
+            as the transform, and returns the labels.
+            By default, this will try to find a "labels" key in the input, if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+    """
 
     def __init__(
         self,
-- 
GitLab


From 74e336575e61e4e3300e44dba701167ac4695d5c Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 11:08:06 +0100
Subject: [PATCH 329/624] Minor updates in autoaugment, augment docstring v2
 (#7317)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 torchvision/transforms/v2/_augment.py      | 12 +++---
 torchvision/transforms/v2/_auto_augment.py | 44 +++++++++++++---------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index b5aac9ca9..0df7e0f24 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -13,7 +13,7 @@ from .utils import is_simple_tensor, query_chw
 
 
 class RandomErasing(_RandomApplyTransform):
-    """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
+    """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels.
 
     .. betastatus:: RandomErasing transform
 
@@ -21,14 +21,14 @@ class RandomErasing(_RandomApplyTransform):
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
 
     Args:
-         p: probability that the random erasing operation will be performed.
-         scale: range of proportion of erased area against input image.
-         ratio: range of aspect ratio of erased area.
-         value: erasing value. Default is 0. If a single int, it is used to
+        p (float, optional): probability that the random erasing operation will be performed.
+        scale (tuple of float, optional): range of proportion of erased area against input image.
+        ratio (tuple of float, optional): range of aspect ratio of erased area.
+        value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to
             erase all pixels. If a tuple of length 3, it is used to erase
             R, G, B channels respectively.
             If a str of 'random', erasing each pixel with random values.
-         inplace: boolean to make this transform inplace. Default set to False.
+        inplace (bool, optional): boolean to make this transform inplace. Default set to False.
 
     Returns:
         Erased input.
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 98e23b997..2cd88c1a7 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -167,14 +167,16 @@ class AutoAugment(_AutoAugmentBase):
 
     .. betastatus:: AutoAugment transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        policy (AutoAugmentPolicy): Desired policy enum defined by
+        policy (AutoAugmentPolicy, optional): Desired policy enum defined by
             :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
@@ -342,15 +344,17 @@ class RandAugment(_AutoAugmentBase):
 
     .. betastatus:: RandAugment transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        num_ops (int): Number of augmentation transformations to apply sequentially.
-        magnitude (int): Magnitude for all the transformations.
-        num_magnitude_bins (int): The number of different magnitude values.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        num_ops (int, optional): Number of augmentation transformations to apply sequentially.
+        magnitude (int, optional): Magnitude for all the transformations.
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
@@ -423,13 +427,15 @@ class TrivialAugmentWide(_AutoAugmentBase):
 
     .. betastatus:: TrivialAugmentWide transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        num_magnitude_bins (int): The number of different magnitude values.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
@@ -492,18 +498,20 @@ class AugMix(_AutoAugmentBase):
 
     .. betastatus:: AugMix transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        severity (int): The severity of base augmentation operators. Default is ``3``.
-        mixture_width (int): The number of augmentation chains. Default is ``3``.
-        chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+        severity (int, optional): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int, optional): The number of augmentation chains. Default is ``3``.
+        chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
             Default is ``-1``.
-        alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
-        all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
-- 
GitLab


From aef00c4d4d31ed63db1d80f234daba9f876dfe5a Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 11:57:14 +0100
Subject: [PATCH 330/624] Fixed broken test_random_choice (#7315)

---
 test/test_transforms_v2.py              | 2 +-
 test/test_transforms_v2_consistency.py  | 2 +-
 torchvision/transforms/v2/_container.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 93d5f17fc..9beded4c9 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1359,7 +1359,7 @@ class TestContainers:
 
 class TestRandomChoice:
     def test_assertions(self):
-        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
+        with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"):
             transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1])
 
 
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 059a230ee..a8a87cd43 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -822,7 +822,7 @@ class TestContainerTransforms:
                 v2_transforms.Resize(256),
                 legacy_transforms.CenterCrop(224),
             ],
-            probabilities=probabilities,
+            p=probabilities,
         )
         legacy_transform = legacy_transforms.RandomChoice(
             [
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 27affc710..7f9df3373 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -139,7 +139,7 @@ class RandomChoice(Transform):
             p = [1] * len(transforms)
         elif len(p) != len(transforms):
             raise ValueError(
-                f"The number of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
+                f"Length of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
             )
 
         super().__init__()
-- 
GitLab


From 7fefdea3cb2c72ebdb4b12b6ff5a9c43c2ed6ff4 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 12:11:29 +0100
Subject: [PATCH 331/624] Updated _meta.py docstrings (#7320)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/transforms.rst         |  2 ++
 torchvision/transforms/v2/_meta.py | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 1dec6bedf..8e3c60085 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -191,6 +191,7 @@ Miscellaneous
     Lambda
     v2.Lambda
     v2.SanitizeBoundingBox
+    v2.ClampBoundingBox
 
 .. _conversion_transforms:
 
@@ -212,6 +213,7 @@ Conversion
     v2.ConvertImageDtype
     v2.ConvertDtype
     v2.ToDtype
+    v2.ConvertBoundingBoxFormat
 
 Auto-Augmentation
 -----------------
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 7d0f0ec39..94ec851d0 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -9,6 +9,15 @@ from .utils import is_simple_tensor
 
 
 class ConvertBoundingBoxFormat(Transform):
+    """[BETA] Convert bounding box coordinates to the given ``format``, e.g. from "CXCYWH" to "XYXY".
+
+    .. betastatus:: ConvertBoundingBoxFormat transform
+
+    Args:
+        format (str or datapoints.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and
+            string values match the enums, e.g. "XYXY" or "XYWH" etc.
+    """
     _transformed_types = (datapoints.BoundingBox,)
 
     def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
@@ -22,7 +31,7 @@ class ConvertBoundingBoxFormat(Transform):
 
 
 class ConvertDtype(Transform):
-    """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
+    """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly.
 
     .. betastatus:: ConvertDtype transform
 
@@ -63,6 +72,13 @@ ConvertImageDtype = ConvertDtype
 
 
 class ClampBoundingBox(Transform):
+    """[BETA] Clamp bounding boxes to their corresponding image dimensions.
+
+    The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
+
+    .. betastatus:: ClampBoundingBox transform
+
+    """
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
-- 
GitLab


From 449cc090254cb8eb878a3581e2cf7471838767e3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 11:21:25 +0000
Subject: [PATCH 332/624] remove strEnum from BoundingBoxFormat (#7322)

---
 test/test_datapoints.py                 |  2 +-
 torchvision/datapoints/_bounding_box.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 5b875a6ef..39c051233 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -28,5 +28,5 @@ def test_bbox_instance(data, format):
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat.from_str(format.upper())
+        format = datapoints.BoundingBoxFormat[(format.upper())]
     assert bboxes.format == format
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 1dc46f8f2..75e779f0b 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -1,18 +1,18 @@
 from __future__ import annotations
 
+from enum import Enum
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import torch
-from torchvision._utils import StrEnum
 from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
 
 from ._datapoint import _FillTypeJIT, Datapoint
 
 
-class BoundingBoxFormat(StrEnum):
-    XYXY = StrEnum.auto()
-    XYWH = StrEnum.auto()
-    CXCYWH = StrEnum.auto()
+class BoundingBoxFormat(Enum):
+    XYXY = "XYXY"
+    XYWH = "XYWH"
+    CXCYWH = "CXCYWH"
 
 
 class BoundingBox(Datapoint):
@@ -39,7 +39,7 @@ class BoundingBox(Datapoint):
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
         if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
+            format = BoundingBoxFormat[format.upper()]
 
         return cls._wrap(tensor, format=format, spatial_size=spatial_size)
 
-- 
GitLab


From be798eff3f1255f647d56b7c245bdd8f4652e0a7 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 12:43:31 +0100
Subject: [PATCH 333/624] Updated _type_conversion.py docs (#7324)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/transforms.rst                    | 1 +
 torchvision/transforms/v2/_type_conversion.py | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 8e3c60085..0e9b053fb 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -209,6 +209,7 @@ Conversion
     v2.ToTensor
     PILToTensor
     v2.PILToTensor
+    v2.ToImageTensor
     ConvertImageDtype
     v2.ConvertImageDtype
     v2.ConvertDtype
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index b0743feb1..504c5cc3d 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -27,6 +27,13 @@ class PILToTensor(Transform):
 
 
 class ToImageTensor(Transform):
+    """[BETA] Convert a tensor or an ndarray or PIL Image to :class:`~torchvision.datapoints.Image`.
+
+    .. betastatus:: ToImageTensor transform
+
+    This transform does not support torchscript.
+    """
+
     _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(
-- 
GitLab


From a376f797dbf12142af97365b190deedef04c9a93 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 13:05:56 +0100
Subject: [PATCH 334/624] add docs for datapoints (#7312)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/datapoints.rst              | 13 +++++++++
 docs/source/index.rst                   |  1 +
 torchvision/datapoints/_bounding_box.py | 37 +++++++++++++++++++++++++
 torchvision/datapoints/_image.py        | 13 +++++++++
 torchvision/datapoints/_mask.py         | 13 +++++++++
 torchvision/datapoints/_video.py        | 12 ++++++++
 6 files changed, 89 insertions(+)
 create mode 100644 docs/source/datapoints.rst

diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
new file mode 100644
index 000000000..07e20b090
--- /dev/null
+++ b/docs/source/datapoints.rst
@@ -0,0 +1,13 @@
+Datapoints
+==========
+
+.. currentmodule:: torchvision.datapoints
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Image
+    Video
+    BoundingBoxFormat
+    BoundingBox
+    Mask
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd0..ac047ff58 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,6 +31,7 @@ architectures, and common image transformations for computer vision.
    :maxdepth: 2
    :caption: Package Reference
 
+   datapoints
    transforms
    models
    datasets
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 75e779f0b..d8441823c 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -10,12 +10,35 @@ from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class BoundingBoxFormat(Enum):
+    """[BETA] Coordinate format of a bounding box.
+
+    Available formats are
+
+    * ``XYXY``
+    * ``XYWH``
+    * ``CXCYWH``
+    """
+
     XYXY = "XYXY"
     XYWH = "XYWH"
     CXCYWH = "CXCYWH"
 
 
 class BoundingBox(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        spatial_size (two-tuple of ints): Height and width of the corresponding image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     format: BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
@@ -52,6 +75,20 @@ class BoundingBox(Datapoint):
         format: Optional[BoundingBoxFormat] = None,
         spatial_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBox:
+        """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference.
+
+        Args:
+            other (BoundingBox): Reference bounding box.
+            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox`
+            format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
+                reference.
+            spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
+                omitted, it is taken from the reference.
+
+        """
+        if isinstance(format, str):
+            format = BoundingBoxFormat.from_str(format.upper())
+
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 21dfe5a5c..e47a6c10f 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -10,6 +10,19 @@ from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class Image(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for images.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Image:
         image = tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index bb70ec122..0135d793d 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -10,6 +10,19 @@ from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class Mask(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Mask:
         return tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index ab51c1023..a6fbe2bd4 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -9,6 +9,18 @@ from ._datapoint import _FillTypeJIT, Datapoint
 
 
 class Video(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for videos.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Video:
         video = tensor.as_subclass(cls)
-- 
GitLab


From 877ffd9fec5f4fd78963ef400b8a161a1500d443 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 13:46:18 +0000
Subject: [PATCH 335/624] Various doc enhancements (#7326)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 docs/source/conf.py                           |  1 +
 docs/source/datapoints.rst                    |  6 ++++++
 docs/source/index.rst                         |  2 +-
 docs/source/transforms.rst                    |  8 +++++++-
 torchvision/transforms/transforms.py          | 15 +++++++++++----
 torchvision/transforms/v2/_container.py       |  4 +---
 torchvision/transforms/v2/_deprecated.py      |  2 +-
 torchvision/transforms/v2/_meta.py            |  4 +++-
 torchvision/transforms/v2/_misc.py            |  8 +++++---
 torchvision/transforms/v2/_type_conversion.py |  7 ++++---
 10 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8b4ce17de..6d748f5b7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -34,6 +34,7 @@ from tabulate import tabulate
 sys.path.append(os.path.abspath("."))
 
 torchvision.disable_beta_transforms_warning()
+import torchvision.datapoints  # Don't remove, otherwise the docs for datapoints aren't linked properly
 
 # -- General configuration ------------------------------------------------
 
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 07e20b090..1cc62413e 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -2,6 +2,12 @@ Datapoints
 ==========
 
 .. currentmodule:: torchvision.datapoints
+
+Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
+dispatch their inputs to the appropriate lower-level kernels. Most users do not
+need to manipulate datapoints directly and can simply rely on dataset wrapping -
+see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+
 .. autosummary::
     :toctree: generated/
     :template: class.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ac047ff58..bc38fdb03 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,8 +31,8 @@ architectures, and common image transformations for computer vision.
    :maxdepth: 2
    :caption: Package Reference
 
-   datapoints
    transforms
+   datapoints
    models
    datasets
    utils
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 0e9b053fb..1fe3e78f5 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -198,6 +198,12 @@ Miscellaneous
 Conversion
 ----------
 
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa).
+    
 .. autosummary::
     :toctree: generated/
     :template: class.rst
@@ -211,8 +217,8 @@ Conversion
     v2.PILToTensor
     v2.ToImageTensor
     ConvertImageDtype
-    v2.ConvertImageDtype
     v2.ConvertDtype
+    v2.ConvertImageDtype
     v2.ToDtype
     v2.ConvertBoundingBoxFormat
 
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 90cb0374e..95eb9199e 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -105,7 +105,9 @@ class Compose:
 
 
 class ToTensor:
-    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript.
+    """Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image or numpy.ndarray (H x W x C) in the range
     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
@@ -139,7 +141,9 @@ class ToTensor:
 
 
 class PILToTensor:
-    """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript.
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
     """
@@ -166,7 +170,8 @@ class PILToTensor:
 
 
 class ConvertImageDtype(torch.nn.Module):
-    """Convert a tensor image to the given ``dtype`` and scale the values accordingly
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly.
+
     This function does not support PIL Image.
 
     Args:
@@ -194,7 +199,9 @@ class ConvertImageDtype(torch.nn.Module):
 
 
 class ToPILImage:
-    """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript.
+    """Convert a tensor or an ndarray to PIL Image - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
     H x W x C to a PIL Image while preserving the value range.
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 7f9df3373..2f34a5890 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -138,9 +138,7 @@ class RandomChoice(Transform):
         if p is None:
             p = [1] * len(transforms)
         elif len(p) != len(transforms):
-            raise ValueError(
-                f"Length of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
-            )
+            raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}")
 
         super().__init__()
 
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index c44e6b08d..b5544ecfd 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,7 +10,7 @@ from torchvision.transforms.v2 import Transform
 
 
 class ToTensor(Transform):
-    """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly.
 
     .. betastatus:: ToTensor transform
 
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 94ec851d0..7f28e25c6 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -9,7 +9,7 @@ from .utils import is_simple_tensor
 
 
 class ConvertBoundingBoxFormat(Transform):
-    """[BETA] Convert bounding box coordinates to the given ``format``, e.g. from "CXCYWH" to "XYXY".
+    """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
 
     .. betastatus:: ConvertBoundingBoxFormat transform
 
@@ -18,6 +18,7 @@ class ConvertBoundingBoxFormat(Transform):
             Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and
             string values match the enums, e.g. "XYXY" or "XYWH" etc.
     """
+
     _transformed_types = (datapoints.BoundingBox,)
 
     def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
@@ -79,6 +80,7 @@ class ClampBoundingBox(Transform):
     .. betastatus:: ClampBoundingBox transform
 
     """
+
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 2237334f7..40d578562 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -223,13 +223,15 @@ class GaussianBlur(Transform):
 
 
 class ToDtype(Transform):
-    """[BETA] Converts the input to a specific dtype.
+    """[BETA] Converts the input to a specific dtype - this does not scale values.
 
     .. betastatus:: ToDtype transform
 
     Args:
-        dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify
-            per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``.
+        dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
+            A dict can be passed to specify per-datapoint conversions, e.g.
+            ``dtype={datapoints.Image: torch.float32, datapoints.Video:
+            torch.float64}``.
     """
 
     _transformed_types = (torch.Tensor,)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 504c5cc3d..92de31460 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -11,7 +11,7 @@ from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 class PILToTensor(Transform):
-    """[BETA] Convert a ``PIL Image`` to a tensor of the same type.
+    """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
 
     .. betastatus:: PILToTensor transform
 
@@ -27,7 +27,8 @@ class PILToTensor(Transform):
 
 
 class ToImageTensor(Transform):
-    """[BETA] Convert a tensor or an ndarray or PIL Image to :class:`~torchvision.datapoints.Image`.
+    """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
+    ; this does not scale values.
 
     .. betastatus:: ToImageTensor transform
 
@@ -43,7 +44,7 @@ class ToImageTensor(Transform):
 
 
 class ToImagePIL(Transform):
-    """[BETA] Convert a tensor or an ndarray to PIL Image.
+    """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
     .. betastatus:: ToImagePIL transform
 
-- 
GitLab


From 56d4078e53a211dbc18132b798db8e27fdb30ca7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 13:52:02 +0000
Subject: [PATCH 336/624] Add docs for UniformTemporalSubsample (#7325)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 docs/source/transforms.rst             |  1 +
 torchvision/transforms/v2/_temporal.py | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 1fe3e78f5..6957e79bb 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -192,6 +192,7 @@ Miscellaneous
     v2.Lambda
     v2.SanitizeBoundingBox
     v2.ClampBoundingBox
+    v2.UniformTemporalSubsample
 
 .. _conversion_transforms:
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index b26d6b045..ad7526bc4 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -7,6 +7,19 @@ from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 class UniformTemporalSubsample(Transform):
+    """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
+
+    .. betastatus:: UniformTemporalSubsample transform
+
+    Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
+
+    When ``num_samples`` is larger than the size of temporal dimension of the video, it
+    will sample frames based on nearest neighbor interpolation.
+
+    Args:
+        num_samples (int): The number of equispaced samples to be selected
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Video)
 
     def __init__(self, num_samples: int):
-- 
GitLab


From e0eed2cb6629f36fc27414b7c5cb29cf0c338909 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 15:12:46 +0100
Subject: [PATCH 337/624] fix BoundingBox.wrap_like (#7327)

---
 torchvision/datapoints/_bounding_box.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index d8441823c..11d42f171 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -87,7 +87,7 @@ class BoundingBox(Datapoint):
 
         """
         if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
+            format = BoundingBoxFormat[format.upper()]
 
         return cls._wrap(
             tensor,
-- 
GitLab


From d0ad279e21c9feb4c9fbf9dc1df45ef1f6ea827c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 14:45:06 +0000
Subject: [PATCH 338/624] Add docs for functionals v2 (#7328)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 6957e79bb..22e0889a4 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -5,6 +5,22 @@ Transforming and augmenting images
 
 .. currentmodule:: torchvision.transforms
 
+
+.. note::
+    In 0.15, we released a new set of transforms available in the
+    ``torchvision.transforms.v2`` namespace, which add support for transforming
+    not just images but also bounding boxes, masks, or videos. These transforms
+    are fully backward compatible with the current ones, and you'll see them
+    documented below with a `v2.` prefix. To get started with those new
+    transforms, you can check out
+    :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+    Note that these transforms are still BETA, and while we don't expect major
+    breaking changes in the future, some APIs may still change according to user
+    feedback. Please submit any feedback you may have in
+    https://github.com/pytorch/vision/issues/6753, and you can also check out
+    https://github.com/pytorch/vision/issues/7319 to learn more about the APIs
+    that we suspect might involve future changes.
+
 Transforms are common image transformations available in the
 ``torchvision.transforms`` module. They can be chained together using
 :class:`Compose`.
@@ -253,6 +269,14 @@ Functional Transforms
 
 .. currentmodule:: torchvision.transforms.functional
 
+
+.. note::
+    You'll find below the documentation for the existing
+    ``torchvision.transforms.functional`` namespace. The
+    ``torchvision.transforms.v2.functional`` namespace exists as well and can be
+    used! The same functionals are present, so you simply need to change your
+    import to rely on the ``v2`` namespace.
+
 Functional transforms give you fine-grained control of the transformation pipeline.
 As opposed to the transformations above, functional transforms don't contain a random number
 generator for their parameters.
-- 
GitLab


From 92d75e639adc46c6583917e65293f0cb64f54603 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 15:45:33 +0100
Subject: [PATCH 339/624] add gallery example for datapoints (#7321)

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 gallery/plot_datapoints.py | 132 +++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 gallery/plot_datapoints.py

diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
new file mode 100644
index 000000000..83ca67935
--- /dev/null
+++ b/gallery/plot_datapoints.py
@@ -0,0 +1,132 @@
+"""
+==============
+Datapoints FAQ
+==============
+
+The :mod:`torchvision.datapoints` namespace was introduced together with ``torchvision.transforms.v2``. This example
+showcases what these datapoints are and how they behave. This is a fairly low-level topic that most users will not need
+to worry about: you do not need to understand the internals of datapoints to efficiently rely on
+``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets,
+transforms, or work directly with the datapoints.
+"""
+
+import PIL.Image
+
+import torch
+import torchvision
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import datapoints
+
+
+########################################################################################################################
+# What are datapoints?
+# --------------------
+#
+# Datapoints are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = datapoints.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+
+########################################################################################################################
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# What datapoints are supported?
+# ------------------------------
+#
+# So far :mod:`torchvision.datapoints` supports four types of datapoints:
+#
+# * :class:`~torchvision.datapoints.Image`
+# * :class:`~torchvision.datapoints.Video`
+# * :class:`~torchvision.datapoints.BoundingBox`
+# * :class:`~torchvision.datapoints.Mask`
+#
+# How do I construct a datapoint?
+# -------------------------------
+#
+# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = datapoints.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+########################################################################################################################
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+########################################################################################################################
+# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
+# :class:`PIL.Image.Image` directly:
+
+image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+########################################################################################################################
+# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
+# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the
+# corresponding image alongside the actual values:
+
+bounding_box = datapoints.BoundingBox(
+    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+)
+print(bounding_box)
+
+
+########################################################################################################################
+# Do I have to wrap the output of the datasets myself?
+# ----------------------------------------------------
+#
+# Only if you are using custom datasets. For the built-in ones, you can use
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
+# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
+# also don't have to wrap manually.
+#
+# How do the datapoints behave inside a computation?
+# --------------------------------------------------
+#
+# Datapoints look and feel just like regular tensors. Everything that is supported on a plain :class:`torch.Tensor`
+# also works on datapoints.
+# Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the
+# datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below):
+
+assert isinstance(image, datapoints.Image)
+
+new_image = image + 0
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+
+########################################################################################################################
+# .. note::
+#
+#    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# There are two exceptions to this rule:
+#
+# 1. The operations :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, and :meth:`~torch.Tensor.requires_grad_`
+#    retain the datapoint type.
+# 2. Inplace operations on datapoints cannot change the type of the datapoint they are called on. However, if you use
+#    the flow style, the returned value will be unwrapped:
+
+image = datapoints.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+assert isinstance(image, torch.Tensor)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+assert (new_image == image).all()
-- 
GitLab


From a27522cb06d23bb8949c69dadb8e9d0f24b48c00 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 16:16:34 +0000
Subject: [PATCH 340/624] Change betastatus doc warning and v2 import warning
 (#7329)

---
 docs/source/beta_status.py                    | 15 ++++++--
 docs/source/transforms.rst                    |  8 ++---
 torchvision/__init__.py                       |  9 ++---
 torchvision/transforms/v2/_augment.py         |  2 +-
 torchvision/transforms/v2/_auto_augment.py    |  8 ++---
 torchvision/transforms/v2/_color.py           | 20 +++++------
 torchvision/transforms/v2/_container.py       |  8 ++---
 torchvision/transforms/v2/_deprecated.py      |  2 +-
 torchvision/transforms/v2/_geometry.py        | 36 +++++++++----------
 torchvision/transforms/v2/_meta.py            |  6 ++--
 torchvision/transforms/v2/_misc.py            | 12 +++----
 torchvision/transforms/v2/_temporal.py        |  2 +-
 torchvision/transforms/v2/_type_conversion.py |  6 ++--
 13 files changed, 73 insertions(+), 61 deletions(-)

diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py
index 925894df5..4a0fdc72c 100644
--- a/docs/source/beta_status.py
+++ b/docs/source/beta_status.py
@@ -4,15 +4,26 @@ from docutils.parsers.rst import Directive
 
 class BetaStatus(Directive):
     has_content = True
+    text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
 
     def run(self):
-        api_name = " ".join(self.content)
-        text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
+        text = self.text.format(api_name=" ".join(self.content))
         return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))]
 
 
+class V2BetaStatus(BetaStatus):
+    text = (
+        "The {api_name} is in Beta stage, and while we do not expect major breaking changes, "
+        "some APIs may still change according to user feedback. Please submit any feedback you may have "
+        "in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check "
+        "out https://github.com/pytorch/vision/issues/7319 to learn "
+        "more about the APIs that we suspect might involve future changes."
+    )
+
+
 def setup(app):
     app.add_directive("betastatus", BetaStatus)
+    app.add_directive("v2betastatus", V2BetaStatus)
     return {
         "version": "0.1",
         "parallel_read_safe": True,
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 22e0889a4..0d6961bbe 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -16,10 +16,10 @@ Transforming and augmenting images
     :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
     Note that these transforms are still BETA, and while we don't expect major
     breaking changes in the future, some APIs may still change according to user
-    feedback. Please submit any feedback you may have in
-    https://github.com/pytorch/vision/issues/6753, and you can also check out
-    https://github.com/pytorch/vision/issues/7319 to learn more about the APIs
-    that we suspect might involve future changes.
+    feedback. Please submit any feedback you may have `here
+    <https://github.com/pytorch/vision/issues/6753>`_, and you can also check
+    out `this issue <https://github.com/pytorch/vision/issues/7319>`_ to learn
+    more about the APIs that we suspect might involve future changes.
 
 Transforms are common image transformations available in the
 ``torchvision.transforms`` module. They can be chained together using
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index f29da9cf6..eed24091a 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -100,10 +100,11 @@ def _is_tracing():
 _WARN_ABOUT_BETA_TRANSFORMS = True
 _BETA_TRANSFORMS_WARNING = (
     "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. "
-    "While we will try our best to maintain backward compatibility, "
-    "some APIs or behaviors might change without a deprecation cycle. "
-    "To help us improve these new features, please provide your feedback "
-    "here: https://github.com/pytorch/vision/issues/6753."
+    "While we do not expect major breaking changes, some APIs may still change "
+    "according to user feedback. Please submit any feedback you may have in "
+    "this issue: https://github.com/pytorch/vision/issues/6753, and you can also "
+    "check out https://github.com/pytorch/vision/issues/7319 to learn more about "
+    "the APIs that we suspect might involve future changes. "
     "You can silence this warning by calling torchvision.disable_beta_transform_warning()."
 )
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 0df7e0f24..937e3508a 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -15,7 +15,7 @@ from .utils import is_simple_tensor, query_chw
 class RandomErasing(_RandomApplyTransform):
     """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels.
 
-    .. betastatus:: RandomErasing transform
+    .. v2betastatus:: RandomErasing transform
 
     This transform does not support PIL Image.
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 2cd88c1a7..34c0ced43 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -165,7 +165,7 @@ class AutoAugment(_AutoAugmentBase):
     r"""[BETA] AutoAugment data augmentation method based on
     `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
 
-    .. betastatus:: AutoAugment transform
+    .. v2betastatus:: AutoAugment transform
 
     This transformation works on images and videos only.
 
@@ -342,7 +342,7 @@ class RandAugment(_AutoAugmentBase):
     `"RandAugment: Practical automated data augmentation with a reduced search space"
     <https://arxiv.org/abs/1909.13719>`_.
 
-    .. betastatus:: RandAugment transform
+    .. v2betastatus:: RandAugment transform
 
     This transformation works on images and videos only.
 
@@ -425,7 +425,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
     r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
     `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
 
-    .. betastatus:: TrivialAugmentWide transform
+    .. v2betastatus:: TrivialAugmentWide transform
 
     This transformation works on images and videos only.
 
@@ -496,7 +496,7 @@ class AugMix(_AutoAugmentBase):
     r"""[BETA] AugMix data augmentation method based on
     `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
 
-    .. betastatus:: AugMix transform
+    .. v2betastatus:: AugMix transform
 
     This transformation works on images and videos only.
 
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 237e8d618..4ad534c98 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -13,7 +13,7 @@ from .utils import is_simple_tensor, query_chw
 class Grayscale(Transform):
     """[BETA] Convert images or videos to grayscale.
 
-    .. betastatus:: Grayscale transform
+    .. v2betastatus:: Grayscale transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
@@ -42,7 +42,7 @@ class Grayscale(Transform):
 class RandomGrayscale(_RandomApplyTransform):
     """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1).
 
-    .. betastatus:: RandomGrayscale transform
+    .. v2betastatus:: RandomGrayscale transform
 
     If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
     where ... means an arbitrary number of leading dimensions
@@ -76,7 +76,7 @@ class RandomGrayscale(_RandomApplyTransform):
 class ColorJitter(Transform):
     """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video.
 
-    .. betastatus:: ColorJitter transform
+    .. v2betastatus:: ColorJitter transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -182,7 +182,7 @@ class RandomPhotometricDistort(Transform):
     """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
     MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
 
-    .. betastatus:: RandomPhotometricDistort transform
+    .. v2betastatus:: RandomPhotometricDistort transform
 
     This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
     under the hood to adjust the contrast, saturation, hue, brightness, and also
@@ -282,7 +282,7 @@ class RandomPhotometricDistort(Transform):
 class RandomEqualize(_RandomApplyTransform):
     """[BETA] Equalize the histogram of the given image or video with a given probability.
 
-    .. betastatus:: RandomEqualize transform
+    .. v2betastatus:: RandomEqualize transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -301,7 +301,7 @@ class RandomEqualize(_RandomApplyTransform):
 class RandomInvert(_RandomApplyTransform):
     """[BETA] Inverts the colors of the given image or video with a given probability.
 
-    .. betastatus:: RandomInvert transform
+    .. v2betastatus:: RandomInvert transform
 
     If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
     where ... means it can have an arbitrary number of leading dimensions.
@@ -321,7 +321,7 @@ class RandomPosterize(_RandomApplyTransform):
     """[BETA] Posterize the image or video with a given probability by reducing the
     number of bits for each color channel.
 
-    .. betastatus:: RandomPosterize transform
+    .. v2betastatus:: RandomPosterize transform
 
     If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
     and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -346,7 +346,7 @@ class RandomSolarize(_RandomApplyTransform):
     """[BETA] Solarize the image or video with a given probability by inverting all pixel
     values above a threshold.
 
-    .. betastatus:: RandomSolarize transform
+    .. v2betastatus:: RandomSolarize transform
 
     If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
     where ... means it can have an arbitrary number of leading dimensions.
@@ -370,7 +370,7 @@ class RandomSolarize(_RandomApplyTransform):
 class RandomAutocontrast(_RandomApplyTransform):
     """[BETA] Autocontrast the pixels of the given image or video with a given probability.
 
-    .. betastatus:: RandomAutocontrast transform
+    .. v2betastatus:: RandomAutocontrast transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -389,7 +389,7 @@ class RandomAutocontrast(_RandomApplyTransform):
 class RandomAdjustSharpness(_RandomApplyTransform):
     """[BETA] Adjust the sharpness of the image or video with a given probability.
 
-    .. betastatus:: RandomAdjustSharpness transform
+    .. v2betastatus:: RandomAdjustSharpness transform
 
     If the input is a :class:`torch.Tensor`,
     it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 2f34a5890..fffef4157 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -10,7 +10,7 @@ from torchvision.transforms.v2 import Transform
 class Compose(Transform):
     """[BETA] Composes several transforms together.
 
-    .. betastatus:: Compose transform
+    .. v2betastatus:: Compose transform
 
     This transform does not support torchscript.
     Please, see the note below.
@@ -61,7 +61,7 @@ class Compose(Transform):
 class RandomApply(Transform):
     """[BETA] Apply randomly a list of transformations with a given probability.
 
-    .. betastatus:: RandomApply transform
+    .. v2betastatus:: RandomApply transform
 
     .. note::
         In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
@@ -116,7 +116,7 @@ class RandomApply(Transform):
 class RandomChoice(Transform):
     """[BETA] Apply single transformation randomly picked from a list.
 
-    .. betastatus:: RandomChoice transform
+    .. v2betastatus:: RandomChoice transform
 
     This transform does not support torchscript.
 
@@ -155,7 +155,7 @@ class RandomChoice(Transform):
 class RandomOrder(Transform):
     """[BETA] Apply a list of transformations in a random order.
 
-    .. betastatus:: RandomOrder transform
+    .. v2betastatus:: RandomOrder transform
 
     This transform does not support torchscript.
 
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index b5544ecfd..e900e853d 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -12,7 +12,7 @@ from torchvision.transforms.v2 import Transform
 class ToTensor(Transform):
     """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly.
 
-    .. betastatus:: ToTensor transform
+    .. v2betastatus:: ToTensor transform
 
     .. warning::
         :class:`v2.ToTensor` is deprecated and will be removed in a future release.
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index b2618bb89..59791c30b 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -28,7 +28,7 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query
 class RandomHorizontalFlip(_RandomApplyTransform):
     """[BETA] Horizontally flip the input with a given probability.
 
-    .. betastatus:: RandomHorizontalFlip transform
+    .. v2betastatus:: RandomHorizontalFlip transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -48,7 +48,7 @@ class RandomHorizontalFlip(_RandomApplyTransform):
 class RandomVerticalFlip(_RandomApplyTransform):
     """[BETA] Vertically flip the input with a given probability.
 
-    .. betastatus:: RandomVerticalFlip transform
+    .. v2betastatus:: RandomVerticalFlip transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -68,7 +68,7 @@ class RandomVerticalFlip(_RandomApplyTransform):
 class Resize(Transform):
     """[BETA] Resize the input to the given size.
 
-    .. betastatus:: Resize transform
+    .. v2betastatus:: Resize transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -162,7 +162,7 @@ class Resize(Transform):
 class CenterCrop(Transform):
     """[BETA] Crop the input at the center.
 
-    .. betastatus:: CenterCrop transform
+    .. v2betastatus:: CenterCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -190,7 +190,7 @@ class CenterCrop(Transform):
 class RandomResizedCrop(Transform):
     """[BETA] Crop a random portion of the input and resize it to a given size.
 
-    .. betastatus:: RandomResizedCrop transform
+    .. v2betastatus:: RandomResizedCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -316,7 +316,7 @@ ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
 class FiveCrop(Transform):
     """[BETA] Crop the image or video into four corners and the central crop.
 
-    .. betastatus:: FiveCrop transform
+    .. v2betastatus:: FiveCrop transform
 
     If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
     :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
@@ -379,7 +379,7 @@ class TenCrop(Transform):
     """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
     these (horizontal flipping is used by default).
 
-    .. betastatus:: TenCrop transform
+    .. v2betastatus:: TenCrop transform
 
     If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
     :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
@@ -437,7 +437,7 @@ class TenCrop(Transform):
 class Pad(Transform):
     """[BETA] Pad the input on all sides with the given "pad" value.
 
-    .. betastatus:: Pad transform
+    .. v2betastatus:: Pad transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -512,7 +512,7 @@ class RandomZoomOut(_RandomApplyTransform):
     """[BETA] "Zoom out" transformation from
     `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
-    .. betastatus:: RandomZoomOut transform
+    .. v2betastatus:: RandomZoomOut transform
 
     This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
     Output spatial size is randomly sampled from original size up to a maximum size configured
@@ -581,7 +581,7 @@ class RandomZoomOut(_RandomApplyTransform):
 class RandomRotation(Transform):
     """[BETA] Rotate the input by angle.
 
-    .. betastatus:: RandomRotation transform
+    .. v2betastatus:: RandomRotation transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -654,7 +654,7 @@ class RandomRotation(Transform):
 class RandomAffine(Transform):
     """[BETA] Random affine transformation the input keeping center invariant.
 
-    .. betastatus:: RandomAffine transform
+    .. v2betastatus:: RandomAffine transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -775,7 +775,7 @@ class RandomAffine(Transform):
 class RandomCrop(Transform):
     """[BETA] Crop the input at a random location.
 
-    .. betastatus:: RandomCrop transform
+    .. v2betastatus:: RandomCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -930,7 +930,7 @@ class RandomCrop(Transform):
 class RandomPerspective(_RandomApplyTransform):
     """[BETA] Perform a random perspective transformation of the input with a given probability.
 
-    .. betastatus:: RandomPerspective transform
+    .. v2betastatus:: RandomPerspective transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1016,7 +1016,7 @@ class RandomPerspective(_RandomApplyTransform):
 class ElasticTransform(Transform):
     """[BETA] Transform the input with elastic transformations.
 
-    .. betastatus:: RandomPerspective transform
+    .. v2betastatus:: RandomPerspective transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1108,7 +1108,7 @@ class RandomIoUCrop(Transform):
     """[BETA] Random IoU crop transformation from
     `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
-    .. betastatus:: RandomIoUCrop transform
+    .. v2betastatus:: RandomIoUCrop transform
 
     This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
 
@@ -1232,7 +1232,7 @@ class ScaleJitter(Transform):
     """[BETA] Perform Large Scale Jitter on the input according to
     `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
 
-    .. betastatus:: ScaleJitter transform
+    .. v2betastatus:: ScaleJitter transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1298,7 +1298,7 @@ class ScaleJitter(Transform):
 class RandomShortestSize(Transform):
     """[BETA] Randomly resize the input.
 
-    .. betastatus:: RandomShortestSize transform
+    .. v2betastatus:: RandomShortestSize transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1366,7 +1366,7 @@ class RandomShortestSize(Transform):
 class RandomResize(Transform):
     """[BETA] Randomly resize the input.
 
-    .. betastatus:: RandomResize transform
+    .. v2betastatus:: RandomResize transform
 
     This transformation can be used together with ``RandomCrop`` as data augmentations to train
     models on image segmentation task.
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 7f28e25c6..b7e2a4225 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -11,7 +11,7 @@ from .utils import is_simple_tensor
 class ConvertBoundingBoxFormat(Transform):
     """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
 
-    .. betastatus:: ConvertBoundingBoxFormat transform
+    .. v2betastatus:: ConvertBoundingBoxFormat transform
 
     Args:
         format (str or datapoints.BoundingBoxFormat): output bounding box format.
@@ -34,7 +34,7 @@ class ConvertBoundingBoxFormat(Transform):
 class ConvertDtype(Transform):
     """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly.
 
-    .. betastatus:: ConvertDtype transform
+    .. v2betastatus:: ConvertDtype transform
 
     This function does not support PIL Image.
 
@@ -77,7 +77,7 @@ class ClampBoundingBox(Transform):
 
     The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
 
-    .. betastatus:: ClampBoundingBox transform
+    .. v2betastatus:: ClampBoundingBox transform
 
     """
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 40d578562..c9b9025eb 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -24,7 +24,7 @@ class Identity(Transform):
 class Lambda(Transform):
     """[BETA] Apply a user-defined function as a transform.
 
-    .. betastatus:: Lambda transform
+    .. v2betastatus:: Lambda transform
 
     This transform does not support torchscript.
 
@@ -55,7 +55,7 @@ class Lambda(Transform):
 class LinearTransformation(Transform):
     """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
 
-    .. betastatus:: LinearTransformation transform
+    .. v2betastatus:: LinearTransformation transform
 
     This transform does not support PIL Image.
     Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
@@ -138,7 +138,7 @@ class LinearTransformation(Transform):
 class Normalize(Transform):
     """[BETA] Normalize a tensor image or video with mean and standard deviation.
 
-    .. betastatus:: Normalize transform
+    .. v2betastatus:: Normalize transform
 
     This transform does not support PIL Image.
     Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
@@ -178,7 +178,7 @@ class Normalize(Transform):
 class GaussianBlur(Transform):
     """[BETA] Blurs image with randomly chosen Gaussian blur.
 
-    .. betastatus:: GausssianBlur transform
+    .. v2betastatus:: GausssianBlur transform
 
     If the input is a Tensor, it is expected
     to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -225,7 +225,7 @@ class GaussianBlur(Transform):
 class ToDtype(Transform):
     """[BETA] Converts the input to a specific dtype - this does not scale values.
 
-    .. betastatus:: ToDtype transform
+    .. v2betastatus:: ToDtype transform
 
     Args:
         dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
@@ -258,7 +258,7 @@ class ToDtype(Transform):
 class SanitizeBoundingBox(Transform):
     """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
 
-    .. betastatus:: SanitizeBoundingBox transform
+    .. v2betastatus:: SanitizeBoundingBox transform
 
     This transform removes bounding boxes and their associated labels/masks that:
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index ad7526bc4..df4ad6664 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -9,7 +9,7 @@ from torchvision.transforms.v2.utils import is_simple_tensor
 class UniformTemporalSubsample(Transform):
     """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
 
-    .. betastatus:: UniformTemporalSubsample transform
+    .. v2betastatus:: UniformTemporalSubsample transform
 
     Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
 
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 92de31460..60f44c5d3 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -13,7 +13,7 @@ from torchvision.transforms.v2.utils import is_simple_tensor
 class PILToTensor(Transform):
     """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
 
-    .. betastatus:: PILToTensor transform
+    .. v2betastatus:: PILToTensor transform
 
     This transform does not support torchscript.
 
@@ -30,7 +30,7 @@ class ToImageTensor(Transform):
     """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
     ; this does not scale values.
 
-    .. betastatus:: ToImageTensor transform
+    .. v2betastatus:: ToImageTensor transform
 
     This transform does not support torchscript.
     """
@@ -46,7 +46,7 @@ class ToImageTensor(Transform):
 class ToImagePIL(Transform):
     """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
-    .. betastatus:: ToImagePIL transform
+    .. v2betastatus:: ToImagePIL transform
 
     This transform does not support torchscript.
 
-- 
GitLab


From 8e849eae063fc148bd92e3cc8a7942070be8b702 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 17:40:42 +0100
Subject: [PATCH 341/624] add gallery for transforms v2 (#7331)

---
 gallery/plot_transforms_v2.py | 109 ++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 gallery/plot_transforms_v2.py

diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
new file mode 100644
index 000000000..d1096bec1
--- /dev/null
+++ b/gallery/plot_transforms_v2.py
@@ -0,0 +1,109 @@
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports
+images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This
+example showcases the core functionality of the new ``torchvision.transforms.v2`` API.
+"""
+
+import pathlib
+
+import torch
+import torchvision
+
+
+def load_data():
+    from torchvision.io import read_image
+    from torchvision import datapoints
+    from torchvision.ops import masks_to_boxes
+
+    assets_directory = pathlib.Path("assets")
+
+    path = assets_directory / "FudanPed00054.png"
+    image = datapoints.Image(read_image(str(path)))
+    merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png"))
+
+    labels = torch.unique(merged_masks)[1:]
+
+    masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
+
+    bounding_boxes = datapoints.BoundingBox(
+        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+    )
+
+    return path, image, bounding_boxes, masks, labels
+
+
+########################################################################################################################
+# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
+# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
+# detection or instance and semantic segmentation. Still, the interface is the same, making
+# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1.
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+import torchvision.transforms.v2 as transforms
+
+transform = transforms.Compose(
+    [
+        transforms.ColorJitter(contrast=0.5),
+        transforms.RandomRotation(30),
+        transforms.CenterCrop(480),
+    ]
+)
+
+########################################################################################################################
+# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
+# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
+# order.
+
+path, image, bounding_boxes, masks, labels = load_data()
+
+torch.manual_seed(0)
+new_image = transform(image)  # Image Classification
+new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels)  # Object Detection
+new_image, new_bounding_boxes, new_masks, new_labels = transform(
+    image, bounding_boxes, masks, labels
+)  # Instance Segmentation
+new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure
+
+########################################################################################################################
+# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
+# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
+# regular user, you likely don't have to touch this yourself. See
+# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+#
+# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
+# information directly with the sample:
+
+sample = {"path": path, "image": image}
+new_sample = transform(sample)
+
+assert new_sample["path"] is sample["path"]
+
+########################################################################################################################
+# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
+# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
+# simple heuristic:
+#
+# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`,
+#   or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through.
+# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or
+#   video, while all others will be passed through.
+
+plain_tensor_image = torch.rand(image.shape)
+
+print(image.shape, plain_tensor_image.shape)
+
+# passing a plain tensor together with an explicit image, will not transform the former
+plain_tensor_image, image = transform(plain_tensor_image, image)
+
+print(image.shape, plain_tensor_image.shape)
+
+# passing a plain tensor without an explicit image, will transform the former
+plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes)
+
+print(image.shape, plain_tensor_image.shape)
-- 
GitLab


From 01ef0a68b6ec00452391251fc16c38e58b92bf07 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 18:18:12 +0100
Subject: [PATCH 342/624] Fixed uncaught warnings in tests v2 (#7330)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2.py | 41 +++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9beded4c9..f5ca97696 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -136,14 +136,14 @@ class TestSmoke:
             (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
             (transforms.RandomHorizontalFlip(p=1.0), None),
             (transforms.RandomPerspective(p=1.0), None),
-            (transforms.RandomResize(min_size=10, max_size=20), None),
-            (transforms.RandomResizedCrop([16, 16]), None),
+            (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None),
+            (transforms.RandomResizedCrop([16, 16], antialias=True), None),
             (transforms.RandomRotation(degrees=30), None),
-            (transforms.RandomShortestSize(min_size=10), None),
+            (transforms.RandomShortestSize(min_size=10, antialias=True), None),
             (transforms.RandomVerticalFlip(p=1.0), None),
             (transforms.RandomZoomOut(p=1.0), None),
             (transforms.Resize([16, 16], antialias=True), None),
-            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
             (transforms.ClampBoundingBox(), None),
             (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertDtype(), None),
@@ -1514,7 +1514,7 @@ class TestRandomShortestSize:
     def test__get_params(self, min_size, max_size, mocker):
         spatial_size = (3, 10)
 
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
 
         sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
         params = transform._get_params([sample])
@@ -1595,7 +1595,7 @@ class TestRandomResize:
         min_size = 3
         max_size = 6
 
-        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
 
         for _ in range(10):
             params = transform._get_params([])
@@ -1791,15 +1791,21 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
     else:
         sample = image, label
 
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
     t = transforms.Compose(
         [
-            transforms.RandomResizedCrop((224, 224)),
+            transforms.RandomResizedCrop((224, 224), antialias=True),
             transforms.RandomHorizontalFlip(p=1),
             transforms.RandAugment(),
             transforms.TrivialAugmentWide(),
             transforms.AugMix(),
             transforms.AutoAugment(),
-            to_tensor(),
+            to_tensor,
             # TODO: ConvertImageDtype is a pass-through on PIL images, is that
             # intended?  This results in a failure if we convert to tensor after
             # it, because the image would still be uint8 which make Normalize
@@ -1830,10 +1836,17 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 @pytest.mark.parametrize("sanitize", (True, False))
 def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     torch.manual_seed(0)
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
     if data_augmentation == "hflip":
         t = [
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "lsj":
@@ -1847,7 +1860,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             # ),
             transforms.RandomCrop((1024, 1024), pad_if_needed=True),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "multiscale":
@@ -1856,7 +1869,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
                 min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
             ),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssd":
@@ -1865,14 +1878,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssdlite":
         t = [
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     if sanitize:
@@ -1907,7 +1920,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 
     out = t(sample)
 
-    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
+    if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image:
         assert is_simple_tensor(out["image"])
     else:
         assert isinstance(out["image"], datapoints.Image)
-- 
GitLab


From 8ee4d599d739d46db3431662fd91404952e80483 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Feb 2023 13:29:19 +0100
Subject: [PATCH 343/624] add docstring for dataset wrapper (#7333)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/datasets.rst                   |  9 +++
 torchvision/datapoints/_dataset_wrapper.py | 65 +++++++++++++++++++++-
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index 68c72e7af..35e5eaf2a 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -169,3 +169,12 @@ Base classes for custom datasets
     DatasetFolder
     ImageFolder
     VisionDataset
+
+Transforms v2
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    wrap_dataset_for_transforms_v2
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index e358c83d9..87ce3ba93 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -14,8 +14,71 @@ from torchvision.transforms.v2 import functional as F
 __all__ = ["wrap_dataset_for_transforms_v2"]
 
 
-# TODO: naming!
 def wrap_dataset_for_transforms_v2(dataset):
+    """[BETA] Wrap a ``torchvision.dataset`` for usage with :mod:`torchvision.transforms.v2`.
+
+    .. v2betastatus:: wrap_dataset_for_transforms_v2 function
+
+    Example:
+        >>> dataset = torchvision.datasets.CocoDetection(...)
+        >>> dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    .. note::
+
+       For now, only the most popular datasets are supported. Furthermore, the wrapper only supports dataset
+       configurations that are fully supported by ``torchvision.transforms.v2``. If you encounter an error prompting you
+       to raise an issue to ``torchvision`` for a dataset or configuration that you need, please do so.
+
+    The dataset samples are wrapped according to the description below.
+
+    Special cases:
+
+        * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
+          returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
+          ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``.
+          The original keys are preserved.
+        * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
+          the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
+          preserved.
+        * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBox` datapoint.
+        * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dictsthe wrapper returns a dict
+          of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
+          in the corresponding ``torchvision.datapoints``. The original keys are preserved.
+        * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
+          :class:`~torchvision.datapoints.Mask` datapoint.
+        * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
+          :class:`~torchvision.datapoints.Mask` datapoint. The target for ``target_type="instance"`` is *replaced* by
+          a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.datapoints.Mask` datapoint) and
+          ``"labels"``.
+        * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBox` datapoint.
+
+    Image classification datasets
+
+        This wrapper is a no-op for image classification datasets, since they were already fully supported by
+        :mod:`torchvision.transforms` and thus no change is needed for :mod:`torchvision.transforms.v2`.
+
+    Segmentation datasets
+
+        Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation` return a two-tuple of
+        :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the
+        segmentation mask into a :class:`~torchvision.datapoints.Mask` (second item).
+
+    Video classification datasets
+
+        Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics` return a three-tuple containing a
+        :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a
+        :class:`~torchvision.datapoints.Video` while leaving the other items as is.
+
+        .. note::
+
+            Only datasets constructed with ``output_format="TCHW"`` are supported, since the alternative
+            ``output_format="THWC"`` is not supported by :mod:`torchvision.transforms.v2`.
+
+    Args:
+        dataset: the dataset instance to wrap for compatibility with transforms v2.
+    """
     return VisionDatasetDatapointWrapper(dataset)
 
 
-- 
GitLab


From 126e3e7527243fd838bf2929e6483326815283de Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 27 Feb 2023 12:33:54 +0000
Subject: [PATCH 344/624] [FBcode->GH] Bump SoLoader and Allow building
 torchvision operators statically (#7342)

Co-authored-by: Eric Sauser <esauser@meta.com>
Co-authored-by: Sim Sun <simsun@meta.com>
---
 android/build.gradle      | 2 +-
 torchvision/csrc/macros.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/android/build.gradle b/android/build.gradle
index f28ba9112..f7995a07f 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -14,7 +14,7 @@ allprojects {
 
             androidSupportAppCompatV7Version = "28.0.0"
             fbjniJavaOnlyVersion = "0.0.3"
-            soLoaderNativeLoaderVersion = "0.10.4"
+            soLoaderNativeLoaderVersion = "0.10.5"
             pytorchAndroidVersion = "1.12"
         }
 
diff --git a/torchvision/csrc/macros.h b/torchvision/csrc/macros.h
index 8a7136fad..64ca89429 100644
--- a/torchvision/csrc/macros.h
+++ b/torchvision/csrc/macros.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(TORCHVISION_BUILD_STATIC_LIBS)
 #if defined(torchvision_EXPORTS)
 #define VISION_API __declspec(dllexport)
 #else
-- 
GitLab


From 6a8a3c05e47d8946c3f772df1bfed5328dce4206 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 27 Feb 2023 09:43:26 -0500
Subject: [PATCH 345/624] Fix vision unit tests infra failures (#7332)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 .circleci/config.yml                          | 30 ++-----------------
 .circleci/config.yml.in                       |  2 +-
 .circleci/regenerate.py                       |  7 ++++-
 .circleci/unittest/linux/scripts/install.sh   |  7 ++++-
 .circleci/unittest/linux/scripts/setup_env.sh |  4 +--
 .../unittest/windows/scripts/setup_env.sh     |  2 +-
 6 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a8aedd346..b2193928b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -767,7 +767,7 @@ jobs:
       image: ubuntu-2004-cuda-11.4:202110-01
     resource_class: gpu.nvidia.medium
     environment:
-      image_name: "pytorch/manylinux-cuda116"
+      image_name: "pytorch/manylinux-cuda117"
       CU_VERSION: << parameters.cu_version >>
       PYTHON_VERSION: << parameters.python_version >>
     steps:
@@ -1363,15 +1363,6 @@ workflows:
               - nightly
           name: unittest_linux_gpu_py3.10
           python_version: '3.10'
-      - unittest_linux_gpu:
-          cu_version: cu117
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.11
-          python_version: '3.11'
       - unittest_windows_cpu:
           cu_version: cpu
           name: unittest_windows_cpu_py3.8
@@ -1384,10 +1375,6 @@ workflows:
           cu_version: cpu
           name: unittest_windows_cpu_py3.10
           python_version: '3.10'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.11
-          python_version: '3.11'
       - unittest_windows_gpu:
           cu_version: cu117
           name: unittest_windows_gpu_py3.8
@@ -1410,15 +1397,6 @@ workflows:
               - nightly
           name: unittest_windows_gpu_py3.10
           python_version: '3.10'
-      - unittest_windows_gpu:
-          cu_version: cu117
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.11
-          python_version: '3.11'
       - unittest_macos_cpu:
           cu_version: cpu
           name: unittest_macos_cpu_py3.8
@@ -1431,10 +1409,6 @@ workflows:
           cu_version: cpu
           name: unittest_macos_cpu_py3.10
           python_version: '3.10'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.11
-          python_version: '3.11'
 
   cmake:
     jobs:
@@ -1446,7 +1420,7 @@ workflows:
           cu_version: cu117
           name: cmake_linux_gpu
           python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
+          wheel_docker_image: pytorch/manylinux-cuda117
       - cmake_windows_cpu:
           cu_version: cpu
           name: cmake_windows_cpu
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index ab6fa6c35..abc990656 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -767,7 +767,7 @@ jobs:
       image: ubuntu-2004-cuda-11.4:202110-01
     resource_class: gpu.nvidia.medium
     environment:
-      image_name: "pytorch/manylinux-cuda116"
+      image_name: "pytorch/manylinux-cuda117"
       CU_VERSION: << parameters.cu_version >>
       PYTHON_VERSION: << parameters.python_version >>
     steps:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 4c1651a3a..bd8a54e2f 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -262,6 +262,11 @@ def unittest_workflows(indentation=6):
             if os_type == "linux" and device_type == "cpu":
                 continue
             for i, python_version in enumerate(PYTHON_VERSIONS):
+
+                # Turn off unit tests for 3.11, unit test are not setup properly
+                if python_version == "3.11":
+                    continue
+
                 job = {
                     "name": f"unittest_{os_type}_{device_type}_py{python_version}",
                     "python_version": python_version,
@@ -290,7 +295,7 @@ def cmake_workflows(indentation=6):
 
             job["cu_version"] = "cu117" if device == "gpu" else "cpu"
             if device == "gpu" and os_type == "linux":
-                job["wheel_docker_image"] = "pytorch/manylinux-cuda116"
+                job["wheel_docker_image"] = "pytorch/manylinux-cuda117"
             jobs.append({f"cmake_{os_type}_{device}": job})
     return indent(indentation, jobs)
 
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
index 54722842a..a15c1458d 100755
--- a/.circleci/unittest/linux/scripts/install.sh
+++ b/.circleci/unittest/linux/scripts/install.sh
@@ -5,7 +5,7 @@ unset PYTORCH_VERSION
 # so no need to set PYTORCH_VERSION.
 # In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
 
-set -e
+set -ex
 
 eval "$(./conda/bin/conda shell.bash hook)"
 conda activate ./env
@@ -34,7 +34,12 @@ if [ "${os}" == "MacOSX" ]; then
     conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}"
 else
     conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
+
+    # make sure local cuda is set to required cuda version and not CUDA version by default
+    rm -f /usr/local/cuda
+    ln -s /usr/local/cuda-${version} /usr/local/cuda
 fi
 
+
 printf "* Installing torchvision\n"
 python setup.py develop
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 0574cdff1..8a8a78f1f 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -5,7 +5,7 @@
 #
 # Do not install PyTorch and torchvision here, otherwise they also get cached.
 
-set -e
+set -ex
 
 this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 # Avoid error: "fatal: unsafe repository"
@@ -39,7 +39,7 @@ conda activate "${env_dir}"
 # 3. Install Conda dependencies
 printf "* Installing dependencies (except PyTorch)\n"
 FFMPEG_PIN="=4.2"
-if [[ "${PYTHON_VERSION}" = "3.9" ]]; then
+if [[ "${PYTHON_VERSION}" == "3.9" ]]; then
     FFMPEG_PIN=">=4.2"
 fi
 
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
index 5eeb2e17b..846978759 100644
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ b/.circleci/unittest/windows/scripts/setup_env.sh
@@ -5,7 +5,7 @@
 #
 # Do not install PyTorch and torchvision here, otherwise they also get cached.
 
-set -e
+set -ex
 
 this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 root_dir="$(git rev-parse --show-toplevel)"
-- 
GitLab


From 547dd1d05a83e1f9d04da891c73aff146b6c0316 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Feb 2023 16:12:15 +0100
Subject: [PATCH 346/624] skip flaky adjust_contrast kernel tests (#7347)

---
 test/transforms_v2_kernel_infos.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index a14ce27d3..8ae0a8ba4 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -1925,6 +1925,9 @@ def sample_inputs_adjust_contrast_video():
         yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
+# TODO: this is just temporary to make CI green for release. We should add proper tolerances after
+skip_adjust_contrast_jit = TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.skip(reason="Test is flaky"))
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1939,11 +1942,13 @@ KERNEL_INFOS.extend(
                 **float32_vs_uint8_pixel_difference(2),
                 **cuda_vs_cpu_pixel_difference(),
             },
+            test_marks=[skip_adjust_contrast_jit],
         ),
         KernelInfo(
             F.adjust_contrast_video,
             sample_inputs_fn=sample_inputs_adjust_contrast_video,
             closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+            test_marks=[skip_adjust_contrast_jit],
         ),
     ]
 )
-- 
GitLab


From 1cd95e65f90aad9bc5d6e7552ac90dfb084bd525 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Feb 2023 16:26:06 +0100
Subject: [PATCH 347/624] skip tests that use assert_run_python_script on
 windows (#7351)

---
 test/test_transforms.py    | 5 +++++
 test/test_transforms_v2.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/test/test_transforms.py b/test/test_transforms.py
index 03b385e9e..b6eccba42 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2,6 +2,7 @@ import math
 import os
 import random
 import re
+import sys
 import textwrap
 import warnings
 from functools import partial
@@ -2278,6 +2279,10 @@ def test_random_grayscale_with_grayscale_input():
     ),
 )
 @pytest.mark.parametrize("from_private", (True, False))
+@pytest.mark.skipif(
+    sys.platform in ("win32", "cygwin"),
+    reason="assert_run_python_script is broken on Windows. Possible fix in https://github.com/pytorch/vision/pull/7346",
+)
 def test_functional_deprecation_warning(import_statement, from_private):
     if from_private:
         import_statement = import_statement.replace("functional", "_functional")
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index f5ca97696..9fe7bbf51 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -2,6 +2,7 @@ import itertools
 import pathlib
 import random
 import re
+import sys
 import textwrap
 import warnings
 from collections import defaultdict
@@ -2101,6 +2102,10 @@ def test_sanitize_bounding_boxes_errors():
     ),
 )
 @pytest.mark.parametrize("call_disable_warning", (True, False))
+@pytest.mark.skipif(
+    sys.platform in ("win32", "cygwin"),
+    reason="assert_run_python_script is broken on Windows. Possible fix in https://github.com/pytorch/vision/pull/7346",
+)
 def test_warnings_v2_namespaces(import_statement, call_disable_warning):
     if call_disable_warning:
         source = f"""
@@ -2120,6 +2125,10 @@ def test_warnings_v2_namespaces(import_statement, call_disable_warning):
     assert_run_python_script(textwrap.dedent(source))
 
 
+@pytest.mark.skipif(
+    sys.platform in ("win32", "cygwin"),
+    reason="assert_run_python_script is broken on Windows. Possible fix in https://github.com/pytorch/vision/pull/7346",
+)
 def test_no_warnings_v1_namespace():
     source = """
     import warnings
-- 
GitLab


From 0888734406cc7a69844a11bdf24ba2e9b987979b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 27 Feb 2023 15:26:52 +0000
Subject: [PATCH 348/624] Skip
 test_schema_meta_validation[maskrcnn_resnet50_fpn_v2] (#7350)

---
 test/test_extended_models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index da8339a55..0866cc0f8 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -182,6 +182,10 @@ detection_models_input_dims = {
 )
 @run_if_test_with_extended
 def test_schema_meta_validation(model_fn):
+
+    if model_fn.__name__ == "maskrcnn_resnet50_fpn_v2":
+        pytest.skip(reason="FIXME https://github.com/pytorch/vision/issues/7349")
+
     # list of all possible supported high-level fields for weights meta-data
     permitted_fields = {
         "backend",
-- 
GitLab


From cd3324639372c6a10b50703dc8262418f8a83144 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Feb 2023 16:28:25 +0100
Subject: [PATCH 349/624] skip flaky adjust_saturation kernel tests (#7348)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/transforms_v2_kernel_infos.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 8ae0a8ba4..3c3611cb8 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -2064,6 +2064,9 @@ def sample_inputs_adjust_saturation_video():
         yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
+# TODO: this is just temporary to make CI green for release. We should add proper tolerances after
+skip_adjust_saturation_cuda = TestMark(("TestKernels", "test_cuda_vs_cpu"), pytest.mark.skip(reason="Test is flaky"))
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -2077,10 +2080,12 @@ KERNEL_INFOS.extend(
                 **pil_reference_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(2),
             },
+            test_marks=[skip_adjust_saturation_cuda],
         ),
         KernelInfo(
             F.adjust_saturation_video,
             sample_inputs_fn=sample_inputs_adjust_saturation_video,
+            test_marks=[skip_adjust_saturation_cuda],
         ),
     ]
 )
-- 
GitLab


From af048198f87da11f344ffba37d6962aa78b36218 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 27 Feb 2023 12:20:06 -0500
Subject: [PATCH 350/624] Fix cpu instals in circleci (#7357)

---
 .circleci/unittest/linux/scripts/install.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
index a15c1458d..6375e1910 100755
--- a/.circleci/unittest/linux/scripts/install.sh
+++ b/.circleci/unittest/linux/scripts/install.sh
@@ -22,6 +22,10 @@ else
     echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION: ${CU_VERSION} "
     version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
     cudatoolkit="pytorch-cuda=${version}"
+
+     # make sure local cuda is set to required cuda version and not CUDA version by default
+    rm -f /usr/local/cuda
+    ln -s /usr/local/cuda-${version} /usr/local/cuda
 fi
 
 case "$(uname -s)" in
@@ -34,10 +38,6 @@ if [ "${os}" == "MacOSX" ]; then
     conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}"
 else
     conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-
-    # make sure local cuda is set to required cuda version and not CUDA version by default
-    rm -f /usr/local/cuda
-    ln -s /usr/local/cuda-${version} /usr/local/cuda
 fi
 
 
-- 
GitLab


From 120e7af6466190b754cf3026c685a5d31561da90 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Feb 2023 22:02:37 +0100
Subject: [PATCH 351/624] fix assert_run_python_script on Windows (#7346)

---
 test/common_utils.py       | 17 ++++++++++-------
 test/test_transforms.py    |  5 -----
 test/test_transforms_v2.py |  9 ---------
 3 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 2f74f3686..697b6f6e4 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -844,17 +844,20 @@ class InfoBase:
 
 def assert_run_python_script(source_code):
     """Utility to check assertions in an independent Python subprocess.
+
     The script provided in the source code should return 0 and not print
-    anything on stderr or stdout. Taken from scikit-learn test utils.
-    source_code (str): The Python source code to execute.
+    anything on stderr or stdout. Modified from scikit-learn test utils.
+
+    Args:
+        source_code (str): The Python source code to execute.
     """
-    with tempfile.NamedTemporaryFile(mode="wb") as f:
-        f.write(source_code.encode())
-        f.flush()
+    with get_tmp_dir() as root:
+        path = pathlib.Path(root) / "main.py"
+        with open(path, "w") as file:
+            file.write(source_code)
 
-        cmd = [sys.executable, f.name]
         try:
-            out = check_output(cmd, stderr=STDOUT)
+            out = check_output([sys.executable, str(path)], stderr=STDOUT)
         except CalledProcessError as e:
             raise RuntimeError(f"script errored with output:\n{e.output.decode()}")
         if out != b"":
diff --git a/test/test_transforms.py b/test/test_transforms.py
index b6eccba42..03b385e9e 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2,7 +2,6 @@ import math
 import os
 import random
 import re
-import sys
 import textwrap
 import warnings
 from functools import partial
@@ -2279,10 +2278,6 @@ def test_random_grayscale_with_grayscale_input():
     ),
 )
 @pytest.mark.parametrize("from_private", (True, False))
-@pytest.mark.skipif(
-    sys.platform in ("win32", "cygwin"),
-    reason="assert_run_python_script is broken on Windows. Possible fix in https://github.com/pytorch/vision/pull/7346",
-)
 def test_functional_deprecation_warning(import_statement, from_private):
     if from_private:
         import_statement = import_statement.replace("functional", "_functional")
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9fe7bbf51..f5ca97696 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -2,7 +2,6 @@ import itertools
 import pathlib
 import random
 import re
-import sys
 import textwrap
 import warnings
 from collections import defaultdict
@@ -2102,10 +2101,6 @@ def test_sanitize_bounding_boxes_errors():
     ),
 )
 @pytest.mark.parametrize("call_disable_warning", (True, False))
-@pytest.mark.skipif(
-    sys.platform in ("win32", "cygwin"),
-    reason="assert_run_python_script is broken on Windows. Possible fix in https://github.com/pytorch/vision/pull/7346",
-)
 def test_warnings_v2_namespaces(import_statement, call_disable_warning):
     if call_disable_warning:
         source = f"""
@@ -2125,10 +2120,6 @@ def test_warnings_v2_namespaces(import_statement, call_disable_warning):
     assert_run_python_script(textwrap.dedent(source))
 
 
-@pytest.mark.skipif(
-    sys.platform in ("win32", "cygwin"),
-    reason="assert_run_python_script is broken on Windows. Possible fix in https://github.com/pytorch/vision/pull/7346",
-)
 def test_no_warnings_v1_namespace():
     source = """
     import warnings
-- 
GitLab


From feda8b7ba0c4777ca6d73c47b2c203311cf12d65 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 1 Mar 2023 03:04:42 -0800
Subject: [PATCH 352/624] Extend TorchVision smoke tests to MPS (#7365)

---
 test/smoke_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index 728c9440f..e8ee178d9 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -59,6 +59,8 @@ def main() -> None:
     smoke_test_torchvision_resnet50_classify()
     if torch.cuda.is_available():
         smoke_test_torchvision_resnet50_classify("cuda")
+    if torch.backends.mps.is_available():
+        smoke_test_torchvision_resnet50_classify("mps")
 
 
 if __name__ == "__main__":
-- 
GitLab


From 924d373cc234bb837f09c9be0ca18fd138453d52 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 1 Mar 2023 15:02:53 +0100
Subject: [PATCH 353/624] fix flaky test for rotate_bounding_box (#7362)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/common_utils.py                  | 2 +-
 test/test_transforms_v2_functional.py | 8 ++++----
 test/transforms_v2_kernel_infos.py    | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 697b6f6e4..bd945b09e 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -351,7 +351,7 @@ assert_equal = functools.partial(assert_close, rtol=0, atol=0)
 
 def parametrized_error_message(*args, **kwargs):
     def to_str(obj):
-        if isinstance(obj, torch.Tensor) and obj.numel() > 10:
+        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
             return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
         elif isinstance(obj, enum.Enum):
             return f"{type(obj).__name__}.{obj.name}"
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index e648b35d4..ee9576b64 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -146,7 +146,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
-            msg=parametrized_error_message(*([actual, expected] + other_args), **kwargs),
+            msg=parametrized_error_message(input, other_args, **kwargs),
         )
 
     def _unbatch(self, batch, *, data_dims):
@@ -204,7 +204,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device),
-            msg=parametrized_error_message(*other_args, **kwargs),
+            msg=parametrized_error_message(batched_input, *other_args, **kwargs),
         )
 
     @sample_inputs
@@ -236,7 +236,7 @@ class TestKernels:
             output_cpu,
             check_device=False,
             **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device),
-            msg=parametrized_error_message(*other_args, **kwargs),
+            msg=parametrized_error_message(input_cpu, *other_args, **kwargs),
         )
 
     @sample_inputs
@@ -294,7 +294,7 @@ class TestKernels:
             actual,
             expected,
             **info.get_closeness_kwargs(test_id, dtype=torch.float32, device=input.device),
-            msg=parametrized_error_message(*other_args, **kwargs),
+            msg=parametrized_error_message(input, *other_args, **kwargs),
         )
 
 
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 3c3611cb8..6fea25137 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -860,8 +860,8 @@ KERNEL_INFOS.extend(
             reference_fn=reference_rotate_bounding_box,
             reference_inputs_fn=reference_inputs_rotate_bounding_box,
             closeness_kwargs={
-                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
-                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-4, rtol=1e-4),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-4, rtol=1e-4),
             },
         ),
         KernelInfo(
-- 
GitLab


From 0e45022a50e23f1ef22fbd4fe973a2c00d07972e Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 1 Mar 2023 10:19:06 -0500
Subject: [PATCH 354/624] Add smoke test Using a simple RN50 with torch.compile
 (#7359)

---
 test/smoke_test.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index e8ee178d9..9ffc91177 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -4,6 +4,7 @@ import os
 from pathlib import Path
 
 import torch
+import torch.nn as nn
 import torchvision
 from torchvision.io import read_image
 from torchvision.models import resnet50, ResNet50_Weights
@@ -26,6 +27,12 @@ def smoke_test_torchvision_read_decode() -> None:
     if img_png.ndim != 3 or img_png.numel() < 100:
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
+def smoke_test_compile() -> None:
+    model = resnet50().cuda()
+    model = torch.compile(model)
+    x = torch.randn(1, 3, 224, 224, device="cuda")
+    out = model(x)
+    print(f"torch.compile model output: {out.shape}")
 
 def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
     img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device)
@@ -54,14 +61,18 @@ def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
 
 def main() -> None:
     print(f"torchvision: {torchvision.__version__}")
+    print(f"torch.cuda.is_available: {torch.cuda.is_available()}")
     smoke_test_torchvision()
     smoke_test_torchvision_read_decode()
     smoke_test_torchvision_resnet50_classify()
     if torch.cuda.is_available():
         smoke_test_torchvision_resnet50_classify("cuda")
+        smoke_test_compile()
+
     if torch.backends.mps.is_available():
         smoke_test_torchvision_resnet50_classify("mps")
 
 
+
 if __name__ == "__main__":
     main()
-- 
GitLab


From 9e3a37ace187d7ca3e32d14a925ad29e738f9a27 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 1 Mar 2023 17:16:43 +0100
Subject: [PATCH 355/624] increase timeout for Linux unittest CI on CPU (#7369)

---
 .github/workflows/test-linux-cpu.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 8a9f7d33b..2993ed53e 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -22,6 +22,7 @@ jobs:
     with:
       runner: linux.12xlarge
       repository: pytorch/vision
+      timeout: 120
       script: |
         # Mark Build Directory Safe
         git config --global --add safe.directory /__w/vision/vision
-- 
GitLab


From 8499bc497b0b1d19f971b7389623884bcd44c62b Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 1 Mar 2023 11:40:03 -0500
Subject: [PATCH 356/624] Use python 3.9 conda environment for cmake circleci
 workflows (#7370)

---
 .circleci/config.yml    | 9 +++++++++
 .circleci/config.yml.in | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b2193928b..7fad570b9 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -995,6 +995,10 @@ jobs:
           command: |
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
+            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
+            conda activate base
+            conda create -yn python39 python=3.9
+            conda activate python39
             packaging/build_cmake.sh
 
   cmake_windows_gpu:
@@ -1012,6 +1016,11 @@ jobs:
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
             packaging/windows/internal/cuda_install.bat
+            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
+            conda activate
+            conda update -y conda
+            conda create -yn python39 python=3.9
+            conda activate python39
             packaging/build_cmake.sh
 
   build_docs:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index abc990656..841cfc3f5 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -995,6 +995,10 @@ jobs:
           command: |
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
+            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
+            conda activate base
+            conda create -yn python39 python=3.9
+            conda activate python39
             packaging/build_cmake.sh
 
   cmake_windows_gpu:
@@ -1012,6 +1016,11 @@ jobs:
             set -ex
             source packaging/windows/internal/vc_install_helper.sh
             packaging/windows/internal/cuda_install.bat
+            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
+            conda activate
+            conda update -y conda
+            conda create -yn python39 python=3.9
+            conda activate python39
             packaging/build_cmake.sh
 
   build_docs:
-- 
GitLab


From e3da44bb02cbcbb4e7269606905590b754b712e8 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 1 Mar 2023 20:23:22 +0100
Subject: [PATCH 357/624] remove Linux GPU unittest from CircleCI (#7354)

---
 .circleci/config.yml    | 112 ----------------------------------------
 .circleci/config.yml.in |  90 --------------------------------
 .circleci/regenerate.py |   5 +-
 3 files changed, 2 insertions(+), 205 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7fad570b9..325731419 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -722,96 +722,6 @@ jobs:
             conda activate python${PYTHON_VERSION}
             python -c "import torchvision"
 
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    environment:
-      image_name: "pytorch/manylinux-cuda117"
-      CU_VERSION: << parameters.cu_version >>
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          # Here we create an envlist file that contains some env variables that we want the docker container to be aware of.
-          # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables.
-          # They're avaiable in all the other workflows (OSX and Windows).
-          # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible.
-          # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run".
-          name: export CIRCLECI env var
-          command: echo "CIRCLECI=true" >> ./env.list
-      - run:
-          name: Install torchvision
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post Process
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   unittest_windows_cpu:
     <<: *binary_common
     executor:
@@ -1350,28 +1260,6 @@ workflows:
       - unittest_torchhub
       - unittest_onnx
       - unittest_extended
-      - unittest_linux_gpu:
-          cu_version: cu117
-          name: unittest_linux_gpu_py3.8
-          python_version: '3.8'
-      - unittest_linux_gpu:
-          cu_version: cu117
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.9
-          python_version: '3.9'
-      - unittest_linux_gpu:
-          cu_version: cu117
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.10
-          python_version: '3.10'
       - unittest_windows_cpu:
           cu_version: cpu
           name: unittest_windows_cpu_py3.8
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 841cfc3f5..44ae923f0 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -722,96 +722,6 @@ jobs:
             conda activate python${PYTHON_VERSION}
             python -c "import torchvision"
 
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    environment:
-      image_name: "pytorch/manylinux-cuda117"
-      CU_VERSION: << parameters.cu_version >>
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          # Here we create an envlist file that contains some env variables that we want the docker container to be aware of.
-          # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables.
-          # They're avaiable in all the other workflows (OSX and Windows).
-          # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible.
-          # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run".
-          name: export CIRCLECI env var
-          command: echo "CIRCLECI=true" >> ./env.list
-      - run:
-          name: Install torchvision
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post Process
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   unittest_windows_cpu:
     <<: *binary_common
     executor:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index bd8a54e2f..2c8b60901 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -255,12 +255,11 @@ def indent(indentation, data_list):
 
 def unittest_workflows(indentation=6):
     jobs = []
-    for os_type in ["linux", "windows", "macos"]:
+    for os_type in ["windows", "macos"]:
         for device_type in ["cpu", "gpu"]:
             if os_type == "macos" and device_type == "gpu":
                 continue
-            if os_type == "linux" and device_type == "cpu":
-                continue
+
             for i, python_version in enumerate(PYTHON_VERSIONS):
 
                 # Turn off unit tests for 3.11, unit test are not setup properly
-- 
GitLab


From caf12f840037193fb3d1e6c60168c37dfa218f43 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 1 Mar 2023 22:23:22 +0100
Subject: [PATCH 358/624] consolidate Linux workflows on CPU and GPU (#7189)

---
 .github/unittest.sh                  | 79 ++++++++++++++++++++++++++++
 .github/workflows/test-linux-cpu.yml | 58 --------------------
 .github/workflows/test-linux-gpu.yml | 61 ---------------------
 .github/workflows/test-linux.yml     | 41 +++++++++++++++
 4 files changed, 120 insertions(+), 119 deletions(-)
 create mode 100755 .github/unittest.sh
 delete mode 100644 .github/workflows/test-linux-cpu.yml
 delete mode 100644 .github/workflows/test-linux-gpu.yml
 create mode 100644 .github/workflows/test-linux.yml

diff --git a/.github/unittest.sh b/.github/unittest.sh
new file mode 100755
index 000000000..0109a5cc2
--- /dev/null
+++ b/.github/unittest.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Prepare conda
+CONDA_PATH=$(which conda)
+eval "$(${CONDA_PATH} shell.bash hook)"
+conda config --set channel_priority strict
+
+echo '::group::Set PyTorch conda channel and wheel index'
+# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
+if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+  CHANNEL_ID=test
+else
+  CHANNEL_ID=nightly
+fi
+PYTORCH_CONDA_CHANNEL=pytorch-"${CHANNEL_ID}"
+echo "PYTORCH_CONDA_CHANNEL=${PYTORCH_CONDA_CHANNEL}"
+
+case $GPU_ARCH_TYPE in
+  cpu)
+    GPU_ARCH_ID="cpu"
+    ;;
+  cuda)
+    VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//')
+    GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}"
+    ;;
+  *)
+    echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}"
+    exit 1
+    ;;
+esac
+PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL_ID}/${GPU_ARCH_ID}"
+echo "PYTORCH_WHEEL_INDEX=${PYTORCH_WHEEL_INDEX}"
+echo '::endgroup::'
+
+echo '::group::Create build environment'
+# See https://github.com/pytorch/vision/issues/7296 for ffmpeg
+conda create \
+  --name ci \
+  --quiet --yes \
+  python="${PYTHON_VERSION}" pip \
+  ninja libpng jpeg \
+  'ffmpeg<4.3' \
+  -c "${PYTORCH_CONDA_CHANNEL}" \
+  -c defaults
+conda activate ci
+pip install --progress-bar=off --upgrade setuptools
+
+# See https://github.com/pytorch/vision/issues/6790
+if [[ "${PYTHON_VERSION}" != "3.11" ]]; then
+  pip install --progress-bar=off av!=10.0.0
+fi
+
+echo '::endgroup::'
+
+echo '::group::Install PyTorch'
+pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+
+if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
+  python3 -c "import torch; exit(not torch.cuda.is_available())"
+fi
+echo '::endgroup::'
+
+echo '::group::Install TorchVision'
+python setup.py develop
+echo '::endgroup::'
+
+echo '::group::Collect PyTorch environment information'
+python -m torch.utils.collect_env
+echo '::endgroup::'
+
+echo '::group::Install testing utilities'
+pip install --progress-bar=off pytest pytest-mock pytest-cov
+echo '::endgroup::'
+
+echo '::group::Run tests'
+pytest --durations=25
+echo '::endgroup::'
diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
deleted file mode 100644
index 2993ed53e..000000000
--- a/.github/workflows/test-linux-cpu.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: Unit-tests on Linux CPU
-
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-
-env:
-  CHANNEL: "nightly"
-
-jobs:
-  tests:
-    strategy:
-      matrix:
-        python_version: ["3.8", "3.9", "3.10"]
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.12xlarge
-      repository: pytorch/vision
-      timeout: 120
-      script: |
-        # Mark Build Directory Safe
-        git config --global --add safe.directory /__w/vision/vision
-
-        # Set up Environment Variables
-        export PYTHON_VERSION="${{ matrix.python_version }}"
-        export VERSION="cpu"
-        export CUDATOOLKIT="cpuonly"
-
-        # Set CHANNEL
-        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
-          export CHANNEL=test
-        else
-          export CHANNEL=nightly
-        fi
-
-        # Create Conda Env
-        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
-        conda activate /work/ci_env
-
-        # Install PyTorch, Torchvision, and testing libraries
-        set -ex
-        conda install \
-          --yes \
-          -c "pytorch-${CHANNEL}" \
-          -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
-          "${CUDATOOLKIT}"
-        python3 setup.py develop
-        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
-
-        # Run Tests
-        python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
deleted file mode 100644
index d1275071b..000000000
--- a/.github/workflows/test-linux-gpu.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: Unit-tests on Linux GPU
-
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-
-env:
-  CHANNEL: "nightly"
-
-jobs:
-  tests:
-    strategy:
-      matrix:
-        python_version: ["3.8"]
-        cuda_arch_version: ["11.7"]
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      repository: pytorch/vision
-      gpu-arch-type: cuda
-      gpu-arch-version: ${{ matrix.cuda_arch_version }}
-      timeout: 120
-      script: |
-        # Mark Build Directory Safe
-        git config --global --add safe.directory /__w/vision/vision
-
-        # Set up Environment Variables
-        export PYTHON_VERSION="${{ matrix.python_version }}"
-        export VERSION="${{ matrix.cuda_arch_version }}"
-        export CUDATOOLKIT="pytorch-cuda=${VERSION}"
-
-        # Set CHANNEL
-        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
-          export CHANNEL=test
-        else
-          export CHANNEL=nightly
-        fi
-
-        # Create Conda Env
-        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
-        conda activate /work/ci_env
-
-        # Install PyTorch, Torchvision, and testing libraries
-        set -ex
-        conda install \
-          --yes \
-          -c "pytorch-${CHANNEL}" \
-          -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
-          "${CUDATOOLKIT}"
-        python3 setup.py develop
-        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
-
-        # Run Tests
-        python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
new file mode 100644
index 000000000..133eaf606
--- /dev/null
+++ b/.github/workflows/test-linux.yml
@@ -0,0 +1,41 @@
+name: Unit-tests on Linux
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["linux.12xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: 3.8
+            runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.7"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      script: |
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/unittest.sh
-- 
GitLab


From 0e62c34fb6ed7766fea368e67033564d8010ea78 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 2 Mar 2023 11:13:32 +0100
Subject: [PATCH 359/624] fix code format (#7377)

---
 test/smoke_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index 9ffc91177..35e079c31 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -27,6 +27,7 @@ def smoke_test_torchvision_read_decode() -> None:
     if img_png.ndim != 3 or img_png.numel() < 100:
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
+
 def smoke_test_compile() -> None:
     model = resnet50().cuda()
     model = torch.compile(model)
@@ -34,6 +35,7 @@ def smoke_test_compile() -> None:
     out = model(x)
     print(f"torch.compile model output: {out.shape}")
 
+
 def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
     img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device)
 
@@ -73,6 +75,5 @@ def main() -> None:
         smoke_test_torchvision_resnet50_classify("mps")
 
 
-
 if __name__ == "__main__":
     main()
-- 
GitLab


From 81f0c0c9e277540de0cfa38739d6dca2dfa12c66 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 3 Mar 2023 13:26:58 -0500
Subject: [PATCH 360/624]  Execute compile smoke test only on linux, check that
 windows throws an exception (#7386)

---
 test/smoke_test.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index 35e079c31..63b35d04b 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -2,6 +2,7 @@
 
 import os
 from pathlib import Path
+from sys import platform
 
 import torch
 import torch.nn as nn
@@ -29,11 +30,17 @@ def smoke_test_torchvision_read_decode() -> None:
 
 
 def smoke_test_compile() -> None:
-    model = resnet50().cuda()
-    model = torch.compile(model)
-    x = torch.randn(1, 3, 224, 224, device="cuda")
-    out = model(x)
-    print(f"torch.compile model output: {out.shape}")
+    try:
+        model = resnet50().cuda()
+        model = torch.compile(model)
+        x = torch.randn(1, 3, 224, 224, device="cuda")
+        out = model(x)
+        print(f"torch.compile model output: {out.shape}")
+    except RuntimeError:
+        if platform == "win32":
+            print("Successfully caught torch.compile RuntimeError on win")
+        else:
+            raise
 
 
 def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
-- 
GitLab


From 5d9bc5e53e6b01c6d623da8dc25c6434d66ff093 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 3 Mar 2023 14:21:31 -0500
Subject: [PATCH 361/624] Assert Runtime Error for torch.compile on Python 3.11
 (#7387)

---
 test/smoke_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index 63b35d04b..f965c6f6a 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -39,6 +39,8 @@ def smoke_test_compile() -> None:
     except RuntimeError:
         if platform == "win32":
             print("Successfully caught torch.compile RuntimeError on win")
+        elif sys.version_info >= (3, 11, 0):
+            print("Successfully caught torch.compile RuntimeError on Python 3.11")
         else:
             raise
 
-- 
GitLab


From beb4bb706b5e13009cb5d5586505c6d2896d184a Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Fri, 3 Mar 2023 15:41:31 -0500
Subject: [PATCH 362/624] Fix failing smoke test (#7388)

---
 test/smoke_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index f965c6f6a..1f1364512 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,8 +1,8 @@
 """Run smoke tests"""
 
 import os
+import sys
 from pathlib import Path
-from sys import platform
 
 import torch
 import torch.nn as nn
@@ -37,7 +37,7 @@ def smoke_test_compile() -> None:
         out = model(x)
         print(f"torch.compile model output: {out.shape}")
     except RuntimeError:
-        if platform == "win32":
+        if sys.platform == "win32":
             print("Successfully caught torch.compile RuntimeError on win")
         elif sys.version_info >= (3, 11, 0):
             print("Successfully caught torch.compile RuntimeError on Python 3.11")
-- 
GitLab


From 5850f370c03d941f97c7bd53f99a83abb0b9dd01 Mon Sep 17 00:00:00 2001
From: Camilo De La Torre <64303300+camilodlt@users.noreply.github.com>
Date: Tue, 7 Mar 2023 14:53:41 +0100
Subject: [PATCH 363/624] Fix typo in warning message (#7394)

---
 torchvision/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index eed24091a..590b32732 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -105,7 +105,7 @@ _BETA_TRANSFORMS_WARNING = (
     "this issue: https://github.com/pytorch/vision/issues/6753, and you can also "
     "check out https://github.com/pytorch/vision/issues/7319 to learn more about "
     "the APIs that we suspect might involve future changes. "
-    "You can silence this warning by calling torchvision.disable_beta_transform_warning()."
+    "You can silence this warning by calling torchvision.disable_beta_transforms_warning()."
 )
 
 
-- 
GitLab


From 44e8a35030b2161de9ba958fbe6dd6c1c6849863 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Mar 2023 12:53:24 +0100
Subject: [PATCH 364/624] remove torch.hub tests (#7399)

---
 .circleci/config.yml    | 10 ---------
 .circleci/config.yml.in | 10 ---------
 test/test_hub.py        | 46 -----------------------------------------
 3 files changed, 66 deletions(-)
 delete mode 100644 test/test_hub.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 325731419..caf29402a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -324,15 +324,6 @@ jobs:
           name: Check Python types statically
           command: mypy --install-types --non-interactive --config-file mypy.ini
 
-  unittest_torchhub:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - install_torchvision
-      - run_tests_selective:
-          file_or_dir: test/test_hub.py
-
   unittest_onnx:
     docker:
       - image: cimg/python:3.8
@@ -1257,7 +1248,6 @@ workflows:
 
   unittest:
     jobs:
-      - unittest_torchhub
       - unittest_onnx
       - unittest_extended
       - unittest_windows_cpu:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 44ae923f0..109c81267 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -324,15 +324,6 @@ jobs:
           name: Check Python types statically
           command: mypy --install-types --non-interactive --config-file mypy.ini
 
-  unittest_torchhub:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - install_torchvision
-      - run_tests_selective:
-          file_or_dir: test/test_hub.py
-
   unittest_onnx:
     docker:
       - image: cimg/python:3.8
@@ -1017,7 +1008,6 @@ workflows:
 
   unittest:
     jobs:
-      - unittest_torchhub
       - unittest_onnx
       - unittest_extended
       {{ unittest_workflows() }}
diff --git a/test/test_hub.py b/test/test_hub.py
deleted file mode 100644
index d88c6fa2c..000000000
--- a/test/test_hub.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import shutil
-import sys
-import tempfile
-
-import pytest
-import torch.hub as hub
-
-
-def sum_of_model_parameters(model):
-    s = 0
-    for p in model.parameters():
-        s += p.sum()
-    return s
-
-
-SUM_OF_PRETRAINED_RESNET18_PARAMS = -12703.9931640625
-
-
-@pytest.mark.skipif("torchvision" in sys.modules, reason="TestHub must start without torchvision imported")
-class TestHub:
-    # Only run this check ONCE before all tests start.
-    # - If torchvision is imported before all tests start, e.g. we might find _C.so
-    #   which doesn't exist in downloaded zip but in the installed wheel.
-    # - After the first test is run, torchvision is already in sys.modules due to
-    #   Python cache as we run all hub tests in the same python process.
-
-    def test_load_from_github(self):
-        hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False)
-        assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS)
-
-    def test_set_dir(self):
-        temp_dir = tempfile.gettempdir()
-        hub.set_dir(temp_dir)
-        hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False)
-        assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS)
-        assert os.path.exists(temp_dir + "/pytorch_vision_master")
-        shutil.rmtree(temp_dir + "/pytorch_vision_master")
-
-    def test_list_entrypoints(self):
-        entry_lists = hub.list("pytorch/vision", force_reload=True)
-        assert "resnet18" in entry_lists
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
-- 
GitLab


From 91b57697d08cdea609ad506382cdf8f52e8b2e41 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Mar 2023 12:58:45 +0100
Subject: [PATCH 365/624] add macOS unittest to GHA (#7376)

---
 .github/unittest.sh              | 26 +++++++++++++++++
 .github/workflows/test-m1.yml    | 50 --------------------------------
 .github/workflows/test-macos.yml | 37 +++++++++++++++++++++++
 setup.py                         |  2 ++
 4 files changed, 65 insertions(+), 50 deletions(-)
 delete mode 100644 .github/workflows/test-m1.yml
 create mode 100644 .github/workflows/test-macos.yml

diff --git a/.github/unittest.sh b/.github/unittest.sh
index 0109a5cc2..0cfb138b4 100755
--- a/.github/unittest.sh
+++ b/.github/unittest.sh
@@ -7,6 +7,32 @@ CONDA_PATH=$(which conda)
 eval "$(${CONDA_PATH} shell.bash hook)"
 conda config --set channel_priority strict
 
+# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
+case $(uname) in
+  Linux)
+    OS_TYPE=linux
+    ;;
+  Darwin)
+    OS_TYPE=macos
+    ;;
+  *)
+    echo "Unknown OS type:" $(uname)
+    exit 1
+    ;;
+esac
+
+echo '::group::Uninstall system JPEG libraries on macOS'
+# The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG libraries installed by default
+# that interfere with our build. We uninstall them here and use the one from conda below.
+if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then
+  JPEG_LIBS=$(brew list | grep jpeg)
+  echo $JPEG_LIBS
+  for lib in $JPEG_LIBS; do
+    brew uninstall --ignore-dependencies --force $lib || true
+  done
+fi
+echo '::endgroup::'
+
 echo '::group::Set PyTorch conda channel and wheel index'
 # TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
 if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml
deleted file mode 100644
index c03fa9f76..000000000
--- a/.github/workflows/test-m1.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Unit-tests on M1
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-env:
-  CHANNEL: "nightly"
-jobs:
-  tests:
-    name: "Unit-tests on M1"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8"]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Install TorchVision
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')`
-          export PATH=~/miniconda3/bin:$PATH
-          set -ex
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy
-          conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 setup.py develop
-          conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock 'av<10'
-      - name: Run tests
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          set -ex
-          conda run -p ${ENV_NAME} --no-capture-output python3 -u -mpytest -v --tb=long --durations 20
-          conda env remove -p ${ENV_NAME}
diff --git a/.github/workflows/test-macos.yml b/.github/workflows/test-macos.yml
new file mode 100644
index 000000000..ce4d8cb1f
--- /dev/null
+++ b/.github/workflows/test-macos.yml
@@ -0,0 +1,37 @@
+name: Unit-tests on macOS
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["macos-12"]
+        include:
+          - python-version: "3.8"
+            runner: macos-m1-12
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      # We need an increased timeout here, since the macos-12 runner is the free one from GH
+      # and needs roughly 2 hours to just run the test suite
+      timeout: 240
+      runner: ${{ matrix.runner }}
+      script: |
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=cpu
+
+        ./.github/unittest.sh
diff --git a/setup.py b/setup.py
index 24b7a2edb..059c4f424 100644
--- a/setup.py
+++ b/setup.py
@@ -298,6 +298,8 @@ def get_extensions():
     use_jpeg = use_jpeg and jpeg_found
     if use_jpeg:
         print("Building torchvision with JPEG image support")
+        print(f"  libpng include path: {jpeg_include}")
+        print(f"  libpng lib path: {jpeg_lib}")
         image_link_flags.append("jpeg")
         if jpeg_conda:
             image_library += [jpeg_lib]
-- 
GitLab


From e59cf64bb6eb4c3a50e0a76d8019fa4c4d5f2a15 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 8 Mar 2023 22:29:45 +0100
Subject: [PATCH 366/624] port special tests from CircleCI to GHA (#7396)

---
 .github/{unittest.sh => scripts/setup-env.sh} |  8 ---
 .github/scripts/unittest.sh                   | 18 +++++
 .github/workflows/test-linux.yml              | 68 ++++++++++++++++++-
 .github/workflows/test-macos.yml              |  8 ++-
 scripts/download_model_urls.py                | 41 +++++++++++
 5 files changed, 129 insertions(+), 14 deletions(-)
 rename .github/{unittest.sh => scripts/setup-env.sh} (93%)
 create mode 100755 .github/scripts/unittest.sh
 create mode 100644 scripts/download_model_urls.py

diff --git a/.github/unittest.sh b/.github/scripts/setup-env.sh
similarity index 93%
rename from .github/unittest.sh
rename to .github/scripts/setup-env.sh
index 0cfb138b4..80787f4dd 100755
--- a/.github/unittest.sh
+++ b/.github/scripts/setup-env.sh
@@ -95,11 +95,3 @@ echo '::endgroup::'
 echo '::group::Collect PyTorch environment information'
 python -m torch.utils.collect_env
 echo '::endgroup::'
-
-echo '::group::Install testing utilities'
-pip install --progress-bar=off pytest pytest-mock pytest-cov
-echo '::endgroup::'
-
-echo '::group::Run tests'
-pytest --durations=25
-echo '::endgroup::'
diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
new file mode 100755
index 000000000..ca3cfd6c0
--- /dev/null
+++ b/.github/scripts/unittest.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+./.github/scripts/setup-env.sh
+
+# Prepare conda
+CONDA_PATH=$(which conda)
+eval "$(${CONDA_PATH} shell.bash hook)"
+conda activate ci
+
+echo '::group::Install testing utilities'
+pip install --progress-bar=off pytest pytest-mock pytest-cov
+echo '::endgroup::'
+
+echo '::group::Run unittests'
+pytest --durations=25
+echo '::endgroup::'
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
index 133eaf606..bab309161 100644
--- a/.github/workflows/test-linux.yml
+++ b/.github/workflows/test-linux.yml
@@ -1,4 +1,4 @@
-name: Unit-tests on Linux
+name: Tests on Linux
 
 on:
   pull_request:
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  tests:
+  unittests:
     strategy:
       matrix:
         python-version:
@@ -34,8 +34,70 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       timeout: 120
       script: |
+        set -euo pipefail
+
         export PYTHON_VERSION=${{ matrix.python-version }}
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
 
-        ./.github/unittest.sh
+        ./.github/scripts/unittest.sh
+
+  onnx:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+
+        ./.github/scripts/setup-env.sh
+        
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        
+        echo '::group::Install ONNX'
+        pip install --progress-bar=off onnx onnxruntime
+        echo '::endgroup::'
+        
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest
+        echo '::endgroup::'
+        
+        echo '::group::Run ONNX tests'
+        pytest --durations=25 -v test/test_onnx.py
+        echo '::endgroup::'
+
+  unittests-extended:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+
+        ./.github/scripts/setup-env.sh
+        
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        
+        echo '::group::Pre-download model weights'
+        pip install --progress-bar=off aiohttp aiofiles tqdm
+        python scripts/download_model_urls.py
+        echo '::endgroup::'
+        
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest
+        echo '::endgroup::'
+        
+        echo '::group::Run extended unittests'
+        export PYTORCH_TEST_WITH_EXTENDED=1
+        pytest --durations=25 -v test/test_extended_*.py
+        echo '::endgroup::'
diff --git a/.github/workflows/test-macos.yml b/.github/workflows/test-macos.yml
index ce4d8cb1f..f188a6416 100644
--- a/.github/workflows/test-macos.yml
+++ b/.github/workflows/test-macos.yml
@@ -1,4 +1,4 @@
-name: Unit-tests on macOS
+name: Tests on macOS
 
 on:
   pull_request:
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  tests:
+  unittests:
     strategy:
       matrix:
         python-version:
@@ -31,7 +31,9 @@ jobs:
       timeout: 240
       runner: ${{ matrix.runner }}
       script: |
+        set -euo pipefail
+
         export PYTHON_VERSION=${{ matrix.python-version }}
         export GPU_ARCH_TYPE=cpu
 
-        ./.github/unittest.sh
+        ./.github/scripts/unittest.sh
diff --git a/scripts/download_model_urls.py b/scripts/download_model_urls.py
new file mode 100644
index 000000000..f5f53d71e
--- /dev/null
+++ b/scripts/download_model_urls.py
@@ -0,0 +1,41 @@
+import asyncio
+import sys
+from pathlib import Path
+from time import perf_counter
+from urllib.parse import urlsplit
+
+import aiofiles
+import aiohttp
+from torchvision import models
+from tqdm.asyncio import tqdm
+
+
+async def main(download_root):
+    download_root.mkdir(parents=True, exist_ok=True)
+    urls = {weight.url for name in models.list_models() for weight in iter(models.get_model_weights(name))}
+
+    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=None)) as session:
+        await tqdm.gather(*[download(download_root, session, url) for url in urls])
+
+
+async def download(download_root, session, url):
+    response = await session.get(url, params=dict(source="ci"))
+
+    assert response.ok
+
+    file_name = Path(urlsplit(url).path).name
+    async with aiofiles.open(download_root / file_name, "wb") as f:
+        async for data in response.content.iter_any():
+            await f.write(data)
+
+
+if __name__ == "__main__":
+    download_root = (
+        (Path(sys.argv[1]) if len(sys.argv) > 1 else Path("~/.cache/torch/hub/checkpoints")).expanduser().resolve()
+    )
+    print(f"Downloading model weights to {download_root}")
+    start = perf_counter()
+    asyncio.get_event_loop().run_until_complete(main(download_root))
+    stop = perf_counter()
+    minutes, seconds = divmod(stop - start, 60)
+    print(f"Download took {minutes:2.0f}m {seconds:2.0f}s")
-- 
GitLab


From 6218dc3a4bc8756a454d5213ea0ebfcad4244448 Mon Sep 17 00:00:00 2001
From: SvenDS9 <122370631+SvenDS9@users.noreply.github.com>
Date: Thu, 9 Mar 2023 17:57:05 +0100
Subject: [PATCH 367/624] guard against deprecation of binom_test (#7404)

---
 test/test_ops.py        | 2 +-
 test/test_transforms.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 3f9400257..5f8f8098c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1640,7 +1640,7 @@ class TestStochasticDepth:
                 counts += batch_size - non_zero_count
                 num_samples += batch_size
 
-        p_value = stats.binom_test(counts, num_samples, p=p)
+        p_value = stats.binomtest(counts, num_samples, p=p).pvalue
         assert p_value > 0.01
 
     @pytest.mark.parametrize("seed", range(10))
diff --git a/test/test_transforms.py b/test/test_transforms.py
index 03b385e9e..c96fbb284 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -1445,7 +1445,7 @@ def test_random_order():
         if out == resize_crop_out:
             num_normal_order += 1
 
-    p_value = stats.binom_test(num_normal_order, num_samples, p=0.5)
+    p_value = stats.binomtest(num_normal_order, num_samples, p=0.5).pvalue
     random.setstate(random_state)
     assert p_value > 0.0001
 
@@ -1851,7 +1851,7 @@ def test_random_erasing(seed):
         aspect_ratios.append(h / w)
 
     count_bigger_then_ones = len([1 for aspect_ratio in aspect_ratios if aspect_ratio > 1])
-    p_value = stats.binom_test(count_bigger_then_ones, trial, p=0.5)
+    p_value = stats.binomtest(count_bigger_then_ones, trial, p=0.5).pvalue
     assert p_value > 0.0001
 
     # Checking if RandomErasing can be printed as string
-- 
GitLab


From 4a7def80bb2516a8e9e054afb1f8c8855c8f36ec Mon Sep 17 00:00:00 2001
From: SvenDS9 <122370631+SvenDS9@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:13:56 +0100
Subject: [PATCH 368/624] test_prototype_datasets_builtin: Properly close all
 streams (#7403)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_prototype_datasets_builtin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 4848e799f..4d19b6796 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -123,7 +123,7 @@ class TestCommon:
     def test_stream_closing(self, log_session_streams, dataset_mock, config):
         def make_msg_and_close(head):
             unclosed_streams = []
-            for stream in StreamWrapper.session_streams.keys():
+            for stream in list(StreamWrapper.session_streams.keys()):
                 unclosed_streams.append(repr(stream.file_obj))
                 stream.close()
             unclosed_streams = "\n".join(unclosed_streams)
-- 
GitLab


From a73285aa06d99049742e5f7bb7cbd3b633a89e78 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 9 Mar 2023 22:44:17 -0800
Subject: [PATCH 369/624] Add device check to `io.decode_image` (#7406)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 test/test_image.py                             | 7 +++++++
 torchvision/csrc/io/image/cpu/decode_image.cpp | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/test/test_image.py b/test/test_image.py
index 7fcd54c9c..f71e023c4 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -368,6 +368,13 @@ def test_decode_jpeg_cuda(mode, img_path, scripted):
     # Some difference expected between jpeg implementations
     assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2
 
+@needs_cuda
+def test_decode_image_cuda_raises():
+    data = torch.randint(0, 127, size=(255,), device="cuda", dtype=torch.uint8)
+    exception_raised = True
+    with pytest.raises(RuntimeError):
+        decode_image(data)
+
 
 @needs_cuda
 @pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda")))
diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp
index 1cc05dc76..da4dc5833 100644
--- a/torchvision/csrc/io/image/cpu/decode_image.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -7,6 +7,8 @@ namespace vision {
 namespace image {
 
 torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) {
+  // Check that tensor is a CPU tensor
+  TORCH_CHECK(data.device() == torch::kCPU, "Expected a CPU tensor");
   // Check that the input tensor dtype is uint8
   TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
   // Check that the input tensor is 1-dimensional
-- 
GitLab


From 82cf540b45fae6bc83cfff10db9041c3a1aeec5d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 10 Mar 2023 09:24:01 +0100
Subject: [PATCH 370/624] port lint workflows from CircleCI to GHA (#7401)

---
 .circleci/config.yml       | 58 -----------------------
 .circleci/config.yml.in    | 58 -----------------------
 .github/workflows/lint.yml | 95 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 116 deletions(-)
 create mode 100644 .github/workflows/lint.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
index caf29402a..e1552f8e4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -269,61 +269,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  lint_python_and_config:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - pip_install:
-          args: pre-commit
-          descr: Install lint utilities
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  lint_c:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - apt_install:
-          args: libtinfo5
-          descr: Install additional system libraries
-      - checkout
-      - run:
-          name: Install lint utilities
-          command: |
-            curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format
-            chmod +x clang-format
-            sudo mv clang-format /opt/clang-format
-      - run:
-          name: Lint C code
-          command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  type_check_python:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - install_torchvision:
-          editable: true
-      - pip_install:
-          args: mypy
-          descr: Install Python type check utilities
-      - run:
-          name: Check Python types statically
-          command: mypy --install-types --non-interactive --config-file mypy.ini
-
   unittest_onnx:
     docker:
       - image: cimg/python:3.8
@@ -996,9 +941,6 @@ workflows:
   lint:
     jobs:
       - circleci_consistency
-      - lint_python_and_config
-      - lint_c
-      - type_check_python
 
   build:
     jobs:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 109c81267..b86ee3077 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -269,61 +269,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  lint_python_and_config:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - pip_install:
-          args: pre-commit
-          descr: Install lint utilities
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  lint_c:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - apt_install:
-          args: libtinfo5
-          descr: Install additional system libraries
-      - checkout
-      - run:
-          name: Install lint utilities
-          command: |
-            curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format
-            chmod +x clang-format
-            sudo mv clang-format /opt/clang-format
-      - run:
-          name: Lint C code
-          command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  type_check_python:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - install_torchvision:
-          editable: true
-      - pip_install:
-          args: mypy
-          descr: Install Python type check utilities
-      - run:
-          name: Check Python types statically
-          command: mypy --install-types --non-interactive --config-file mypy.ini
-
   unittest_onnx:
     docker:
       - image: cimg/python:3.8
@@ -996,9 +941,6 @@ workflows:
   lint:
     jobs:
       - circleci_consistency
-      - lint_python_and_config
-      - lint_c
-      - type_check_python
 
   build:
     jobs:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..b546fa335
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,95 @@
+name: Lint
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  python-source-and-configs:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda create --name ci --quiet --yes python=3.8 pip
+        conda activate ci
+        echo '::endgroup::'
+        
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off pre-commit
+        echo '::endgroup::'
+        
+        echo '::group::Lint Python source and configs'
+        set +e
+        pre-commit run --all-files
+        
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+        echo '::endgroup::'
+
+  c-source:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+        
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        # clang-format needs some shared libraries that conflict with the system ones. Thus, we install them from conda
+        # and prepend the libraries to linker path to prioritize them
+        conda create --name ci --quiet --yes python=3.8 ncurses=5 libgcc
+        conda activate ci
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+        echo '::endgroup::'
+        
+        echo '::group::Install lint tools'
+        curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format
+        chmod +x ./clang-format
+        echo '::endgroup::'
+
+        echo '::group::Lint C source'
+        set +e
+        ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format
+        
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+        echo '::endgroup::'
+
+
+  python-types:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+        
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        ./.github/scripts/setup-env.sh
+        
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off mypy
+        echo '::endgroup::'
+        
+        echo '::group::Lint Python types'
+        mypy --install-types --non-interactive --config-file mypy.ini
+        echo '::endgroup::'
-- 
GitLab


From ed9207d7e2779759540b7dde5e284c013ee8583f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 10 Mar 2023 10:46:13 +0000
Subject: [PATCH 371/624] Fix lint (#7408)

---
 test/test_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_image.py b/test/test_image.py
index f71e023c4..4c210ea7e 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -368,10 +368,10 @@ def test_decode_jpeg_cuda(mode, img_path, scripted):
     # Some difference expected between jpeg implementations
     assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2
 
+
 @needs_cuda
 def test_decode_image_cuda_raises():
     data = torch.randint(0, 127, size=(255,), device="cuda", dtype=torch.uint8)
-    exception_raised = True
     with pytest.raises(RuntimeError):
         decode_image(data)
 
-- 
GitLab


From 7d2acaa7d7fc600fa08fca18e9230f8651147025 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 10 Mar 2023 12:07:07 +0100
Subject: [PATCH 372/624] fix lint c workflow (#7409)

---
 .github/workflows/lint.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b546fa335..b66886b97 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -49,8 +49,9 @@ jobs:
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
         # clang-format needs some shared libraries that conflict with the system ones. Thus, we install them from conda
-        # and prepend the libraries to linker path to prioritize them
-        conda create --name ci --quiet --yes python=3.8 ncurses=5 libgcc
+        # and prepend the libraries to linker path to prioritize them. `ncurses=5` is only available on the conda-forge
+        # channel. Since we are not building or testing here, this is fine.
+        conda create --name ci --quiet --yes -c conda-forge python=3.8 ncurses=5 libgcc
         conda activate ci
         export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
         echo '::endgroup::'
-- 
GitLab


From 98c58158d1bc09e6fab31d3bf1af36e8d1752a89 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 13 Mar 2023 09:52:00 +0100
Subject: [PATCH 373/624] simplify ci env setup (#7411)

---
 .github/scripts/setup-env.sh     | 48 ++++++++++----------------------
 .github/workflows/lint.yml       |  2 ++
 .github/workflows/test-linux.yml |  2 ++
 .github/workflows/test-macos.yml |  1 +
 4 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index 80787f4dd..3a9217e9e 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -5,7 +5,6 @@ set -euo pipefail
 # Prepare conda
 CONDA_PATH=$(which conda)
 eval "$(${CONDA_PATH} shell.bash hook)"
-conda config --set channel_priority strict
 
 # Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
 case $(uname) in
@@ -33,43 +32,15 @@ if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then
 fi
 echo '::endgroup::'
 
-echo '::group::Set PyTorch conda channel and wheel index'
-# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
-if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
-  CHANNEL_ID=test
-else
-  CHANNEL_ID=nightly
-fi
-PYTORCH_CONDA_CHANNEL=pytorch-"${CHANNEL_ID}"
-echo "PYTORCH_CONDA_CHANNEL=${PYTORCH_CONDA_CHANNEL}"
-
-case $GPU_ARCH_TYPE in
-  cpu)
-    GPU_ARCH_ID="cpu"
-    ;;
-  cuda)
-    VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//')
-    GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}"
-    ;;
-  *)
-    echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}"
-    exit 1
-    ;;
-esac
-PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL_ID}/${GPU_ARCH_ID}"
-echo "PYTORCH_WHEEL_INDEX=${PYTORCH_WHEEL_INDEX}"
-echo '::endgroup::'
-
 echo '::group::Create build environment'
 # See https://github.com/pytorch/vision/issues/7296 for ffmpeg
 conda create \
   --name ci \
   --quiet --yes \
   python="${PYTHON_VERSION}" pip \
-  ninja libpng jpeg \
-  'ffmpeg<4.3' \
-  -c "${PYTORCH_CONDA_CHANNEL}" \
-  -c defaults
+  ninja \
+  libpng jpeg \
+  'ffmpeg<4.3'
 conda activate ci
 pip install --progress-bar=off --upgrade setuptools
 
@@ -81,7 +52,18 @@ fi
 echo '::endgroup::'
 
 echo '::group::Install PyTorch'
-pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
+if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+  CHANNEL=test
+else
+  CHANNEL=nightly
+fi
+
+pip install --progress-bar=off light-the-torch
+ltt install --progress-bar=off \
+  --pytorch-computation-backend="${GPU_ARCH_TYPE}${GPU_ARCH_VERSION}" \
+  --pytorch-channel="${CHANNEL}" \
+  torch
 
 if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
   python3 -c "import torch; exit(not torch.cuda.is_available())"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b66886b97..ae5249a92 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -81,6 +81,8 @@ jobs:
         
         export PYTHON_VERSION=3.8
         export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
         ./.github/scripts/setup-env.sh
         
         CONDA_PATH=$(which conda)
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
index bab309161..b68d634da 100644
--- a/.github/workflows/test-linux.yml
+++ b/.github/workflows/test-linux.yml
@@ -51,6 +51,7 @@ jobs:
 
         export PYTHON_VERSION=3.8
         export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
 
         ./.github/scripts/setup-env.sh
         
@@ -80,6 +81,7 @@ jobs:
 
         export PYTHON_VERSION=3.8
         export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
 
         ./.github/scripts/setup-env.sh
         
diff --git a/.github/workflows/test-macos.yml b/.github/workflows/test-macos.yml
index f188a6416..03e4b2db1 100644
--- a/.github/workflows/test-macos.yml
+++ b/.github/workflows/test-macos.yml
@@ -35,5 +35,6 @@ jobs:
 
         export PYTHON_VERSION=${{ matrix.python-version }}
         export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
 
         ./.github/scripts/unittest.sh
-- 
GitLab


From d2eaeb8dd86fb583278ec9f7cdef4cf1928913f7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 14 Mar 2023 15:44:04 +0100
Subject: [PATCH 374/624] Surface failing tests on GHA (#7364)

---
 .github/scripts/unittest.sh      | 2 +-
 .github/workflows/test-linux.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
index ca3cfd6c0..41c750ebc 100755
--- a/.github/scripts/unittest.sh
+++ b/.github/scripts/unittest.sh
@@ -14,5 +14,5 @@ pip install --progress-bar=off pytest pytest-mock pytest-cov
 echo '::endgroup::'
 
 echo '::group::Run unittests'
-pytest --durations=25
+pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25
 echo '::endgroup::'
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
index b68d634da..b9b01b7a5 100644
--- a/.github/workflows/test-linux.yml
+++ b/.github/workflows/test-linux.yml
@@ -69,7 +69,7 @@ jobs:
         echo '::endgroup::'
         
         echo '::group::Run ONNX tests'
-        pytest --durations=25 -v test/test_onnx.py
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py
         echo '::endgroup::'
 
   unittests-extended:
@@ -101,5 +101,5 @@ jobs:
         
         echo '::group::Run extended unittests'
         export PYTORCH_TEST_WITH_EXTENDED=1
-        pytest --durations=25 -v test/test_extended_*.py
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py
         echo '::endgroup::'
-- 
GitLab


From 6c4ff94b56a8a26c150af0cd95f37bf30e1b8eb4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 14 Mar 2023 16:30:36 +0100
Subject: [PATCH 375/624] also surface failing tests in prototype jobs (#7416)

---
 .../workflows/prototype-tests-linux-gpu.yml   | 82 +++++--------------
 1 file changed, 21 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index cf1bd9e58..dee425054 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -4,91 +4,51 @@ on:
   pull_request:
 
 jobs:
-  tests:
+  unittests-prototype:
     strategy:
       matrix:
         python-version:
           - "3.8"
           - "3.9"
           - "3.10"
+          - "3.11"
+        runner: ["linux.12xlarge"]
         gpu-arch-type: ["cpu"]
-        gpu-arch-version: [""]
-        runner: ["linux.2xlarge"]
         include:
           - python-version: "3.8"
+            runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
             gpu-arch-version: "11.7"
-            runner: linux.4xlarge.nvidia.gpu
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
-      job-name: Python ${{ matrix.python-version }}, ${{ matrix.gpu-arch-type }}
       repository: pytorch/vision
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      runner: ${{ matrix.runner }}
-      timeout: 45
+      timeout: 120
       script: |
-        # Mark Build Directory Safe
-
-        echo '::group::Set PyTorch conda channel'
-        if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
-          POSTFIX=test
-        else
-          POSTFIX=nightly
-        fi
-        PYTORCH_CHANNEL=pytorch-"${POSTFIX}"
-        echo "${PYTORCH_CHANNEL}"
-        echo '::endgroup::'
-
-        echo '::group::Set PyTorch conda mutex'
-        if [[ ${{ matrix.gpu-arch-type }} = 'cuda' ]]; then
-          PYTORCH_MUTEX="pytorch-cuda=${{ matrix.gpu-arch-version }}"
-        else
-          PYTORCH_MUTEX=cpuonly
-        fi
-        echo "${PYTORCH_MUTEX}"
-        echo '::endgroup::'
-
-        echo '::group::Create conda environment'
-        conda create --prefix $PWD/ci \
-          --quiet --yes \
-          python=${{ matrix.python-version }} \
-          numpy libpng jpeg scipy
-        conda activate $PWD/ci
-        echo '::endgroup::'
-
-        echo '::group::Install PyTorch'
-        conda install \
-          --quiet --yes \
-          -c "${PYTORCH_CHANNEL}" \
-          -c nvidia \
-          pytorch \
-          "${PYTORCH_MUTEX}"
-        if [[ ${{ matrix.gpu-arch-type }} = 'cuda' ]]; then
-          python3 -c "import torch; exit(not torch.cuda.is_available())"
-        fi
-        echo '::endgroup::'
-
-        echo '::group::Install TorchVision'
-        python setup.py develop
-        echo '::endgroup::'
-
-        echo '::group::Collect PyTorch environment information'
-        python -m torch.utils.collect_env
-        echo '::endgroup::'
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        ./.github/scripts/setup-env.sh
+        
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
 
         echo '::group::Install testing utilities'
         pip install --progress-bar=off pytest pytest-mock pytest-cov
         echo '::endgroup::'
 
-        echo '::group::Run prototype tests'
         # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e.
         # `test/test_prototype*.py` takes the highest priority, neither `--ignore` nor `--ignore-glob` can help us here.
         rm test/test_prototype_datasets*.py
         pytest \
-          --durations=25 \
-          --cov=torchvision/prototype \
-          --cov-report=term-missing \
-          test/test_prototype*.py
-        echo '::endgroup::'
+          -v --durations=25 \
+          --cov=torchvision/prototype --cov-report=term-missing \
+          --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" \
+          test/test_prototype_*.py
-- 
GitLab


From 8e078971b8aebdeb1746fea58851e3754f103053 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 15 Mar 2023 20:15:31 +0100
Subject: [PATCH 376/624] Migrate docs workflow to GHA (#7407)

Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>
---
 .circleci/config.yml       | 112 -------------------------------------
 .circleci/config.yml.in    |  67 ----------------------
 .circleci/regenerate.py    |  39 +------------
 .github/workflows/docs.yml |  68 ++++++++++++++++++++++
 4 files changed, 69 insertions(+), 217 deletions(-)
 create mode 100644 .github/workflows/docs.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e1552f8e4..6d43ceeeb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -869,73 +869,6 @@ jobs:
             conda activate python39
             packaging/build_cmake.sh
 
-  build_docs:
-    <<: *binary_common
-    docker:
-      - image: cimg/python:3.8
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - checkout
-      - download_model_weights
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - designate_upload_channel
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Build docs
-          command: |
-            set -ex
-            # turn v1.12.0rc3 into 1.12.0
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
-            VERSION=${tag:-main}
-            eval "$(./conda/bin/conda shell.bash hook)"
-            conda activate ./env
-            pushd docs
-            pip install --progress-bar=off -r requirements.txt
-            make html
-            popd
-      - persist_to_workspace:
-          root: ./
-          paths:
-            - "*"
-      - store_artifacts:
-          path: ./docs/build/html
-          destination: docs
-
-  upload_docs:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda100"
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - run:
-          name: Generate netrc
-          command: |
-            # set credentials for https pushing
-            # requires the org-member context
-            cat > ~/.netrc \<<DONE
-              machine github.com
-              login pytorchbot
-              password ${GITHUB_PYTORCHBOT_TOKEN}
-            DONE
-      - run:
-          name: Upload docs
-          command: |
-            # Don't use "checkout" step since it uses ssh, which cannot git push
-            # https://circleci.com/docs/2.0/configuration-reference/#checkout
-            set -ex
-            # Change v1.12.1rc1 into 1.12 (only major.minor)
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-            target=${tag:-main}
-            ~/workspace/.circleci/build_docs/commit_docs.sh ~/workspace $target
-
 
 workflows:
   lint:
@@ -944,17 +877,6 @@ workflows:
 
   build:
     jobs:
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1151,29 +1073,6 @@ workflows:
           cu_version: cu118
           name: binary_win_conda_py3.11_cu118
           python_version: '3.11'
-      - build_docs:
-          filters:
-            branches:
-              only:
-              - /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: build_docs
-          python_version: '3.8'
-          requires:
-          - binary_linux_wheel_py3.8_cpu
-      - upload_docs:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: upload_docs
-          python_version: '3.8'
-          requires:
-          - build_docs
       - binary_ios_build:
           build_environment: binary-libtorchvision_ops-ios-12.0.0-x86_64
           ios_arch: x86_64
@@ -1301,17 +1200,6 @@ workflows:
               only:
               - nightly
           name: nightly_binary_libtorchvision_ops_android_upload
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
       - binary_win_wheel:
           cu_version: cpu
           filters:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index b86ee3077..8dcc1dcbb 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -869,73 +869,6 @@ jobs:
             conda activate python39
             packaging/build_cmake.sh
 
-  build_docs:
-    <<: *binary_common
-    docker:
-      - image: cimg/python:3.8
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - checkout
-      - download_model_weights
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - designate_upload_channel
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Build docs
-          command: |
-            set -ex
-            # turn v1.12.0rc3 into 1.12.0
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
-            VERSION=${tag:-main}
-            eval "$(./conda/bin/conda shell.bash hook)"
-            conda activate ./env
-            pushd docs
-            pip install --progress-bar=off -r requirements.txt
-            make html
-            popd
-      - persist_to_workspace:
-          root: ./
-          paths:
-            - "*"
-      - store_artifacts:
-          path: ./docs/build/html
-          destination: docs
-
-  upload_docs:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda100"
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - run:
-          name: Generate netrc
-          command: |
-            # set credentials for https pushing
-            # requires the org-member context
-            cat > ~/.netrc \<<DONE
-              machine github.com
-              login pytorchbot
-              password ${GITHUB_PYTORCHBOT_TOKEN}
-            DONE
-      - run:
-          name: Upload docs
-          command: |
-            # Don't use "checkout" step since it uses ssh, which cannot git push
-            # https://circleci.com/docs/2.0/configuration-reference/#checkout
-            set -ex
-            # Change v1.12.1rc1 into 1.12 (only major.minor)
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-            target=${tag:-main}
-            ~/workspace/.circleci/build_docs/commit_docs.sh ~/workspace $target
-
 
 workflows:
   lint:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 2c8b60901..ae54ace84 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -61,12 +61,8 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                             fb = "/.*/"
 
                         # Disable all Linux Wheels Workflows from CircleCI
-                        # since those will now be done through Nova. We'll keep
-                        # around the py3.8 CPU Linux Wheels build since the docs
-                        # job depends on it.
                         if os_type == "linux" and btype == "wheel":
-                            if not (python_version == "3.8" and cu_version == "cpu"):
-                                continue
+                            continue
 
                         # Disable all Macos Wheels Workflows from CircleCI.
                         if os_type == "macos" and btype == "wheel":
@@ -80,10 +76,6 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
                             btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
                         )
 
-    if not filter_branch:
-        # Build on every pull request, but upload only on nightly and tags
-        w += build_doc_job("/.*/")
-        w += upload_doc_job("nightly")
     return indent(indentation, w)
 
 
@@ -114,35 +106,6 @@ def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix=""
     return w
 
 
-def build_doc_job(filter_branch):
-    job = {
-        "name": "build_docs",
-        "python_version": "3.8",
-        "requires": [
-            "binary_linux_wheel_py3.8_cpu",
-        ],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch, tags_list=RC_PATTERN)
-    return [{"build_docs": job}]
-
-
-def upload_doc_job(filter_branch):
-    job = {
-        "name": "upload_docs",
-        "context": "org-member",
-        "python_version": "3.8",
-        "requires": [
-            "build_docs",
-        ],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch, tags_list=RC_PATTERN)
-    return [{"upload_docs": job}]
-
-
 manylinux_images = {
     "cu117": "pytorch/manylinux-cuda117",
     "cu118": "pytorch/manylinux-cuda118",
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 000000000..22ff4ab84
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,68 @@
+name: Docs
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  build:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      upload-artifact: docs
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+        ./.github/scripts/setup-env.sh
+        
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it
+        #  already links against the one pulled from conda. However, at runtime it pulls from
+        #  /lib64
+        # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
+        # have to pay attention in all other workflows?
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+        
+        cd docs
+        
+        echo '::group::Install doc requirements'
+        pip install --progress-bar=off -r requirements.txt
+        echo '::endgroup::'
+        
+        echo '::group::Build HTML docs'
+        # The runner does not have sufficient memory to run with as many processes as their are
+        # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
+        sed -i -e 's/-j auto/-j 1/' Makefile
+        make html
+        echo '::endgroup::'
+        
+        mv build/html "${RUNNER_ARTIFACT_DIR}"
+
+  upload-preview:
+    if: github.event_name == 'pull_request'
+    needs: [build]
+    runs-on: [self-hosted, linux.2xlarge]
+    steps:
+      - uses: actions/download-artifact@v3
+        with:
+          name: docs
+
+      - name: Upload docs preview
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: html
+          s3-prefix: pytorch/vision/${{ github.event.pull_request.number }}
-- 
GitLab


From c5e48eb4f0b4b87322a48f9e7cfb4aef02b2a016 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 16 Mar 2023 11:00:58 +0100
Subject: [PATCH 377/624] bump version (#7400)

Co-authored-by: Aditya Oke <47158509+oke-aditya@users.noreply.github.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 README.rst                 | 4 +++-
 ios/LibTorchvision.podspec | 4 ++--
 version.txt                | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.rst b/README.rst
index 615d96986..73984792f 100644
--- a/README.rst
+++ b/README.rst
@@ -21,7 +21,9 @@ supported Python versions.
 +--------------------------+--------------------------+---------------------------------+
 | ``torch``                | ``torchvision``          | ``python``                      |
 +==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.8``, ``<=3.10``           |
+| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.8``, ``<=3.11``           |
++--------------------------+--------------------------+---------------------------------+
+| ``2.0.0``                | ``0.15.1``               | ``>=3.8``, ``<=3.11``           |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.13.0``               | ``0.14.0``               | ``>=3.7.2``, ``<=3.10``         |
 +--------------------------+--------------------------+---------------------------------+
diff --git a/ios/LibTorchvision.podspec b/ios/LibTorchvision.podspec
index d7b154e4a..b88fb70ac 100644
--- a/ios/LibTorchvision.podspec
+++ b/ios/LibTorchvision.podspec
@@ -1,8 +1,8 @@
-pytorch_version = '1.13.0'
+pytorch_version = '2.0.0'
 
 Pod::Spec.new do |s|
     s.name             = 'LibTorchvision'
-    s.version          = '0.14.0'
+    s.version          = '0.15.1'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/vision'
diff --git a/version.txt b/version.txt
index b4f7ccce2..5e0f9f3c7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.15.0a0
+0.16.0a0
-- 
GitLab


From b403bfc771e0caf31efd06d43860b09004f4ac61 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 16 Mar 2023 11:36:33 +0000
Subject: [PATCH 378/624] Minor doc typo fix (#7427)

---
 docs/source/models/fcos.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/fcos.rst b/docs/source/models/fcos.rst
index 1bcc42676..4673d312e 100644
--- a/docs/source/models/fcos.rst
+++ b/docs/source/models/fcos.rst
@@ -3,7 +3,7 @@ FCOS
 
 .. currentmodule:: torchvision.models.detection
 
-The RetinaNet model is based on the `FCOS: Fully Convolutional One-Stage Object Detection
+The FCOS model is based on the `FCOS: Fully Convolutional One-Stage Object Detection
 <https://arxiv.org/abs/1904.01355>`__ paper.
 
 .. betastatus:: detection module
-- 
GitLab


From f0a1df315db3c0fdec3d638895f6173809030bba Mon Sep 17 00:00:00 2001
From: Danylo Baibak <baibak@meta.com>
Date: Wed, 22 Mar 2023 08:29:16 +0100
Subject: [PATCH 379/624] Add a workflow to update the viable/strict branch
 (#7435)

---
 .github/workflows/update-viablestrict.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 .github/workflows/update-viablestrict.yml

diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
new file mode 100644
index 000000000..1da2c3140
--- /dev/null
+++ b/.github/workflows/update-viablestrict.yml
@@ -0,0 +1,20 @@
+name: Update viable/strict
+
+on:
+  schedule:
+    - cron: 10,40 * * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  do_update_viablestrict:
+    uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@main
+    with:
+      repository: pytorch/vision
+      required_check: "build,cmake,lint,Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
+    secrets:
+      ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+      GITHUB_TOKEN : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
-- 
GitLab


From b1d16c9cd651765b7ff042d20d8bc57d76e6fd6f Mon Sep 17 00:00:00 2001
From: Danylo Baibak <baibak@meta.com>
Date: Wed, 22 Mar 2023 10:34:35 +0100
Subject: [PATCH 380/624] Fixed the typo in the "Update viable/strict"
 workflow. (#7441)

---
 .github/workflows/update-viablestrict.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 1da2c3140..279527ad9 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -1,6 +1,9 @@
 name: Update viable/strict
 
 on:
+  pull_request:
+    paths:
+      - .github/workflows/update-viablestrict.yml
   schedule:
     - cron: 10,40 * * * *
   workflow_dispatch:
@@ -14,7 +17,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@main
     with:
       repository: pytorch/vision
-      required_check: "build,cmake,lint,Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
+      required_checks: "build,cmake,lint,Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
     secrets:
       ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-      GITHUB_TOKEN : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
+      GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
-- 
GitLab


From 18a2e8eb5c6e30e2bc22416379b10f5dfaccc4d4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 22 Mar 2023 14:49:46 +0100
Subject: [PATCH 381/624] move parameter sampling of RandomPhotometricDistort
 into _get_params (#7442)

---
 torchvision/transforms/v2/_color.py | 49 +++++++++++++----------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 4ad534c98..7dd8eeae2 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -228,19 +228,22 @@ class RandomPhotometricDistort(Transform):
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         num_channels, *_ = query_chw(flat_inputs)
-        return dict(
-            zip(
-                ["brightness", "contrast1", "saturation", "hue", "contrast2"],
-                (torch.rand(5) < self.p).tolist(),
-            ),
-            contrast_before=bool(torch.rand(()) < 0.5),
-            channel_permutation=torch.randperm(num_channels) if torch.rand(()) < self.p else None,
-        )
+        params: Dict[str, Any] = {
+            key: ColorJitter._generate_value(range[0], range[1]) if torch.rand(1) < self.p else None
+            for key, range in [
+                ("brightness_factor", self.brightness),
+                ("contrast_factor", self.contrast),
+                ("saturation_factor", self.saturation),
+                ("hue_factor", self.hue),
+            ]
+        }
+        params["contrast_before"] = bool(torch.rand(()) < 0.5)
+        params["channel_permutation"] = torch.randperm(num_channels) if torch.rand(1) < self.p else None
+        return params
 
     def _permute_channels(
         self, inpt: Union[datapoints._ImageType, datapoints._VideoType], permutation: torch.Tensor
     ) -> Union[datapoints._ImageType, datapoints._VideoType]:
-
         orig_inpt = inpt
         if isinstance(orig_inpt, PIL.Image.Image):
             inpt = F.pil_to_tensor(inpt)
@@ -256,24 +259,16 @@ class RandomPhotometricDistort(Transform):
     def _transform(
         self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
     ) -> Union[datapoints._ImageType, datapoints._VideoType]:
-        if params["brightness"]:
-            inpt = F.adjust_brightness(
-                inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
-            )
-        if params["contrast1"] and params["contrast_before"]:
-            inpt = F.adjust_contrast(
-                inpt, contrast_factor=ColorJitter._generate_value(self.contrast[0], self.contrast[1])
-            )
-        if params["saturation"]:
-            inpt = F.adjust_saturation(
-                inpt, saturation_factor=ColorJitter._generate_value(self.saturation[0], self.saturation[1])
-            )
-        if params["hue"]:
-            inpt = F.adjust_hue(inpt, hue_factor=ColorJitter._generate_value(self.hue[0], self.hue[1]))
-        if params["contrast2"] and not params["contrast_before"]:
-            inpt = F.adjust_contrast(
-                inpt, contrast_factor=ColorJitter._generate_value(self.contrast[0], self.contrast[1])
-            )
+        if params["brightness_factor"] is not None:
+            inpt = F.adjust_brightness(inpt, brightness_factor=params["brightness_factor"])
+        if params["contrast_factor"] is not None and params["contrast_before"]:
+            inpt = F.adjust_contrast(inpt, contrast_factor=params["contrast_factor"])
+        if params["saturation_factor"] is not None:
+            inpt = F.adjust_saturation(inpt, saturation_factor=params["saturation_factor"])
+        if params["hue_factor"] is not None:
+            inpt = F.adjust_hue(inpt, hue_factor=params["hue_factor"])
+        if params["contrast_factor"] is not None and not params["contrast_before"]:
+            inpt = F.adjust_contrast(inpt, contrast_factor=params["contrast_factor"])
         if params["channel_permutation"] is not None:
             inpt = self._permute_channels(inpt, permutation=params["channel_permutation"])
         return inpt
-- 
GitLab


From 995f9b95d3a073c07a32b83616a881d537a6dff2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 23 Mar 2023 10:26:09 +0100
Subject: [PATCH 382/624] fix test_detection_preset for ssd data augmentation
 (#7447)

---
 test/test_transforms_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index f5ca97696..16a993b36 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1875,7 +1875,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     elif data_augmentation == "ssd":
         t = [
             transforms.RandomPhotometricDistort(p=1),
-            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
+            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0}), p=1),
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
             to_tensor,
@@ -1934,7 +1934,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         # param is True.
         # Note that the values below are probably specific to the random seed
         # set above (which is fine).
-        (True, "ssd"): 4,
+        (True, "ssd"): 5,
         (True, "ssdlite"): 4,
     }.get((sanitize, data_augmentation), num_boxes)
 
-- 
GitLab


From 76144badd130504abb519175a52c2f87c424bcba Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 23 Mar 2023 11:34:10 +0100
Subject: [PATCH 383/624] prevent unwrapping in SanitizeBoundingBoxes (#7446)

---
 test/test_transforms_v2.py         |  3 +++
 torchvision/transforms/v2/_misc.py | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 16a993b36..7413ec1bd 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -2020,6 +2020,9 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     assert out_image is input_img
     assert out_whatever is whatever
 
+    assert isinstance(out_boxes, datapoints.BoundingBox)
+    assert isinstance(out_masks, datapoints.Mask)
+
     if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
         assert out_labels is labels
     else:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index c9b9025eb..8f063fd60 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -397,10 +397,15 @@ class SanitizeBoundingBox(Transform):
         return tree_unflatten(flat_outputs, spec)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        is_label = inpt is not None and inpt is params["labels"]
+        is_bounding_box_or_mask = isinstance(inpt, (datapoints.BoundingBox, datapoints.Mask))
 
-        if (inpt is not None and inpt is params["labels"]) or isinstance(
-            inpt, (datapoints.BoundingBox, datapoints.Mask)
-        ):
-            inpt = inpt[params["valid"]]
+        if not (is_label or is_bounding_box_or_mask):
+            return inpt
 
-        return inpt
+        output = inpt[params["valid"]]
+
+        if is_label:
+            return output
+
+        return type(inpt).wrap_like(inpt, output)
-- 
GitLab


From dabb6d526174dbb04490af35c47ee75f339f578a Mon Sep 17 00:00:00 2001
From: Shu <shu.wan@outlook.com>
Date: Thu, 23 Mar 2023 03:35:16 -0700
Subject: [PATCH 384/624] MovingMNIST split fix (#7449)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_datasets.py                | 15 ++++++++-------
 torchvision/datasets/moving_mnist.py |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 74d03e7ea..48d08b846 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1504,14 +1504,16 @@ class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
 
     ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
 
+    _NUM_FRAMES = 20
+
     def inject_fake_data(self, tmpdir, config):
         base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__)
         os.makedirs(base_folder, exist_ok=True)
-        num_samples = 20
+        num_samples = 5
         data = np.concatenate(
             [
                 np.zeros((config["split_ratio"], num_samples, 64, 64)),
-                np.ones((20 - config["split_ratio"], num_samples, 64, 64)),
+                np.ones((self._NUM_FRAMES - config["split_ratio"], num_samples, 64, 64)),
             ]
         )
         np.save(os.path.join(base_folder, "mnist_test_seq.npy"), data)
@@ -1519,14 +1521,13 @@ class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
 
     @datasets_utils.test_all_configs
     def test_split(self, config):
-        if config["split"] is None:
-            return
-
-        with self.create_dataset(config) as (dataset, info):
+        with self.create_dataset(config) as (dataset, _):
             if config["split"] == "train":
                 assert (dataset.data == 0).all()
-            else:
+            elif config["split"] == "test":
                 assert (dataset.data == 1).all()
+            else:
+                assert dataset.data.size()[1] == self._NUM_FRAMES
 
 
 class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
diff --git a/torchvision/datasets/moving_mnist.py b/torchvision/datasets/moving_mnist.py
index afff0bfa3..ac5a2b150 100644
--- a/torchvision/datasets/moving_mnist.py
+++ b/torchvision/datasets/moving_mnist.py
@@ -58,7 +58,7 @@ class MovingMNIST(VisionDataset):
         data = torch.from_numpy(np.load(os.path.join(self._base_folder, self._filename)))
         if self.split == "train":
             data = data[: self.split_ratio]
-        else:
+        elif self.split == "test":
             data = data[self.split_ratio :]
         self.data = data.transpose(0, 1).unsqueeze(2).contiguous()
 
-- 
GitLab


From 2b25d67925df9741ba2a75a07bc3046302969e87 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Thu, 23 Mar 2023 11:57:25 -0400
Subject: [PATCH 385/624] Upgrade ufmt to 1.3.3 (#7454)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 .pre-commit-config.yaml | 2 +-
 CONTRIBUTING.md         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6662d7e9b..343df7f10 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
       - id: end-of-file-fixer
 
   - repo: https://github.com/omnilib/ufmt
-    rev: v1.3.2
+    rev: v1.3.3
     hooks:
       - id: ufmt
         additional_dependencies:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5bca03097..7c73e8f92 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -83,7 +83,7 @@ Instead of relying directly on `black` however, we rely on
 [ufmt](https://github.com/omnilib/ufmt), for compatibility reasons with Facebook
 internal infrastructure.
 
-To format your code, install `ufmt` with `pip install ufmt==1.3.2 black==22.3.0 usort==1.0.2` and use e.g.:
+To format your code, install `ufmt` with `pip install ufmt==1.3.3 black==22.3.0 usort==1.0.2` and use e.g.:
 
 ```bash
 ufmt format torchvision
-- 
GitLab


From db3ead1656a0ec9d523ca6b07c68394edf87b45a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Fri, 24 Mar 2023 10:44:42 -0400
Subject: [PATCH 386/624] Ignore some flake8 codes (#7462)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index a57e341e6..0f4ddbfab 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ max-line-length = 120
 [flake8]
 # note: we ignore all 501s (line too long) anyway as they're taken care of by black
 max-line-length = 120
-ignore = E203, E402, W503, W504, F821, E501
+ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE
 per-file-ignores =
     __init__.py: F401, F403, F405
     ./hubconf.py: F401
-- 
GitLab


From 8f2e5c90ce0f55877eddb1f7fee8f8b48004849b Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Fri, 24 Mar 2023 15:07:58 -0400
Subject: [PATCH 387/624] SymIntify roi_align (#7448)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/__init__.py                       |  2 +-
 torchvision/_meta_registrations.py            | 48 +++++++++++++++
 .../csrc/ops/autograd/roi_align_kernel.cpp    | 54 ++++++++---------
 torchvision/csrc/ops/roi_align.cpp            | 59 ++++++++++++++++++-
 torchvision/csrc/ops/roi_align.h              | 22 +++++++
 5 files changed, 155 insertions(+), 30 deletions(-)
 create mode 100644 torchvision/_meta_registrations.py

diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 590b32732..3ce050e4d 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -3,7 +3,7 @@ import warnings
 from modulefinder import Module
 
 import torch
-from torchvision import datasets, io, models, ops, transforms, utils
+from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils
 
 from .extension import _HAS_OPS
 
diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py
new file mode 100644
index 000000000..b33371671
--- /dev/null
+++ b/torchvision/_meta_registrations.py
@@ -0,0 +1,48 @@
+import torch
+import torch.library
+
+# Ensure that torch.ops.torchvision is visible
+import torchvision.extension  # noqa: F401
+
+from torch._prims_common import check
+
+_meta_lib = torch.library.Library("torchvision", "IMPL", "Meta")
+
+vision = torch.ops.torchvision
+
+
+def register_meta(op):
+    def wrapper(fn):
+        _meta_lib.impl(op, fn)
+        return fn
+
+    return wrapper
+
+
+@register_meta(vision.roi_align.default)
+def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    check(
+        input.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for input to have the same type as tensor for rois; "
+            f"but type {input.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    num_rois = rois.size(0)
+    _, channels, height, width = input.size()
+    return input.new_empty((num_rois, channels, pooled_height, pooled_width))
+
+
+@register_meta(vision._roi_align_backward.default)
+def meta_roi_align_backward(
+    grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned
+):
+    check(
+        grad.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for grad to have the same type as tensor for rois; "
+            f"but type {grad.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    return grad.new_empty((batch_size, channels, height, width))
diff --git a/torchvision/csrc/ops/autograd/roi_align_kernel.cpp b/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
index f26842b64..6d792fe09 100644
--- a/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
@@ -15,8 +15,8 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
       const torch::autograd::Variable& input,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
       int64_t sampling_ratio,
       bool aligned) {
     ctx->saved_data["spatial_scale"] = spatial_scale;
@@ -24,10 +24,10 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
     ctx->saved_data["pooled_width"] = pooled_width;
     ctx->saved_data["sampling_ratio"] = sampling_ratio;
     ctx->saved_data["aligned"] = aligned;
-    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["input_shape"] = input.sym_sizes();
     ctx->save_for_backward({rois});
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = roi_align(
+    auto result = roi_align_symint(
         input,
         rois,
         spatial_scale,
@@ -44,17 +44,17 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
     // Use data saved in forward
     auto saved = ctx->get_saved_variables();
     auto rois = saved[0];
-    auto input_shape = ctx->saved_data["input_shape"].toIntList();
-    auto grad_in = detail::_roi_align_backward(
+    auto input_shape = ctx->saved_data["input_shape"].toList();
+    auto grad_in = detail::_roi_align_backward_symint(
         grad_output[0],
         rois,
         ctx->saved_data["spatial_scale"].toDouble(),
-        ctx->saved_data["pooled_height"].toInt(),
-        ctx->saved_data["pooled_width"].toInt(),
-        input_shape[0],
-        input_shape[1],
-        input_shape[2],
-        input_shape[3],
+        ctx->saved_data["pooled_height"].toSymInt(),
+        ctx->saved_data["pooled_width"].toSymInt(),
+        input_shape[0].get().toSymInt(),
+        input_shape[1].get().toSymInt(),
+        input_shape[2].get().toSymInt(),
+        input_shape[3].get().toSymInt(),
         ctx->saved_data["sampling_ratio"].toInt(),
         ctx->saved_data["aligned"].toBool());
     return {
@@ -77,16 +77,16 @@ class ROIAlignBackwardFunction
       const torch::autograd::Variable& grad,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
-      int64_t batch_size,
-      int64_t channels,
-      int64_t height,
-      int64_t width,
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
+      c10::SymInt batch_size,
+      c10::SymInt channels,
+      c10::SymInt height,
+      c10::SymInt width,
       int64_t sampling_ratio,
       bool aligned) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = detail::_roi_align_backward(
+    auto result = detail::_roi_align_backward_symint(
         grad,
         rois,
         spatial_scale,
@@ -112,8 +112,8 @@ at::Tensor roi_align_autograd(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
   return ROIAlignFunction::apply(
@@ -130,12 +130,12 @@ at::Tensor roi_align_backward_autograd(
     const at::Tensor& grad,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
-    int64_t batch_size,
-    int64_t channels,
-    int64_t height,
-    int64_t width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
     int64_t sampling_ratio,
     bool aligned) {
   return ROIAlignBackwardFunction::apply(
diff --git a/torchvision/csrc/ops/roi_align.cpp b/torchvision/csrc/ops/roi_align.cpp
index e2465d626..aa6dccb44 100644
--- a/torchvision/csrc/ops/roi_align.cpp
+++ b/torchvision/csrc/ops/roi_align.cpp
@@ -32,6 +32,31 @@ at::Tensor roi_align(
       aligned);
 }
 
+at::Tensor roi_align_symint(
+    const at::Tensor& input, // Input feature map.
+    const at::Tensor& rois, // List of ROIs to pool over.
+    double spatial_scale, // The scale of the image features. ROIs will be
+    // scaled to this.
+    c10::SymInt pooled_height, // The height of the pooled feature map.
+    c10::SymInt pooled_width, // The width of the pooled feature
+    int64_t sampling_ratio, // The number of points to sample in each bin
+    bool aligned) // The flag for pixel shift
+// along each axis.
+{
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.roi_align.roi_align");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::roi_align", "")
+                       .typed<decltype(roi_align_symint)>();
+  return op.call(
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
+}
+
 namespace detail {
 
 at::Tensor _roi_align_backward(
@@ -64,13 +89,43 @@ at::Tensor _roi_align_backward(
       aligned);
 }
 
+at::Tensor _roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("torchvision::_roi_align_backward", "")
+          .typed<decltype(_roi_align_backward_symint)>();
+  return op.call(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio,
+      aligned);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio, bool aligned) -> Tensor"));
+      "torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, int sampling_ratio, bool aligned) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_roi_align_backward(Tensor grad, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int batch_size, int channels, int height, int width, int sampling_ratio, bool aligned) -> Tensor"));
+      "torchvision::_roi_align_backward(Tensor grad, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, SymInt batch_size, SymInt channels, SymInt height, SymInt width, int sampling_ratio, bool aligned) -> Tensor"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 2ddb6ac39..072d6d423 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -15,6 +15,15 @@ VISION_API at::Tensor roi_align(
     int64_t sampling_ratio,
     bool aligned);
 
+VISION_API at::Tensor roi_align_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 namespace detail {
 
 at::Tensor _roi_align_backward(
@@ -30,6 +39,19 @@ at::Tensor _roi_align_backward(
     int64_t sampling_ratio,
     bool aligned);
 
+at::Tensor _roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 } // namespace detail
 
 } // namespace ops
-- 
GitLab


From d4adf08988339a7da81a19ba390d78a629e45c4d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 24 Mar 2023 16:09:34 -0700
Subject: [PATCH 388/624] Update C++ standard to C++17 (#7464)

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 059c4f424..965257435 100644
--- a/setup.py
+++ b/setup.py
@@ -437,8 +437,8 @@ def get_extensions():
                     "swresample",
                     "swscale",
                 ],
-                extra_compile_args=["-std=c++14"] if os.name != "nt" else ["/std:c++14", "/MP"],
-                extra_link_args=["-std=c++14" if os.name != "nt" else "/std:c++14"],
+                extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"],
+                extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"],
             )
         )
 
-- 
GitLab


From db7dc02cda81e4df356eb9a295555f483a880561 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Mar 2023 14:01:15 +0200
Subject: [PATCH 389/624] prepare doc preview s3-prefix for future change
 (#7459)

---
 .github/workflows/docs.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 22ff4ab84..efec5a2c2 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -66,3 +66,16 @@ jobs:
           if-no-files-found: error
           path: html
           s3-prefix: pytorch/vision/${{ github.event.pull_request.number }}
+
+      # The upload below duplicates the upload from above, but to a different path. This is needed since we are in the
+      # process of changing the path, but want to keep the disruption to a minimum.
+      # See https://github.com/pytorch/test-infra/issues/3894
+      # After a grace period, we can delete this again
+      - name: Upload docs preview
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: html
+          s3-prefix: pytorch/pytorch/vision/${{ github.event.pull_request.number }}
-- 
GitLab


From 77a1104c67d68434f503f5ef61985c6f1e767945 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Mar 2023 17:11:19 +0200
Subject: [PATCH 390/624] kill macOS unittest workflows on CircleCI (#7466)

---
 .circleci/config.yml    | 55 -----------------------------------------
 .circleci/config.yml.in | 43 --------------------------------
 .circleci/regenerate.py |  2 +-
 3 files changed, 1 insertion(+), 99 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6d43ceeeb..29d7f28bb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -743,49 +743,6 @@ jobs:
       - store_test_results:
           path: test-results
 
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   cmake_linux_cpu:
     <<: *binary_common
     docker:
@@ -1125,18 +1082,6 @@ workflows:
               - nightly
           name: unittest_windows_gpu_py3.10
           python_version: '3.10'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.8
-          python_version: '3.8'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.9
-          python_version: '3.9'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.10
-          python_version: '3.10'
 
   cmake:
     jobs:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 8dcc1dcbb..5d8500d43 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -743,49 +743,6 @@ jobs:
       - store_test_results:
           path: test-results
 
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   cmake_linux_cpu:
     <<: *binary_common
     docker:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index ae54ace84..c45102443 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -218,7 +218,7 @@ def indent(indentation, data_list):
 
 def unittest_workflows(indentation=6):
     jobs = []
-    for os_type in ["windows", "macos"]:
+    for os_type in ["windows"]:
         for device_type in ["cpu", "gpu"]:
             if os_type == "macos" and device_type == "gpu":
                 continue
-- 
GitLab


From ad9ee8b171328df21fa90270deea34ae29080d32 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Mar 2023 17:12:14 +0200
Subject: [PATCH 391/624] kill ONNX / extended unittest workflows on CircleCI
 (#7467)

---
 .circleci/config.yml    | 28 ----------------------------
 .circleci/config.yml.in | 28 ----------------------------
 2 files changed, 56 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 29d7f28bb..dec5e7e96 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -269,32 +269,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  unittest_onnx:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - install_torchvision
-      - pip_install:
-          args: onnx onnxruntime
-          descr: Install ONNX
-      - run_tests_selective:
-          file_or_dir: test/test_onnx.py
-
-  unittest_extended:
-    docker:
-      - image: cimg/python:3.8
-    resource_class: xlarge
-    steps:
-      - checkout
-      - download_model_weights
-      - install_torchvision
-      - run:
-          name: Enable extended tests
-          command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV
-      - run_tests_selective:
-          file_or_dir: test/test_extended_*.py
-
   binary_linux_wheel:
     <<: *binary_common
     docker:
@@ -1046,8 +1020,6 @@ workflows:
 
   unittest:
     jobs:
-      - unittest_onnx
-      - unittest_extended
       - unittest_windows_cpu:
           cu_version: cpu
           name: unittest_windows_cpu_py3.8
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 5d8500d43..4be70b85f 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -269,32 +269,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  unittest_onnx:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - install_torchvision
-      - pip_install:
-          args: onnx onnxruntime
-          descr: Install ONNX
-      - run_tests_selective:
-          file_or_dir: test/test_onnx.py
-
-  unittest_extended:
-    docker:
-      - image: cimg/python:3.8
-    resource_class: xlarge
-    steps:
-      - checkout
-      - download_model_weights
-      - install_torchvision
-      - run:
-          name: Enable extended tests
-          command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV
-      - run_tests_selective:
-          file_or_dir: test/test_extended_*.py
-
   binary_linux_wheel:
     <<: *binary_common
     docker:
@@ -840,8 +814,6 @@ workflows:
 
   unittest:
     jobs:
-      - unittest_onnx
-      - unittest_extended
       {{ unittest_workflows() }}
 
   cmake:
-- 
GitLab


From 0387b8821d67ca62d57e3b228ade45371c0af79d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 27 Mar 2023 20:09:03 +0200
Subject: [PATCH 392/624] cleanup CircleCI config (#7470)

---
 .circleci/config.yml    | 203 +---------------------------------------
 .circleci/config.yml.in | 203 +---------------------------------------
 2 files changed, 4 insertions(+), 402 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index dec5e7e96..2acc7e0c7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,8 +2,8 @@ version: 2.1
 
 # How to test the Linux jobs:
 #   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.8
-#     - Replace binary_linux_wheel_py3.8 with the name of the job you want to test.
+#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_win_wheel_py3.8
+#     - Replace binary_win_wheel_py3.8 with the name of the job you want to test.
 #       Job names are 'name:' key.
 
 executors:
@@ -99,25 +99,6 @@ commands:
       - brew_install:
           formulae: libtool
 
-  apt_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      update:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> apt install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: |
-            <<# parameters.update >> sudo apt update -qy  <</ parameters.update >>
-            sudo apt install << parameters.args >>
-
   pip_install:
     parameters:
       args:
@@ -139,55 +120,6 @@ commands:
             --progress-bar=off
             << parameters.args >>
 
-  install_torchvision:
-    parameters:
-      editable:
-        type: boolean
-        default: true
-    steps:
-      - pip_install:
-          args: --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install PyTorch from nightly releases
-      - pip_install:
-          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
-          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
-
-  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
-  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
-  run_tests_selective:
-    parameters:
-      file_or_dir:
-        type: string
-    steps:
-      - run:
-          name: Install test utilities
-          command: pip install --progress-bar=off pytest pytest-mock
-      - run:
-          name: Run tests
-          command: pytest --junitxml=test-results/junit.xml -v --durations 20 <<parameters.file_or_dir>>
-      - store_test_results:
-          path: test-results
-
-  download_model_weights:
-    parameters:
-      extract_roots:
-        type: string
-        default: "torchvision/models"
-      background:
-        type: boolean
-        default: true
-    steps:
-      - apt_install:
-          args: parallel wget
-          descr: Install download utilitites
-      - run:
-          name: Download model weights
-          background: << parameters.background >>
-          command: |
-            mkdir -p ~/.cache/torch/hub/checkpoints
-            python scripts/collect_model_urls.py << parameters.extract_roots >> \
-                | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci'
-
 binary_common: &binary_common
   parameters:
     # Edit these defaults to do a release
@@ -269,50 +201,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  binary_linux_wheel:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_linux_conda:
-    <<: *binary_common
-    docker:
-      - image: "<< parameters.conda_docker_image >>"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_conda.sh
-      - store_artifacts:
-          path: /opt/conda/conda-bld/linux-64
-      - persist_to_workspace:
-          root: /opt/conda/conda-bld/linux-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
   binary_win_conda:
     <<: *binary_common
     executor: windows-cpu
@@ -524,44 +412,6 @@ jobs:
               aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
             done
 
-  smoke_test_linux_conda:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_linux_pip:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
   smoke_test_docker_image_build:
     machine:
       image: ubuntu-2004:202104-01
@@ -583,55 +433,6 @@ jobs:
             docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
             docker push ${image_name}:latest
 
-  smoke_test_win_conda:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda env remove -n python${PYTHON_VERSION} || true
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_win_pip:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
   unittest_windows_cpu:
     <<: *binary_common
     executor:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 4be70b85f..b8d838aee 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -2,8 +2,8 @@ version: 2.1
 
 # How to test the Linux jobs:
 #   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.8
-#     - Replace binary_linux_wheel_py3.8 with the name of the job you want to test.
+#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_win_wheel_py3.8
+#     - Replace binary_win_wheel_py3.8 with the name of the job you want to test.
 #       Job names are 'name:' key.
 
 executors:
@@ -99,25 +99,6 @@ commands:
       - brew_install:
           formulae: libtool
 
-  apt_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      update:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> apt install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: |
-            <<# parameters.update >> sudo apt update -qy  <</ parameters.update >>
-            sudo apt install << parameters.args >>
-
   pip_install:
     parameters:
       args:
@@ -139,55 +120,6 @@ commands:
             --progress-bar=off
             << parameters.args >>
 
-  install_torchvision:
-    parameters:
-      editable:
-        type: boolean
-        default: true
-    steps:
-      - pip_install:
-          args: --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install PyTorch from nightly releases
-      - pip_install:
-          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
-          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
-
-  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
-  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
-  run_tests_selective:
-    parameters:
-      file_or_dir:
-        type: string
-    steps:
-      - run:
-          name: Install test utilities
-          command: pip install --progress-bar=off pytest pytest-mock
-      - run:
-          name: Run tests
-          command: pytest --junitxml=test-results/junit.xml -v --durations 20 <<parameters.file_or_dir>>
-      - store_test_results:
-          path: test-results
-
-  download_model_weights:
-    parameters:
-      extract_roots:
-        type: string
-        default: "torchvision/models"
-      background:
-        type: boolean
-        default: true
-    steps:
-      - apt_install:
-          args: parallel wget
-          descr: Install download utilitites
-      - run:
-          name: Download model weights
-          background: << parameters.background >>
-          command: |
-            mkdir -p ~/.cache/torch/hub/checkpoints
-            python scripts/collect_model_urls.py << parameters.extract_roots >> \
-                | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci'
-
 binary_common: &binary_common
   parameters:
     # Edit these defaults to do a release
@@ -269,50 +201,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  binary_linux_wheel:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_linux_conda:
-    <<: *binary_common
-    docker:
-      - image: "<< parameters.conda_docker_image >>"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_conda.sh
-      - store_artifacts:
-          path: /opt/conda/conda-bld/linux-64
-      - persist_to_workspace:
-          root: /opt/conda/conda-bld/linux-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
   binary_win_conda:
     <<: *binary_common
     executor: windows-cpu
@@ -524,44 +412,6 @@ jobs:
               aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
             done
 
-  smoke_test_linux_conda:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_linux_pip:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
   smoke_test_docker_image_build:
     machine:
       image: ubuntu-2004:202104-01
@@ -583,55 +433,6 @@ jobs:
             docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
             docker push ${image_name}:latest
 
-  smoke_test_win_conda:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda env remove -n python${PYTHON_VERSION} || true
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_win_pip:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
   unittest_windows_cpu:
     <<: *binary_common
     executor:
-- 
GitLab


From 872804f13363be2fcc52cb9d32874d87e749c023 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 30 Mar 2023 16:39:53 +0200
Subject: [PATCH 393/624] fix ImagePair MAE comparison (#7477)

---
 test/common_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index bd945b09e..c5826a36f 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -292,7 +292,8 @@ class ImagePair(TensorLikePair):
         actual, expected = self._equalize_attributes(actual, expected)
 
         if self.mae:
-            actual, expected = self._promote_for_comparison(actual, expected)
+            if actual.dtype is torch.uint8:
+                actual, expected = actual.to(torch.int), expected.to(torch.int)
             mae = float(torch.abs(actual - expected).float().mean())
             if mae > self.atol:
                 self._fail(
-- 
GitLab


From a5ac34df70d35d57f233fbfffe3f8f9c217339a9 Mon Sep 17 00:00:00 2001
From: Danylo Baibak <baibak@meta.com>
Date: Fri, 31 Mar 2023 11:02:11 +0200
Subject: [PATCH 394/624] Unblock viable/strict branch (#7479)

---
 .github/workflows/update-viablestrict.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 279527ad9..7ed460dca 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -17,7 +17,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@main
     with:
       repository: pytorch/vision
-      required_checks: "build,cmake,lint,Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
+      required_checks: "cmake,lint,Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
     secrets:
       ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
       GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
-- 
GitLab


From 78c271974f94585f45cd696f66d08dae538a9207 Mon Sep 17 00:00:00 2001
From: Johannes <theodoridis@hdm-stuttgart.de>
Date: Fri, 31 Mar 2023 11:09:03 +0200
Subject: [PATCH 395/624] Update docstring of Resize with shape constraint
 (#7480)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/transforms/transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 95eb9199e..46af416e4 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -283,7 +283,7 @@ class Normalize(torch.nn.Module):
 class Resize(torch.nn.Module):
     """Resize the input image to the given size.
     If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    to have [..., H, W] shape, where ... means a maximum of two leading dimensions
 
     .. warning::
         The output image might be different depending on its type: when downsampling, the interpolation of PIL images
-- 
GitLab


From 39712558247e0d806ceb0df384a6258aa744f950 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 3 Apr 2023 10:03:57 +0100
Subject: [PATCH 396/624] Add missing paper link for mvitv2 (#7484)

---
 torchvision/models/video/mvit.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index d3be03740..6bde4a250 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -767,7 +767,9 @@ def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = T
 def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
     """
     Constructs a small MViTV2 architecture from
-    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.
+    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__
+    and `MViTv2: Improved Multiscale Vision Transformers for Classification and Detection
+<https://arxiv.org/abs/2112.01526>`__.
 
     .. betastatus:: video module
 
-- 
GitLab


From 781f512b01bc2324d7fdd11f0901f60571fc476f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 3 Apr 2023 12:49:15 +0100
Subject: [PATCH 397/624] Fix mvitv2 docstring format (#7487)

---
 torchvision/models/video/mvit.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index 6bde4a250..973ff34cc 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -765,11 +765,10 @@ def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = T
 @register_model()
 @handle_legacy_interface(weights=("pretrained", MViT_V2_S_Weights.KINETICS400_V1))
 def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
-    """
-    Constructs a small MViTV2 architecture from
-    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__
-    and `MViTv2: Improved Multiscale Vision Transformers for Classification and Detection
-<https://arxiv.org/abs/2112.01526>`__.
+    """Constructs a small MViTV2 architecture from
+    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
+    `MViTv2: Improved Multiscale Vision Transformers for Classification
+    and Detection <https://arxiv.org/abs/2112.01526>`__.
 
     .. betastatus:: video module
 
@@ -787,7 +786,7 @@ def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = T
             for more details about this class.
 
     .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
-        :members:
+            :members:
     """
     weights = MViT_V2_S_Weights.verify(weights)
 
-- 
GitLab


From 5c5a94ddc12f300450759598f98ca7591b2a4677 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 4 Apr 2023 16:09:15 +0200
Subject: [PATCH 398/624] enable Windows CPU CI on GHA (#7475)

---
 .github/scripts/setup-env.sh       | 27 +++++++++++++---
 .github/scripts/unittest.sh        |  8 ++---
 .github/workflows/test-windows.yml | 51 ++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 11 deletions(-)
 create mode 100644 .github/workflows/test-windows.yml

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index 3a9217e9e..635fdb265 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -14,23 +14,26 @@ case $(uname) in
   Darwin)
     OS_TYPE=macos
     ;;
+  MSYS*)
+    OS_TYPE=windows
+    ;;
   *)
     echo "Unknown OS type:" $(uname)
     exit 1
     ;;
 esac
 
-echo '::group::Uninstall system JPEG libraries on macOS'
-# The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG libraries installed by default
-# that interfere with our build. We uninstall them here and use the one from conda below.
 if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then
+  echo '::group::Uninstall system JPEG libraries on macOS'
+  # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG libraries installed by
+  # default that interfere with our build. We uninstall them here and use the one from conda below.
   JPEG_LIBS=$(brew list | grep jpeg)
   echo $JPEG_LIBS
   for lib in $JPEG_LIBS; do
     brew uninstall --ignore-dependencies --force $lib || true
   done
+  echo '::endgroup::'
 fi
-echo '::endgroup::'
 
 echo '::group::Create build environment'
 # See https://github.com/pytorch/vision/issues/7296 for ffmpeg
@@ -66,10 +69,24 @@ ltt install --progress-bar=off \
   torch
 
 if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
-  python3 -c "import torch; exit(not torch.cuda.is_available())"
+  python -c "import torch; exit(not torch.cuda.is_available())"
 fi
 echo '::endgroup::'
 
+if [[ "${OS_TYPE}" == "windows" ]]; then
+  echo '::group::Install third party dependencies prior to TorchVision install on Windows'
+  # `easy_install`, i.e. `python setup.py` has problems downloading the dependencies due to SSL.
+  # Thus, we install them upfront with `pip` to avoid that.
+  # Instead of fixing the SSL error, we can probably maintain this special case until we switch away from the deprecated
+  # `easy_install` anyway.
+  python setup.py egg_info
+  # The requires.txt cannot be used with `pip install -r` directly. The requirements are listed at the top and the
+  # optional dependencies come in non-standard syntax after a blank line. Thus, we just extract the header.
+  sed -e '/^$/,$d' *.egg-info/requires.txt > requirements.txt
+  pip install --progress-bar=off -r requirements.txt
+  echo '::endgroup::'
+fi
+
 echo '::group::Install TorchVision'
 python setup.py develop
 echo '::endgroup::'
diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
index 41c750ebc..2a0b71542 100755
--- a/.github/scripts/unittest.sh
+++ b/.github/scripts/unittest.sh
@@ -4,15 +4,11 @@ set -euo pipefail
 
 ./.github/scripts/setup-env.sh
 
-# Prepare conda
-CONDA_PATH=$(which conda)
-eval "$(${CONDA_PATH} shell.bash hook)"
-conda activate ci
+# Activate conda environment
+eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci
 
 echo '::group::Install testing utilities'
 pip install --progress-bar=off pytest pytest-mock pytest-cov
 echo '::endgroup::'
 
-echo '::group::Run unittests'
 pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25
-echo '::endgroup::'
diff --git a/.github/workflows/test-windows.yml b/.github/workflows/test-windows.yml
new file mode 100644
index 000000000..dd663d9e9
--- /dev/null
+++ b/.github/workflows/test-windows.yml
@@ -0,0 +1,51 @@
+name: Tests on Windows
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  unittests:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["windows.4xlarge"]
+        gpu-arch-type: ["cpu"]
+        # FIXME: enable this as soon as nvjpeg is available on the Windows runner
+#        include:
+#          - python-version: "3.8"
+#            runner: windows.8xlarge.nvidia.gpu
+#            gpu-arch-type: cuda
+#            gpu-arch-version: "11.7"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      timeout: 120
+      script: |
+        set -euxo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        
+        # TODO: Port this to pytorch/test-infra/.github/workflows/windows_job.yml
+        export PATH="/c/Jenkins/Miniconda3/Scripts:${PATH}"
+        
+        if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
+          # TODO: This should be handled by the generic Windows job the same as its done by the generic Linux job
+          export CUDA_HOME="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ matrix.gpu-arch-version }}"
+          export CUDA_PATH="${CUDA_HOME}"
+        fi
+        
+        ./.github/scripts/unittest.sh
-- 
GitLab


From 715db1d605cf9d81f56cfca9f64da20675d8f7a6 Mon Sep 17 00:00:00 2001
From: Danylo Baibak <baibak@meta.com>
Date: Wed, 5 Apr 2023 13:40:01 +0200
Subject: [PATCH 399/624] Added allowed_workflows to pytorch probot (#7485)

---
 .github/pytorch-probot.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 27d0f2a1f..3ac91e892 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1 +1,10 @@
 tracking_issue: 2447
+
+# List of workflows that will be re-run in case of failures
+# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts
+allowed_workflows:
+- Build Linux
+- Build Macos
+- Build M1
+- Tests on Linux
+- Tests on macOS
-- 
GitLab


From dcca679ad460710c1bb9cb7abe8a5904e7de2ab8 Mon Sep 17 00:00:00 2001
From: Danylo Baibak <baibak@meta.com>
Date: Wed, 5 Apr 2023 16:55:13 +0200
Subject: [PATCH 400/624] Remove cmake for update viable/strict (#7499)

---
 .github/workflows/update-viablestrict.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 7ed460dca..2d9c22656 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -17,7 +17,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@main
     with:
       repository: pytorch/vision
-      required_checks: "cmake,lint,Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
+      required_checks: "Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
     secrets:
       ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
       GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
-- 
GitLab


From ce653d8b6e88fbdb4d3523bf677fee71bc594196 Mon Sep 17 00:00:00 2001
From: Danylo Baibak <baibak@meta.com>
Date: Thu, 6 Apr 2023 16:10:48 +0200
Subject: [PATCH 401/624] Improved configuration naming for repetitive
 workflows (#7502)

---
 .github/pytorch-probot.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 3ac91e892..d381297f9 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -2,7 +2,7 @@ tracking_issue: 2447
 
 # List of workflows that will be re-run in case of failures
 # https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts
-allowed_workflows:
+retryable_workflows:
 - Build Linux
 - Build Macos
 - Build M1
-- 
GitLab


From 27b8491640aac89a08624f3f70a270ee88542984 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 6 Apr 2023 21:41:18 +0200
Subject: [PATCH 402/624] only return small set of targets by default from
 dataset wrapper (#7488)

---
 gallery/plot_transforms_v2_e2e.py          |   5 +-
 test/datasets_utils.py                     |  18 +-
 test/test_datasets.py                      |   4 +-
 torchvision/datapoints/_dataset_wrapper.py | 265 ++++++++++++++++-----
 4 files changed, 223 insertions(+), 69 deletions(-)

diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index aa25d214f..5d8d22dce 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -75,7 +75,8 @@ print(type(target), type(target[0]), list(target[0].keys()))
 # :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
 # :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It
 # also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding
-# ``torchvision.datapoints``.
+# ``torchvision.datapoints``. By default, it only returns ``"boxes"`` and ``"labels"`` to avoid transforming unnecessary
+# items down the line, but you can pass the ``target_type`` parameter for fine-grained control.
 
 dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
 
@@ -83,7 +84,7 @@ sample = dataset[0]
 image, target = sample
 print(type(image))
 print(type(target), list(target.keys()))
-print(type(target["boxes"]), type(target["masks"]), type(target["labels"]))
+print(type(target["boxes"]), type(target["labels"]))
 
 ########################################################################################################################
 # As baseline, let's have a look at a sample without transformations:
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 768324955..169437a74 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -572,9 +572,21 @@ class DatasetTestCase(unittest.TestCase):
 
         try:
             with self.create_dataset(config) as (dataset, _):
-                wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
-                wrapped_sample = wrapped_dataset[0]
-                assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
+                for target_keys in [None, "all"]:
+                    if target_keys is not None and self.DATASET_CLASS not in {
+                        torchvision.datasets.CocoDetection,
+                        torchvision.datasets.VOCDetection,
+                        torchvision.datasets.Kitti,
+                        torchvision.datasets.WIDERFace,
+                    }:
+                        with self.assertRaisesRegex(ValueError, "`target_keys` is currently only supported for"):
+                            wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                        continue
+
+                    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                    wrapped_sample = wrapped_dataset[0]
+
+                    assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
         except TypeError as error:
             msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
             if str(error).startswith(msg):
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 48d08b846..ed6aa17d3 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -771,6 +771,8 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
                     bbox=torch.rand(4).tolist(),
                     segmentation=[torch.rand(8).tolist()],
                     category_id=int(torch.randint(91, ())),
+                    area=float(torch.rand(1)),
+                    iscrowd=int(torch.randint(2, size=(1,))),
                 )
             )
             annotion_id += 1
@@ -3336,7 +3338,7 @@ class TestDatasetWrapper:
         mocker.patch.dict(
             datapoints._dataset_wrapper.WRAPPER_FACTORIES,
             clear=False,
-            values={datasets.FakeData: lambda dataset: lambda idx, sample: sentinel},
+            values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel},
         )
 
         class MyFakeData(datasets.FakeData):
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index 87ce3ba93..cce8f1b2e 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import collections.abc
+
 import contextlib
 from collections import defaultdict
 
@@ -14,7 +16,7 @@ from torchvision.transforms.v2 import functional as F
 __all__ = ["wrap_dataset_for_transforms_v2"]
 
 
-def wrap_dataset_for_transforms_v2(dataset):
+def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
     """[BETA] Wrap a ``torchvision.dataset`` for usage with :mod:`torchvision.transforms.v2`.
 
     .. v2betastatus:: wrap_dataset_for_transforms_v2 function
@@ -36,15 +38,17 @@ def wrap_dataset_for_transforms_v2(dataset):
         * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
           returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
           ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``.
-          The original keys are preserved.
+          The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"``
+          and ``"labels"``.
         * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
           the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
-          preserved.
+          preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
           coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBox` datapoint.
-        * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dictsthe wrapper returns a dict
-          of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
-          in the corresponding ``torchvision.datapoints``. The original keys are preserved.
+        * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
+          dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
+          in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is
+          ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
           :class:`~torchvision.datapoints.Mask` datapoint.
         * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
@@ -61,13 +65,13 @@ def wrap_dataset_for_transforms_v2(dataset):
 
     Segmentation datasets
 
-        Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation` return a two-tuple of
+        Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation`, return a two-tuple of
         :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the
         segmentation mask into a :class:`~torchvision.datapoints.Mask` (second item).
 
     Video classification datasets
 
-        Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics` return a three-tuple containing a
+        Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics`, return a three-tuple containing a
         :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a
         :class:`~torchvision.datapoints.Video` while leaving the other items as is.
 
@@ -78,8 +82,23 @@ def wrap_dataset_for_transforms_v2(dataset):
 
     Args:
         dataset: the dataset instance to wrap for compatibility with transforms v2.
+        target_keys: Target keys to return in case the target is a dictionary. If ``None`` (default), selected keys are
+            specific to the dataset. If ``"all"``, returns the full target. Can also be a collection of strings for
+            fine grained access. Currently only supported for :class:`~torchvision.datasets.CocoDetection`,
+            :class:`~torchvision.datasets.VOCDetection`, :class:`~torchvision.datasets.Kitti`, and
+            :class:`~torchvision.datasets.WIDERFace`. See above for details.
     """
-    return VisionDatasetDatapointWrapper(dataset)
+    if not (
+        target_keys is None
+        or target_keys == "all"
+        or (isinstance(target_keys, collections.abc.Collection) and all(isinstance(key, str) for key in target_keys))
+    ):
+        raise ValueError(
+            f"`target_keys` can be None, 'all', or a collection of strings denoting the keys to be returned, "
+            f"but got {target_keys}"
+        )
+
+    return VisionDatasetDatapointWrapper(dataset, target_keys)
 
 
 class WrapperFactories(dict):
@@ -99,7 +118,7 @@ WRAPPER_FACTORIES = WrapperFactories()
 
 
 class VisionDatasetDatapointWrapper(Dataset):
-    def __init__(self, dataset):
+    def __init__(self, dataset, target_keys):
         dataset_cls = type(dataset)
 
         if not isinstance(dataset, datasets.VisionDataset):
@@ -111,6 +130,16 @@ class VisionDatasetDatapointWrapper(Dataset):
         for cls in dataset_cls.mro():
             if cls in WRAPPER_FACTORIES:
                 wrapper_factory = WRAPPER_FACTORIES[cls]
+                if target_keys is not None and cls not in {
+                    datasets.CocoDetection,
+                    datasets.VOCDetection,
+                    datasets.Kitti,
+                    datasets.WIDERFace,
+                }:
+                    raise ValueError(
+                        f"`target_keys` is currently only supported for `CocoDetection`, `VOCDetection`, `Kitti`, "
+                        f"and `WIDERFace`, but got {cls.__name__}."
+                    )
                 break
             elif cls is datasets.VisionDataset:
                 # TODO: If we have documentation on how to do that, put a link in the error message.
@@ -123,7 +152,7 @@ class VisionDatasetDatapointWrapper(Dataset):
                 raise TypeError(msg)
 
         self._dataset = dataset
-        self._wrapper = wrapper_factory(dataset)
+        self._wrapper = wrapper_factory(dataset, target_keys)
 
         # We need to disable the transforms on the dataset here to be able to inject the wrapping before we apply them.
         # Although internally, `datasets.VisionDataset` merges `transform` and `target_transform` into the joint
@@ -170,7 +199,7 @@ def identity(item):
     return item
 
 
-def identity_wrapper_factory(dataset):
+def identity_wrapper_factory(dataset, target_keys):
     def wrapper(idx, sample):
         return sample
 
@@ -181,6 +210,20 @@ def pil_image_to_mask(pil_image):
     return datapoints.Mask(pil_image)
 
 
+def parse_target_keys(target_keys, *, available, default):
+    if target_keys is None:
+        target_keys = default
+    if target_keys == "all":
+        target_keys = available
+    else:
+        target_keys = set(target_keys)
+        extra = target_keys - available
+        if extra:
+            raise ValueError(f"Target keys {sorted(extra)} are not available")
+
+    return target_keys
+
+
 def list_of_dicts_to_dict_of_lists(list_of_dicts):
     dict_of_lists = defaultdict(list)
     for dct in list_of_dicts:
@@ -203,8 +246,8 @@ def wrap_target_by_type(target, *, target_types, type_wrappers):
     return wrapped_target
 
 
-def classification_wrapper_factory(dataset):
-    return identity_wrapper_factory(dataset)
+def classification_wrapper_factory(dataset, target_keys):
+    return identity_wrapper_factory(dataset, target_keys)
 
 
 for dataset_cls in [
@@ -221,7 +264,7 @@ for dataset_cls in [
     WRAPPER_FACTORIES.register(dataset_cls)(classification_wrapper_factory)
 
 
-def segmentation_wrapper_factory(dataset):
+def segmentation_wrapper_factory(dataset, target_keys):
     def wrapper(idx, sample):
         image, mask = sample
         return image, pil_image_to_mask(mask)
@@ -235,7 +278,7 @@ for dataset_cls in [
     WRAPPER_FACTORIES.register(dataset_cls)(segmentation_wrapper_factory)
 
 
-def video_classification_wrapper_factory(dataset):
+def video_classification_wrapper_factory(dataset, target_keys):
     if dataset.video_clips.output_format == "THWC":
         raise RuntimeError(
             f"{type(dataset).__name__} with `output_format='THWC'` is not supported by this wrapper, "
@@ -261,15 +304,33 @@ for dataset_cls in [
 
 
 @WRAPPER_FACTORIES.register(datasets.Caltech101)
-def caltech101_wrapper_factory(dataset):
+def caltech101_wrapper_factory(dataset, target_keys):
     if "annotation" in dataset.target_type:
         raise_not_supported("Caltech101 dataset with `target_type=['annotation', ...]`")
 
-    return classification_wrapper_factory(dataset)
+    return classification_wrapper_factory(dataset, target_keys)
 
 
 @WRAPPER_FACTORIES.register(datasets.CocoDetection)
-def coco_dectection_wrapper_factory(dataset):
+def coco_dectection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "segmentation",
+            "area",
+            "iscrowd",
+            "image_id",
+            "bbox",
+            "category_id",
+            # added by the wrapper
+            "boxes",
+            "masks",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
     def segmentation_to_mask(segmentation, *, spatial_size):
         from pycocotools import mask
 
@@ -288,30 +349,41 @@ def coco_dectection_wrapper_factory(dataset):
         if not target:
             return image, dict(image_id=image_id)
 
+        spatial_size = tuple(F.get_spatial_size(image))
+
         batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
 
-        batched_target["image_id"] = image_id
+        if "image_id" in target_keys:
+            target["image_id"] = image_id
 
-        spatial_size = tuple(F.get_spatial_size(image))
-        batched_target["boxes"] = F.convert_format_bounding_box(
-            datapoints.BoundingBox(
-                batched_target["bbox"],
-                format=datapoints.BoundingBoxFormat.XYWH,
-                spatial_size=spatial_size,
-            ),
-            new_format=datapoints.BoundingBoxFormat.XYXY,
-        )
-        batched_target["masks"] = datapoints.Mask(
-            torch.stack(
-                [
-                    segmentation_to_mask(segmentation, spatial_size=spatial_size)
-                    for segmentation in batched_target["segmentation"]
-                ]
-            ),
-        )
-        batched_target["labels"] = torch.tensor(batched_target["category_id"])
+        if "boxes" in target_keys:
+            target["boxes"] = F.convert_format_bounding_box(
+                datapoints.BoundingBox(
+                    batched_target["bbox"],
+                    format=datapoints.BoundingBoxFormat.XYWH,
+                    spatial_size=spatial_size,
+                ),
+                new_format=datapoints.BoundingBoxFormat.XYXY,
+            )
+
+        if "masks" in target_keys:
+            target["masks"] = datapoints.Mask(
+                torch.stack(
+                    [
+                        segmentation_to_mask(segmentation, spatial_size=spatial_size)
+                        for segmentation in batched_target["segmentation"]
+                    ]
+                ),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(batched_target["category_id"])
 
-        return image, batched_target
+        for target_key in target_keys - {"image_id", "boxes", "masks", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
 
     return wrapper
 
@@ -346,23 +418,41 @@ VOC_DETECTION_CATEGORY_TO_IDX = dict(zip(VOC_DETECTION_CATEGORIES, range(len(VOC
 
 
 @WRAPPER_FACTORIES.register(datasets.VOCDetection)
-def voc_detection_wrapper_factory(dataset):
+def voc_detection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "annotation",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
     def wrapper(idx, sample):
         image, target = sample
 
         batched_instances = list_of_dicts_to_dict_of_lists(target["annotation"]["object"])
 
-        target["boxes"] = datapoints.BoundingBox(
-            [
-                [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
-                for bndbox in batched_instances["bndbox"]
-            ],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(image.height, image.width),
-        )
-        target["labels"] = torch.tensor(
-            [VOC_DETECTION_CATEGORY_TO_IDX[category] for category in batched_instances["name"]]
-        )
+        if "annotation" not in target_keys:
+            target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = datapoints.BoundingBox(
+                [
+                    [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
+                    for bndbox in batched_instances["bndbox"]
+                ],
+                format=datapoints.BoundingBoxFormat.XYXY,
+                spatial_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(
+                [VOC_DETECTION_CATEGORY_TO_IDX[category] for category in batched_instances["name"]]
+            )
 
         return image, target
 
@@ -370,15 +460,15 @@ def voc_detection_wrapper_factory(dataset):
 
 
 @WRAPPER_FACTORIES.register(datasets.SBDataset)
-def sbd_wrapper(dataset):
+def sbd_wrapper(dataset, target_keys):
     if dataset.mode == "boundaries":
         raise_not_supported("SBDataset with mode='boundaries'")
 
-    return segmentation_wrapper_factory(dataset)
+    return segmentation_wrapper_factory(dataset, target_keys)
 
 
 @WRAPPER_FACTORIES.register(datasets.CelebA)
-def celeba_wrapper_factory(dataset):
+def celeba_wrapper_factory(dataset, target_keys):
     if any(target_type in dataset.target_type for target_type in ["attr", "landmarks"]):
         raise_not_supported("`CelebA` dataset with `target_type=['attr', 'landmarks', ...]`")
 
@@ -410,17 +500,47 @@ KITTI_CATEGORY_TO_IDX = dict(zip(KITTI_CATEGORIES, range(len(KITTI_CATEGORIES)))
 
 
 @WRAPPER_FACTORIES.register(datasets.Kitti)
-def kitti_wrapper_factory(dataset):
+def kitti_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "type",
+            "truncated",
+            "occluded",
+            "alpha",
+            "bbox",
+            "dimensions",
+            "location",
+            "rotation_y",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
     def wrapper(idx, sample):
         image, target = sample
 
-        if target is not None:
-            target = list_of_dicts_to_dict_of_lists(target)
+        if target is None:
+            return image, target
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
 
+        if "boxes" in target_keys:
             target["boxes"] = datapoints.BoundingBox(
-                target["bbox"], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(image.height, image.width)
+                batched_target["bbox"],
+                format=datapoints.BoundingBoxFormat.XYXY,
+                spatial_size=(image.height, image.width),
             )
-            target["labels"] = torch.tensor([KITTI_CATEGORY_TO_IDX[category] for category in target["type"]])
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor([KITTI_CATEGORY_TO_IDX[category] for category in batched_target["type"]])
+
+        for target_key in target_keys - {"boxes", "labels"}:
+            target[target_key] = batched_target[target_key]
 
         return image, target
 
@@ -428,7 +548,7 @@ def kitti_wrapper_factory(dataset):
 
 
 @WRAPPER_FACTORIES.register(datasets.OxfordIIITPet)
-def oxford_iiit_pet_wrapper_factor(dataset):
+def oxford_iiit_pet_wrapper_factor(dataset, target_keys):
     def wrapper(idx, sample):
         image, target = sample
 
@@ -447,7 +567,7 @@ def oxford_iiit_pet_wrapper_factor(dataset):
 
 
 @WRAPPER_FACTORIES.register(datasets.Cityscapes)
-def cityscapes_wrapper_factory(dataset):
+def cityscapes_wrapper_factory(dataset, target_keys):
     if any(target_type in dataset.target_type for target_type in ["polygon", "color"]):
         raise_not_supported("`Cityscapes` dataset with `target_type=['polygon', 'color', ...]`")
 
@@ -482,11 +602,30 @@ def cityscapes_wrapper_factory(dataset):
 
 
 @WRAPPER_FACTORIES.register(datasets.WIDERFace)
-def widerface_wrapper(dataset):
+def widerface_wrapper(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            "bbox",
+            "blur",
+            "expression",
+            "illumination",
+            "occlusion",
+            "pose",
+            "invalid",
+        },
+        default="all",
+    )
+
     def wrapper(idx, sample):
         image, target = sample
 
-        if target is not None:
+        if target is None:
+            return image, target
+
+        target = {key: target[key] for key in target_keys}
+
+        if "bbox" in target_keys:
             target["bbox"] = F.convert_format_bounding_box(
                 datapoints.BoundingBox(
                     target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
-- 
GitLab


From 1c4f0c49079f6e3a5b3cf2109fd3f83b1041f0d0 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Fri, 7 Apr 2023 09:31:19 -0400
Subject: [PATCH 403/624] Only do meta registrations if we have the ops (#7500)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 torchvision/_meta_registrations.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py
index b33371671..7285e15ce 100644
--- a/torchvision/_meta_registrations.py
+++ b/torchvision/_meta_registrations.py
@@ -1,3 +1,5 @@
+import functools
+
 import torch
 import torch.library
 
@@ -6,20 +8,22 @@ import torchvision.extension  # noqa: F401
 
 from torch._prims_common import check
 
-_meta_lib = torch.library.Library("torchvision", "IMPL", "Meta")
 
-vision = torch.ops.torchvision
+@functools.lru_cache(None)
+def get_meta_lib():
+    return torch.library.Library("torchvision", "IMPL", "Meta")
 
 
-def register_meta(op):
+def register_meta(op_name, overload_name="default"):
     def wrapper(fn):
-        _meta_lib.impl(op, fn)
+        if torchvision.extension._has_ops():
+            get_meta_lib().impl(getattr(getattr(torch.ops.torchvision, op_name), overload_name), fn)
         return fn
 
     return wrapper
 
 
-@register_meta(vision.roi_align.default)
+@register_meta("roi_align")
 def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
     check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
     check(
@@ -34,7 +38,7 @@ def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, samp
     return input.new_empty((num_rois, channels, pooled_height, pooled_width))
 
 
-@register_meta(vision._roi_align_backward.default)
+@register_meta("_roi_align_backward")
 def meta_roi_align_backward(
     grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned
 ):
-- 
GitLab


From 5b07d6c9c6c14cf88fc545415d63021456874744 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 7 Apr 2023 12:01:37 -0400
Subject: [PATCH 404/624] Remove temporary channel for python 3.11 (#7505)

---
 packaging/build_conda.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index f2ccc62d3..ec171f827 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -13,8 +13,4 @@ setup_visual_studio_constraint
 setup_junit_results_folder
 export CUDATOOLKIT_CHANNEL="nvidia"
 
-if [[ "$PYTHON_VERSION" == "3.11" ]]; then
-  export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c malfet"
-fi
-
 conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --no-test --python "$PYTHON_VERSION" packaging/torchvision
-- 
GitLab


From 29757104250dd088386fef1ec3d70ed0b0c1be8a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 13 Apr 2023 09:30:32 +0200
Subject: [PATCH 405/624] add image_id to the default return for wrapped
 CocoDetection (#7511)

---
 torchvision/datapoints/_dataset_wrapper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index cce8f1b2e..09a5469dd 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -38,8 +38,8 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
         * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
           returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
           ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``.
-          The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"``
-          and ``"labels"``.
+          The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the
+          ``"image_id"``, ``"boxes"``, and ``"labels"``.
         * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
           the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
           preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
@@ -328,7 +328,7 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
             "masks",
             "labels",
         },
-        default={"boxes", "labels"},
+        default={"image_id", "boxes", "labels"},
     )
 
     def segmentation_to_mask(segmentation, *, spatial_size):
-- 
GitLab


From fc377d04a10fbfa8ee27d104dcf204c20b5c778a Mon Sep 17 00:00:00 2001
From: puhuk <wetr235@gmail.com>
Date: Fri, 14 Apr 2023 17:35:41 +0900
Subject: [PATCH 406/624] Improve ImageNet documentation (#7503)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/datasets/imagenet.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torchvision/datasets/imagenet.py b/torchvision/datasets/imagenet.py
index 4b86bf2f2..eee0d0054 100644
--- a/torchvision/datasets/imagenet.py
+++ b/torchvision/datasets/imagenet.py
@@ -21,6 +21,12 @@ META_FILE = "meta.bin"
 class ImageNet(ImageFolder):
     """`ImageNet <http://image-net.org/>`_ 2012 Classification Dataset.
 
+    .. note::
+        Before using this class, it is required to download ImageNet 2012 dataset from
+        `here <https://image-net.org/challenges/LSVRC/2012/2012-downloads.php>`_ and
+        place the files ``ILSVRC2012_devkit_t12.tar.gz`` and ``ILSVRC2012_img_train.tar``
+        or ``ILSVRC2012_img_val.tar`` based on ``split`` in the root directory.
+
     Args:
         root (string): Root directory of the ImageNet Dataset.
         split (string, optional): The dataset split, supports ``train``, or ``val``.
-- 
GitLab


From b78d98bb152ffb9c0c0f5365f59f475c70b1784e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 14 Apr 2023 13:56:41 +0200
Subject: [PATCH 407/624] add example for v2 wrapping for custom datasets
 (#7514)

---
 gallery/plot_datapoints.py                 | 64 ++++++++++++++++++++++
 torchvision/datapoints/_dataset_wrapper.py |  4 +-
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index 83ca67935..5094de13a 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -20,6 +20,7 @@ import torchvision
 torchvision.disable_beta_transforms_warning()
 
 from torchvision import datapoints
+from torchvision.transforms.v2 import functional as F
 
 
 ########################################################################################################################
@@ -93,6 +94,68 @@ print(bounding_box)
 # built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
 # also don't have to wrap manually.
 #
+# If you have a custom dataset, for example the ``PennFudanDataset`` from
+# `this tutorial <https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html>`_, you have two options:
+#
+# 1. Perform the wrapping inside ``__getitem__``:
+
+class PennFudanDataset(torch.utils.data.Dataset):
+    ...
+
+    def __getitem__(self, item):
+        ...
+
+        target["boxes"] = datapoints.BoundingBox(
+            boxes,
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=F.get_spatial_size(img),
+        )
+        target["labels"] = labels
+        target["masks"] = datapoints.Mask(masks)
+
+        ...
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        ...
+
+########################################################################################################################
+# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
+
+
+class WrapPennFudanDataset:
+    def __call__(self, img, target):
+        target["boxes"] = datapoints.BoundingBox(
+            target["boxes"],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=F.get_spatial_size(img),
+        )
+        target["masks"] = datapoints.Mask(target["masks"])
+        return img, target
+
+
+...
+
+
+def get_transform(train):
+    transforms = []
+    transforms.append(WrapPennFudanDataset())
+    transforms.append(T.PILToTensor())
+    ...
+
+########################################################################################################################
+# .. note::
+#
+#    If both :class:`~torchvision.datapoints.BoundingBox`'es and :class:`~torchvision.datapoints.Mask`'s are included in
+#    the sample, ``torchvision.transforms.v2`` will transform them both. Meaning, if you don't need both, dropping or
+#    at least not wrapping the obsolete parts, can lead to a significant performance boost.
+#
+#    For example, if you are using the ``PennFudanDataset`` for object detection, not wrapping the masks avoids
+#    transforming them over and over again in the pipeline just to ultimately ignoring them. In general, it would be
+#    even better to not load the masks at all, but this is not possible in this example, since the bounding boxes are
+#    generated from the masks.
+#
 # How do the datapoints behave inside a computation?
 # --------------------------------------------------
 #
@@ -101,6 +164,7 @@ print(bounding_box)
 # Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the
 # datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below):
 
+
 assert isinstance(image, datapoints.Image)
 
 new_image = image + 0
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index 09a5469dd..d88bc81e6 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -124,7 +124,9 @@ class VisionDatasetDatapointWrapper(Dataset):
         if not isinstance(dataset, datasets.VisionDataset):
             raise TypeError(
                 f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, "
-                f"but got a '{dataset_cls.__name__}' instead."
+                f"but got a '{dataset_cls.__name__}' instead.\n"
+                f"For an example of how to perform the wrapping for custom datasets, see\n\n"
+                "https://pytorch.org/vision/main/auto_examples/plot_datapoints.html#do-i-have-to-wrap-the-output-of-the-datasets-myself"
             )
 
         for cls in dataset_cls.mro():
-- 
GitLab


From 55799959046fe38e5bd324631e33238a6f8f08a7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 18 Apr 2023 13:49:10 +0200
Subject: [PATCH 408/624] disable Windows CPU unittests on CircleCI (#7526)

---
 .circleci/config.yml    | 50 -----------------------------------------
 .circleci/config.yml.in | 38 -------------------------------
 .circleci/regenerate.py |  2 +-
 3 files changed, 1 insertion(+), 89 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2acc7e0c7..302c34273 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -433,44 +433,6 @@ jobs:
             docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
             docker push ${image_name}:latest
 
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   unittest_windows_gpu:
     <<: *binary_common
     executor:
@@ -821,18 +783,6 @@ workflows:
 
   unittest:
     jobs:
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.8
-          python_version: '3.8'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.9
-          python_version: '3.9'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.10
-          python_version: '3.10'
       - unittest_windows_gpu:
           cu_version: cu117
           name: unittest_windows_gpu_py3.8
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index b8d838aee..0a6679938 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -433,44 +433,6 @@ jobs:
             docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
             docker push ${image_name}:latest
 
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   unittest_windows_gpu:
     <<: *binary_common
     executor:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index c45102443..11f2f791a 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -219,7 +219,7 @@ def indent(indentation, data_list):
 def unittest_workflows(indentation=6):
     jobs = []
     for os_type in ["windows"]:
-        for device_type in ["cpu", "gpu"]:
+        for device_type in ["gpu"]:
             if os_type == "macos" and device_type == "gpu":
                 continue
 
-- 
GitLab


From 4344da3d3f6c766216f2ec932916433055af05f4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 21 Apr 2023 08:11:18 +0100
Subject: [PATCH 409/624] Update master -> main for url of collect_env.py
 (#7527)

---
 .github/ISSUE_TEMPLATE/bug-report.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index a073146eb..ba811554c 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -48,7 +48,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+      wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
       # For security purposes, please check the contents of collect_env.py before running it.
       python collect_env.py
       ```
-- 
GitLab


From 2925df7cdf349f51dbda463d59569067e8a87c43 Mon Sep 17 00:00:00 2001
From: Riza Velioglu <40141130+rizavelioglu@users.noreply.github.com>
Date: Fri, 21 Apr 2023 10:20:05 +0200
Subject: [PATCH 410/624] fix color in draw_segmentation_masks (#7520)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_utils.py   |  9 ++++--
 torchvision/utils.py | 74 +++++++++++++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 84b1a673c..32b3db596 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -120,6 +120,9 @@ def test_draw_boxes_colors(colors):
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
     utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors)
 
+    with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"):
+        utils.draw_bounding_boxes(image=img, boxes=boxes, colors=[])
+
 
 def test_draw_boxes_vanilla():
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
@@ -268,12 +271,12 @@ def test_draw_segmentation_masks_errors():
     with pytest.raises(ValueError, match="must have the same height and width"):
         masks_bad_shape = torch.randint(0, 2, size=(h + 4, w), dtype=torch.bool)
         utils.draw_segmentation_masks(image=img, masks=masks_bad_shape)
-    with pytest.raises(ValueError, match="There are more masks"):
+    with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"):
         utils.draw_segmentation_masks(image=img, masks=masks, colors=[])
-    with pytest.raises(ValueError, match="colors must be a tuple or a string, or a list thereof"):
+    with pytest.raises(ValueError, match="`colors` must be a tuple or a string, or a list thereof"):
         bad_colors = np.array(["red", "blue"])  # should be a list
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
-    with pytest.raises(ValueError, match="It seems that you passed a tuple of colors instead of"):
+    with pytest.raises(ValueError, match="If passed as tuple, colors should be an RGB triplet"):
         bad_colors = ("red", "blue")  # should be a list
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
 
diff --git a/torchvision/utils.py b/torchvision/utils.py
index ebaf82a46..bc9d88b28 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -217,15 +217,7 @@ def draw_bounding_boxes(
             f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box."
         )
 
-    if colors is None:
-        colors = _generate_color_palette(num_boxes)
-    elif isinstance(colors, list):
-        if len(colors) < num_boxes:
-            raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ")
-    else:  # colors specifies a single color for all boxes
-        colors = [colors] * num_boxes
-
-    colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
+    colors = _parse_colors(colors, num_objects=num_boxes)
 
     if font is None:
         if font_size is not None:
@@ -307,34 +299,17 @@ def draw_segmentation_masks(
         raise ValueError("The image and the masks must have the same height and width")
 
     num_masks = masks.size()[0]
-    if colors is not None and num_masks > len(colors):
-        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
 
     if num_masks == 0:
         warnings.warn("masks doesn't contain any mask. No mask was drawn")
         return image
 
-    if colors is None:
-        colors = _generate_color_palette(num_masks)
-
-    if not isinstance(colors, list):
-        colors = [colors]
-    if not isinstance(colors[0], (tuple, str)):
-        raise ValueError("colors must be a tuple or a string, or a list thereof")
-    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
-        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
-
     out_dtype = torch.uint8
-
-    colors_ = []
-    for color in colors:
-        if isinstance(color, str):
-            color = ImageColor.getrgb(color)
-        colors_.append(torch.tensor(color, dtype=out_dtype))
+    colors = [torch.tensor(color, dtype=out_dtype) for color in _parse_colors(colors, num_objects=num_masks)]
 
     img_to_draw = image.detach().clone()
     # TODO: There might be a way to vectorize this
-    for mask, color in zip(masks, colors_):
+    for mask, color in zip(masks, colors):
         img_to_draw[:, mask] = color[:, None]
 
     out = image * (1 - alpha) + img_to_draw * alpha
@@ -535,6 +510,49 @@ def _generate_color_palette(num_objects: int):
     return [tuple((i * palette) % 255) for i in range(num_objects)]
 
 
+def _parse_colors(
+    colors: Union[None, str, Tuple[int, int, int], List[Union[str, Tuple[int, int, int]]]],
+    *,
+    num_objects: int,
+) -> List[Tuple[int, int, int]]:
+    """
+    Parses a specification of colors for a set of objects.
+
+    Args:
+        colors: A specification of colors for the objects. This can be one of the following:
+            - None: to generate a color palette automatically.
+            - A list of colors: where each color is either a string (specifying a named color) or an RGB tuple.
+            - A string or an RGB tuple: to use the same color for all objects.
+
+            If `colors` is a tuple, it should be a 3-tuple specifying the RGB values of the color.
+            If `colors` is a list, it should have at least as many elements as the number of objects to color.
+
+        num_objects (int): The number of objects to color.
+
+    Returns:
+        A list of 3-tuples, specifying the RGB values of the colors.
+
+    Raises:
+        ValueError: If the number of colors in the list is less than the number of objects to color.
+                    If `colors` is not a list, tuple, string or None.
+    """
+    if colors is None:
+        colors = _generate_color_palette(num_objects)
+    elif isinstance(colors, list):
+        if len(colors) < num_objects:
+            raise ValueError(
+                f"Number of colors must be equal or larger than the number of objects, but got {len(colors)} < {num_objects}."
+            )
+    elif not isinstance(colors, (tuple, str)):
+        raise ValueError("`colors` must be a tuple or a string, or a list thereof, but got {colors}.")
+    elif isinstance(colors, tuple) and len(colors) != 3:
+        raise ValueError("If passed as tuple, colors should be an RGB triplet, but got {colors}.")
+    else:  # colors specifies a single color for all objects
+        colors = [colors] * num_objects
+
+    return [ImageColor.getrgb(color) if isinstance(color, str) else color for color in colors]
+
+
 def _log_api_usage_once(obj: Any) -> None:
 
     """
-- 
GitLab


From 0d75d9e5516f446c9c0ef93bd4ed9fea13992d06 Mon Sep 17 00:00:00 2001
From: ptrblck <ptrblck@users.noreply.github.com>
Date: Fri, 21 Apr 2023 12:18:54 -0700
Subject: [PATCH 411/624] Add CUDA 12.1 builds (#7533)

Windows CUDA-12.1 failures are expected
---
 .circleci/config.yml                          | 228 ++++++++++++++++++
 .circleci/regenerate.py                       |   5 +-
 .circleci/unittest/windows/scripts/install.sh |   2 +-
 packaging/pkg_helpers.bash                    |  18 +-
 packaging/windows/internal/cuda_install.bat   |  27 ++-
 5 files changed, 255 insertions(+), 25 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 302c34273..ea5500723 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -598,6 +598,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.8_cu118
           python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu121
+          python_version: '3.8'
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -625,6 +634,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.9_cu118
           python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu121
+          python_version: '3.9'
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -652,6 +670,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.10_cu118
           python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.10_cu121
+          python_version: '3.10'
       - binary_win_wheel:
           cu_version: cpu
           name: binary_win_wheel_py3.11_cpu
@@ -667,8 +694,17 @@ workflows:
           python_version: '3.11'
       - binary_win_wheel:
           cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_wheel_py3.11_cu118
           python_version: '3.11'
+      - binary_win_wheel:
+          cu_version: cu121
+          name: binary_win_wheel_py3.11_cu121
+          python_version: '3.11'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -696,6 +732,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.8_cu118
           python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu121
+          python_version: '3.8'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -723,6 +768,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.9_cu118
           python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu121
+          python_version: '3.9'
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -750,6 +804,15 @@ workflows:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.10_cu118
           python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.10_cu121
+          python_version: '3.10'
       - binary_win_conda:
           cu_version: cpu
           name: binary_win_conda_py3.11_cpu
@@ -765,8 +828,17 @@ workflows:
           python_version: '3.11'
       - binary_win_conda:
           cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: binary_win_conda_py3.11_cu118
           python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cu121
+          name: binary_win_conda_py3.11_cu121
+          python_version: '3.11'
       - binary_ios_build:
           build_environment: binary-libtorchvision_ops-ios-12.0.0-x86_64
           ios_arch: x86_64
@@ -928,6 +1000,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.8_cu118
           subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu121
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.8_cu121
+          subfolder: cu121/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -988,6 +1080,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.9_cu118
           subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu121
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.9_cu121
+          subfolder: cu121/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1048,6 +1160,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.10_cu118
           subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu121
+          python_version: '3.10'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.10_cu121
+          subfolder: cu121/
       - binary_win_wheel:
           cu_version: cpu
           filters:
@@ -1108,6 +1240,26 @@ workflows:
           requires:
           - nightly_binary_win_wheel_py3.11_cu118
           subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu121
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cu121
+          subfolder: cu121/
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1165,6 +1317,25 @@ workflows:
           name: nightly_binary_win_conda_py3.8_cu118_upload
           requires:
           - nightly_binary_win_conda_py3.8_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu121
+          python_version: '3.8'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.8_cu121
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1222,6 +1393,25 @@ workflows:
           name: nightly_binary_win_conda_py3.9_cu118_upload
           requires:
           - nightly_binary_win_conda_py3.9_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu121
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.9_cu121
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1279,6 +1469,25 @@ workflows:
           name: nightly_binary_win_conda_py3.10_cu118_upload
           requires:
           - nightly_binary_win_conda_py3.10_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu121
+          python_version: '3.10'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.10_cu121
       - binary_win_conda:
           cu_version: cpu
           filters:
@@ -1336,6 +1545,25 @@ workflows:
           name: nightly_binary_win_conda_py3.11_cu118_upload
           requires:
           - nightly_binary_win_conda_py3.11_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu121
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cu121
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 11f2f791a..6e69dde48 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -32,8 +32,8 @@ def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6,
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
             cu_versions_dict = {
-                "linux": ["cpu", "cu117", "cu118", "rocm5.2", "rocm5.3"],
-                "win": ["cpu", "cu117", "cu118"],
+                "linux": ["cpu", "cu117", "cu118", "cu121", "rocm5.2", "rocm5.3"],
+                "win": ["cpu", "cu117", "cu118", "cu121"],
                 "macos": ["cpu"],
             }
             cu_versions = cu_versions_dict[os_type]
@@ -109,6 +109,7 @@ def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix=""
 manylinux_images = {
     "cu117": "pytorch/manylinux-cuda117",
     "cu118": "pytorch/manylinux-cuda118",
+    "cu121": "pytorch/manylinux-cuda121",
 }
 
 
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
index e75bea649..7c55c8144 100644
--- a/.circleci/unittest/windows/scripts/install.sh
+++ b/.circleci/unittest/windows/scripts/install.sh
@@ -25,7 +25,7 @@ else
     fi
 
     cuda_toolkit_pckg="cudatoolkit"
-    if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 || $CUDA_VERSION == 11.8 ]]; then
+    if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 || $CUDA_VERSION == 11.8 || $CUDA_VERSION == 12.1 ]]; then
         cuda_toolkit_pckg="pytorch-cuda"
     fi
 
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 69f733918..3b5330545 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -46,13 +46,21 @@ setup_cuda() {
 
   # Now work out the CUDA settings
   case "$CU_VERSION" in
+    cu121)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.1"
+      else
+        export CUDA_HOME=/usr/local/cuda-12.1/
+      fi
+      export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
+      ;;
     cu118)
       if [[ "$OSTYPE" == "msys" ]]; then
         export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8"
       else
         export CUDA_HOME=/usr/local/cuda-11.8/
       fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
       ;;
     cu117)
       if [[ "$OSTYPE" == "msys" ]]; then
@@ -62,14 +70,6 @@ setup_cuda() {
       fi
       export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
       ;;
-    cu116)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.6/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
     cpu)
       ;;
     rocm*)
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index 6474a98d9..a68ce7f85 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -26,18 +26,19 @@ set CUDNN_LIB_FOLDER="lib\x64"
 if %CUDA_VER% EQU 116 goto cuda116
 if %CUDA_VER% EQU 117 goto cuda117
 if %CUDA_VER% EQU 118 goto cuda118
+if %CUDA_VER% EQU 121 goto cuda121
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
 
-:cuda116
+:cuda117
 
-set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe
+set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvjpeg_11.6 nvjpeg_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6"
+    set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvjpeg_11.7 nvjpeg_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7"
 )
 
 set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
@@ -56,18 +57,18 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
 
 goto cuda_common
 
-:cuda117
+:cuda118
 
-set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe
+set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvjpeg_11.7 nvjpeg_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7"
+    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvjpeg_11.8 nvjpeg_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
 )
 
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
+set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.5.0.96_cuda11-archive.zip
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.5.0.96_cuda11-archive
 set CUDNN_LIB_FOLDER="lib"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
     curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
@@ -82,18 +83,18 @@ if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
 
 goto cuda_common
 
-:cuda118
+:cuda121
 
-set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
+set CUDA_INSTALL_EXE=cuda_12.1.0_531.14_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvjpeg_11.8 nvjpeg_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
+    set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1"
 )
 
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.5.0.96_cuda11-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.5.0.96_cuda11-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.8.1.3_cuda12-archive
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 set CUDNN_LIB_FOLDER="lib"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
     curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-- 
GitLab


From 0b203bd558bc58febf4c0cb7a5cc62f27d913807 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 1 May 2023 16:21:57 -0400
Subject: [PATCH 412/624] Adding windows wheel builds (#7540)

---
 .github/workflows/build-wheels-windows.yml | 54 ++++++++++++++++++++++
 packaging/pre_build_script.sh              |  7 ++-
 2 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/build-wheels-windows.yml

diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
new file mode 100644
index 000000000..e23813597
--- /dev/null
+++ b/.github/workflows/build-wheels-windows.yml
@@ -0,0 +1,54 @@
+name: Build Windows Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: windows
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            env-script: packaging/windows/internal/vc_env_helper.bat
+            post-script: "python packaging/wheel/relocate.py"
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      env-script: ${{ matrix.env-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 5dfde2386..d02a13fde 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
+if [[ "$(uname)" == Darwin ]]; then
   # Uninstall Conflicting jpeg brew formulae
   jpeg_packages=$(brew list | grep jpeg)
   echo "Existing Jpeg-related Brew libraries"
@@ -8,10 +8,13 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
     brew uninstall --ignore-dependencies --force $pkg || true
   done
 
+  conda install -yq wget
+fi
+
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Install libpng from Anaconda (defaults)
   conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
   conda install -yq ffmpeg=4.2 -c pytorch
-  conda install -yq wget
 else
   # Install native CentOS libJPEG, freetype and GnuTLS
   yum install -y libjpeg-turbo-devel freetype gnutls
-- 
GitLab


From a33ce08b64eebc3210dd8544e3d84e1dc495dc6b Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 1 May 2023 20:08:28 -0400
Subject: [PATCH 413/624] [Nova] Adding conda windows build (#7547)

---
 .github/workflows/build-conda-windows.yml | 54 +++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 .github/workflows/build-conda-windows.yml

diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
new file mode 100644
index 000000000..59f25579f
--- /dev/null
+++ b/.github/workflows/build-conda-windows.yml
@@ -0,0 +1,54 @@
+name: Build Windows Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: conda
+      os: windows
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            env-script: packaging/windows/internal/vc_env_helper.bat
+            post-script: ""
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      env-script: ${{ matrix.env-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From 9bc094ea5e7cb90f1b1780508c95eaab4ffb1d74 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 2 May 2023 12:26:27 +0200
Subject: [PATCH 414/624] enable Windows GPU CI on GHA (#7498)

---
 .circleci/config.yml               | 72 ------------------------------
 .circleci/config.yml.in            | 51 ---------------------
 .circleci/regenerate.py            | 31 -------------
 .github/workflows/test-windows.yml | 12 ++---
 4 files changed, 6 insertions(+), 160 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ea5500723..a37820bfc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -433,53 +433,6 @@ jobs:
             docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
             docker push ${image_name}:latest
 
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.7"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   cmake_linux_cpu:
     <<: *binary_common
     docker:
@@ -853,31 +806,6 @@ workflows:
           build_environment: binary-libtorchvision_ops-android
           name: binary_libtorchvision_ops_android
 
-  unittest:
-    jobs:
-      - unittest_windows_gpu:
-          cu_version: cu117
-          name: unittest_windows_gpu_py3.8
-          python_version: '3.8'
-      - unittest_windows_gpu:
-          cu_version: cu117
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.9
-          python_version: '3.9'
-      - unittest_windows_gpu:
-          cu_version: cu117
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.10
-          python_version: '3.10'
-
   cmake:
     jobs:
       - cmake_linux_cpu:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 0a6679938..d7ccd400c 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -433,53 +433,6 @@ jobs:
             docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
             docker push ${image_name}:latest
 
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.7"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
   cmake_linux_cpu:
     <<: *binary_common
     docker:
@@ -575,10 +528,6 @@ workflows:
       {{ ios_workflows() }}
       {{ android_workflows() }}
 
-  unittest:
-    jobs:
-      {{ unittest_workflows() }}
-
   cmake:
     jobs:
       {{ cmake_workflows() }}
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 6e69dde48..8db363702 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -217,36 +217,6 @@ def indent(indentation, data_list):
     return ("\n" + " " * indentation).join(yaml.dump(data_list, default_flow_style=False).splitlines())
 
 
-def unittest_workflows(indentation=6):
-    jobs = []
-    for os_type in ["windows"]:
-        for device_type in ["gpu"]:
-            if os_type == "macos" and device_type == "gpu":
-                continue
-
-            for i, python_version in enumerate(PYTHON_VERSIONS):
-
-                # Turn off unit tests for 3.11, unit test are not setup properly
-                if python_version == "3.11":
-                    continue
-
-                job = {
-                    "name": f"unittest_{os_type}_{device_type}_py{python_version}",
-                    "python_version": python_version,
-                }
-
-                if device_type == "gpu":
-                    if python_version != "3.8":
-                        job["filters"] = gen_filter_branch_tree("main", "nightly")
-                    job["cu_version"] = "cu117"
-                else:
-                    job["cu_version"] = "cpu"
-
-                jobs.append({f"unittest_{os_type}_{device_type}": job})
-
-    return indent(indentation, jobs)
-
-
 def cmake_workflows(indentation=6):
     jobs = []
     python_version = "3.8"
@@ -331,7 +301,6 @@ if __name__ == "__main__":
         f.write(
             env.get_template("config.yml.in").render(
                 build_workflows=build_workflows,
-                unittest_workflows=unittest_workflows,
                 cmake_workflows=cmake_workflows,
                 ios_workflows=ios_workflows,
                 android_workflows=android_workflows,
diff --git a/.github/workflows/test-windows.yml b/.github/workflows/test-windows.yml
index dd663d9e9..40c97def8 100644
--- a/.github/workflows/test-windows.yml
+++ b/.github/workflows/test-windows.yml
@@ -20,12 +20,11 @@ jobs:
           - "3.11"
         runner: ["windows.4xlarge"]
         gpu-arch-type: ["cpu"]
-        # FIXME: enable this as soon as nvjpeg is available on the Windows runner
-#        include:
-#          - python-version: "3.8"
-#            runner: windows.8xlarge.nvidia.gpu
-#            gpu-arch-type: cuda
-#            gpu-arch-version: "11.7"
+        include:
+          - python-version: "3.8"
+            runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.7"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
     with:
@@ -46,6 +45,7 @@ jobs:
           # TODO: This should be handled by the generic Windows job the same as its done by the generic Linux job
           export CUDA_HOME="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ matrix.gpu-arch-version }}"
           export CUDA_PATH="${CUDA_HOME}"
+          export PATH="${CUDA_PATH}/bin:${PATH}"
         fi
         
         ./.github/scripts/unittest.sh
-- 
GitLab


From 6381f7b2b944a8b3616750b73039136d541d8754 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 2 May 2023 13:04:46 +0200
Subject: [PATCH 415/624] improve smoke test (#7550)

---
 test/smoke_test.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index 1f1364512..bfcb4a0a7 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,13 +1,11 @@
 """Run smoke tests"""
 
-import os
 import sys
 from pathlib import Path
 
 import torch
-import torch.nn as nn
 import torchvision
-from torchvision.io import read_image
+from torchvision.io import decode_jpeg, read_file, read_image
 from torchvision.models import resnet50, ResNet50_Weights
 
 SCRIPT_DIR = Path(__file__).parent
@@ -22,13 +20,20 @@ def smoke_test_torchvision() -> None:
 
 def smoke_test_torchvision_read_decode() -> None:
     img_jpg = read_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
-    if img_jpg.ndim != 3 or img_jpg.numel() < 100:
+    if img_jpg.shape != (3, 606, 517):
         raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
     img_png = read_image(str(SCRIPT_DIR / "assets" / "interlaced_png" / "wizard_low.png"))
-    if img_png.ndim != 3 or img_png.numel() < 100:
+    if img_png.shape != (4, 471, 354):
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
 
+def smoke_test_torchvision_decode_jpeg_cuda():
+    img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
+    img_jpg = decode_jpeg(img_jpg_data, device="cuda")
+    if img_jpg.shape != (3, 606, 517):
+        raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
+
+
 def smoke_test_compile() -> None:
     try:
         model = resnet50().cuda()
@@ -77,6 +82,7 @@ def main() -> None:
     smoke_test_torchvision_read_decode()
     smoke_test_torchvision_resnet50_classify()
     if torch.cuda.is_available():
+        smoke_test_torchvision_decode_jpeg_cuda()
         smoke_test_torchvision_resnet50_classify("cuda")
         smoke_test_compile()
 
-- 
GitLab


From e946e87dc8881993d23e3a652244ddf53a40ab16 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 2 May 2023 13:08:29 +0200
Subject: [PATCH 416/624] refactor download tests (#7546)

---
 test/test_datasets_download.py | 374 ++++++++++++---------------------
 1 file changed, 132 insertions(+), 242 deletions(-)

diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
index f0fd4330a..e99017d8b 100644
--- a/test/test_datasets_download.py
+++ b/test/test_datasets_download.py
@@ -14,13 +14,7 @@ from urllib.request import Request, urlopen
 
 import pytest
 from torchvision import datasets
-from torchvision.datasets.utils import (
-    _get_redirect_url,
-    check_integrity,
-    download_file_from_google_drive,
-    download_url,
-    USER_AGENT,
-)
+from torchvision.datasets.utils import _get_redirect_url, USER_AGENT
 
 
 def limit_requests_per_time(min_secs_between_requests=2.0):
@@ -84,47 +78,45 @@ urlopen = resolve_redirects()(urlopen)
 
 @contextlib.contextmanager
 def log_download_attempts(
-    urls_and_md5s=None,
-    file="utils",
-    patch=True,
-    mock_auxiliaries=None,
+    urls,
+    *,
+    dataset_module,
 ):
-    def add_mock(stack, name, file, **kwargs):
+    def maybe_add_mock(*, module, name, stack, lst=None):
+        patcher = unittest.mock.patch(f"torchvision.datasets.{module}.{name}")
+
         try:
-            return stack.enter_context(unittest.mock.patch(f"torchvision.datasets.{file}.{name}", **kwargs))
-        except AttributeError as error:
-            if file != "utils":
-                return add_mock(stack, name, "utils", **kwargs)
-            else:
-                raise pytest.UsageError from error
-
-    if urls_and_md5s is None:
-        urls_and_md5s = set()
-    if mock_auxiliaries is None:
-        mock_auxiliaries = patch
+            mock = stack.enter_context(patcher)
+        except AttributeError:
+            return
 
-    with contextlib.ExitStack() as stack:
-        url_mock = add_mock(stack, "download_url", file, wraps=None if patch else download_url)
-        google_drive_mock = add_mock(
-            stack, "download_file_from_google_drive", file, wraps=None if patch else download_file_from_google_drive
-        )
+        if lst is not None:
+            lst.append(mock)
 
-        if mock_auxiliaries:
-            add_mock(stack, "extract_archive", file)
+    with contextlib.ExitStack() as stack:
+        download_url_mocks = []
+        download_file_from_google_drive_mocks = []
+        for module in [dataset_module, "utils"]:
+            maybe_add_mock(module=module, name="download_url", stack=stack, lst=download_url_mocks)
+            maybe_add_mock(
+                module=module,
+                name="download_file_from_google_drive",
+                stack=stack,
+                lst=download_file_from_google_drive_mocks,
+            )
+            maybe_add_mock(module=module, name="extract_archive", stack=stack)
 
         try:
-            yield urls_and_md5s
+            yield
         finally:
-            for args, kwargs in url_mock.call_args_list:
-                url = args[0]
-                md5 = args[-1] if len(args) == 4 else kwargs.get("md5")
-                urls_and_md5s.add((url, md5))
+            for download_url_mock in download_url_mocks:
+                for args, kwargs in download_url_mock.call_args_list:
+                    urls.append(args[0] if args else kwargs["url"])
 
-            for args, kwargs in google_drive_mock.call_args_list:
-                id = args[0]
-                url = f"https://drive.google.com/file/d/{id}"
-                md5 = args[3] if len(args) == 4 else kwargs.get("md5")
-                urls_and_md5s.add((url, md5))
+            for download_file_from_google_drive_mock in download_file_from_google_drive_mocks:
+                for args, kwargs in download_file_from_google_drive_mock.call_args_list:
+                    file_id = args[0] if args else kwargs["file_id"]
+                    urls.append(f"https://drive.google.com/file/d/{file_id}")
 
 
 def retry(fn, times=1, wait=5.0):
@@ -170,45 +162,14 @@ def assert_url_is_accessible(url, timeout=5.0):
         urlopen(request, timeout=timeout)
 
 
-def assert_file_downloads_correctly(url, md5, tmpdir, timeout=5.0):
-    file = path.join(tmpdir, path.basename(url))
-    with assert_server_response_ok():
-        with open(file, "wb") as fh:
-            request = Request(url, headers={"User-Agent": USER_AGENT})
-            response = urlopen(request, timeout=timeout)
-            fh.write(response.read())
-
-    assert check_integrity(file, md5=md5), "The MD5 checksums mismatch"
-
-
-class DownloadConfig:
-    def __init__(self, url, md5=None, id=None):
-        self.url = url
-        self.md5 = md5
-        self.id = id or url
-
-    def __repr__(self) -> str:
-        return self.id
-
+def collect_urls(dataset_cls, *args, **kwargs):
+    urls = []
+    with contextlib.suppress(Exception), log_download_attempts(
+        urls, dataset_module=dataset_cls.__module__.split(".")[-1]
+    ):
+        dataset_cls(*args, **kwargs)
 
-def make_download_configs(urls_and_md5s, name=None):
-    return [
-        DownloadConfig(url, md5=md5, id=f"{name}, {url}" if name is not None else None) for url, md5 in urls_and_md5s
-    ]
-
-
-def collect_download_configs(dataset_loader, name=None, **kwargs):
-    urls_and_md5s = set()
-    try:
-        with log_download_attempts(urls_and_md5s=urls_and_md5s, **kwargs):
-            dataset = dataset_loader()
-    except Exception:
-        dataset = None
-
-    if name is None and dataset is not None:
-        name = type(dataset).__name__
-
-    return make_download_configs(urls_and_md5s, name)
+    return [(url, f"{dataset_cls.__name__}, {url}") for url in urls]
 
 
 # This is a workaround since fixtures, such as the built-in tmp_dir, can only be used within a test but not within a
@@ -223,12 +184,14 @@ def root():
 
 
 def places365():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Places365(ROOT, split=split, small=small, download=True),
-                name=f"Places365, {split}, {'small' if small else 'large'}",
-                file="places365",
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(
+                datasets.Places365,
+                ROOT,
+                split=split,
+                small=small,
+                download=True,
             )
             for split, small in itertools.product(("train-standard", "train-challenge", "val"), (False, True))
         ]
@@ -236,30 +199,26 @@ def places365():
 
 
 def caltech101():
-    return collect_download_configs(lambda: datasets.Caltech101(ROOT, download=True), name="Caltech101")
+    return collect_urls(datasets.Caltech101, ROOT, download=True)
 
 
 def caltech256():
-    return collect_download_configs(lambda: datasets.Caltech256(ROOT, download=True), name="Caltech256")
+    return collect_urls(datasets.Caltech256, ROOT, download=True)
 
 
 def cifar10():
-    return collect_download_configs(lambda: datasets.CIFAR10(ROOT, download=True), name="CIFAR10")
+    return collect_urls(datasets.CIFAR10, ROOT, download=True)
 
 
 def cifar100():
-    return collect_download_configs(lambda: datasets.CIFAR100(ROOT, download=True), name="CIFAR100")
+    return collect_urls(datasets.CIFAR100, ROOT, download=True)
 
 
 def voc():
     # TODO: Also test the "2007-test" key
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.VOCSegmentation(ROOT, year=year, download=True),
-                name=f"VOC, {year}",
-                file="voc",
-            )
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(datasets.VOCSegmentation, ROOT, year=year, download=True)
             for year in ("2007", "2008", "2009", "2010", "2011", "2012")
         ]
     )
@@ -267,59 +226,42 @@ def voc():
 
 def mnist():
     with unittest.mock.patch.object(datasets.MNIST, "mirrors", datasets.MNIST.mirrors[-1:]):
-        return collect_download_configs(lambda: datasets.MNIST(ROOT, download=True), name="MNIST")
+        return collect_urls(datasets.MNIST, ROOT, download=True)
 
 
 def fashion_mnist():
-    return collect_download_configs(lambda: datasets.FashionMNIST(ROOT, download=True), name="FashionMNIST")
+    return collect_urls(datasets.FashionMNIST, ROOT, download=True)
 
 
 def kmnist():
-    return collect_download_configs(lambda: datasets.KMNIST(ROOT, download=True), name="KMNIST")
+    return collect_urls(datasets.KMNIST, ROOT, download=True)
 
 
 def emnist():
     # the 'split' argument can be any valid one, since everything is downloaded anyway
-    return collect_download_configs(lambda: datasets.EMNIST(ROOT, split="byclass", download=True), name="EMNIST")
+    return collect_urls(datasets.EMNIST, ROOT, split="byclass", download=True)
 
 
 def qmnist():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.QMNIST(ROOT, what=what, download=True),
-                name=f"QMNIST, {what}",
-                file="mnist",
-            )
-            for what in ("train", "test", "nist")
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.QMNIST, ROOT, what=what, download=True) for what in ("train", "test", "nist")]
     )
 
 
 def moving_mnist():
-    return collect_download_configs(lambda: datasets.MovingMNIST(ROOT, download=True), name="MovingMNIST")
+    return collect_urls(datasets.MovingMNIST, ROOT, download=True)
 
 
 def omniglot():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Omniglot(ROOT, background=background, download=True),
-                name=f"Omniglot, {'background' if background else 'evaluation'}",
-            )
-            for background in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.Omniglot, ROOT, background=background, download=True) for background in (True, False)]
     )
 
 
 def phototour():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.PhotoTour(ROOT, name=name, download=True),
-                name=f"PhotoTour, {name}",
-                file="phototour",
-            )
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(datasets.PhotoTour, ROOT, name=name, download=True)
             # The names postfixed with '_harris' point to the domain 'matthewalunbrown.com'. For some reason all
             # requests timeout from within CI. They are disabled until this is resolved.
             for name in ("notredame", "yosemite", "liberty")  # "notredame_harris", "yosemite_harris", "liberty_harris"
@@ -328,91 +270,51 @@ def phototour():
 
 
 def sbdataset():
-    return collect_download_configs(
-        lambda: datasets.SBDataset(ROOT, download=True),
-        name="SBDataset",
-        file="voc",
-    )
+    return collect_urls(datasets.SBDataset, ROOT, download=True)
 
 
 def sbu():
-    return collect_download_configs(
-        lambda: datasets.SBU(ROOT, download=True),
-        name="SBU",
-        file="sbu",
-    )
+    return collect_urls(datasets.SBU, ROOT, download=True)
 
 
 def semeion():
-    return collect_download_configs(
-        lambda: datasets.SEMEION(ROOT, download=True),
-        name="SEMEION",
-        file="semeion",
-    )
+    return collect_urls(datasets.SEMEION, ROOT, download=True)
 
 
 def stl10():
-    return collect_download_configs(
-        lambda: datasets.STL10(ROOT, download=True),
-        name="STL10",
-    )
+    return collect_urls(datasets.STL10, ROOT, download=True)
 
 
 def svhn():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.SVHN(ROOT, split=split, download=True),
-                name=f"SVHN, {split}",
-                file="svhn",
-            )
-            for split in ("train", "test", "extra")
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.SVHN, ROOT, split=split, download=True) for split in ("train", "test", "extra")]
     )
 
 
 def usps():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.USPS(ROOT, train=train, download=True),
-                name=f"USPS, {'train' if train else 'test'}",
-                file="usps",
-            )
-            for train in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.USPS, ROOT, train=train, download=True) for train in (True, False)]
     )
 
 
 def celeba():
-    return collect_download_configs(
-        lambda: datasets.CelebA(ROOT, download=True),
-        name="CelebA",
-        file="celeba",
-    )
+    return collect_urls(datasets.CelebA, ROOT, download=True)
 
 
 def widerface():
-    return collect_download_configs(
-        lambda: datasets.WIDERFace(ROOT, download=True),
-        name="WIDERFace",
-        file="widerface",
-    )
+    return collect_urls(datasets.WIDERFace, ROOT, download=True)
 
 
 def kinetics():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Kinetics(
-                    path.join(ROOT, f"Kinetics{num_classes}"),
-                    frames_per_clip=1,
-                    num_classes=num_classes,
-                    split=split,
-                    download=True,
-                ),
-                name=f"Kinetics, {num_classes}, {split}",
-                file="kinetics",
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(
+                datasets.Kinetics,
+                path.join(ROOT, f"Kinetics{num_classes}"),
+                frames_per_clip=1,
+                num_classes=num_classes,
+                split=split,
+                download=True,
             )
             for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val"))
         ]
@@ -420,58 +322,55 @@ def kinetics():
 
 
 def kitti():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda train=train: datasets.Kitti(ROOT, train=train, download=True),
-                name=f"Kitti, {'train' if train else 'test'}",
-                file="kitti",
-            )
-            for train in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.Kitti, ROOT, train=train, download=True) for train in (True, False)]
     )
 
 
-def make_parametrize_kwargs(download_configs):
-    argvalues = []
-    ids = []
-    for config in download_configs:
-        argvalues.append((config.url, config.md5))
-        ids.append(config.id)
-
-    return dict(argnames=("url", "md5"), argvalues=argvalues, ids=ids)
-
-
-@pytest.mark.parametrize(
-    **make_parametrize_kwargs(
-        itertools.chain(
-            caltech101(),
-            caltech256(),
-            cifar10(),
-            cifar100(),
-            # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
-            # voc(),
-            mnist(),
-            fashion_mnist(),
-            kmnist(),
-            emnist(),
-            qmnist(),
-            omniglot(),
-            phototour(),
-            sbdataset(),
-            semeion(),
-            stl10(),
-            svhn(),
-            usps(),
-            celeba(),
-            widerface(),
-            kinetics(),
-            kitti(),
-            places365(),
-        )
+def stanford_cars():
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.StanfordCars, ROOT, split=split, download=True) for split in ["train", "test"]]
+    )
+
+
+def url_parametrization(*dataset_urls_and_ids_fns):
+    return pytest.mark.parametrize(
+        "url",
+        [
+            pytest.param(url, id=id)
+            for dataset_urls_and_ids_fn in dataset_urls_and_ids_fns
+            for url, id in sorted(set(dataset_urls_and_ids_fn()))
+        ],
     )
+
+
+@url_parametrization(
+    caltech101,
+    caltech256,
+    cifar10,
+    cifar100,
+    # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
+    # voc,
+    mnist,
+    fashion_mnist,
+    kmnist,
+    emnist,
+    qmnist,
+    omniglot,
+    phototour,
+    sbdataset,
+    semeion,
+    stl10,
+    svhn,
+    usps,
+    celeba,
+    widerface,
+    kinetics,
+    kitti,
+    places365,
+    sbu,
 )
-def test_url_is_accessible(url, md5):
+def test_url_is_accessible(url):
     """
     If you see this test failing, find the offending dataset in the parametrization and move it to
     ``test_url_is_not_accessible`` and link an issue detailing the problem.
@@ -479,15 +378,11 @@ def test_url_is_accessible(url, md5):
     retry(lambda: assert_url_is_accessible(url))
 
 
-@pytest.mark.parametrize(
-    **make_parametrize_kwargs(
-        itertools.chain(
-            sbu(),  # https://github.com/pytorch/vision/issues/7005
-        )
-    )
+@url_parametrization(
+    stanford_cars,  # https://github.com/pytorch/vision/issues/7545
 )
 @pytest.mark.xfail
-def test_url_is_not_accessible(url, md5):
+def test_url_is_not_accessible(url):
     """
     As the name implies, this test is the 'inverse' of ``test_url_is_accessible``. Since the download servers are
     beyond our control, some files might not be accessible for longer stretches of time. Still, we want to know if they
@@ -497,8 +392,3 @@ def test_url_is_not_accessible(url, md5):
     ``test_url_is_accessible``.
     """
     retry(lambda: assert_url_is_accessible(url))
-
-
-@pytest.mark.parametrize(**make_parametrize_kwargs(itertools.chain()))
-def test_file_downloads_correctly(url, md5):
-    retry(lambda: assert_file_downloads_correctly(url, md5))
-- 
GitLab


From 912c44faa9b6a11fe60f1f33c665287078f0531e Mon Sep 17 00:00:00 2001
From: David Garcia <dvid@usal.es>
Date: Tue, 2 May 2023 15:16:40 +0400
Subject: [PATCH 417/624] Fix Resize() documentation (#7536)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/transforms/functional.py   | 14 +++++++-------
 torchvision/transforms/transforms.py   | 14 +++++++-------
 torchvision/transforms/v2/_geometry.py | 14 +++++++-------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 09c8bdbcf..1676aa2b7 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -423,13 +423,13 @@ def resize(
             supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e. the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, 
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This 
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
         antialias (bool, optional): Whether to apply antialiasing.
             It only affects **tensors** with bilinear or bicubic modes and it is
             ignored otherwise: on PIL images, antialiasing is always applied on
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 46af416e4..1edf772bc 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -307,13 +307,13 @@ class Resize(torch.nn.Module):
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e. the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, 
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This 
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
         antialias (bool, optional): Whether to apply antialiasing.
             It only affects **tensors** with bilinear or bicubic modes and it is
             ignored otherwise: on PIL images, antialiasing is always applied on
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 59791c30b..1a5281639 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -97,13 +97,13 @@ class Resize(Transform):
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e. the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, 
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This 
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
         antialias (bool, optional): Whether to apply antialiasing.
             It only affects **tensors** with bilinear or bicubic modes and it is
             ignored otherwise: on PIL images, antialiasing is always applied on
-- 
GitLab


From 020513dc9f03c62d7f92c93b8f0f10a41a5768b1 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 2 May 2023 18:26:03 -0400
Subject: [PATCH 418/624] [Nova] Adding conda windows build (#7551)

---
 .github/workflows/build-conda-windows.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 59f25579f..98c683771 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -34,15 +34,15 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_conda_windows.yml@main
     with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
-      env-script: ${{ matrix.env-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
@@ -50,5 +50,4 @@ jobs:
       # to official channels yet
       trigger-event: development
     secrets:
-      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-- 
GitLab


From 8fa3aa5188f062fc7abdba9633118376460b307b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 3 May 2023 10:03:11 +0100
Subject: [PATCH 419/624] [FBcode->GH] [codemod] Replace hasattr with getattr
 in pytorch/vision/test/test_transforms_tensor.py (#7553)

Co-authored-by: Richard Barnes <rbarnes@meta.com>
---
 test/test_transforms_tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index ef26f393d..077a12af4 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -100,7 +100,7 @@ def _test_op(func, method, device, channels=3, fn_kwargs=None, meth_kwargs=None,
 
 def _test_fn_save_load(fn, tmpdir):
     scripted_fn = torch.jit.script(fn)
-    p = os.path.join(tmpdir, f"t_op_list_{fn.__name__ if hasattr(fn, '__name__') else fn.__class__.__name__}.pt")
+    p = os.path.join(tmpdir, f"t_op_list_{getattr(fn, '__name__', fn.__class__.__name__)}.pt")
     scripted_fn.save(p)
     _ = torch.jit.load(p)
 
-- 
GitLab


From 2b635006534766545955aa7aae69634b1b8c384f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 3 May 2023 11:42:58 +0200
Subject: [PATCH 420/624] remove android and iOS builds from CI (#7524)

---
 .circleci/config.yml    | 199 ----------------------------------------
 .circleci/config.yml.in | 154 -------------------------------
 .circleci/regenerate.py |  57 ------------
 android/README.md       |   3 +
 ios/README.md           |   3 +
 5 files changed, 6 insertions(+), 410 deletions(-)
 create mode 100644 android/README.md
 create mode 100644 ios/README.md

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a37820bfc..a27ea7267 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -46,59 +46,6 @@ commands:
             fi
             echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
 
-  brew_update:
-    description: "Update Homebrew and install base formulae"
-    steps:
-      - run:
-          name: Update Homebrew
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-
-            # Update repositories manually.
-            # Running `brew update` produces a comparison between the
-            # current checkout and the updated checkout, which takes a
-            # very long time because the existing checkout is 2y old.
-            for path in $(find /usr/local/Homebrew -type d -name .git)
-            do
-            cd $path/..
-            git fetch --depth=1 origin
-            git reset --hard origin/master
-            done
-
-            export HOMEBREW_NO_AUTO_UPDATE=1
-
-            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
-            # moreutils installs a `parallel` executable by default, which conflicts
-            # with the executable from the GNU `parallel`, so we must unlink GNU
-            # `parallel` first, and relink it afterwards.
-            brew install coreutils
-            brew unlink parallel
-            brew install moreutils
-            brew link parallel --overwrite
-            brew install expect
-
-  brew_install:
-    description: "Install Homebrew formulae"
-    parameters:
-      formulae:
-        type: string
-        default: ""
-    steps:
-      - run:
-          name: Install << parameters.formulae >>
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-            export HOMEBREW_NO_AUTO_UPDATE=1
-            brew install << parameters.formulae >>
-
-  run_brew_for_ios_build:
-    steps:
-      - brew_update
-      - brew_install:
-          formulae: libtool
-
   pip_install:
     parameters:
       args:
@@ -158,30 +105,6 @@ binary_common: &binary_common
     CU_VERSION: << parameters.cu_version >>
     MACOSX_DEPLOYMENT_TARGET: 10.9
 
-torchvision_ios_params: &torchvision_ios_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-    ios_arch:
-      type: string
-      default: ""
-    ios_platform:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-    IOS_ARCH: << parameters.ios_arch >>
-    IOS_PLATFORM: << parameters.ios_platform >>
-
-torchvision_android_params: &torchvision_android_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-
 smoke_test_common: &smoke_test_common
   <<: *binary_common
   docker:
@@ -274,79 +197,6 @@ jobs:
           paths:
             - "*"
 
-  binary_ios_build:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh"
-          cat "$script"
-          source "$script"
-    - persist_to_workspace:
-        root: /Users/distiller/workspace/
-        paths: ios
-
-  binary_ios_upload:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_android_build:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh"
-          cat "$script"
-          source "$script"
-    - store_artifacts:
-        path: ~/workspace/artifacts
-
-  binary_android_upload:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh"
-          cat "$script"
-          source "$script"
-
   binary_macos_conda:
     <<: *binary_common
     macos:
@@ -792,19 +642,6 @@ workflows:
           cu_version: cu121
           name: binary_win_conda_py3.11_cu121
           python_version: '3.11'
-      - binary_ios_build:
-          build_environment: binary-libtorchvision_ops-ios-12.0.0-x86_64
-          ios_arch: x86_64
-          ios_platform: SIMULATOR
-          name: binary_libtorchvision_ops_ios_12.0.0_x86_64
-      - binary_ios_build:
-          build_environment: binary-libtorchvision_ops-ios-12.0.0-arm64
-          ios_arch: arm64
-          ios_platform: OS
-          name: binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_android_build:
-          build_environment: binary-libtorchvision_ops-android
-          name: binary_libtorchvision_ops_android
 
   cmake:
     jobs:
@@ -832,42 +669,6 @@ workflows:
 
   nightly:
     jobs:
-      - binary_ios_build:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-x86_64
-          filters:
-            branches:
-              only:
-              - nightly
-          ios_arch: x86_64
-          ios_platform: SIMULATOR
-          name: nightly_binary_libtorchvision_ops_ios_12.0.0_x86_64
-      - binary_ios_build:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-arm64
-          filters:
-            branches:
-              only:
-              - nightly
-          ios_arch: arm64
-          ios_platform: OS
-          name: nightly_binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_ios_upload:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-upload
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-          requires:
-          - nightly_binary_libtorchvision_ops_ios_12.0.0_x86_64
-          - nightly_binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_android_upload:
-          build_environment: nightly-binary-libtorchvision_ops-android-upload
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-          name: nightly_binary_libtorchvision_ops_android_upload
       - binary_win_wheel:
           cu_version: cpu
           filters:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index d7ccd400c..2358b0a95 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -46,59 +46,6 @@ commands:
             fi
             echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
 
-  brew_update:
-    description: "Update Homebrew and install base formulae"
-    steps:
-      - run:
-          name: Update Homebrew
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-
-            # Update repositories manually.
-            # Running `brew update` produces a comparison between the
-            # current checkout and the updated checkout, which takes a
-            # very long time because the existing checkout is 2y old.
-            for path in $(find /usr/local/Homebrew -type d -name .git)
-            do
-            cd $path/..
-            git fetch --depth=1 origin
-            git reset --hard origin/master
-            done
-
-            export HOMEBREW_NO_AUTO_UPDATE=1
-
-            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
-            # moreutils installs a `parallel` executable by default, which conflicts
-            # with the executable from the GNU `parallel`, so we must unlink GNU
-            # `parallel` first, and relink it afterwards.
-            brew install coreutils
-            brew unlink parallel
-            brew install moreutils
-            brew link parallel --overwrite
-            brew install expect
-
-  brew_install:
-    description: "Install Homebrew formulae"
-    parameters:
-      formulae:
-        type: string
-        default: ""
-    steps:
-      - run:
-          name: Install << parameters.formulae >>
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-            export HOMEBREW_NO_AUTO_UPDATE=1
-            brew install << parameters.formulae >>
-
-  run_brew_for_ios_build:
-    steps:
-      - brew_update
-      - brew_install:
-          formulae: libtool
-
   pip_install:
     parameters:
       args:
@@ -158,30 +105,6 @@ binary_common: &binary_common
     CU_VERSION: << parameters.cu_version >>
     MACOSX_DEPLOYMENT_TARGET: 10.9
 
-torchvision_ios_params: &torchvision_ios_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-    ios_arch:
-      type: string
-      default: ""
-    ios_platform:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-    IOS_ARCH: << parameters.ios_arch >>
-    IOS_PLATFORM: << parameters.ios_platform >>
-
-torchvision_android_params: &torchvision_android_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-
 smoke_test_common: &smoke_test_common
   <<: *binary_common
   docker:
@@ -274,79 +197,6 @@ jobs:
           paths:
             - "*"
 
-  binary_ios_build:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh"
-          cat "$script"
-          source "$script"
-    - persist_to_workspace:
-        root: /Users/distiller/workspace/
-        paths: ios
-
-  binary_ios_upload:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_android_build:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh"
-          cat "$script"
-          source "$script"
-    - store_artifacts:
-        path: ~/workspace/artifacts
-
-  binary_android_upload:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh"
-          cat "$script"
-          source "$script"
-
   binary_macos_conda:
     <<: *binary_common
     macos:
@@ -525,8 +375,6 @@ workflows:
   build:
     jobs:
       {{ build_workflows(windows_latest_only=True) }}
-      {{ ios_workflows() }}
-      {{ android_workflows() }}
 
   cmake:
     jobs:
@@ -534,8 +382,6 @@ workflows:
 
   nightly:
     jobs:
-      {{ ios_workflows(nightly=True) }}
-      {{ android_workflows(nightly=True) }}
       {{ build_workflows(prefix="nightly_", filter_branch="nightly", upload=True) }}
   docker_build:
     triggers:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 8db363702..4dd795f5c 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -233,61 +233,6 @@ def cmake_workflows(indentation=6):
     return indent(indentation, jobs)
 
 
-def ios_workflows(indentation=6, nightly=False):
-    jobs = []
-    build_job_names = []
-    name_prefix = "nightly_" if nightly else ""
-    env_prefix = "nightly-" if nightly else ""
-    for arch, platform in [("x86_64", "SIMULATOR"), ("arm64", "OS")]:
-        name = f"{name_prefix}binary_libtorchvision_ops_ios_12.0.0_{arch}"
-        build_job_names.append(name)
-        build_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-ios-12.0.0-{arch}",
-            "ios_arch": arch,
-            "ios_platform": platform,
-            "name": name,
-        }
-        if nightly:
-            build_job["filters"] = gen_filter_branch_tree("nightly")
-        jobs.append({"binary_ios_build": build_job})
-
-    if nightly:
-        upload_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-ios-12.0.0-upload",
-            "context": "org-member",
-            "filters": gen_filter_branch_tree("nightly"),
-            "requires": build_job_names,
-        }
-        jobs.append({"binary_ios_upload": upload_job})
-    return indent(indentation, jobs)
-
-
-def android_workflows(indentation=6, nightly=False):
-    jobs = []
-    build_job_names = []
-    name_prefix = "nightly_" if nightly else ""
-    env_prefix = "nightly-" if nightly else ""
-
-    name = f"{name_prefix}binary_libtorchvision_ops_android"
-    build_job_names.append(name)
-    build_job = {
-        "build_environment": f"{env_prefix}binary-libtorchvision_ops-android",
-        "name": name,
-    }
-
-    if nightly:
-        upload_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-android-upload",
-            "context": "org-member",
-            "filters": gen_filter_branch_tree("nightly"),
-            "name": f"{name_prefix}binary_libtorchvision_ops_android_upload",
-        }
-        jobs.append({"binary_android_upload": upload_job})
-    else:
-        jobs.append({"binary_android_build": build_job})
-    return indent(indentation, jobs)
-
-
 if __name__ == "__main__":
     d = os.path.dirname(__file__)
     env = jinja2.Environment(
@@ -302,7 +247,5 @@ if __name__ == "__main__":
             env.get_template("config.yml.in").render(
                 build_workflows=build_workflows,
                 cmake_workflows=cmake_workflows,
-                ios_workflows=ios_workflows,
-                android_workflows=android_workflows,
             )
         )
diff --git a/android/README.md b/android/README.md
new file mode 100644
index 000000000..788c83f26
--- /dev/null
+++ b/android/README.md
@@ -0,0 +1,3 @@
+## Status
+
+The Android demo of TorchVision is currently unmaintained, untested and likely out-of-date.
diff --git a/ios/README.md b/ios/README.md
new file mode 100644
index 000000000..0b50245f1
--- /dev/null
+++ b/ios/README.md
@@ -0,0 +1,3 @@
+## Status
+
+The iOS demo of TorchVision is currently unmaintained, untested and likely out-of-date.
-- 
GitLab


From 5a9e21032c32877f8203eeee15801a266bb171f0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 3 May 2023 10:46:35 +0100
Subject: [PATCH 421/624] Fix lint CI (#7554)

---
 torchvision/transforms/functional.py   | 4 ++--
 torchvision/transforms/transforms.py   | 4 ++--
 torchvision/transforms/v2/_geometry.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 1676aa2b7..5760642c7 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -424,10 +424,10 @@ def resize(
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image. If the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, 
+            than ``max_size`` after being resized according to ``size``,
             ``size`` will be overruled so that the longer edge is equal to
             ``max_size``.
-            As a result, the smaller edge may be shorter than ``size``. This 
+            As a result, the smaller edge may be shorter than ``size``. This
             is only supported if ``size`` is an int (or a sequence of length
             1 in torchscript mode).
         antialias (bool, optional): Whether to apply antialiasing.
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 1edf772bc..d0290f932 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -308,10 +308,10 @@ class Resize(torch.nn.Module):
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image. If the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, 
+            than ``max_size`` after being resized according to ``size``,
             ``size`` will be overruled so that the longer edge is equal to
             ``max_size``.
-            As a result, the smaller edge may be shorter than ``size``. This 
+            As a result, the smaller edge may be shorter than ``size``. This
             is only supported if ``size`` is an int (or a sequence of length
             1 in torchscript mode).
         antialias (bool, optional): Whether to apply antialiasing.
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 1a5281639..731d768c2 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -98,10 +98,10 @@ class Resize(Transform):
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
             the resized image. If the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, 
+            than ``max_size`` after being resized according to ``size``,
             ``size`` will be overruled so that the longer edge is equal to
             ``max_size``.
-            As a result, the smaller edge may be shorter than ``size``. This 
+            As a result, the smaller edge may be shorter than ``size``. This
             is only supported if ``size`` is an int (or a sequence of length
             1 in torchscript mode).
         antialias (bool, optional): Whether to apply antialiasing.
-- 
GitLab


From 6e7cecd3cc6ee995737c7144adf036076c015000 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 3 May 2023 11:52:45 +0100
Subject: [PATCH 422/624] [FBcode->GH] torchvision decoder. fix artifacts,
 blury images for frame dimensions not multiple of 8 (#7523) (#7552)

Co-authored-by: Vadim Zubov <vadikrobot@meta.com>
---
 torchvision/csrc/io/decoder/video_sampler.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
index 62ec0709b..8b712609e 100644
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ b/torchvision/csrc/io/decoder/video_sampler.cpp
@@ -181,6 +181,23 @@ bool VideoSampler::init(const SamplerParameters& params) {
   // set output format
   params_ = params;
 
+  if (params.in.video.format == AV_PIX_FMT_YUV420P) {
+    /* When the video width and height are not multiples of 8,
+     * and there is no size change in the conversion,
+     * a blurry screen will appear on the right side
+     * This problem was discovered in 2012 and
+     * continues to exist in version 4.1.3 in 2019
+     * This problem can be avoided by increasing SWS_ACCURATE_RND
+     * details https://trac.ffmpeg.org/ticket/1582
+     */
+    if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) {
+      VLOG(1) << "The width " << params.in.video.width << " and height "
+              << params.in.video.height << " the image is not a multiple of 8, "
+              << "the decoding speed may be reduced";
+      swsFlags_ |= SWS_ACCURATE_RND;
+    }
+  }
+
   scaleContext_ = sws_getContext(
       params.in.video.width,
       params.in.video.height,
-- 
GitLab


From 8811c915c021c347d2cc18bdf841a9a0f349cb1a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 3 May 2023 12:54:14 +0200
Subject: [PATCH 423/624] remove temporary skips in favor of increased
 tolerances (#7355)

---
 test/transforms_v2_kernel_infos.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 6fea25137..4469cf1ab 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -79,20 +79,20 @@ class KernelInfo(InfoBase):
         self.logs_usage = logs_usage
 
 
-def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False):
+def pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False):
     return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae)
 
 
 def cuda_vs_cpu_pixel_difference(atol=1):
     return {
-        (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): _pixel_difference_closeness_kwargs(atol, dtype=dtype)
+        (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): pixel_difference_closeness_kwargs(atol, dtype=dtype)
         for dtype in [torch.uint8, torch.float32]
     }
 
 
 def pil_reference_pixel_difference(atol=1, mae=False):
     return {
-        (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): _pixel_difference_closeness_kwargs(
+        (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(
             atol, mae=mae
         )
     }
@@ -104,7 +104,7 @@ def float32_vs_uint8_pixel_difference(atol=1, mae=False):
             ("TestKernels", "test_float32_vs_uint8"),
             torch.float32,
             "cpu",
-        ): _pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae)
+        ): pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae)
     }
 
 
@@ -1925,9 +1925,6 @@ def sample_inputs_adjust_contrast_video():
         yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
-# TODO: this is just temporary to make CI green for release. We should add proper tolerances after
-skip_adjust_contrast_jit = TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.skip(reason="Test is flaky"))
-
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1941,14 +1938,16 @@ KERNEL_INFOS.extend(
                 **pil_reference_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(2),
                 **cuda_vs_cpu_pixel_difference(),
+                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1),
             },
-            test_marks=[skip_adjust_contrast_jit],
         ),
         KernelInfo(
             F.adjust_contrast_video,
             sample_inputs_fn=sample_inputs_adjust_contrast_video,
-            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
-            test_marks=[skip_adjust_contrast_jit],
+            closeness_kwargs={
+                **cuda_vs_cpu_pixel_difference(),
+                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1),
+            },
         ),
     ]
 )
@@ -2064,9 +2063,6 @@ def sample_inputs_adjust_saturation_video():
         yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
-# TODO: this is just temporary to make CI green for release. We should add proper tolerances after
-skip_adjust_saturation_cuda = TestMark(("TestKernels", "test_cuda_vs_cpu"), pytest.mark.skip(reason="Test is flaky"))
-
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -2079,13 +2075,13 @@ KERNEL_INFOS.extend(
             closeness_kwargs={
                 **pil_reference_pixel_difference(),
                 **float32_vs_uint8_pixel_difference(2),
+                **cuda_vs_cpu_pixel_difference(),
             },
-            test_marks=[skip_adjust_saturation_cuda],
         ),
         KernelInfo(
             F.adjust_saturation_video,
             sample_inputs_fn=sample_inputs_adjust_saturation_video,
-            test_marks=[skip_adjust_saturation_cuda],
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
         ),
     ]
 )
-- 
GitLab


From 683baf8ee762cceeeae01b3ff04ae4ad606abe70 Mon Sep 17 00:00:00 2001
From: "Adam J. Stewart" <ajstewart426@gmail.com>
Date: Wed, 3 May 2023 06:21:47 -0500
Subject: [PATCH 424/624] Check sha256 of weights (#7219)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/models/_api.py                               | 4 ++--
 torchvision/models/alexnet.py                            | 2 +-
 torchvision/models/convnext.py                           | 2 +-
 torchvision/models/densenet.py                           | 2 +-
 torchvision/models/detection/faster_rcnn.py              | 6 +++---
 torchvision/models/detection/fcos.py                     | 2 +-
 torchvision/models/detection/keypoint_rcnn.py            | 2 +-
 torchvision/models/detection/mask_rcnn.py                | 4 ++--
 torchvision/models/detection/retinanet.py                | 4 ++--
 torchvision/models/detection/ssd.py                      | 2 +-
 torchvision/models/detection/ssdlite.py                  | 2 +-
 torchvision/models/efficientnet.py                       | 2 +-
 torchvision/models/googlenet.py                          | 2 +-
 torchvision/models/inception.py                          | 2 +-
 torchvision/models/maxvit.py                             | 2 +-
 torchvision/models/mnasnet.py                            | 2 +-
 torchvision/models/mobilenetv2.py                        | 2 +-
 torchvision/models/mobilenetv3.py                        | 2 +-
 torchvision/models/optical_flow/raft.py                  | 2 +-
 torchvision/models/quantization/googlenet.py             | 2 +-
 torchvision/models/quantization/inception.py             | 2 +-
 torchvision/models/quantization/mobilenetv2.py           | 2 +-
 torchvision/models/quantization/mobilenetv3.py           | 2 +-
 torchvision/models/quantization/resnet.py                | 2 +-
 torchvision/models/quantization/shufflenetv2.py          | 2 +-
 torchvision/models/regnet.py                             | 2 +-
 torchvision/models/resnet.py                             | 2 +-
 torchvision/models/segmentation/deeplabv3.py             | 6 +++---
 torchvision/models/segmentation/fcn.py                   | 4 ++--
 torchvision/models/segmentation/lraspp.py                | 2 +-
 torchvision/models/shufflenetv2.py                       | 2 +-
 torchvision/models/squeezenet.py                         | 2 +-
 torchvision/models/swin_transformer.py                   | 2 +-
 torchvision/models/vgg.py                                | 2 +-
 torchvision/models/video/mvit.py                         | 2 +-
 torchvision/models/video/resnet.py                       | 2 +-
 torchvision/models/video/s3d.py                          | 2 +-
 torchvision/models/video/swin_transformer.py             | 2 +-
 torchvision/models/vision_transformer.py                 | 2 +-
 torchvision/prototype/models/depth/stereo/crestereo.py   | 2 +-
 torchvision/prototype/models/depth/stereo/raft_stereo.py | 2 +-
 41 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 89522b12e..51db5c0b2 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -85,8 +85,8 @@ class WeightsEnum(Enum):
                 )
         return obj
 
-    def get_state_dict(self, progress: bool) -> Mapping[str, Any]:
-        return load_state_dict_from_url(self.url, progress=progress)
+    def get_state_dict(self, *args: Any, **kwargs: Any) -> Mapping[str, Any]:
+        return load_state_dict_from_url(self.url, *args, **kwargs)
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}.{self._name_}"
diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py
index 4778a19a8..f85acbeb2 100644
--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -114,6 +114,6 @@ def alexnet(*, weights: Optional[AlexNet_Weights] = None, progress: bool = True,
     model = AlexNet(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/convnext.py b/torchvision/models/convnext.py
index 6228c730f..444ef3c21 100644
--- a/torchvision/models/convnext.py
+++ b/torchvision/models/convnext.py
@@ -189,7 +189,7 @@ def _convnext(
     model = ConvNeXt(block_setting, stochastic_depth_prob=stochastic_depth_prob, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py
index 8253b9572..3b42807cc 100644
--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -227,7 +227,7 @@ def _load_state_dict(model: nn.Module, weights: WeightsEnum, progress: bool) ->
         r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$"
     )
 
-    state_dict = weights.get_state_dict(progress=progress)
+    state_dict = weights.get_state_dict(progress=progress, check_hash=True)
     for key in list(state_dict.keys()):
         res = pattern.match(key)
         if res:
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index bda052111..de32f3453 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -571,7 +571,7 @@ def fasterrcnn_resnet50_fpn(
     model = FasterRCNN(backbone, num_classes=num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == FasterRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -653,7 +653,7 @@ def fasterrcnn_resnet50_fpn_v2(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -694,7 +694,7 @@ def _fasterrcnn_mobilenet_v3_large_fpn(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index 8c6f84ca5..dd846aea9 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -766,6 +766,6 @@ def fcos_resnet50_fpn(
     model = FCOS(backbone, num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 5db9911ca..1ef0c1950 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -465,7 +465,7 @@ def keypointrcnn_resnet50_fpn(
     model = KeypointRCNN(backbone, num_classes, num_keypoints=num_keypoints, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == KeypointRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index b2f2cbfe2..695dd4d63 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -501,7 +501,7 @@ def maskrcnn_resnet50_fpn(
     model = MaskRCNN(backbone, num_classes=num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == MaskRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -582,6 +582,6 @@ def maskrcnn_resnet50_fpn_v2(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 3a14c983a..3a9cf80d1 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -815,7 +815,7 @@ def retinanet_resnet50_fpn(
     model = RetinaNet(backbone, num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == RetinaNet_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -894,6 +894,6 @@ def retinanet_resnet50_fpn_v2(
     model = RetinaNet(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index 584798df7..87062d2bc 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -677,6 +677,6 @@ def ssd300_vgg16(
     model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index b1ef24ef1..f16d46ad1 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -326,6 +326,6 @@ def ssdlite320_mobilenet_v3_large(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index bf8d4cee1..14b134bb2 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -357,7 +357,7 @@ def _efficientnet(
     model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py
index 947ae210e..1dc5136d7 100644
--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -332,7 +332,7 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
     model = GoogLeNet(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.aux1 = None  # type: ignore[assignment]
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index f7b00d492..447a7682d 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -470,7 +470,7 @@ def inception_v3(*, weights: Optional[Inception_V3_Weights] = None, progress: bo
     model = Inception3(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
index 633be4d4e..d8e62cd36 100644
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -763,7 +763,7 @@ def _maxvit(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 4b501b721..65462154a 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -317,7 +317,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
     model = MNASNet(alpha, **kwargs)
 
     if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index 451c553bc..fbb6a4981 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -255,6 +255,6 @@ def mobilenet_v2(
     model = MobileNetV2(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index 4489563e8..aa520e149 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -282,7 +282,7 @@ def _mobilenet_v3(
     model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index 37da4ff0a..e682fda2c 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -818,7 +818,7 @@ def _raft(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index 6998a2b53..a12b5dec0 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -197,7 +197,7 @@ def googlenet(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.aux1 = None  # type: ignore[assignment]
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index 3421095d7..9c4562eea 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -265,7 +265,7 @@ def inception_v3(
         if quantize and not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not quantize and not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py
index 1ac08f041..4700bb4af 100644
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -149,6 +149,6 @@ def mobilenet_v2(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py
index e5686f020..f1fdcfec9 100644
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -149,7 +149,7 @@ def _mobilenet_v3_model(
         torch.ao.quantization.prepare_qat(model, inplace=True)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     if quantize:
         torch.ao.quantization.convert(model, inplace=True)
diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py
index 18ccff669..39958a010 100644
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -144,7 +144,7 @@ def _resnet(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index d41a84449..50fef3a65 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -108,7 +108,7 @@ def _shufflenetv2(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index e887640e8..f37b2994e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -397,7 +397,7 @@ def _regnet(
     model = RegNet(block_params, norm_layer=norm_layer, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index 40c8afb9a..367fc62eb 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -298,7 +298,7 @@ def _resnet(
     model = ResNet(block, layers, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
index 5f615d2c9..f58c5d26a 100644
--- a/torchvision/models/segmentation/deeplabv3.py
+++ b/torchvision/models/segmentation/deeplabv3.py
@@ -275,7 +275,7 @@ def deeplabv3_resnet50(
     model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -331,7 +331,7 @@ def deeplabv3_resnet101(
     model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -385,6 +385,6 @@ def deeplabv3_mobilenet_v3_large(
     model = _deeplabv3_mobilenetv3(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
index 7a270c99d..fb2e242ad 100644
--- a/torchvision/models/segmentation/fcn.py
+++ b/torchvision/models/segmentation/fcn.py
@@ -168,7 +168,7 @@ def fcn_resnet50(
     model = _fcn_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -227,6 +227,6 @@ def fcn_resnet101(
     model = _fcn_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
index ac1509d09..70bced70f 100644
--- a/torchvision/models/segmentation/lraspp.py
+++ b/torchvision/models/segmentation/lraspp.py
@@ -173,6 +173,6 @@ def lraspp_mobilenet_v3_large(
     model = _lraspp_mobilenetv3(backbone, num_classes)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index ba6056854..52b85244b 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -178,7 +178,7 @@ def _shufflenetv2(
     model = ShuffleNetV2(*args, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py
index 84097c240..982b32107 100644
--- a/torchvision/models/squeezenet.py
+++ b/torchvision/models/squeezenet.py
@@ -109,7 +109,7 @@ def _squeezenet(
     model = SqueezeNet(version, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 249ca37b9..df897e2b9 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -639,7 +639,7 @@ def _swin_transformer(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index 4bf1a8317..0a548570d 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -102,7 +102,7 @@ def _vgg(cfg: str, batch_norm: bool, weights: Optional[WeightsEnum], progress: b
             _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
     model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
     return model
 
 
diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index 973ff34cc..159c12a4f 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -593,7 +593,7 @@ def _mvit(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/video/resnet.py b/torchvision/models/video/resnet.py
index 5bc46a8ed..a1cb28840 100644
--- a/torchvision/models/video/resnet.py
+++ b/torchvision/models/video/resnet.py
@@ -303,7 +303,7 @@ def _video_resnet(
     model = VideoResNet(block, conv_makers, layers, stem, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py
index 8716a454b..4b202829b 100644
--- a/torchvision/models/video/s3d.py
+++ b/torchvision/models/video/s3d.py
@@ -214,6 +214,6 @@ def s3d(*, weights: Optional[S3D_Weights] = None, progress: bool = True, **kwarg
     model = S3D(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/video/swin_transformer.py b/torchvision/models/video/swin_transformer.py
index 25cf3cf99..a8d87ffbe 100644
--- a/torchvision/models/video/swin_transformer.py
+++ b/torchvision/models/video/swin_transformer.py
@@ -497,7 +497,7 @@ def _swin_transformer3d(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index 1e3075311..f2983ef9d 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -332,7 +332,7 @@ def _vision_transformer(
     )
 
     if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
index 89a23aae7..f1b9a7c87 100644
--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -1052,7 +1052,7 @@ def _crestereo(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
index aca12948c..20ef077a6 100644
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py
@@ -620,7 +620,7 @@ def _raft_stereo(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
-- 
GitLab


From 14553fb9eb78e053100e966feee6149905f9922c Mon Sep 17 00:00:00 2001
From: Liron Ilouz <43831550+ilouzl@users.noreply.github.com>
Date: Wed, 3 May 2023 14:34:24 +0300
Subject: [PATCH 425/624] proper comparison of input and outpus size before
 resizing (#7519)

Co-authored-by: Liron <liron@tapwithus.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_transforms.py              | 10 ++++++++++
 torchvision/transforms/functional.py |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/test_transforms.py b/test/test_transforms.py
index c96fbb284..41075c651 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -463,6 +463,16 @@ def test_resize_size_equals_small_edge_size(height, width):
     assert max(result.size) == max_size
 
 
+def test_resize_equal_input_output_sizes():
+    # Regression test for https://github.com/pytorch/vision/issues/7518
+    height, width = 28, 27
+    img = Image.new("RGB", size=(width, height))
+
+    t = transforms.Resize((height, width), antialias=True)
+    result = t(img)
+    assert result is img
+
+
 class TestPad:
     @pytest.mark.parametrize("fill", [85, 85.0])
     def test_pad(self, fill):
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 5760642c7..2c2f1e193 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -478,7 +478,7 @@ def resize(
         size = [size]
     output_size = _compute_resized_output_size((image_height, image_width), size, max_size)
 
-    if (image_height, image_width) == output_size:
+    if [image_height, image_width] == output_size:
         return img
 
     antialias = _check_antialias(img, antialias, interpolation)
-- 
GitLab


From 0370134359268666bb1de9b53c1e9e64f1d4cde7 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 4 May 2023 13:13:20 -0400
Subject: [PATCH 426/624] Add missing cuda 12.1 code, cleanup cu116 use cases
 (#7558)

---
 packaging/pkg_helpers.bash                      | 12 ++++++------
 packaging/windows/internal/cuda_install.bat     |  1 -
 packaging/windows/internal/vc_install_helper.sh | 14 ++------------
 3 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 3b5330545..e7e5e7c43 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -260,15 +260,15 @@ setup_conda_cudatoolkit_constraint() {
     export CONDA_BUILD_VARIANT="cpu"
   else
     case "$CU_VERSION" in
+      cu121)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=12.1 # [not osx]"
+        ;;
       cu118)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.8 # [not osx]"
         ;;
       cu117)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
         ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]"
-        ;;
       cpu)
         export CONDA_CUDATOOLKIT_CONSTRAINT=""
         export CONDA_BUILD_VARIANT="cpu"
@@ -289,15 +289,15 @@ setup_conda_cudatoolkit_plain_constraint() {
     export CMAKE_USE_CUDA=0
   else
     case "$CU_VERSION" in
+      cu121)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=12.1"
+        ;;
       cu118)
         export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.8"
         ;;
       cu117)
         export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7"
         ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.6"
-        ;;
       cpu)
         export CONDA_CUDATOOLKIT_CONSTRAINT=""
         export CONDA_BUILD_VARIANT="cpu"
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index a68ce7f85..25aee6a95 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -23,7 +23,6 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
 set CUDNN_FOLDER="cuda"
 set CUDNN_LIB_FOLDER="lib\x64"
 
-if %CUDA_VER% EQU 116 goto cuda116
 if %CUDA_VER% EQU 117 goto cuda117
 if %CUDA_VER% EQU 118 goto cuda118
 if %CUDA_VER% EQU 121 goto cuda121
diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh
index cdae18065..251509ae1 100644
--- a/packaging/windows/internal/vc_install_helper.sh
+++ b/packaging/windows/internal/vc_install_helper.sh
@@ -2,15 +2,5 @@
 
 set -ex
 
-if [[ "$CU_VERSION" == "cu92" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS="-vcvars_ver=14.13"
-  powershell packaging/windows/internal/vs2017_install.ps1
-elif [[ "$CU_VERSION" == "cu100" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS=""
-  powershell packaging/windows/internal/vs2017_install.ps1
-else
-  export VC_YEAR=2019
-  export VSDEVCMD_ARGS=""
-fi
+export VC_YEAR=2019
+export VSDEVCMD_ARGS=""
-- 
GitLab


From caa02392d3f415d2cd466d6a21f8d2756d8b002e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 May 2023 21:03:31 +0200
Subject: [PATCH 427/624] remove double doc upload after CloudFront fix (#7516)

---
 .github/workflows/docs.yml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index efec5a2c2..22ff4ab84 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -66,16 +66,3 @@ jobs:
           if-no-files-found: error
           path: html
           s3-prefix: pytorch/vision/${{ github.event.pull_request.number }}
-
-      # The upload below duplicates the upload from above, but to a different path. This is needed since we are in the
-      # process of changing the path, but want to keep the disruption to a minimum.
-      # See https://github.com/pytorch/test-infra/issues/3894
-      # After a grace period, we can delete this again
-      - name: Upload docs preview
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: html
-          s3-prefix: pytorch/pytorch/vision/${{ github.event.pull_request.number }}
-- 
GitLab


From e5a1b71d24049f16eaeba66c4d1ec6486afa694e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 May 2023 22:03:28 +0200
Subject: [PATCH 428/624] moving windows conda / CUDA workarounds to Nova
 (#7555)

---
 .github/workflows/test-windows.yml | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/test-windows.yml b/.github/workflows/test-windows.yml
index 40c97def8..b300a5d0e 100644
--- a/.github/workflows/test-windows.yml
+++ b/.github/workflows/test-windows.yml
@@ -30,6 +30,8 @@ jobs:
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
       timeout: 120
       script: |
         set -euxo pipefail
@@ -38,14 +40,4 @@ jobs:
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
         
-        # TODO: Port this to pytorch/test-infra/.github/workflows/windows_job.yml
-        export PATH="/c/Jenkins/Miniconda3/Scripts:${PATH}"
-        
-        if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
-          # TODO: This should be handled by the generic Windows job the same as its done by the generic Linux job
-          export CUDA_HOME="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ matrix.gpu-arch-version }}"
-          export CUDA_PATH="${CUDA_HOME}"
-          export PATH="${CUDA_PATH}/bin:${PATH}"
-        fi
-        
         ./.github/scripts/unittest.sh
-- 
GitLab


From 205602affdec5d1c1c5a037ac80ce687e73cd3f6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 May 2023 22:48:10 +0200
Subject: [PATCH 429/624] fix v2.Lambda (#7566)

---
 test/test_transforms_v2.py         | 35 ++++++++++++++++++++++++++++++
 torchvision/transforms/v2/_misc.py |  4 +++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 7413ec1bd..02e3e1e56 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -2137,3 +2137,38 @@ def test_no_warnings_v1_namespace():
         from torchvision.datasets import ImageNet
     """
     assert_run_python_script(textwrap.dedent(source))
+
+
+class TestLambda:
+    inputs = pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0])
+
+    @inputs
+    def test_default(self, input):
+        was_applied = False
+
+        def was_applied_fn(input):
+            nonlocal was_applied
+            was_applied = True
+            return input
+
+        transform = transforms.Lambda(was_applied_fn)
+
+        transform(input)
+
+        assert was_applied
+
+    @inputs
+    def test_with_types(self, input):
+        was_applied = False
+
+        def was_applied_fn(input):
+            nonlocal was_applied
+            was_applied = True
+            return input
+
+        types = (torch.Tensor, np.ndarray)
+        transform = transforms.Lambda(was_applied_fn, *types)
+
+        transform(input)
+
+        assert was_applied is isinstance(input, types)
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 8f063fd60..90741c4ec 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -32,10 +32,12 @@ class Lambda(Transform):
         lambd (function): Lambda/function to be used for transform.
     """
 
+    _transformed_types = (object,)
+
     def __init__(self, lambd: Callable[[Any], Any], *types: Type):
         super().__init__()
         self.lambd = lambd
-        self.types = types or (object,)
+        self.types = types or self._transformed_types
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if isinstance(inpt, self.types):
-- 
GitLab


From 03f2a8b73e601da6c1d84bdb24e6f8ec20c88674 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 9 May 2023 10:33:05 +0200
Subject: [PATCH 430/624] add same size shortcut for resize in transforms v2
 (#7521)

---
 test/transforms_v2_kernel_infos.py            |  3 +++
 .../transforms/v2/functional/_geometry.py     | 22 ++++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 4469cf1ab..1678c3fb2 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -321,6 +321,9 @@ def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=
     old_height, old_width = spatial_size
     new_height, new_width = F._geometry._compute_resized_output_size(spatial_size, size=size, max_size=max_size)
 
+    if (old_height, old_width) == (new_height, new_width):
+        return bounding_box, (old_height, old_width)
+
     affine_matrix = np.array(
         [
             [new_width / old_width, 0, 0],
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 3da9e71c5..1ceabbd80 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -179,7 +179,9 @@ def resize_image_tensor(
     num_channels, old_height, old_width = shape[-3:]
     new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
 
-    if image.numel() > 0:
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+    elif image.numel() > 0:
         image = image.reshape(-1, num_channels, old_height, old_width)
 
         dtype = image.dtype
@@ -210,9 +212,19 @@ def resize_image_pil(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
 ) -> PIL.Image.Image:
+    old_height, old_width = image.height, image.width
+    new_height, new_width = _compute_resized_output_size(
+        (old_height, old_width),
+        size=size,  # type: ignore[arg-type]
+        max_size=max_size,
+    )
+
     interpolation = _check_interpolation(interpolation)
-    size = _compute_resized_output_size(image.size[::-1], size=size, max_size=max_size)  # type: ignore[arg-type]
-    return _FP.resize(image, size, interpolation=pil_modes_mapping[interpolation])
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+
+    return image.resize((new_width, new_height), resample=pil_modes_mapping[interpolation])
 
 
 def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = None) -> torch.Tensor:
@@ -235,6 +247,10 @@ def resize_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     old_height, old_width = spatial_size
     new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return bounding_box, spatial_size
+
     w_ratio = new_width / old_width
     h_ratio = new_height / old_height
     ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_box.device)
-- 
GitLab


From 2caa84faf753e961d99d7f3e89b381edb854107a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 9 May 2023 11:11:08 +0200
Subject: [PATCH 431/624] use automatic doc upload instead of seperate job
 (#7567)

---
 .github/workflows/docs.yml | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 22ff4ab84..f48f8fb0a 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -14,7 +14,6 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/vision
-      upload-artifact: docs
       script: |
         set -euo pipefail
 
@@ -40,29 +39,9 @@ jobs:
         pip install --progress-bar=off -r requirements.txt
         echo '::endgroup::'
         
-        echo '::group::Build HTML docs'
-        # The runner does not have sufficient memory to run with as many processes as their are
+        # The runner does not have sufficient memory to run with as many processes as there are
         # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
         sed -i -e 's/-j auto/-j 1/' Makefile
         make html
-        echo '::endgroup::'
         
-        mv build/html "${RUNNER_ARTIFACT_DIR}"
-
-  upload-preview:
-    if: github.event_name == 'pull_request'
-    needs: [build]
-    runs-on: [self-hosted, linux.2xlarge]
-    steps:
-      - uses: actions/download-artifact@v3
-        with:
-          name: docs
-
-      - name: Upload docs preview
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: html
-          s3-prefix: pytorch/vision/${{ github.event.pull_request.number }}
+        mv build/html/* "${RUNNER_DOCS_DIR}"
-- 
GitLab


From 732551701c326b8338887a3812d189c845ff28a5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 9 May 2023 12:30:58 +0200
Subject: [PATCH 432/624] remove obsolete pre Pyhton3.8 compat (#7559)

---
 torchvision/_internally_replaced_utils.py | 10 +---------
 torchvision/extension.py                  | 17 +----------------
 2 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/torchvision/_internally_replaced_utils.py b/torchvision/_internally_replaced_utils.py
index 18afc3ed9..d9a6e261e 100644
--- a/torchvision/_internally_replaced_utils.py
+++ b/torchvision/_internally_replaced_utils.py
@@ -28,7 +28,6 @@ def _get_extension_path(lib_name):
     if os.name == "nt":
         # Register the main torchvision library location on the default DLL path
         import ctypes
-        import sys
 
         kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
         with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
@@ -37,14 +36,7 @@ def _get_extension_path(lib_name):
         if with_load_library_flags:
             kernel32.AddDllDirectory.restype = ctypes.c_void_p
 
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(lib_dir)
-        elif with_load_library_flags:
-            res = kernel32.AddDllDirectory(lib_dir)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
-                raise err
+        os.add_dll_directory(lib_dir)
 
         kernel32.SetErrorMode(prev_error_mode)
 
diff --git a/torchvision/extension.py b/torchvision/extension.py
index de5ea0c94..c417c54f9 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -1,7 +1,5 @@
-import ctypes
 import os
 import sys
-from warnings import warn
 
 import torch
 
@@ -22,7 +20,7 @@ try:
     # conda environment/bin path is configured Please take a look:
     # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
     # Please note: if some path can't be added using add_dll_directory we simply ignore this path
-    if os.name == "nt" and sys.version_info >= (3, 8) and sys.version_info < (3, 9):
+    if os.name == "nt" and sys.version_info < (3, 9):
         env_path = os.environ["PATH"]
         path_arr = env_path.split(";")
         for path in path_arr:
@@ -88,19 +86,6 @@ def _check_cuda_version():
 
 def _load_library(lib_name):
     lib_path = _get_extension_path(lib_name)
-    # On Windows Python-3.8+ has `os.add_dll_directory` call,
-    # which is called from _get_extension_path to configure dll search path
-    # Condition below adds a workaround for older versions by
-    # explicitly calling `LoadLibraryExW` with the following flags:
-    #  - LOAD_LIBRARY_SEARCH_DEFAULT_DIRS (0x1000)
-    #  - LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR (0x100)
-    if os.name == "nt" and sys.version_info < (3, 8):
-        _kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
-        if hasattr(_kernel32, "LoadLibraryExW"):
-            _kernel32.LoadLibraryExW(lib_path, None, 0x00001100)
-        else:
-            warn("LoadLibraryExW is missing in kernel32.dll")
-
     torch.ops.load_library(lib_path)
 
 
-- 
GitLab


From b1333dd767d8a0800b4ca63a550fe5f924e76868 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 9 May 2023 14:39:49 +0200
Subject: [PATCH 433/624] migrate README to markdown (#7565)

---
 MANIFEST.in |   2 +-
 README.md   | 188 ++++++++++++++++++++++++++++++++++++++++++++
 README.rst  | 218 ----------------------------------------------------
 setup.py    |   3 +-
 4 files changed, 191 insertions(+), 220 deletions(-)
 create mode 100644 README.md
 delete mode 100644 README.rst

diff --git a/MANIFEST.in b/MANIFEST.in
index 75f238c0a..9e45188df 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include README.rst
+include README.md
 include LICENSE
 
 recursive-exclude * __pycache__
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..81adac2a3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,188 @@
+# torchvision
+
+[![total torchvision downloads](https://pepy.tech/badge/torchvision)](https://pepy.tech/project/torchvision)
+[![documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/vision/stable/index.html)
+
+The torchvision package consists of popular datasets, model architectures, and common image transformations for computer
+vision.
+
+## Installation
+
+We recommend Anaconda as Python package management system. Please refer to [pytorch.org](https://pytorch.org/) for the
+detail of PyTorch (`torch`) installation. The following is the corresponding `torchvision` versions and supported Python
+versions.
+
+| `torch`            | `torchvision`      | Python              |
+| ------------------ | ------------------ | ------------------- |
+| `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.11`   |
+| `2.0.0`            | `0.15.1`           | `>=3.8`, `<=3.11`   |
+| `1.13.0`           | `0.14.0`           | `>=3.7.2`, `<=3.10` |
+| `1.12.0`           | `0.13.0`           | `>=3.7`, `<=3.10`   |
+| `1.11.0`           | `0.12.0`           | `>=3.7`, `<=3.10`   |
+
+<details>
+    <summary>older versions</summary>
+
+| `torch`            | `torchvision`      | Python                    |
+|--------------------|--------------------|---------------------------|
+| `1.10.2`           | `0.11.3`           | `>=3.6`, `<=3.9`          |
+| `1.10.1`           | `0.11.2`           | `>=3.6`, `<=3.9`          |
+| `1.10.0`           | `0.11.1`           | `>=3.6`, `<=3.9`          |
+| `1.9.1`            | `0.10.1`           | `>=3.6`, `<=3.9`          |
+| `1.9.0`            | `0.10.0`           | `>=3.6`, `<=3.9`          |
+| `1.8.2`            | `0.9.2`            | `>=3.6`, `<=3.9`          |
+| `1.8.1`            | `0.9.1`            | `>=3.6`, `<=3.9`          |
+| `1.8.0`            | `0.9.0`            | `>=3.6`, `<=3.9`          |
+| `1.7.1`            | `0.8.2`            | `>=3.6`, `<=3.9`          |
+| `1.7.0`            | `0.8.1`            | `>=3.6`, `<=3.8`          |
+| `1.7.0`            | `0.8.0`            | `>=3.6`, `<=3.8`          |
+| `1.6.0`            | `0.7.0`            | `>=3.6`, `<=3.8`          |
+| `1.5.1`            | `0.6.1`            | `>=3.5`, `<=3.8`          |
+| `1.5.0`            | `0.6.0`            | `>=3.5`, `<=3.8`          |
+| `1.4.0`            | `0.5.0`            | `==2.7`, `>=3.5`, `<=3.8` |
+| `1.3.1`            | `0.4.2`            | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.3.0`            | `0.4.1`            | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.2.0`            | `0.4.0`            | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.1.0`            | `0.3.0`            | `==2.7`, `>=3.5`, `<=3.7` |
+| `<=1.0.1`          | `0.2.2`            | `==2.7`, `>=3.5`, `<=3.7` |
+
+</details>
+
+Anaconda:
+
+```
+conda install torchvision -c pytorch
+```
+
+pip:
+
+```
+pip install torchvision
+```
+
+From source:
+
+```
+python setup.py install
+# or, for OSX
+# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+```
+
+We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
+`--no-build-isolation` flag. In case building TorchVision from source fails, install the nightly version of PyTorch
+following the linked guide on the
+[contributing page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation) and retry the
+install.
+
+By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
+building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+
+## Image Backend
+
+Torchvision currently supports the following image backends:
+
+- [Pillow](https://python-pillow.org/) (default)
+- [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
+  If installed will be used as the default.
+- [accimage](https://github.com/pytorch/accimage) - if installed can be activated by calling
+  `torchvision.set_image_backend('accimage')`
+- [libpng](http://www.libpng.org/pub/png/libpng.html) - can be installed via conda `conda install libpng` or any of the
+  package managers for debian-based and RHEL-based Linux distributions.
+- [libjpeg](http://ijg.org/) - can be installed via conda `conda install jpeg` or any of the package managers for
+  debian-based and RHEL-based Linux distributions. [libjpeg-turbo](https://libjpeg-turbo.org/) can be used as well.
+
+**Notes:** `libpng` and `libjpeg` must be available at compilation time in order to be available. Make sure that it is
+available on the standard library locations, otherwise, add the include and library paths in the environment variables
+`TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`, respectively.
+
+## Video Backend
+
+Torchvision currently supports the following video backends:
+
+- [pyav](https://github.com/PyAV-Org/PyAV) (default) - Pythonic binding for ffmpeg libraries.
+- video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any
+  conflicting version of ffmpeg installed. Currently, this is only supported on Linux.
+
+```
+conda install -c conda-forge ffmpeg
+python setup.py install
+```
+
+# Using the models on C++
+
+TorchVision provides an example project for how to use the models on C++ using JIT Script.
+
+Installation From source:
+
+```
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
+
+Once installed, the library can be accessed in cmake (after properly configuring `CMAKE_PREFIX_PATH`) via the
+`TorchVision::TorchVision` target:
+
+```
+find_package(TorchVision REQUIRED)
+target_link_libraries(my-target PUBLIC TorchVision::TorchVision)
+```
+
+The `TorchVision` package will also automatically look for the `Torch` package and add it as a dependency to
+`my-target`, so make sure that it is also available to cmake via the `CMAKE_PREFIX_PATH`.
+
+For an example setup, take a look at `examples/cpp/hello_world`.
+
+Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any
+Python dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link
+to Python. This can be done by passing `-DUSE_PYTHON=on` to CMake.
+
+### TorchVision Operators
+
+In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that
+you `#include <torchvision/vision.h>` in your project.
+
+## Documentation
+
+You can find the API documentation on the pytorch website: <https://pytorch.org/vision/stable/index.html>
+
+## Contributing
+
+See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out.
+
+## Disclaimer on Datasets
+
+This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets,
+vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to
+determine whether you have permission to use the dataset under the dataset's license.
+
+If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset
+to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML
+community!
+
+## Pre-trained Model License
+
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the
+dataset used for training. It is your responsibility to determine whether you have permission to use the models for your
+use case.
+
+More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See
+[SWAG LICENSE](https://github.com/facebookresearch/SWAG/blob/main/LICENSE) for additional details.
+
+## Citing TorchVision
+
+If you find TorchVision useful in your work, please consider citing the following BibTeX entry:
+
+```bibtex
+@software{torchvision2016,
+    title        = {TorchVision: PyTorch's Computer Vision library},
+    author       = {TorchVision maintainers and contributors},
+    year         = 2016,
+    journal      = {GitHub repository},
+    publisher    = {GitHub},
+    howpublished = {\url{https://github.com/pytorch/vision}}
+}
+```
diff --git a/README.rst b/README.rst
deleted file mode 100644
index 73984792f..000000000
--- a/README.rst
+++ /dev/null
@@ -1,218 +0,0 @@
-torchvision
-===========
-
-.. image:: https://pepy.tech/badge/torchvision
-    :target: https://pepy.tech/project/torchvision
-
-.. image:: https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v
-    :target: https://pytorch.org/vision/stable/index.html
-
-
-The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
-
-
-Installation
-============
-
-We recommend Anaconda as Python package management system. Please refer to `pytorch.org <https://pytorch.org/>`_
-for the detail of PyTorch (``torch``) installation. The following is the corresponding ``torchvision`` versions and
-supported Python versions.
-
-+--------------------------+--------------------------+---------------------------------+
-| ``torch``                | ``torchvision``          | ``python``                      |
-+==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.8``, ``<=3.11``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``2.0.0``                | ``0.15.1``               | ``>=3.8``, ``<=3.11``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.13.0``               | ``0.14.0``               | ``>=3.7.2``, ``<=3.10``         |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.12.0``               | ``0.13.0``               | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.11.0``               | ``0.12.0``               | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.2``               | ``0.11.3``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.1``               | ``0.11.2``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.0``               | ``0.11.1``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.9.1``                | ``0.10.1``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.9.0``                | ``0.10.0``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.2``                | ``0.9.2``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.1``                | ``0.9.1``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.0``                | ``0.9.0``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.1``                | ``0.8.2``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.0``                | ``0.8.1``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.0``                | ``0.8.0``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.6.0``                | ``0.7.0``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.5.1``                | ``0.6.1``                | ``>=3.5``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.5.0``                | ``0.6.0``                | ``>=3.5``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.4.0``                | ``0.5.0``                | ``==2.7``, ``>=3.5``, ``<=3.8`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.3.1``                | ``0.4.2``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.3.0``                | ``0.4.1``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.2.0``                | ``0.4.0``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.1.0``                | ``0.3.0``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``<=1.0.1``              | ``0.2.2``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-
-Anaconda:
-
-.. code:: bash
-
-    conda install torchvision -c pytorch
-
-pip:
-
-.. code:: bash
-
-    pip install torchvision
-
-From source:
-
-.. code:: bash
-
-    python setup.py install
-    # or, for OSX
-    # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
-
-
-We don't officially support building from source using ``pip``, but *if* you do,
-you'll need to use the ``--no-build-isolation`` flag.
-In case building TorchVision from source fails, install the nightly version of PyTorch following
-the linked guide on the  `contributing page <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_ and retry the install.
-
-By default, GPU support is built if CUDA is found and ``torch.cuda.is_available()`` is true.
-It's possible to force building GPU support by setting ``FORCE_CUDA=1`` environment variable,
-which is useful when building a docker image.
-
-Image Backend
-=============
-Torchvision currently supports the following image backends:
-
-* `Pillow`_ (default)
-
-* `Pillow-SIMD`_ - a **much faster** drop-in replacement for Pillow with SIMD. If installed will be used as the default.
-
-* `accimage`_ - if installed can be activated by calling :code:`torchvision.set_image_backend('accimage')`
-
-* `libpng`_ - can be installed via conda :code:`conda install libpng` or any of the package managers for debian-based and RHEL-based Linux distributions.
-
-* `libjpeg`_ - can be installed via conda :code:`conda install jpeg` or any of the package managers for debian-based and RHEL-based Linux distributions. `libjpeg-turbo`_ can be used as well.
-
-**Notes:** ``libpng`` and ``libjpeg`` must be available at compilation time in order to be available. Make sure that it is available on the standard library locations,
-otherwise, add the include and library paths in the environment variables ``TORCHVISION_INCLUDE`` and ``TORCHVISION_LIBRARY``, respectively.
-
-.. _libpng : http://www.libpng.org/pub/png/libpng.html
-.. _Pillow : https://python-pillow.org/
-.. _Pillow-SIMD : https://github.com/uploadcare/pillow-simd
-.. _accimage: https://github.com/pytorch/accimage
-.. _libjpeg: http://ijg.org/
-.. _libjpeg-turbo: https://libjpeg-turbo.org/
-
-Video Backend
-=============
-Torchvision currently supports the following video backends:
-
-* `pyav`_ (default) - Pythonic binding for ffmpeg libraries.
-
-.. _pyav : https://github.com/PyAV-Org/PyAV
-
-* video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any conflicting version of ffmpeg installed. Currently, this is only supported on Linux.
-
-.. code:: bash
-
-     conda install -c conda-forge ffmpeg
-     python setup.py install
-
-
-Using the models on C++
-=======================
-TorchVision provides an example project for how to use the models on C++ using JIT Script.
-
-Installation From source:
-
-.. code:: bash
-
-    mkdir build
-    cd build
-    # Add -DWITH_CUDA=on support for the CUDA if needed
-    cmake ..
-    make
-    make install
-
-Once installed, the library can be accessed in cmake (after properly configuring ``CMAKE_PREFIX_PATH``) via the :code:`TorchVision::TorchVision` target:
-
-.. code:: rest
-
-	find_package(TorchVision REQUIRED)
-	target_link_libraries(my-target PUBLIC TorchVision::TorchVision)
-
-The ``TorchVision`` package will also automatically look for the ``Torch`` package and add it as a dependency to ``my-target``,
-so make sure that it is also available to cmake via the ``CMAKE_PREFIX_PATH``.
-
-For an example setup, take a look at ``examples/cpp/hello_world``.
-
-Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any Python 
-dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link to Python. This 
-can be done by passing ``-DUSE_PYTHON=on`` to CMake.
-
-TorchVision Operators
----------------------
-In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that you
-:code:`#include <torchvision/vision.h>` in your project.
-
-Documentation
-=============
-You can find the API documentation on the pytorch website: https://pytorch.org/vision/stable/index.html
-
-Contributing
-============
-
-See the `CONTRIBUTING <CONTRIBUTING.md>`_ file for how to help out.
-
-Disclaimer on Datasets
-======================
-
-This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
-
-If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
-
-Pre-trained Model License
-=========================
-
-The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
-
-More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE <https://github.com/facebookresearch/SWAG/blob/main/LICENSE>`_ for additional details.
-
-Citing TorchVision
-==================
-
-If you find TorchVision useful in your work, please consider citing the following BibTeX entry:
-
-.. code:: bibtex
-
-    @software{torchvision2016,
-        title        = {TorchVision: PyTorch's Computer Vision library},
-        author       = {TorchVision maintainers and contributors},
-        year         = 2016,
-        journal      = {GitHub repository},
-        publisher    = {GitHub},
-        howpublished = {\url{https://github.com/pytorch/vision}}
-    }
diff --git a/setup.py b/setup.py
index 965257435..c523ba073 100644
--- a/setup.py
+++ b/setup.py
@@ -525,7 +525,7 @@ if __name__ == "__main__":
 
     write_version_file()
 
-    with open("README.rst") as f:
+    with open("README.md") as f:
         readme = f.read()
 
     setup(
@@ -537,6 +537,7 @@ if __name__ == "__main__":
         url="https://github.com/pytorch/vision",
         description="image and video datasets and models for torch deep learning",
         long_description=readme,
+        long_description_content_type="text/markdown",
         license="BSD",
         # Package info
         packages=find_packages(exclude=("test",)),
-- 
GitLab


From ad9b6747f77132127811781d14eefe8cc7f9efa8 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 9 May 2023 20:17:56 -0400
Subject: [PATCH 434/624] Turn off windows circleci nightly builds. Enable GHA
 (#7571)

---
 .circleci/config.yml                       | 894 +--------------------
 .circleci/regenerate.py                    |   4 +
 .github/workflows/build-conda-windows.yml  |   4 +-
 .github/workflows/build-wheels-windows.yml |   4 +-
 4 files changed, 8 insertions(+), 898 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a27ea7267..7bc482f8c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -374,274 +374,7 @@ workflows:
 
   build:
     jobs:
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu117
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu118
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu121
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu117
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu118
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu121
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu117
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu118
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu121
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cpu
-          name: binary_win_wheel_py3.11_cpu
-          python_version: '3.11'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.11_cu117
-          python_version: '3.11'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.11_cu118
-          python_version: '3.11'
-      - binary_win_wheel:
-          cu_version: cu121
-          name: binary_win_wheel_py3.11_cu121
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu117
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu118
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu121
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu117
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu118
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu121
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu117
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu118
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu121
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cpu
-          name: binary_win_conda_py3.11_cpu
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.11_cu117
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.11_cu118
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cu121
-          name: binary_win_conda_py3.11_cu121
-          python_version: '3.11'
+      []
 
   cmake:
     jobs:
@@ -669,630 +402,7 @@ workflows:
 
   nightly:
     jobs:
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu117
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu118
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu121
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu121
-          subfolder: cu121/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu117
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu118
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu121
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu121
-          subfolder: cu121/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu117
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu118
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu121
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu121
-          subfolder: cu121/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cpu
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu117
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu118
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu121
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cu121
-          subfolder: cu121/
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu117
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu118
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu121
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu121
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu117
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu118
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu121
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu121
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu117
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu118
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu121
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu121
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cpu
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu117
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu118
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu121
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cu121
+      []
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 4dd795f5c..7278c4827 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -28,6 +28,10 @@ RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
 
 def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6, windows_latest_only=False):
     w = []
+
+    # Don't generate anything for build workflow
+    return indent(indentation, w)
+
     for btype in ["wheel", "conda"]:
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 98c683771..954d09d39 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -46,8 +46,6 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
index e23813597..3d818ece5 100644
--- a/.github/workflows/build-wheels-windows.yml
+++ b/.github/workflows/build-wheels-windows.yml
@@ -46,9 +46,7 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From d310ce64113944ba2fef92ee783435fddc7ce0cd Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 9 May 2023 20:48:47 -0400
Subject: [PATCH 435/624] Revert "Turn off windows circleci nightly builds.
 Enable GHA" (#7572)

---
 .circleci/config.yml                       | 894 ++++++++++++++++++++-
 .circleci/regenerate.py                    |   4 -
 .github/workflows/build-conda-windows.yml  |   4 +-
 .github/workflows/build-wheels-windows.yml |   4 +-
 4 files changed, 898 insertions(+), 8 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7bc482f8c..a27ea7267 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -374,7 +374,274 @@ workflows:
 
   build:
     jobs:
-      []
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cpu
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu117
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu118
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu121
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cpu
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu117
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu118
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu121
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.10_cpu
+          python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.10_cu117
+          python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.10_cu118
+          python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.10_cu121
+          python_version: '3.10'
+      - binary_win_wheel:
+          cu_version: cpu
+          name: binary_win_wheel_py3.11_cpu
+          python_version: '3.11'
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.11_cu117
+          python_version: '3.11'
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.11_cu118
+          python_version: '3.11'
+      - binary_win_wheel:
+          cu_version: cu121
+          name: binary_win_wheel_py3.11_cu121
+          python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cpu
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu117
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu118
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu121
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cpu
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu117
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu118
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu121
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.10_cpu
+          python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.10_cu117
+          python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.10_cu118
+          python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.10_cu121
+          python_version: '3.10'
+      - binary_win_conda:
+          cu_version: cpu
+          name: binary_win_conda_py3.11_cpu
+          python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.11_cu117
+          python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: main
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.11_cu118
+          python_version: '3.11'
+      - binary_win_conda:
+          cu_version: cu121
+          name: binary_win_conda_py3.11_cu121
+          python_version: '3.11'
 
   cmake:
     jobs:
@@ -402,7 +669,630 @@ workflows:
 
   nightly:
     jobs:
-      []
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cpu
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cpu_upload
+          requires:
+          - nightly_binary_win_wheel_py3.8_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu117
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu117_upload
+          requires:
+          - nightly_binary_win_wheel_py3.8_cu117
+          subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu118
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.8_cu118
+          subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu121
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.8_cu121
+          subfolder: cu121/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cpu
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cpu_upload
+          requires:
+          - nightly_binary_win_wheel_py3.9_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu117
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu117_upload
+          requires:
+          - nightly_binary_win_wheel_py3.9_cu117
+          subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu118
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.9_cu118
+          subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu121
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.9_cu121
+          subfolder: cu121/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cpu
+          python_version: '3.10'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cpu_upload
+          requires:
+          - nightly_binary_win_wheel_py3.10_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu117
+          python_version: '3.10'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu117_upload
+          requires:
+          - nightly_binary_win_wheel_py3.10_cu117
+          subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu118
+          python_version: '3.10'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.10_cu118
+          subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu121
+          python_version: '3.10'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.10_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.10_cu121
+          subfolder: cu121/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cpu
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cpu_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu117
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu117_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cu117
+          subfolder: cu117/
+      - binary_win_wheel:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu118
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu118_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cu118
+          subfolder: cu118/
+      - binary_win_wheel:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu121
+          python_version: '3.11'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.11_cu121_upload
+          requires:
+          - nightly_binary_win_wheel_py3.11_cu121
+          subfolder: cu121/
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cpu
+          python_version: '3.8'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cpu_upload
+          requires:
+          - nightly_binary_win_conda_py3.8_cpu
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu117
+          python_version: '3.8'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu117_upload
+          requires:
+          - nightly_binary_win_conda_py3.8_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu118
+          python_version: '3.8'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.8_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu121
+          python_version: '3.8'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.8_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.8_cu121
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cpu
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cpu_upload
+          requires:
+          - nightly_binary_win_conda_py3.9_cpu
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu117
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu117_upload
+          requires:
+          - nightly_binary_win_conda_py3.9_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu118
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.9_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu121
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.9_cu121
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cpu
+          python_version: '3.10'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cpu_upload
+          requires:
+          - nightly_binary_win_conda_py3.10_cpu
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu117
+          python_version: '3.10'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu117_upload
+          requires:
+          - nightly_binary_win_conda_py3.10_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu118
+          python_version: '3.10'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.10_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu121
+          python_version: '3.10'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.10_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.10_cu121
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cpu
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cpu_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cpu
+      - binary_win_conda:
+          cu_version: cu117
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu117
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu117_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cu117
+      - binary_win_conda:
+          cu_version: cu118
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu118
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu118_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cu118
+      - binary_win_conda:
+          cu_version: cu121
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu121
+          python_version: '3.11'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.11_cu121_upload
+          requires:
+          - nightly_binary_win_conda_py3.11_cu121
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 7278c4827..4dd795f5c 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -28,10 +28,6 @@ RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
 
 def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6, windows_latest_only=False):
     w = []
-
-    # Don't generate anything for build workflow
-    return indent(indentation, w)
-
     for btype in ["wheel", "conda"]:
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 954d09d39..98c683771 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -46,6 +46,8 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      trigger-event: ${{ github.event_name }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
index 3d818ece5..e23813597 100644
--- a/.github/workflows/build-wheels-windows.yml
+++ b/.github/workflows/build-wheels-windows.yml
@@ -46,7 +46,9 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      trigger-event: ${{ github.event_name }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From 078959f57e42274d1b1c71caeb6a5f8bdd2b37c7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 10 May 2023 11:02:47 +0100
Subject: [PATCH 436/624] Add BC linter (#7569)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .github/workflows/lint.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ae5249a92..c6198775f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -96,3 +96,14 @@ jobs:
         echo '::group::Lint Python types'
         mypy --install-types --non-interactive --config-file mypy.ini
         echo '::endgroup::'
+
+  bc:
+    if: github.event.pull_request
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
-- 
GitLab


From 20d90dfc2be8fedce229f47982db656862c9dc32 Mon Sep 17 00:00:00 2001
From: Paul Mulders <justinkb@gmail.com>
Date: Thu, 11 May 2023 00:43:51 +0200
Subject: [PATCH 437/624] setup.py: fix ROCm build (#7573)

---
 setup.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c523ba073..732b5c0e1 100644
--- a/setup.py
+++ b/setup.py
@@ -328,9 +328,15 @@ def get_extensions():
     image_src = (
         glob.glob(os.path.join(image_path, "*.cpp"))
         + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cuda", "*.cpp"))
     )
 
+    if is_rocm_pytorch:
+        image_src += glob.glob(os.path.join(image_path, "hip", "*.cpp"))
+        # we need to exclude this in favor of the hipified source
+        image_src.remove(os.path.join(image_path, "image.cpp"))
+    else:
+        image_src += glob.glob(os.path.join(image_path, "cuda", "*.cpp"))
+
     if use_png or use_jpeg:
         ext_modules.append(
             extension(
-- 
GitLab


From 84727dd76e969b00c28c4d90682ced7838027b63 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 11 May 2023 11:32:17 +0200
Subject: [PATCH 438/624] fix code format (#7575)

---
 setup.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 732b5c0e1..8b8ddcde1 100644
--- a/setup.py
+++ b/setup.py
@@ -325,10 +325,7 @@ def get_extensions():
     image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))]
 
     image_path = os.path.join(extensions_dir, "io", "image")
-    image_src = (
-        glob.glob(os.path.join(image_path, "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
-    )
+    image_src = glob.glob(os.path.join(image_path, "*.cpp")) + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
 
     if is_rocm_pytorch:
         image_src += glob.glob(os.path.join(image_path, "hip", "*.cpp"))
-- 
GitLab


From 62c22317760f5aa0ff181a6ad7b3f801fa8639b6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 11 May 2023 11:41:50 +0200
Subject: [PATCH 439/624] consolidate test workflows (#7574)

---
 .github/workflows/test-macos.yml              | 40 ------------
 .github/workflows/test-windows.yml            | 43 ------------
 .../workflows/{test-linux.yml => tests.yml}   | 65 ++++++++++++++++++-
 3 files changed, 63 insertions(+), 85 deletions(-)
 delete mode 100644 .github/workflows/test-macos.yml
 delete mode 100644 .github/workflows/test-windows.yml
 rename .github/workflows/{test-linux.yml => tests.yml} (62%)

diff --git a/.github/workflows/test-macos.yml b/.github/workflows/test-macos.yml
deleted file mode 100644
index 03e4b2db1..000000000
--- a/.github/workflows/test-macos.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Tests on macOS
-
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-
-jobs:
-  unittests:
-    strategy:
-      matrix:
-        python-version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
-        runner: ["macos-12"]
-        include:
-          - python-version: "3.8"
-            runner: macos-m1-12
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    with:
-      repository: pytorch/vision
-      # We need an increased timeout here, since the macos-12 runner is the free one from GH
-      # and needs roughly 2 hours to just run the test suite
-      timeout: 240
-      runner: ${{ matrix.runner }}
-      script: |
-        set -euo pipefail
-
-        export PYTHON_VERSION=${{ matrix.python-version }}
-        export GPU_ARCH_TYPE=cpu
-        export GPU_ARCH_VERSION=''
-
-        ./.github/scripts/unittest.sh
diff --git a/.github/workflows/test-windows.yml b/.github/workflows/test-windows.yml
deleted file mode 100644
index b300a5d0e..000000000
--- a/.github/workflows/test-windows.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: Tests on Windows
-
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-
-jobs:
-  unittests:
-    strategy:
-      matrix:
-        python-version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
-        runner: ["windows.4xlarge"]
-        gpu-arch-type: ["cpu"]
-        include:
-          - python-version: "3.8"
-            runner: windows.g5.4xlarge.nvidia.gpu
-            gpu-arch-type: cuda
-            gpu-arch-version: "11.7"
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
-    with:
-      repository: pytorch/vision
-      runner: ${{ matrix.runner }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      timeout: 120
-      script: |
-        set -euxo pipefail
-
-        export PYTHON_VERSION=${{ matrix.python-version }}
-        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
-        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
-        
-        ./.github/scripts/unittest.sh
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/tests.yml
similarity index 62%
rename from .github/workflows/test-linux.yml
rename to .github/workflows/tests.yml
index b9b01b7a5..c1010d9aa 100644
--- a/.github/workflows/test-linux.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: Tests on Linux
+name: Tests
 
 on:
   pull_request:
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  unittests:
+  unittests-linux:
     strategy:
       matrix:
         python-version:
@@ -41,6 +41,67 @@ jobs:
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
 
         ./.github/scripts/unittest.sh
+        
+  unittests-macos:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["macos-12"]
+        include:
+          - python-version: "3.8"
+            runner: macos-m1-12
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      # We need an increased timeout here, since the macos-12 runner is the free one from GH
+      # and needs roughly 2 hours to just run the test suite
+      timeout: 240
+      runner: ${{ matrix.runner }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/unittest.sh
+
+  unittests-windows:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["windows.4xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: "3.8"
+            runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.7"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      script: |
+        set -euxo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        
+        ./.github/scripts/unittest.sh
 
   onnx:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-- 
GitLab


From b06ea39d5f0adbe949d08257837bda912339e415 Mon Sep 17 00:00:00 2001
From: Christoph Reich <34400551+ChristophReich1996@users.noreply.github.com>
Date: Thu, 11 May 2023 14:07:45 +0200
Subject: [PATCH 440/624] Assert RAFT input resolution is 128 x 128 or higher
 (#7339)

Co-authored-by: Nicolas Hug <nicolashug@meta.com>
---
 torchvision/models/optical_flow/raft.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index e682fda2c..c294777ee 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -369,6 +369,19 @@ class CorrBlock(nn.Module):
             raise ValueError(
                 f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)"
             )
+
+        # Explaining min_fmap_size below: the fmaps are down-sampled (num_levels - 1) times by a factor of 2.
+        # The last corr_volume most have at least 2 values (hence the 2* factor), otherwise grid_sample() would
+        # produce nans in its output.
+        min_fmap_size = 2 * (2 ** (self.num_levels - 1))
+        if any(fmap_size < min_fmap_size for fmap_size in fmap1.shape[-2:]):
+            raise ValueError(
+                "Feature maps are too small to be down-sampled by the correlation pyramid. "
+                f"H and W of feature maps should be at least {min_fmap_size}; got: {fmap1.shape[-2:]}. "
+                "Remember that input images to the model are downsampled by 8, so that means their "
+                f"dimensions should be at least 8 * {min_fmap_size} = {8 * min_fmap_size}."
+            )
+
         corr_volume = self._compute_corr_volume(fmap1, fmap2)
 
         batch_size, h, w, num_channels, _, _ = corr_volume.shape  # _, _ = h, w
-- 
GitLab


From c3d602eef6872185f4e0e0ecff585bf91a7d33c9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 11 May 2023 17:58:20 +0200
Subject: [PATCH 441/624] add 0.15.2 release to readme (#7576)

---
 README.md | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 81adac2a3..e9ceb664a 100644
--- a/README.md
+++ b/README.md
@@ -15,36 +15,27 @@ versions.
 | `torch`            | `torchvision`      | Python              |
 | ------------------ | ------------------ | ------------------- |
 | `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.11`   |
-| `2.0.0`            | `0.15.1`           | `>=3.8`, `<=3.11`   |
-| `1.13.0`           | `0.14.0`           | `>=3.7.2`, `<=3.10` |
-| `1.12.0`           | `0.13.0`           | `>=3.7`, `<=3.10`   |
-| `1.11.0`           | `0.12.0`           | `>=3.7`, `<=3.10`   |
+| `2.0`              | `0.15`             | `>=3.8`, `<=3.11`   |
+| `1.13`             | `0.14`             | `>=3.7.2`, `<=3.10` |
+| `1.12`             | `0.13`             | `>=3.7`, `<=3.10`   |
 
 <details>
     <summary>older versions</summary>
 
-| `torch`            | `torchvision`      | Python                    |
-|--------------------|--------------------|---------------------------|
-| `1.10.2`           | `0.11.3`           | `>=3.6`, `<=3.9`          |
-| `1.10.1`           | `0.11.2`           | `>=3.6`, `<=3.9`          |
-| `1.10.0`           | `0.11.1`           | `>=3.6`, `<=3.9`          |
-| `1.9.1`            | `0.10.1`           | `>=3.6`, `<=3.9`          |
-| `1.9.0`            | `0.10.0`           | `>=3.6`, `<=3.9`          |
-| `1.8.2`            | `0.9.2`            | `>=3.6`, `<=3.9`          |
-| `1.8.1`            | `0.9.1`            | `>=3.6`, `<=3.9`          |
-| `1.8.0`            | `0.9.0`            | `>=3.6`, `<=3.9`          |
-| `1.7.1`            | `0.8.2`            | `>=3.6`, `<=3.9`          |
-| `1.7.0`            | `0.8.1`            | `>=3.6`, `<=3.8`          |
-| `1.7.0`            | `0.8.0`            | `>=3.6`, `<=3.8`          |
-| `1.6.0`            | `0.7.0`            | `>=3.6`, `<=3.8`          |
-| `1.5.1`            | `0.6.1`            | `>=3.5`, `<=3.8`          |
-| `1.5.0`            | `0.6.0`            | `>=3.5`, `<=3.8`          |
-| `1.4.0`            | `0.5.0`            | `==2.7`, `>=3.5`, `<=3.8` |
-| `1.3.1`            | `0.4.2`            | `==2.7`, `>=3.5`, `<=3.7` |
-| `1.3.0`            | `0.4.1`            | `==2.7`, `>=3.5`, `<=3.7` |
-| `1.2.0`            | `0.4.0`            | `==2.7`, `>=3.5`, `<=3.7` |
-| `1.1.0`            | `0.3.0`            | `==2.7`, `>=3.5`, `<=3.7` |
-| `<=1.0.1`          | `0.2.2`            | `==2.7`, `>=3.5`, `<=3.7` |
+| `torch` | `torchvision`     | Python                    |
+|---------|-------------------|---------------------------|
+| `1.11`  | `0.12`            | `>=3.7`, `<=3.10`         |
+| `1.10`  | `0.11`            | `>=3.6`, `<=3.9`          |
+| `1.9`   | `0.10`            | `>=3.6`, `<=3.9`          |
+| `1.8`   | `0.9`             | `>=3.6`, `<=3.9`          |
+| `1.7`   | `0.8`             | `>=3.6`, `<=3.9`          |
+| `1.6`   | `0.7`             | `>=3.6`, `<=3.8`          |
+| `1.5`   | `0.6`             | `>=3.5`, `<=3.8`          |
+| `1.4`   | `0.5`             | `==2.7`, `>=3.5`, `<=3.8` |
+| `1.3`   | `0.4.2` / `0.4.3` | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.2`   | `0.4.1`           | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.1`   | `0.3`             | `==2.7`, `>=3.5`, `<=3.7` |
+| `<=1.0` | `0.2`             | `==2.7`, `>=3.5`, `<=3.7` |
 
 </details>
 
-- 
GitLab


From e012579d3dedfcd472e81ee7b7ba2cf30168afc8 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 11 May 2023 16:39:05 -0400
Subject: [PATCH 442/624] Take 2, Turn off windows circleci nightly builds.
 Enable GHA (#7578)

---
 .circleci/config.yml                       | 894 +--------------------
 .circleci/regenerate.py                    |   4 +
 .github/workflows/build-conda-windows.yml  |   4 +-
 .github/workflows/build-wheels-windows.yml |   4 +-
 4 files changed, 8 insertions(+), 898 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a27ea7267..7bc482f8c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -374,274 +374,7 @@ workflows:
 
   build:
     jobs:
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu117
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu118
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu121
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu117
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu118
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu121
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu117
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu118
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu121
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cpu
-          name: binary_win_wheel_py3.11_cpu
-          python_version: '3.11'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.11_cu117
-          python_version: '3.11'
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.11_cu118
-          python_version: '3.11'
-      - binary_win_wheel:
-          cu_version: cu121
-          name: binary_win_wheel_py3.11_cu121
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu117
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu118
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu121
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu117
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu118
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu121
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu117
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu118
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu121
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cpu
-          name: binary_win_conda_py3.11_cpu
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.11_cu117
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.11_cu118
-          python_version: '3.11'
-      - binary_win_conda:
-          cu_version: cu121
-          name: binary_win_conda_py3.11_cu121
-          python_version: '3.11'
+      []
 
   cmake:
     jobs:
@@ -669,630 +402,7 @@ workflows:
 
   nightly:
     jobs:
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu117
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu118
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu121
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu121
-          subfolder: cu121/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu117
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu118
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu121
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu121
-          subfolder: cu121/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu117
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu118
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu121
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu121
-          subfolder: cu121/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cpu
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu117
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu118
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu118_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cu118
-          subfolder: cu118/
-      - binary_win_wheel:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu121
-          python_version: '3.11'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.11_cu121_upload
-          requires:
-          - nightly_binary_win_wheel_py3.11_cu121
-          subfolder: cu121/
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu117
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu118
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu121
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu121
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu117
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu118
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu121
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu121
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu117
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu118
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu121
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu121
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cpu
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cpu
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu117
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cu117
-      - binary_win_conda:
-          cu_version: cu118
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu118
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu118_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cu118
-      - binary_win_conda:
-          cu_version: cu121
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu121
-          python_version: '3.11'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.11_cu121_upload
-          requires:
-          - nightly_binary_win_conda_py3.11_cu121
+      []
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 4dd795f5c..7278c4827 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -28,6 +28,10 @@ RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
 
 def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6, windows_latest_only=False):
     w = []
+
+    # Don't generate anything for build workflow
+    return indent(indentation, w)
+
     for btype in ["wheel", "conda"]:
         for os_type in ["linux", "macos", "win"]:
             python_versions = PYTHON_VERSIONS
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 98c683771..954d09d39 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -46,8 +46,6 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
index e23813597..3d818ece5 100644
--- a/.github/workflows/build-wheels-windows.yml
+++ b/.github/workflows/build-wheels-windows.yml
@@ -46,9 +46,7 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
-      # Using "development" as trigger event so these binaries are not uploaded
-      # to official channels yet
-      trigger-event: development
+      trigger-event: ${{ github.event_name }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-- 
GitLab


From c8cd3ff9eb87e5190a04cd1e4cc970b43410da59 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 12 May 2023 14:46:47 -0400
Subject: [PATCH 443/624] Add cpu smoke_test_torchvision_decode_jpeg test
 (#7583)

---
 test/smoke_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index bfcb4a0a7..0b11e411a 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -26,10 +26,9 @@ def smoke_test_torchvision_read_decode() -> None:
     if img_png.shape != (4, 471, 354):
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
-
-def smoke_test_torchvision_decode_jpeg_cuda():
+def smoke_test_torchvision_decode_jpeg(device: str = "cpu"):
     img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
-    img_jpg = decode_jpeg(img_jpg_data, device="cuda")
+    img_jpg = decode_jpeg(img_jpg_data, device=device)
     if img_jpg.shape != (3, 606, 517):
         raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
 
@@ -81,8 +80,9 @@ def main() -> None:
     smoke_test_torchvision()
     smoke_test_torchvision_read_decode()
     smoke_test_torchvision_resnet50_classify()
+    smoke_test_torchvision_decode_jpeg()
     if torch.cuda.is_available():
-        smoke_test_torchvision_decode_jpeg_cuda()
+        smoke_test_torchvision_decode_jpeg("cuda")
         smoke_test_torchvision_resnet50_classify("cuda")
         smoke_test_compile()
 
-- 
GitLab


From 9b7c7d395d44832104b583e82dfc5c5272535ebb Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Fri, 12 May 2023 20:45:44 -0400
Subject: [PATCH 444/624] Alert non-deterministic on kernels that use
 gpuAtomicAdd (#7582)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu | 2 ++
 torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu  | 2 ++
 torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu   | 2 ++
 torchvision/csrc/ops/cuda/roi_align_kernel.cu     | 2 ++
 torchvision/csrc/ops/cuda/roi_pool_kernel.cu      | 2 ++
 5 files changed, 10 insertions(+)

diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
index 5fd039a31..b664bf11b 100644
--- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
+++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
@@ -426,6 +426,8 @@ void compute_grad_input(
   // Checks if num_kernels or columns numel larger than 2 ** 31
   use_64bits_indexing |= num_kernels > (1 << 31);
 
+  at::globalContext().alertNotDeterministic("compute_grad_input");
+
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         columns.scalar_type(), "compute_grad_input", ([&] {
diff --git a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
index b9c624b09..17cc188cd 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
@@ -412,6 +412,8 @@ at::Tensor ps_roi_align_backward_kernel(
 
   int channels_out = channels / (pooled_height * pooled_width);
 
+  at::globalContext().alertNotDeterministic("ps_roi_align_backward_kernel");
+
   auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "ps_roi_align_backward_kernel", [&] {
diff --git a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
index 917fff03e..3789a2b7d 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
@@ -251,6 +251,8 @@ at::Tensor ps_roi_pool_backward_kernel(
 
   int channels_out = channels / (pooled_height * pooled_width);
 
+  at::globalContext().alertNotDeterministic("ps_roi_pool_backward_kernel");
+
   auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "ps_roi_pool_backward_kernel", [&] {
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index f1f886c47..2622edec1 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -421,6 +421,8 @@ at::Tensor roi_align_backward_kernel(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
+  at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
+
   auto rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "roi_align_backward_kernel", [&] {
diff --git a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
index e29c4438e..74952bba0 100644
--- a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
@@ -232,6 +232,8 @@ at::Tensor roi_pool_backward_kernel(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
+  at::globalContext().alertNotDeterministic("roi_pool_backward_kernel");
+
   auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "roi_pool_backward_kernel", [&] {
-- 
GitLab


From dda5bfac2f7900684cc491b1d3a4aafca2eeccbf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 15 May 2023 12:03:49 +0100
Subject: [PATCH 445/624] Fix lint (#7588)

---
 .github/workflows/lint.yml | 2 --
 test/smoke_test.py         | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index c6198775f..ec8d285c9 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -28,7 +28,6 @@ jobs:
         pip install --progress-bar=off pre-commit
         echo '::endgroup::'
         
-        echo '::group::Lint Python source and configs'
         set +e
         pre-commit run --all-files
         
@@ -36,7 +35,6 @@ jobs:
           git --no-pager diff
           exit 1
         fi
-        echo '::endgroup::'
 
   c-source:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
diff --git a/test/smoke_test.py b/test/smoke_test.py
index 0b11e411a..8037183e1 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -26,6 +26,7 @@ def smoke_test_torchvision_read_decode() -> None:
     if img_png.shape != (4, 471, 354):
         raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
 
+
 def smoke_test_torchvision_decode_jpeg(device: str = "cpu"):
     img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
     img_jpg = decode_jpeg(img_jpg_data, device=device)
-- 
GitLab


From d17bb3b0209cd6ac2929e1ff485ea863b1eafce5 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 15 May 2023 12:23:27 -0400
Subject: [PATCH 446/624] Include dlls required for wheel building (#7589)

---
 packaging/pre_build_script.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index d02a13fde..734239794 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -15,7 +15,14 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Install libpng from Anaconda (defaults)
   conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
   conda install -yq ffmpeg=4.2 -c pytorch
-else
+
+  # Copy binaries to be included in the wheel distribution
+  if [[ "$OSTYPE" == "msys" ]]; then
+      python_exec="$(which python)"
+      bin_path=$(dirname $python_exec)
+      cp "$bin_path/Library/bin/libjpeg.dll" torchvision
+  fi
+
   # Install native CentOS libJPEG, freetype and GnuTLS
   yum install -y libjpeg-turbo-devel freetype gnutls
 
-- 
GitLab


From 353b71b0743668324b85c3a8e5e755ce1a1f3124 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 15 May 2023 12:40:00 -0400
Subject: [PATCH 447/624] Include dlls required for wheel building (#7590)

---
 packaging/pre_build_script.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 734239794..a03982b28 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -23,6 +23,7 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
       cp "$bin_path/Library/bin/libjpeg.dll" torchvision
   fi
 
+else
   # Install native CentOS libJPEG, freetype and GnuTLS
   yum install -y libjpeg-turbo-devel freetype gnutls
 
-- 
GitLab


From a55791898df0b99aa1befe6600d83dabebca0e53 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 15 May 2023 17:51:15 -0400
Subject: [PATCH 448/624] Cleanup circleci binary generation (#7591)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .circleci/config.yml    | 145 -----------------------------
 .circleci/config.yml.in | 145 -----------------------------
 .circleci/regenerate.py | 197 ----------------------------------------
 3 files changed, 487 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7bc482f8c..0463007af 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -124,144 +124,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  binary_win_conda:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda install -yq conda-build "conda-package-handling!=1.5.0"
-            packaging/build_conda.sh
-            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-      - store_artifacts:
-          path: C:/tools/miniconda3/conda-bld/win-64
-      - persist_to_workspace:
-          root: C:/tools/miniconda3/conda-bld/win-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_macos_wheel:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          # Cannot easily deduplicate this as source'ing activate
-          # will set environment variables which we need to propagate
-          # to build_wheel.sh
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_macos_conda:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build
-            packaging/build_conda.sh
-      - store_artifacts:
-          path: /Users/distiller/miniconda3/conda-bld/osx-64
-      - persist_to_workspace:
-          root: /Users/distiller/miniconda3/conda-bld/osx-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  # Requires org-member context
-  binary_conda_upload:
-    docker:
-      - image: continuumio/miniconda
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          command: |
-            # Prevent credential from leaking
-            conda install -yq anaconda-client
-            set -x
-            anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force
-
-  # Requires org-member context
-  binary_wheel_upload:
-    parameters:
-      subfolder:
-        description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
-        type: string
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - checkout
-      - pip_install:
-          args: awscli
-      - run:
-          command: |
-            export PATH="$HOME/.local/bin:$PATH"
-            # Prevent credential from leaking
-            set +x
-            export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
-            set -x
-            for pkg in ~/workspace/*.whl; do
-              aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
-            done
-
   smoke_test_docker_image_build:
     machine:
       image: ubuntu-2004:202104-01
@@ -372,10 +234,6 @@ workflows:
     jobs:
       - circleci_consistency
 
-  build:
-    jobs:
-      []
-
   cmake:
     jobs:
       - cmake_linux_cpu:
@@ -400,9 +258,6 @@ workflows:
           name: cmake_macos_cpu
           python_version: '3.8'
 
-  nightly:
-    jobs:
-      []
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 2358b0a95..f3a88f2d9 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -124,144 +124,6 @@ jobs:
             python .circleci/regenerate.py
             git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
 
-  binary_win_conda:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda install -yq conda-build "conda-package-handling!=1.5.0"
-            packaging/build_conda.sh
-            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-      - store_artifacts:
-          path: C:/tools/miniconda3/conda-bld/win-64
-      - persist_to_workspace:
-          root: C:/tools/miniconda3/conda-bld/win-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_macos_wheel:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          # Cannot easily deduplicate this as source'ing activate
-          # will set environment variables which we need to propagate
-          # to build_wheel.sh
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_macos_conda:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build
-            packaging/build_conda.sh
-      - store_artifacts:
-          path: /Users/distiller/miniconda3/conda-bld/osx-64
-      - persist_to_workspace:
-          root: /Users/distiller/miniconda3/conda-bld/osx-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  # Requires org-member context
-  binary_conda_upload:
-    docker:
-      - image: continuumio/miniconda
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          command: |
-            # Prevent credential from leaking
-            conda install -yq anaconda-client
-            set -x
-            anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force
-
-  # Requires org-member context
-  binary_wheel_upload:
-    parameters:
-      subfolder:
-        description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
-        type: string
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - checkout
-      - pip_install:
-          args: awscli
-      - run:
-          command: |
-            export PATH="$HOME/.local/bin:$PATH"
-            # Prevent credential from leaking
-            set +x
-            export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
-            set -x
-            for pkg in ~/workspace/*.whl; do
-              aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
-            done
-
   smoke_test_docker_image_build:
     machine:
       image: ubuntu-2004:202104-01
@@ -372,17 +234,10 @@ workflows:
     jobs:
       - circleci_consistency
 
-  build:
-    jobs:
-      {{ build_workflows(windows_latest_only=True) }}
-
   cmake:
     jobs:
       {{ cmake_workflows() }}
 
-  nightly:
-    jobs:
-      {{ build_workflows(prefix="nightly_", filter_branch="nightly", upload=True) }}
   docker_build:
     triggers:
       - schedule:
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 7278c4827..e7b8db3d5 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -21,202 +21,6 @@ import yaml
 from jinja2 import select_autoescape
 
 
-PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
-
-RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-
-
-def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6, windows_latest_only=False):
-    w = []
-
-    # Don't generate anything for build workflow
-    return indent(indentation, w)
-
-    for btype in ["wheel", "conda"]:
-        for os_type in ["linux", "macos", "win"]:
-            python_versions = PYTHON_VERSIONS
-            cu_versions_dict = {
-                "linux": ["cpu", "cu117", "cu118", "cu121", "rocm5.2", "rocm5.3"],
-                "win": ["cpu", "cu117", "cu118", "cu121"],
-                "macos": ["cpu"],
-            }
-            cu_versions = cu_versions_dict[os_type]
-            for python_version in python_versions:
-                for cu_version in cu_versions:
-                    # ROCm conda packages not yet supported
-                    if cu_version.startswith("rocm") and btype == "conda":
-                        continue
-                    for unicode in [False]:
-                        fb = filter_branch
-                        if (
-                            windows_latest_only
-                            and os_type == "win"
-                            and filter_branch is None
-                            and (
-                                python_version != python_versions[-1]
-                                or (cu_version not in [cu_versions[0], cu_versions[-1]])
-                            )
-                        ):
-                            fb = "main"
-                        if not fb and (
-                            os_type == "linux" and cu_version == "cpu" and btype == "wheel" and python_version == "3.8"
-                        ):
-                            # the fields must match the build_docs "requires" dependency
-                            fb = "/.*/"
-
-                        # Disable all Linux Wheels Workflows from CircleCI
-                        if os_type == "linux" and btype == "wheel":
-                            continue
-
-                        # Disable all Macos Wheels Workflows from CircleCI.
-                        if os_type == "macos" and btype == "wheel":
-                            continue
-
-                        # Disable all non-Windows Conda workflows
-                        if os_type != "win" and btype == "conda":
-                            continue
-
-                        w += workflow_pair(
-                            btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
-                        )
-
-    return indent(indentation, w)
-
-
-def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix="", upload=False, *, filter_branch=None):
-
-    w = []
-    unicode_suffix = "u" if unicode else ""
-    base_workflow_name = f"{prefix}binary_{os_type}_{btype}_py{python_version}{unicode_suffix}_{cu_version}"
-
-    w.append(
-        generate_base_workflow(
-            base_workflow_name, python_version, cu_version, unicode, os_type, btype, filter_branch=filter_branch
-        )
-    )
-
-    # For the remaining py3.8 Linux Wheels job left around for the docs build,
-    # we'll disable uploads.
-    if os_type == "linux" and btype == "wheel":
-        upload = False
-
-    if upload:
-        w.append(generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, filter_branch=filter_branch))
-        # disable smoke tests, they are broken and needs to be fixed
-        # if filter_branch == "nightly" and os_type in ["linux", "win"]:
-        #     pydistro = "pip" if btype == "wheel" else "conda"
-        #     w.append(generate_smoketest_workflow(pydistro, base_workflow_name, filter_branch, python_version, os_type))
-
-    return w
-
-
-manylinux_images = {
-    "cu117": "pytorch/manylinux-cuda117",
-    "cu118": "pytorch/manylinux-cuda118",
-    "cu121": "pytorch/manylinux-cuda121",
-}
-
-
-def get_manylinux_image(cu_version):
-    if cu_version == "cpu":
-        return "pytorch/manylinux-cpu"
-    elif cu_version.startswith("cu"):
-        cu_suffix = cu_version[len("cu") :]
-        return f"pytorch/manylinux-cuda{cu_suffix}"
-    elif cu_version.startswith("rocm"):
-        rocm_suffix = cu_version[len("rocm") :]
-        return f"pytorch/manylinux-rocm:{rocm_suffix}"
-
-
-def get_conda_image(cu_version):
-    if cu_version == "cpu":
-        return "pytorch/conda-builder:cpu"
-    elif cu_version.startswith("cu"):
-        cu_suffix = cu_version[len("cu") :]
-        return f"pytorch/conda-builder:cuda{cu_suffix}"
-
-
-def generate_base_workflow(
-    base_workflow_name, python_version, cu_version, unicode, os_type, btype, *, filter_branch=None
-):
-
-    d = {
-        "name": base_workflow_name,
-        "python_version": python_version,
-        "cu_version": cu_version,
-    }
-
-    if os_type != "win" and unicode:
-        d["unicode_abi"] = "1"
-
-    if os_type != "win":
-        d["wheel_docker_image"] = get_manylinux_image(cu_version)
-        # ROCm conda packages not yet supported
-        if "rocm" not in cu_version:
-            d["conda_docker_image"] = get_conda_image(cu_version)
-
-    if filter_branch is not None:
-        d["filters"] = {
-            "branches": {"only": filter_branch},
-            "tags": {
-                # Using a raw string here to avoid having to escape
-                # anything
-                "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-            },
-        }
-
-    w = f"binary_{os_type}_{btype}"
-    return {w: d}
-
-
-def gen_filter_branch_tree(*branches, tags_list=None):
-    filter_dict = {"branches": {"only": [b for b in branches]}}
-    if tags_list is not None:
-        filter_dict["tags"] = {"only": tags_list}
-    return filter_dict
-
-
-def generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, *, filter_branch=None):
-    d = {
-        "name": f"{base_workflow_name}_upload",
-        "context": "org-member",
-        "requires": [base_workflow_name],
-    }
-
-    if btype == "wheel":
-        d["subfolder"] = "" if os_type == "macos" else cu_version + "/"
-
-    if filter_branch is not None:
-        d["filters"] = {
-            "branches": {"only": filter_branch},
-            "tags": {
-                # Using a raw string here to avoid having to escape
-                # anything
-                "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-            },
-        }
-
-    return {f"binary_{btype}_upload": d}
-
-
-def generate_smoketest_workflow(pydistro, base_workflow_name, filter_branch, python_version, os_type):
-
-    required_build_suffix = "_upload"
-    required_build_name = base_workflow_name + required_build_suffix
-
-    smoke_suffix = f"smoke_test_{pydistro}"
-    d = {
-        "name": f"{base_workflow_name}_{smoke_suffix}",
-        "requires": [required_build_name],
-        "python_version": python_version,
-    }
-
-    if filter_branch:
-        d["filters"] = gen_filter_branch_tree(filter_branch)
-
-    return {f"smoke_test_{os_type}_{pydistro}": d}
-
-
 def indent(indentation, data_list):
     return ("\n" + " " * indentation).join(yaml.dump(data_list, default_flow_style=False).splitlines())
 
@@ -249,7 +53,6 @@ if __name__ == "__main__":
     with open(os.path.join(d, "config.yml"), "w") as f:
         f.write(
             env.get_template("config.yml.in").render(
-                build_workflows=build_workflows,
                 cmake_workflows=cmake_workflows,
             )
         )
-- 
GitLab


From fc838add3235d60feaea2251c1979fd253b26a77 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Mon, 15 May 2023 20:19:33 -0400
Subject: [PATCH 449/624] Add deterministic, pure-Python roi_align
 implementation (#7587)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 test/test_ops.py             |  61 +++++++++---
 torchvision/ops/roi_align.py | 179 ++++++++++++++++++++++++++++++++++-
 2 files changed, 227 insertions(+), 13 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 5f8f8098c..463ebb333 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -19,6 +19,22 @@ from torchvision import models, ops
 from torchvision.models.feature_extraction import get_graph_node_names
 
 
+# Context manager for setting deterministic flag and automatically
+# resetting it to its original value
+class DeterministicGuard:
+    def __init__(self, deterministic, *, warn_only=False):
+        self.deterministic = deterministic
+        self.warn_only = warn_only
+
+    def __enter__(self):
+        self.deterministic_restore = torch.are_deterministic_algorithms_enabled()
+        self.warn_only_restore = torch.is_deterministic_algorithms_warn_only_enabled()
+        torch.use_deterministic_algorithms(self.deterministic, warn_only=self.warn_only)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.use_deterministic_algorithms(self.deterministic_restore, warn_only=self.warn_only_restore)
+
+
 class RoIOpTesterModuleWrapper(nn.Module):
     def __init__(self, obj):
         super().__init__()
@@ -83,7 +99,7 @@ class RoIOpTester(ABC):
 
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwargs):
+    def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, deterministic=False, **kwargs):
         x_dtype = self.dtype if x_dtype is None else x_dtype
         rois_dtype = self.dtype if rois_dtype is None else rois_dtype
         pool_size = 5
@@ -99,7 +115,8 @@ class RoIOpTester(ABC):
         )
 
         pool_h, pool_w = pool_size, pool_size
-        y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs)
+        with DeterministicGuard(deterministic):
+            y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs)
         # the following should be true whether we're running an autocast test or not.
         assert y.dtype == x.dtype
         gt_y = self.expected_fn(
@@ -140,7 +157,7 @@ class RoIOpTester(ABC):
     @pytest.mark.parametrize("seed", range(10))
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_backward(self, seed, device, contiguous):
+    def test_backward(self, seed, device, contiguous, deterministic=False):
         torch.random.manual_seed(seed)
         pool_size = 2
         x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
@@ -155,7 +172,9 @@ class RoIOpTester(ABC):
 
         script_func = self.get_script_fn(rois, pool_size)
 
-        gradcheck(func, (x,))
+        with DeterministicGuard(deterministic):
+            gradcheck(func, (x,))
+
         gradcheck(script_func, (x,))
 
     @needs_cuda
@@ -384,7 +403,6 @@ class TestRoIAlign(RoIOpTester):
                     grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
 
                     for channel in range(0, n_channels):
-
                         val = 0
                         for iy in range(0, grid_h):
                             y = start_h + (iy + 0.5) * bin_h / grid_h
@@ -402,21 +420,44 @@ class TestRoIAlign(RoIOpTester):
     @pytest.mark.parametrize("aligned", (True, False))
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=None):
+    @pytest.mark.parametrize("deterministic", (True, False))
+    def test_forward(self, device, contiguous, deterministic, aligned, x_dtype=None, rois_dtype=None):
+        if deterministic and device == "cpu":
+            pytest.skip("cpu is always deterministic, don't retest")
         super().test_forward(
-            device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype, aligned=aligned
+            device=device,
+            contiguous=contiguous,
+            deterministic=deterministic,
+            x_dtype=x_dtype,
+            rois_dtype=rois_dtype,
+            aligned=aligned,
         )
 
     @needs_cuda
     @pytest.mark.parametrize("aligned", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
     @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half))
-    def test_autocast(self, aligned, x_dtype, rois_dtype):
+    def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
         with torch.cuda.amp.autocast():
             self.test_forward(
-                torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype
+                torch.device("cuda"),
+                contiguous=False,
+                deterministic=deterministic,
+                aligned=aligned,
+                x_dtype=x_dtype,
+                rois_dtype=rois_dtype,
             )
 
+    @pytest.mark.parametrize("seed", range(10))
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("contiguous", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
+    def test_backward(self, seed, device, contiguous, deterministic):
+        if deterministic and device == "cpu":
+            pytest.skip("cpu is always deterministic, don't retest")
+        super().test_backward(seed, device, contiguous, deterministic)
+
     def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
         rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
         rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
@@ -978,7 +1019,6 @@ class TestDeformConv:
             weight = init_weight
 
         for d in ["cpu", "cuda"]:
-
             out = ops.deform_conv2d(img.to(d), offset.to(d), weight.to(d), padding=1, mask=mask.to(d))
             out.mean().backward()
             if true_cpu_grads is None:
@@ -1374,7 +1414,6 @@ class TestGeneralizedBoxIouLoss:
     @pytest.mark.parametrize("device", cpu_and_gpu())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_giou_loss(self, dtype, device):
-
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
 
         # Identical boxes should have loss of 0
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index 42e93cca2..be8ec8aea 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -1,16 +1,188 @@
 from typing import List, Union
 
 import torch
+import torch._dynamo
 import torch.fx
 from torch import nn, Tensor
 from torch.jit.annotations import BroadcastingList2
 from torch.nn.modules.utils import _pair
-from torchvision.extension import _assert_has_ops
+from torchvision.extension import _assert_has_ops, _has_ops
 
 from ..utils import _log_api_usage_once
 from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
 
 
+# NB: all inputs are tensors
+def _bilinear_interpolate(
+    input,  # [N, C, H, W]
+    roi_batch_ind,  # [K]
+    y,  # [K, PH, IY]
+    x,  # [K, PW, IX]
+    ymask,  # [K, IY]
+    xmask,  # [K, IX]
+):
+    _, channels, height, width = input.size()
+
+    # deal with inverse element out of feature map boundary
+    y = y.clamp(min=0)
+    x = x.clamp(min=0)
+    y_low = y.int()
+    x_low = x.int()
+    y_high = torch.where(y_low >= height - 1, height - 1, y_low + 1)
+    y_low = torch.where(y_low >= height - 1, height - 1, y_low)
+    y = torch.where(y_low >= height - 1, y.to(input.dtype), y)
+
+    x_high = torch.where(x_low >= width - 1, width - 1, x_low + 1)
+    x_low = torch.where(x_low >= width - 1, width - 1, x_low)
+    x = torch.where(x_low >= width - 1, x.to(input.dtype), x)
+
+    ly = y - y_low
+    lx = x - x_low
+    hy = 1.0 - ly
+    hx = 1.0 - lx
+
+    # do bilinear interpolation, but respect the masking!
+    # TODO: It's possible the masking here is unnecessary if y and
+    # x were clamped appropriately; hard to tell
+    def masked_index(
+        y,  # [K, PH, IY]
+        x,  # [K, PW, IX]
+    ):
+        if ymask is not None:
+            assert xmask is not None
+            y = torch.where(ymask[:, None, :], y, 0)
+            x = torch.where(xmask[:, None, :], x, 0)
+        return input[
+            roi_batch_ind[:, None, None, None, None, None],
+            torch.arange(channels, device=input.device)[None, :, None, None, None, None],
+            y[:, None, :, None, :, None],  # prev [K, PH, IY]
+            x[:, None, None, :, None, :],  # prev [K, PW, IX]
+        ]  # [K, C, PH, PW, IY, IX]
+
+    v1 = masked_index(y_low, x_low)
+    v2 = masked_index(y_low, x_high)
+    v3 = masked_index(y_high, x_low)
+    v4 = masked_index(y_high, x_high)
+
+    # all ws preemptively [K, C, PH, PW, IY, IX]
+    def outer_prod(y, x):
+        return y[:, None, :, None, :, None] * x[:, None, None, :, None, :]
+
+    w1 = outer_prod(hy, hx)
+    w2 = outer_prod(hy, lx)
+    w3 = outer_prod(ly, hx)
+    w4 = outer_prod(ly, lx)
+
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+    return val
+
+
+# TODO: this doesn't actually cache
+# TODO: main library should make this easier to do
+def maybe_cast(tensor):
+    if torch.is_autocast_enabled() and tensor.is_cuda and tensor.dtype != torch.double:
+        return tensor.float()
+    else:
+        return tensor
+
+
+# This is a slow but pure Python and differentiable implementation of
+# roi_align.  It potentially is a good basis for Inductor compilation
+# (but I have not benchmarked it) but today it is solely used for the
+# fact that its backwards can be implemented deterministically,
+# which is needed for the PT2 benchmark suite.
+#
+# It is transcribed directly off of the roi_align CUDA kernel, see
+# https://dev-discuss.pytorch.org/t/a-pure-python-implementation-of-roi-align-that-looks-just-like-its-cuda-kernel/1266
+@torch._dynamo.allow_in_graph
+def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    orig_dtype = input.dtype
+
+    input = maybe_cast(input)
+    rois = maybe_cast(rois)
+
+    _, _, height, width = input.size()
+
+    ph = torch.arange(pooled_height, device=input.device)  # [PH]
+    pw = torch.arange(pooled_width, device=input.device)  # [PW]
+
+    # input: [N, C, H, W]
+    # rois: [K, 5]
+
+    roi_batch_ind = rois[:, 0].int()  # [K]
+    offset = 0.5 if aligned else 0.0
+    roi_start_w = rois[:, 1] * spatial_scale - offset  # [K]
+    roi_start_h = rois[:, 2] * spatial_scale - offset  # [K]
+    roi_end_w = rois[:, 3] * spatial_scale - offset  # [K]
+    roi_end_h = rois[:, 4] * spatial_scale - offset  # [K]
+
+    roi_width = roi_end_w - roi_start_w  # [K]
+    roi_height = roi_end_h - roi_start_h  # [K]
+    if not aligned:
+        roi_width = torch.clamp(roi_width, min=1.0)  # [K]
+        roi_height = torch.clamp(roi_height, min=1.0)  # [K]
+
+    bin_size_h = roi_height / pooled_height  # [K]
+    bin_size_w = roi_width / pooled_width  # [K]
+
+    exact_sampling = sampling_ratio > 0
+
+    roi_bin_grid_h = sampling_ratio if exact_sampling else torch.ceil(roi_height / pooled_height)  # scalar or [K]
+    roi_bin_grid_w = sampling_ratio if exact_sampling else torch.ceil(roi_width / pooled_width)  # scalar or [K]
+
+    """
+    iy, ix = dims(2)
+    """
+
+    if exact_sampling:
+        count = max(roi_bin_grid_h * roi_bin_grid_w, 1)  # scalar
+        iy = torch.arange(roi_bin_grid_h, device=input.device)  # [IY]
+        ix = torch.arange(roi_bin_grid_w, device=input.device)  # [IX]
+        ymask = None
+        xmask = None
+    else:
+        count = torch.clamp(roi_bin_grid_h * roi_bin_grid_w, min=1)  # [K]
+        # When doing adaptive sampling, the number of samples we need to do
+        # is data-dependent based on how big the ROIs are.  This is a bit
+        # awkward because first-class dims can't actually handle this.
+        # So instead, we inefficiently suppose that we needed to sample ALL
+        # the points and mask out things that turned out to be unnecessary
+        iy = torch.arange(height, device=input.device)  # [IY]
+        ix = torch.arange(width, device=input.device)  # [IX]
+        ymask = iy[None, :] < roi_bin_grid_h[:, None]  # [K, IY]
+        xmask = ix[None, :] < roi_bin_grid_w[:, None]  # [K, IX]
+
+    def from_K(t):
+        return t[:, None, None]
+
+    y = (
+        from_K(roi_start_h)
+        + ph[None, :, None] * from_K(bin_size_h)
+        + (iy[None, None, :] + 0.5) * from_K(bin_size_h / roi_bin_grid_h)
+    )  # [K, PH, IY]
+    x = (
+        from_K(roi_start_w)
+        + pw[None, :, None] * from_K(bin_size_w)
+        + (ix[None, None, :] + 0.5) * from_K(bin_size_w / roi_bin_grid_w)
+    )  # [K, PW, IX]
+    val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask)  # [K, C, PH, PW, IY, IX]
+
+    # Mask out samples that weren't actually adaptively needed
+    if not exact_sampling:
+        val = torch.where(ymask[:, None, None, None, :, None], val, 0)
+        val = torch.where(xmask[:, None, None, None, None, :], val, 0)
+
+    output = val.sum((-1, -2))  # remove IY, IX ~> [K, C, PH, PW]
+    if isinstance(count, torch.Tensor):
+        output /= count[:, None, None, None]
+    else:
+        output /= count
+
+    output = output.to(orig_dtype)
+
+    return output
+
+
 @torch.fx.wrap
 def roi_align(
     input: Tensor,
@@ -54,12 +226,15 @@ def roi_align(
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(roi_align)
-    _assert_has_ops()
     check_roi_boxes_shape(boxes)
     rois = boxes
     output_size = _pair(output_size)
     if not isinstance(rois, torch.Tensor):
         rois = convert_boxes_to_roi_format(rois)
+    if not torch.jit.is_scripting():
+        if not _has_ops() or (torch.are_deterministic_algorithms_enabled() and input.is_cuda):
+            return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
+    _assert_has_ops()
     return torch.ops.torchvision.roi_align(
         input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
     )
-- 
GitLab


From 99ec261c72c097160e94653cce6f90f2d1209222 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 16 May 2023 15:40:17 +0200
Subject: [PATCH 450/624] Resize V2 relies on interpolate's native uint8
 handling (#7557)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 test/common_utils.py                          | 37 ++++++++++++++-----
 test/test_transforms_v2_consistency.py        | 13 +++++--
 test/test_transforms_v2_functional.py         | 30 +++++++++++++++
 test/transforms_v2_kernel_infos.py            | 18 +++++----
 .../transforms/v2/functional/_geometry.py     | 35 +++++++++++++++++-
 5 files changed, 111 insertions(+), 22 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index c5826a36f..1d0b82a82 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -465,11 +465,15 @@ class TensorLoader:
 class ImageLoader(TensorLoader):
     spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
+    memory_format: torch.memory_format = torch.contiguous_format
 
     def __post_init__(self):
         self.spatial_size = self.shape[-2:]
         self.num_channels = self.shape[-3]
 
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
+
 
 NUM_CHANNELS_MAP = {
     "GRAY": 1,
@@ -493,18 +497,21 @@ def make_image_loader(
     extra_dims=(),
     dtype=torch.float32,
     constant_alpha=True,
+    memory_format=torch.contiguous_format,
 ):
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
-    def fn(shape, dtype, device):
+    def fn(shape, dtype, device, memory_format):
         max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
+        data = torch.testing.make_tensor(
+            shape, low=0, high=max_value, dtype=dtype, device=device, memory_format=memory_format
+        )
         if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
             data[..., -1, :, :] = max_value
         return datapoints.Image(data)
 
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype)
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
 
 
 make_image = from_loader(make_image_loader)
@@ -530,11 +537,13 @@ def make_image_loaders(
 make_images = from_loaders(make_image_loaders)
 
 
-def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dtype=torch.uint8):
+def make_image_loader_for_interpolation(
+    size="random", *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+):
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
-    def fn(shape, dtype, device):
+    def fn(shape, dtype, device, memory_format):
         height, width = shape[-2:]
 
         image_pil = (
@@ -550,19 +559,25 @@ def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dty
             )
         )
 
-        image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
+        image_tensor = to_image_tensor(image_pil)
+        if memory_format == torch.contiguous_format:
+            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
+        else:
+            image_tensor = image_tensor.to(device=device)
+        image_tensor = convert_dtype_image_tensor(image_tensor, dtype=dtype)
 
         return datapoints.Image(image_tensor)
 
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype)
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
 
 
 def make_image_loaders_for_interpolation(
     sizes=((233, 147),),
     color_spaces=("RGB",),
     dtypes=(torch.uint8,),
+    memory_formats=(torch.contiguous_format, torch.channels_last),
 ):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
         yield make_image_loader_for_interpolation(**params)
 
 
@@ -744,8 +759,10 @@ def make_video_loader(
     size = _parse_spatial_size(size)
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
-    def fn(shape, dtype, device):
-        video = make_image(size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device)
+    def fn(shape, dtype, device, memory_format):
+        video = make_image(
+            size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device, memory_format=memory_format
+        )
         return datapoints.Video(video)
 
     return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index a8a87cd43..05ab6b67a 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -98,6 +98,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs((29, 32), antialias=False),
             ArgsKwargs((28, 31), antialias=True),
         ],
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        closeness_kwargs=dict(rtol=0, atol=1),
     ),
     ConsistencyConfig(
         v2_transforms.CenterCrop,
@@ -313,6 +315,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs((29, 32), antialias=False),
             ArgsKwargs((28, 31), antialias=True),
         ],
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        closeness_kwargs=dict(rtol=0, atol=1),
     ),
     ConsistencyConfig(
         v2_transforms.RandomErasing,
@@ -783,7 +787,8 @@ class TestContainerTransforms:
             ]
         )
 
-        check_call_consistency(prototype_transform, legacy_transform)
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1))
 
     @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1])
     @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList])
@@ -807,7 +812,8 @@ class TestContainerTransforms:
             p=p,
         )
 
-        check_call_consistency(prototype_transform, legacy_transform)
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1))
 
         if sequence_type is nn.ModuleList:
             # quick and dirty test that it is jit-scriptable
@@ -832,7 +838,8 @@ class TestContainerTransforms:
             p=probabilities,
         )
 
-        check_call_consistency(prototype_transform, legacy_transform)
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1))
 
 
 class TestToTensorTransforms:
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index ee9576b64..ed861fee9 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -1365,3 +1365,33 @@ def test_correctness_uniform_temporal_subsample(device):
 
     out_video = F.uniform_temporal_subsample(video, 8)
     assert out_video.unique().tolist() == [0, 1, 2, 3, 5, 6, 7, 9]
+
+
+# TODO: We can remove this test and related torchvision workaround
+# once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430
+@make_info_args_kwargs_parametrization(
+    [info for info in KERNEL_INFOS if info.kernel is F.resize_image_tensor],
+    args_kwargs_fn=lambda info: info.reference_inputs_fn(),
+)
+def test_memory_format_consistency_resize_image_tensor(test_id, info, args_kwargs):
+    (input, *other_args), kwargs = args_kwargs.load("cpu")
+
+    output = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
+
+    error_msg_fn = parametrized_error_message(input, *other_args, **kwargs)
+    assert input.ndim == 3, error_msg_fn
+    input_stride = input.stride()
+    output_stride = output.stride()
+    # Here we check output memory format according to the input:
+    # if input_stride is (..., 1) then input is most likely channels first and thus
+    #   output strides should match channels first strides (H * W, H, 1)
+    # if input_stride is (1, ...) then input is most likely channels last and thus
+    #   output strides should match channels last strides (1, W * C, C)
+    if input_stride[-1] == 1:
+        expected_stride = (output.shape[-2] * output.shape[-1], output.shape[-1], 1)
+        assert expected_stride == output_stride, error_msg_fn("")
+    elif input_stride[0] == 1:
+        expected_stride = (1, output.shape[0] * output.shape[-1], output.shape[0])
+        assert expected_stride == output_stride, error_msg_fn("")
+    else:
+        assert False, error_msg_fn("")
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 1678c3fb2..e5873f80d 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -1569,7 +1569,7 @@ def reference_inputs_equalize_image_tensor():
     # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range.
     # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one,
     # the information gain is low if we already provide something really close to the expected value.
-    def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor):
+    def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor, memory_format):
         if dtype.is_floating_point:
             low = low_factor
             high = high_factor
@@ -1577,23 +1577,27 @@ def reference_inputs_equalize_image_tensor():
             max_value = torch.iinfo(dtype).max
             low = int(low_factor * max_value)
             high = int(high_factor * max_value)
-        return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high)
+        return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high).to(
+            memory_format=memory_format, copy=True
+        )
 
-    def make_beta_distributed_image(shape, dtype, device, *, alpha, beta):
+    def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_format):
         image = torch.distributions.Beta(alpha, beta).sample(shape)
         if not dtype.is_floating_point:
             image.mul_(torch.iinfo(dtype).max).round_()
-        return image.to(dtype=dtype, device=device)
+        return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True)
 
     spatial_size = (256, 256)
     for dtype, color_space, fn in itertools.product(
         [torch.uint8],
         ["GRAY", "RGB"],
         [
-            lambda shape, dtype, device: torch.zeros(shape, dtype=dtype, device=device),
-            lambda shape, dtype, device: torch.full(
-                shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device
+            lambda shape, dtype, device, memory_format: torch.zeros(shape, dtype=dtype, device=device).to(
+                memory_format=memory_format, copy=True
             ),
+            lambda shape, dtype, device, memory_format: torch.full(
+                shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device
+            ).to(memory_format=memory_format, copy=True),
             *[
                 functools.partial(make_uniform_band_image, low_factor=low_factor, high_factor=high_factor)
                 for low_factor, high_factor in [
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 1ceabbd80..c9551c9ee 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -176,16 +176,47 @@ def resize_image_tensor(
         antialias = False
 
     shape = image.shape
+    numel = image.numel()
     num_channels, old_height, old_width = shape[-3:]
     new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
 
     if (new_height, new_width) == (old_height, old_width):
         return image
-    elif image.numel() > 0:
+    elif numel > 0:
         image = image.reshape(-1, num_channels, old_height, old_width)
 
         dtype = image.dtype
-        need_cast = dtype not in (torch.float32, torch.float64)
+        acceptable_dtypes = [torch.float32, torch.float64]
+        if interpolation == InterpolationMode.NEAREST or interpolation == InterpolationMode.NEAREST_EXACT:
+            # uint8 dtype can be included for cpu and cuda input if nearest mode
+            acceptable_dtypes.append(torch.uint8)
+        elif interpolation == InterpolationMode.BILINEAR and image.device.type == "cpu":
+            # uint8 dtype support for bilinear mode is limited to cpu and
+            # according to our benchmarks non-AVX CPUs should prefer u8->f32->interpolate->u8 path
+            if "AVX2" in torch.backends.cpu.get_cpu_capability():
+                acceptable_dtypes.append(torch.uint8)
+
+                # TODO: Remove when https://github.com/pytorch/pytorch/pull/101136 is landed
+                if dtype == torch.uint8 and not (
+                    image.is_contiguous() or image.is_contiguous(memory_format=torch.channels_last)
+                ):
+                    image = image.contiguous(memory_format=torch.channels_last)
+
+        strides = image.stride()
+        if image.is_contiguous(memory_format=torch.channels_last) and image.shape[0] == 1 and numel != strides[0]:
+            # There is a weird behaviour in torch core where the output tensor of `interpolate()` can be allocated as
+            # contiguous even though the input is un-ambiguously channels_last (https://github.com/pytorch/pytorch/issues/68430).
+            # In particular this happens for the typical torchvision use-case of single CHW images where we fake the batch dim
+            # to become 1CHW. Below, we restride those tensors to trick torch core into properly allocating the output as
+            # channels_last, thus preserving the memory format of the input. This is not just for format consistency:
+            # for uint8 bilinear images, this also avoids an extra copy (re-packing) of the output and saves time.
+            # TODO: when https://github.com/pytorch/pytorch/issues/68430 is fixed (possibly by https://github.com/pytorch/pytorch/pull/100373),
+            # we should be able to remove this hack.
+            new_strides = list(strides)
+            new_strides[0] = numel
+            image = image.as_strided((1, num_channels, old_height, old_width), new_strides)
+
+        need_cast = dtype not in acceptable_dtypes
         if need_cast:
             image = image.to(dtype=torch.float32)
 
-- 
GitLab


From 19b2d3caefdb99b6e5405c712434c75dd477b438 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 17 May 2023 13:16:50 +0200
Subject: [PATCH 451/624] reinstate doc upload (#7580)

---
 .github/workflows/docs.yml | 62 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f48f8fb0a..d5aec182a 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -7,6 +7,9 @@ on:
       - nightly
       - main
       - release/*
+    tags:
+      - v[0-9]+.[0-9]+.[0-9]
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
@@ -14,6 +17,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/vision
+      upload-artifact: docs
       script: |
         set -euo pipefail
 
@@ -44,4 +48,60 @@ jobs:
         sed -i -e 's/-j auto/-j 1/' Makefile
         make html
         
-        mv build/html/* "${RUNNER_DOCS_DIR}"
+        cp -r build/html "${RUNNER_ARTIFACT_DIR}"
+        
+        # On PRs we also want to upload the docs into our S3 bucket for preview.
+        if [[ ${{ github.event_name == 'pull_request' }} ]]; then
+          cp -r build/html/* "${RUNNER_DOCS_DIR}"
+        fi
+
+  upload:
+    needs: build
+    if: github.repository == 'pytorch/vision' && github.event_name == 'push' && 
+        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    permissions:
+      contents: write
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      download-artifact: docs
+      ref: gh-pages
+      script: |
+        set -euo pipefail
+        
+        REF_TYPE=${{ github.ref_type }}
+        REF_NAME=${{ github.ref_name }}
+
+        if [[ "${REF_TYPE}" == branch ]]; then
+          TARGET_FOLDER="${REF_NAME}"
+        elif [[ "${REF_TYPE}" == tag ]]; then
+          case "${REF_NAME}" in
+            *-rc*)
+              echo "Aborting upload since this is an RC tag: ${REF_NAME}"
+              exit 0
+              ;;
+            *)
+              # Strip the leading "v" as well as the trailing patch version. For example:
+              # 'v0.15.2' -> '0.15'
+              TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/')
+              ;;
+          esac
+        fi
+        echo "Target Folder: ${TARGET_FOLDER}"
+
+        mkdir -p "${TARGET_FOLDER}"
+        rm -rf "${TARGET_FOLDER}"/*
+        mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
+        git add "${TARGET_FOLDER}" || true
+        
+        if [[ "${TARGET_FOLDER}" == main ]]; then
+          mkdir -p _static
+          rm -rf _static/*
+          cp -r "${TARGET_FOLDER}"/_static/* _static
+          git add _static || true
+        fi
+        
+        git config user.name 'pytorchbot'
+        git config user.email 'soumith+bot@pytorch.org'
+        git commit -m "auto-generating sphinx docs" || true
+        git push
-- 
GitLab


From abc40ef241e309bff598222b1accff8d0eb93e8d Mon Sep 17 00:00:00 2001
From: Kai Wana <1602708828@qq.com>
Date: Thu, 18 May 2023 17:48:44 +0800
Subject: [PATCH 452/624] add comments for ops LastLevelMaxPool to avoid
 potential confusion (#7593)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 torchvision/ops/feature_pyramid_network.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/ops/feature_pyramid_network.py b/torchvision/ops/feature_pyramid_network.py
index f4b190844..2e7aef0e2 100644
--- a/torchvision/ops/feature_pyramid_network.py
+++ b/torchvision/ops/feature_pyramid_network.py
@@ -206,7 +206,7 @@ class FeaturePyramidNetwork(nn.Module):
 
 class LastLevelMaxPool(ExtraFPNBlock):
     """
-    Applies a max_pool2d on top of the last feature map
+    Applies a max_pool2d (not actual max_pool2d, we just subsample) on top of the last feature map
     """
 
     def forward(
@@ -216,7 +216,8 @@ class LastLevelMaxPool(ExtraFPNBlock):
         names: List[str],
     ) -> Tuple[List[Tensor], List[str]]:
         names.append("pool")
-        x.append(F.max_pool2d(x[-1], 1, 2, 0))
+        # Use max pooling to simulate stride 2 subsampling
+        x.append(F.max_pool2d(x[-1], kernel_size=1, stride=2, padding=0))
         return x, names
 
 
-- 
GitLab


From 689ff292e57eef932a939f70c20e893dec8f1d2b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 May 2023 16:06:26 +0200
Subject: [PATCH 453/624] fix version wrangling in docs (#7600)

---
 .github/workflows/docs.yml |  8 ++++++++
 docs/source/conf.py        | 16 +++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index d5aec182a..f4cc76db0 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -43,6 +43,14 @@ jobs:
         pip install --progress-bar=off -r requirements.txt
         echo '::endgroup::'
         
+        if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then
+          echo '::group::Enable version string sanitization'
+          # This environment variable just has to exist and must not be empty. The actual value is arbitrary.
+          # See docs/source/conf.py for details
+          export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1
+          echo '::endgroup::'
+        fi
+
         # The runner does not have sufficient memory to run with as many processes as there are
         # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
         sed -i -e 's/-j auto/-j 1/' Makefile
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6d748f5b7..4bb75fe6e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -92,17 +92,15 @@ author = "Torch Contributors"
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
-#
-# The short X.Y version.
-version = "main (" + torchvision.__version__ + " )"
-# The full version, including alpha/beta/rc tags.
-release = "main"
-VERSION = os.environ.get("VERSION", None)
-if VERSION:
+# version: The short X.Y version.
+# release: The full version, including alpha/beta/rc tags.
+if os.environ.get("TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS", None):
     # Turn 1.11.0aHASH into 1.11 (major.minor only)
-    version = ".".join(version.split(".")[:2])
+    version = release = ".".join(torchvision.__version__.split(".")[:2])
     html_title = " ".join((project, version, "documentation"))
-    release = version
+else:
+    version = f"main ({torchvision.__version__})"
+    release = "main"
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation
-- 
GitLab


From d2f7486ccaef461913cdb51990ff353addf6f064 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 19 May 2023 13:27:15 +0200
Subject: [PATCH 454/624] convert torch.split return to list in RAFT (#7597)

---
 torchvision/prototype/models/depth/stereo/raft_stereo.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
index 20ef077a6..151935631 100644
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py
@@ -450,7 +450,12 @@ class RaftStereo(nn.Module):
             hidden_state, context = torch.split(context_outs[i], [hidden_dims[i], context_out_channels[i]], dim=1)
             hidden_states.append(torch.tanh(hidden_state))
             contexts.append(
-                torch.split(context_conv(F.relu(context)), [hidden_dims[i], hidden_dims[i], hidden_dims[i]], dim=1)
+                # mypy is technically correct here. The return type of `torch.split` was incorrectly annotated with
+                # `List[int]` although it should have been `Tuple[Tensor, ...]`. However, the latter is not supported by
+                # JIT and thus we have to keep the wrong annotation here and silence mypy.
+                torch.split(  # type: ignore[arg-type]
+                    context_conv(F.relu(context)), [hidden_dims[i], hidden_dims[i], hidden_dims[i]], dim=1
+                )
             )
 
         _, Cf, Hf, Wf = fmap1.shape
-- 
GitLab


From 300a90926e88f13abbaf3d8155cdba36aab86ab4 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Sat, 20 May 2023 16:05:57 -0400
Subject: [PATCH 455/624] Add non-TS'able _resize_image_and_masks variant with
 less tensor ops (#7592)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 torchvision/models/detection/transform.py | 36 ++++++++++++++---------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py
index 589d5e45b..658c9e834 100644
--- a/torchvision/models/detection/transform.py
+++ b/torchvision/models/detection/transform.py
@@ -24,8 +24,8 @@ def _fake_cast_onnx(v: Tensor) -> float:
 
 def _resize_image_and_masks(
     image: Tensor,
-    self_min_size: float,
-    self_max_size: float,
+    self_min_size: int,
+    self_max_size: int,
     target: Optional[Dict[str, Tensor]] = None,
     fixed_size: Optional[Tuple[int, int]] = None,
 ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
@@ -40,14 +40,24 @@ def _resize_image_and_masks(
     if fixed_size is not None:
         size = [fixed_size[1], fixed_size[0]]
     else:
-        min_size = torch.min(im_shape).to(dtype=torch.float32)
-        max_size = torch.max(im_shape).to(dtype=torch.float32)
-        scale = torch.min(self_min_size / min_size, self_max_size / max_size)
+        if torch.jit.is_scripting() or torchvision._is_tracing():
+            min_size = torch.min(im_shape).to(dtype=torch.float32)
+            max_size = torch.max(im_shape).to(dtype=torch.float32)
+            self_min_size_f = float(self_min_size)
+            self_max_size_f = float(self_max_size)
+            scale = torch.min(self_min_size_f / min_size, self_max_size_f / max_size)
+
+            if torchvision._is_tracing():
+                scale_factor = _fake_cast_onnx(scale)
+            else:
+                scale_factor = scale.item()
 
-        if torchvision._is_tracing():
-            scale_factor = _fake_cast_onnx(scale)
         else:
-            scale_factor = scale.item()
+            # Do it the normal way
+            min_size = min(im_shape)
+            max_size = max(im_shape)
+            scale_factor = min(self_min_size / min_size, self_max_size / max_size)
+
         recompute_scale_factor = True
 
     image = torch.nn.functional.interpolate(
@@ -159,8 +169,7 @@ class GeneralizedRCNNTransform(nn.Module):
     def torch_choice(self, k: List[int]) -> int:
         """
         Implements `random.choice` via torch ops, so it can be compiled with
-        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
-        is fixed.
+        TorchScript and we use PyTorch's RNG (not native RNG)
         """
         index = int(torch.empty(1).uniform_(0.0, float(len(k))).item())
         return k[index]
@@ -174,11 +183,10 @@ class GeneralizedRCNNTransform(nn.Module):
         if self.training:
             if self._skip_resize:
                 return image, target
-            size = float(self.torch_choice(self.min_size))
+            size = self.torch_choice(self.min_size)
         else:
-            # FIXME assume for now that testing uses the largest scale
-            size = float(self.min_size[-1])
-        image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.fixed_size)
+            size = self.min_size[-1]
+        image, target = _resize_image_and_masks(image, size, self.max_size, target, self.fixed_size)
 
         if target is None:
             return image, target
-- 
GitLab


From a1ec864a4a658c9ae4fe6afb47802673f745a2b4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 22 May 2023 11:11:40 +0200
Subject: [PATCH 456/624] remove unused kwargs from make_grid (#7613)

---
 torchvision/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchvision/utils.py b/torchvision/utils.py
index bc9d88b28..1418656a7 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -29,7 +29,6 @@ def make_grid(
     value_range: Optional[Tuple[int, int]] = None,
     scale_each: bool = False,
     pad_value: float = 0.0,
-    **kwargs,
 ) -> torch.Tensor:
     """
     Make a grid of images.
-- 
GitLab


From 508bc1dc8d7cff5c1383068d6601ff669f69111d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 22 May 2023 11:23:43 +0200
Subject: [PATCH 457/624] CXX_STANDARD 14 -> 17 (#7608)

---
 android/ops/CMakeLists.txt              | 2 +-
 examples/cpp/hello_world/CMakeLists.txt | 2 +-
 ios/CMakeLists.txt                      | 2 +-
 test/tracing/frcnn/CMakeLists.txt       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/android/ops/CMakeLists.txt b/android/ops/CMakeLists.txt
index ad42adbfa..fb8d4348e 100644
--- a/android/ops/CMakeLists.txt
+++ b/android/ops/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 
 string(APPEND CMAKE_CXX_FLAGS " -DMOBILE")
 
diff --git a/examples/cpp/hello_world/CMakeLists.txt b/examples/cpp/hello_world/CMakeLists.txt
index 3ca59e4c1..7d49178b8 100644
--- a/examples/cpp/hello_world/CMakeLists.txt
+++ b/examples/cpp/hello_world/CMakeLists.txt
@@ -17,4 +17,4 @@ add_executable(hello-world main.cpp)
 # which also adds all the necessary torch dependencies.
 target_compile_features(hello-world PUBLIC cxx_range_for)
 target_link_libraries(hello-world TorchVision::TorchVision)
-set_property(TARGET hello-world PROPERTY CXX_STANDARD 14)
+set_property(TARGET hello-world PROPERTY CXX_STANDARD 17)
diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt
index 6b9fd3925..4201240a4 100644
--- a/ios/CMakeLists.txt
+++ b/ios/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(LIBTORCH_HEADER_ROOT ${LIBTORCH_HEADER_ROOT})
 set(LIBRARY_OUTPUT_PATH ../lib)
 
diff --git a/test/tracing/frcnn/CMakeLists.txt b/test/tracing/frcnn/CMakeLists.txt
index c79382470..8ede462e3 100644
--- a/test/tracing/frcnn/CMakeLists.txt
+++ b/test/tracing/frcnn/CMakeLists.txt
@@ -10,4 +10,4 @@ find_package(Python3 COMPONENTS Development)
 add_executable(test_frcnn_tracing test_frcnn_tracing.cpp)
 target_compile_features(test_frcnn_tracing PUBLIC cxx_range_for)
 target_link_libraries(test_frcnn_tracing ${TORCH_LIBRARIES} TorchVision::TorchVision Python3::Python)
-set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 14)
+set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 17)
-- 
GitLab


From 6ccc712b02b4014a087878969b610e486ebc6adf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 22 May 2023 13:50:25 +0100
Subject: [PATCH 458/624] Remove addressed workaround in ResizeV2 (#7606)

---
 torchvision/transforms/v2/functional/_geometry.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index c9551c9ee..b9124f280 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -190,17 +190,14 @@ def resize_image_tensor(
         if interpolation == InterpolationMode.NEAREST or interpolation == InterpolationMode.NEAREST_EXACT:
             # uint8 dtype can be included for cpu and cuda input if nearest mode
             acceptable_dtypes.append(torch.uint8)
-        elif interpolation == InterpolationMode.BILINEAR and image.device.type == "cpu":
+        elif (
+            interpolation == InterpolationMode.BILINEAR
+            and image.device.type == "cpu"
+            and "AVX2" in torch.backends.cpu.get_cpu_capability()
+        ):
             # uint8 dtype support for bilinear mode is limited to cpu and
             # according to our benchmarks non-AVX CPUs should prefer u8->f32->interpolate->u8 path
-            if "AVX2" in torch.backends.cpu.get_cpu_capability():
-                acceptable_dtypes.append(torch.uint8)
-
-                # TODO: Remove when https://github.com/pytorch/pytorch/pull/101136 is landed
-                if dtype == torch.uint8 and not (
-                    image.is_contiguous() or image.is_contiguous(memory_format=torch.channels_last)
-                ):
-                    image = image.contiguous(memory_format=torch.channels_last)
+            acceptable_dtypes.append(torch.uint8)
 
         strides = image.stride()
         if image.is_contiguous(memory_format=torch.channels_last) and image.shape[0] == 1 and numel != strides[0]:
-- 
GitLab


From bd4471ccc73644ef7dde79f11cc7507d760546e1 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 23 May 2023 09:02:52 +0200
Subject: [PATCH 459/624] Replaced gpuAtomicAdd by fastAtomicAdd (#7596)

---
 .../csrc/ops/cuda/deform_conv2d_kernel.cu     | 11 ++++-
 .../csrc/ops/cuda/ps_roi_align_kernel.cu      | 40 ++++++++++++----
 .../csrc/ops/cuda/ps_roi_pool_kernel.cu       | 15 +++---
 torchvision/csrc/ops/cuda/roi_align_kernel.cu | 47 +++++++++++++------
 torchvision/csrc/ops/cuda/roi_pool_kernel.cu  | 24 ++++++----
 5 files changed, 95 insertions(+), 42 deletions(-)

diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
index b664bf11b..0e82c5dd6 100644
--- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
+++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
@@ -70,7 +70,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -335,6 +335,8 @@ __global__ void deformable_col2im_kernel(
     index_t out_w,
     bool use_mask,
     scalar_t* grad_im) {
+  const index_t grad_im_numel = width * height * channels * batch_sz;
+
   CUDA_1D_KERNEL_LOOP_T(index, n, int64_t) {
     const index_t out_x = index % out_w;
     const index_t out_y = (index / out_w) % out_h;
@@ -381,7 +383,12 @@ __global__ void deformable_col2im_kernel(
             std::abs(y - yp) < 1 && std::abs(x - xp) < 1) {
           index_t grad_pos = ((b * channels + c) * height + yp) * width + xp;
           scalar_t weight = (1 - std::abs(y - yp)) * (1 - std::abs(x - xp));
-          gpuAtomicAdd(grad_im + grad_pos, mask_value * weight * col[index]);
+          at::native::fastAtomicAdd(
+              grad_im,
+              grad_pos,
+              grad_im_numel,
+              mask_value * weight * col[index],
+              true);
         }
       }
     }
diff --git a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
index 17cc188cd..105c6a142 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -212,7 +212,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
     int sampling_ratio,
     int channels_out,
     T* grad_input,
-    const T* rois) {
+    const T* rois,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, *, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -235,8 +236,6 @@ __global__ void ps_roi_align_backward_kernel_impl(
     T bin_size_w = roi_width / static_cast<T>(pooled_width);
 
     int c_in = channel_mapping[index];
-    T* grad_input_offset =
-        grad_input + (roi_batch_ind * channels + c_in) * height * width;
 
     // Do not using floor/ceil; this implementation detail is critical
     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
@@ -252,6 +251,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
     const T count = roi_bin_grid_h * roi_bin_grid_w;
 
+    const int offset = (roi_batch_ind * channels + c_in) * height * width;
+
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
       const T y = hstart +
           static_cast<T>(iy + .5f) * bin_size_h /
@@ -285,10 +286,30 @@ __global__ void ps_roi_align_backward_kernel_impl(
         T g4 = grad_output_this_bin * w4 / count;
 
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          gpuAtomicAdd(grad_input_offset + y_low * width + x_low, g1);
-          gpuAtomicAdd(grad_input_offset + y_low * width + x_high, g2);
-          gpuAtomicAdd(grad_input_offset + y_high * width + x_low, g3);
-          gpuAtomicAdd(grad_input_offset + y_high * width + x_high, g4);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_low * width + x_low,
+              memory_span,
+              static_cast<T>(g1),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_low * width + x_high,
+              memory_span,
+              static_cast<T>(g2),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_high * width + x_low,
+              memory_span,
+              static_cast<T>(g3),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_high * width + x_high,
+              memory_span,
+              static_cast<T>(g4),
+              true);
         } // if
       } // ix
     } // iy
@@ -430,7 +451,8 @@ at::Tensor ps_roi_align_backward_kernel(
             sampling_ratio,
             channels_out,
             grad_input.data_ptr<scalar_t>(),
-            rois_.data_ptr<scalar_t>());
+            rois_.data_ptr<scalar_t>(),
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
index 3789a2b7d..2c90690f4 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -91,7 +91,8 @@ __global__ void ps_roi_pool_backward_kernel_impl(
     int pooled_width,
     int channels_out,
     T* grad_input,
-    const T* rois) {
+    const T* rois,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, *, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -124,14 +125,15 @@ __global__ void ps_roi_pool_backward_kernel_impl(
     bool is_empty = (hend <= hstart) || (wend <= wstart);
 
     int c_in = channel_mapping[index];
-    T* grad_input_offset =
-        grad_input + (roi_batch_ind * channels + c_in) * height * width;
     T bin_area = (hend - hstart) * (wend - wstart);
     T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
+
+    const int offset = (roi_batch_ind * channels + c_in) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int grad_input_index = h * width + w;
-        gpuAtomicAdd(grad_input_offset + grad_input_index, diff_val);
+        at::native::fastAtomicAdd(
+            grad_input, offset + grad_input_index, memory_span, diff_val, true);
       }
     }
   }
@@ -269,7 +271,8 @@ at::Tensor ps_roi_pool_backward_kernel(
             pooled_width,
             channels_out,
             grad_input.data_ptr<scalar_t>(),
-            rois_.data_ptr<scalar_t>());
+            rois_.data_ptr<scalar_t>(),
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index 2622edec1..26c534486 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -218,7 +218,8 @@ __global__ void roi_align_backward_kernel_impl(
     int n_stride,
     int c_stride,
     int h_stride,
-    int w_stride) {
+    int w_stride,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -247,12 +248,9 @@ __global__ void roi_align_backward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
     // We need to index the gradient using the tensor strides to access the
     // correct values.
-    int output_offset = n * n_stride + c * c_stride;
+    const int output_offset = n * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
@@ -267,6 +265,8 @@ __global__ void roi_align_backward_kernel_impl(
     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
 
+    const int input_offset = (roi_batch_ind * channels + c) * height * width;
+
     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
     {
       const T y = roi_start_h + ph * bin_size_h +
@@ -301,14 +301,30 @@ __global__ void roi_align_backward_kernel_impl(
         T g4 = grad_output_this_bin * w4 / count;
 
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          gpuAtomicAdd(
-              offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          gpuAtomicAdd(
-              offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          gpuAtomicAdd(
-              offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          gpuAtomicAdd(
-              offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_low * width + x_low,
+              memory_span,
+              static_cast<T>(g1),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_low * width + x_high,
+              memory_span,
+              static_cast<T>(g2),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_high * width + x_low,
+              memory_span,
+              static_cast<T>(g3),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_high * width + x_high,
+              memory_span,
+              static_cast<T>(g4),
+              true);
         } // if
       } // ix
     } // iy
@@ -442,7 +458,8 @@ at::Tensor roi_align_backward_kernel(
             n_stride,
             c_stride,
             h_stride,
-            w_stride);
+            w_stride,
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
index 74952bba0..3a9374bb4 100644
--- a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
@@ -3,7 +3,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <float.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -94,7 +94,8 @@ __global__ void roi_pool_backward_kernel_impl(
     int n_stride,
     int c_stride,
     int h_stride,
-    int w_stride) {
+    int w_stride,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -104,19 +105,21 @@ __global__ void roi_pool_backward_kernel_impl(
 
     const T* offset_rois = rois + n * 5;
     int roi_batch_ind = offset_rois[0];
-    T* grad_input_offset =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
 
-    int output_offset = n * n_stride + c * c_stride;
+    const int output_offset = n * n_stride + c * c_stride;
     const int* argmax_data_offset =
         argmax_data + (n * channels + c) * pooled_height * pooled_width;
-    int argmax = argmax_data_offset[ph * pooled_width + pw];
+    const int argmax = argmax_data_offset[ph * pooled_width + pw];
+    const int offset = (roi_batch_ind * channels + c) * height * width;
 
     if (argmax != -1) {
-      gpuAtomicAdd(
-          grad_input_offset + argmax,
+      at::native::fastAtomicAdd(
+          grad_input,
+          offset + argmax,
+          memory_span,
           static_cast<T>(
-              grad_output[output_offset + ph * h_stride + pw * w_stride]));
+              grad_output[output_offset + ph * h_stride + pw * w_stride]),
+          true);
     }
   }
 }
@@ -253,7 +256,8 @@ at::Tensor roi_pool_backward_kernel(
             n_stride,
             c_stride,
             h_stride,
-            w_stride);
+            w_stride,
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
-- 
GitLab


From e5bf7cf85dcab36a313890deb57b8eb864f06292 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 23 May 2023 15:26:51 +0200
Subject: [PATCH 460/624] migrate cmake workflows from CircleCI to GHA (#7417)

Co-authored-by: Andrey Talman <atalman@fb.com>
---
 .github/scripts/cmake.sh                    | 124 ++++++++++++++++++++
 .github/scripts/setup-env.sh                |  19 ++-
 .github/workflows/build-cmake.yml           |  92 +++++++++++++++
 packaging/windows/internal/cuda_install.bat |  14 +--
 4 files changed, 228 insertions(+), 21 deletions(-)
 create mode 100755 .github/scripts/cmake.sh
 create mode 100644 .github/workflows/build-cmake.yml

diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh
new file mode 100755
index 000000000..ef3e5f61d
--- /dev/null
+++ b/.github/scripts/cmake.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+./.github/scripts/setup-env.sh
+
+# Activate conda environment
+set +x && eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci && set -x
+
+# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
+case $(uname) in
+  Linux)
+    OS_TYPE=linux
+    ;;
+  Darwin)
+    OS_TYPE=macos
+    ;;
+  MSYS*)
+    OS_TYPE=windows
+    ;;
+  *)
+    echo "Unknown OS type:" $(uname)
+    exit 1
+    ;;
+esac
+
+if [[ $OS_TYPE == macos ]]; then
+  JOBS=$(sysctl -n hw.logicalcpu)
+else
+  JOBS=$(nproc)
+fi
+
+TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))")
+if [[ $OS_TYPE == windows ]]; then
+  PACKAGING_DIR="${PWD}/packaging"
+  export PATH="${TORCH_PATH}/lib:${PATH}"
+fi
+
+Torch_DIR="${TORCH_PATH}/share/cmake/Torch"
+if [[ "${GPU_ARCH_TYPE}" == "cuda" ]]; then
+  WITH_CUDA=1
+else
+  WITH_CUDA=0
+fi
+
+echo '::group::Prepare CMake builds'
+mkdir -p cpp_build
+
+pushd test/tracing/frcnn
+python trace_model.py
+mkdir -p build
+mv fasterrcnn_resnet50_fpn.pt build
+popd
+
+pushd examples/cpp/hello_world
+python trace_model.py
+mkdir -p build
+mv resnet18.pt build
+popd
+
+# This was only needed for the tracing above
+pip uninstall -y torchvision
+echo '::endgroup::'
+
+echo '::group::Build and install libtorchvision'
+pushd cpp_build
+
+# On macOS, CMake is looking for the library (*.dylib) and the header (*.h) separately. By default, it prefers to load
+# the header from other packages that install the library. This easily leads to a mismatch if the library installed
+# from conda doesn't have the exact same version. Thus, we need to explicitly set CMAKE_FIND_FRAMEWORK=NEVER to force
+# it to not load anything from other installed frameworks. Resources:
+# https://stackoverflow.com/questions/36523911/osx-homebrew-cmake-libpng-version-mismatch-issue
+# https://cmake.org/cmake/help/latest/variable/CMAKE_FIND_FRAMEWORK.html
+cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER \
+  -DCMAKE_INSTALL_PREFIX="${CONDA_PREFIX}"
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cmake.bat" $JOBS
+else
+  make -j$JOBS
+  make install
+fi
+
+popd
+echo '::endgroup::'
+
+echo '::group::Build and run project that uses Faster-RCNN'
+pushd test/tracing/frcnn/build
+
+cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_frcnn.bat" $JOBS
+  cd Release
+  cp ../fasterrcnn_resnet50_fpn.pt .
+else
+  make -j$JOBS
+fi
+
+./test_frcnn_tracing
+
+popd
+echo '::endgroup::'
+
+echo '::group::Build and run C++ example'
+pushd examples/cpp/hello_world/build
+
+cmake .. -DTorch_DIR="${Torch_DIR}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cpp_example.bat" $JOBS
+  cd Release
+  cp ../resnet18.pt .
+else
+  make -j$JOBS
+fi
+
+./hello-world
+
+popd
+echo '::endgroup::'
diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index 635fdb265..a7ba4a6bd 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -1,10 +1,9 @@
 #!/usr/bin/env bash
 
-set -euo pipefail
+set -euxo pipefail
 
 # Prepare conda
-CONDA_PATH=$(which conda)
-eval "$(${CONDA_PATH} shell.bash hook)"
+set +x && eval "$($(which conda) shell.bash hook)" && set -x
 
 # Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
 case $(uname) in
@@ -25,12 +24,12 @@ esac
 
 if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then
   echo '::group::Uninstall system JPEG libraries on macOS'
-  # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG libraries installed by
-  # default that interfere with our build. We uninstall them here and use the one from conda below.
-  JPEG_LIBS=$(brew list | grep jpeg)
-  echo $JPEG_LIBS
-  for lib in $JPEG_LIBS; do
-    brew uninstall --ignore-dependencies --force $lib || true
+  # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG and PNG libraries
+  # installed by default that interfere with our build. We uninstall them here and use the one from conda below.
+  IMAGE_LIBS=$(brew list | grep -E "jpeg|png")
+  echo "${IMAGE_LIBS}"
+  for lib in "${IMAGE_LIBS}"; do
+    brew uninstall --ignore-dependencies --force "${lib}" || true
   done
   echo '::endgroup::'
 fi
@@ -41,7 +40,7 @@ conda create \
   --name ci \
   --quiet --yes \
   python="${PYTHON_VERSION}" pip \
-  ninja \
+  ninja cmake \
   libpng jpeg \
   'ffmpeg<4.3'
 conda activate ci
diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
new file mode 100644
index 000000000..8fdf99c84
--- /dev/null
+++ b/.github/workflows/build-cmake.yml
@@ -0,0 +1,92 @@
+name: CMake
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  linux:
+    strategy:
+      matrix:
+        include:
+          - runner: linux.12xlarge
+            gpu-arch-type: cpu
+          - runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/cmake.sh
+
+  macos:
+    strategy:
+      matrix:
+        include:
+          - runner: macos-12
+          - runner: macos-m1-12
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/cmake.sh
+
+  windows:
+    strategy:
+      matrix:
+        include:
+          - runner: windows.4xlarge
+            gpu-arch-type: cpu
+          - runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      script: |
+        set -euo pipefail
+
+        source packaging/windows/internal/vc_install_helper.sh
+
+        # FIXME: Basically, we are reinstalling CUDA here. We only need this, because we need to copy some files that
+        #  can be extracted from the CUDA installer, but are not available on our Windows AMI.
+        #  See https://github.com/pytorch/test-infra/pull/4189
+        if [[ ${{ matrix.gpu-arch-type }} == cuda ]]; then
+          export CU_VERSION=cu$(echo ${{ matrix.gpu-arch-version }} | sed 's/\.//')
+          echo CU_VERSION="${CU_VERSION}"
+          packaging/windows/internal/cuda_install.bat
+        fi
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/cmake.sh
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index 25aee6a95..2065973af 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -127,14 +127,9 @@ echo %errorlevel%
 
 popd
 
-echo Installing VS integration...
-rem It's for VS 2019
-if "%CUDA_VER_MAJOR%" == "10" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-if "%CUDA_VER_MAJOR%" == "11" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
+echo Installing VS 2019 integration...
+xcopy /YI "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations"
+
 
 echo Installing NvToolsExt...
 7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
@@ -165,6 +160,3 @@ echo Installing cuDNN...
 xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
 xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
 xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
-
-echo Cleaning temp files
-rd /s /q "%SRC_DIR%\temp_build" || ver > nul
-- 
GitLab


From e1ad6adc9a094f27440d2eb073453a62fa943b0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Minh-Long=20Luu=20=28=E5=88=98=E6=98=8E=E9=BE=99=29?=
 <minhlong9413@gmail.com>
Date: Wed, 24 May 2023 17:18:59 +0700
Subject: [PATCH 461/624] Improve "Illustration of Transform" docs (#7621)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 gallery/plot_transforms.py | 110 +++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 46 deletions(-)

diff --git a/gallery/plot_transforms.py b/gallery/plot_transforms.py
index c6e44a14e..0e25de3af 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/plot_transforms.py
@@ -49,19 +49,24 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
 
     plt.tight_layout()
 
-
 ####################################
+# Geometric Transforms
+# --------------------
+# Geometric image transformation refers to the process of altering the geometric properties of an image,
+# such as its shape, size, orientation, or position.
+# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation.
+#
 # Pad
-# ---
+# ~~~
 # The :class:`~torchvision.transforms.Pad` transform
 # (see also :func:`~torchvision.transforms.functional.pad`)
-# fills image borders with some pixel values.
+# pads all image borders with some pixel values.
 padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
 plot(padded_imgs)
 
 ####################################
 # Resize
-# ------
+# ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
 # (see also :func:`~torchvision.transforms.functional.resize`)
 # resizes an image.
@@ -70,7 +75,7 @@ plot(resized_imgs)
 
 ####################################
 # CenterCrop
-# ----------
+# ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
 # (see also :func:`~torchvision.transforms.functional.center_crop`)
 # crops the given image at the center.
@@ -79,46 +84,13 @@ plot(center_crops)
 
 ####################################
 # FiveCrop
-# --------
+# ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
 # (see also :func:`~torchvision.transforms.functional.five_crop`)
 # crops the given image into four corners and the central crop.
 (top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
 plot([top_left, top_right, bottom_left, bottom_right, center])
 
-####################################
-# Grayscale
-# ---------
-# The :class:`~torchvision.transforms.Grayscale` transform
-# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
-# converts an image to grayscale
-gray_img = T.Grayscale()(orig_img)
-plot([gray_img], cmap='gray')
-
-####################################
-# Random transforms
-# -----------------
-# The following transforms are random, which means that the same transfomer
-# instance will produce different result each time it transforms a given image.
-#
-# ColorJitter
-# ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.ColorJitter` transform
-# randomly changes the brightness, saturation, and other properties of an image.
-jitter = T.ColorJitter(brightness=.5, hue=.3)
-jitted_imgs = [jitter(orig_img) for _ in range(4)]
-plot(jitted_imgs)
-
-####################################
-# GaussianBlur
-# ~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.GaussianBlur` transform
-# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
-# performs gaussian blur transform on an image.
-blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
-blurred_imgs = [blurrer(orig_img) for _ in range(4)]
-plot(blurred_imgs)
-
 ####################################
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
@@ -181,6 +153,45 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
 plot(resized_crops)
 
+####################################
+# Photometric Transforms
+# ----------------------
+# Photometric image transformation refers to the process of modifying the photometric properties of an image,
+# such as its brightness, contrast, color, or tone.
+# These transformations are applied to change the visual appearance of an image
+# while preserving its geometric structure.
+#
+# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random,
+# which means that the same transform
+# instance will produce different result each time it transforms a given image.
+#
+# Grayscale
+# ~~~~~~~~~
+# The :class:`~torchvision.transforms.Grayscale` transform
+# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
+# converts an image to grayscale
+gray_img = T.Grayscale()(orig_img)
+plot([gray_img], cmap='gray')
+
+####################################
+# ColorJitter
+# ~~~~~~~~~~~
+# The :class:`~torchvision.transforms.ColorJitter` transform
+# randomly changes the brightness, contrast, saturation, hue, and other properties of an image.
+jitter = T.ColorJitter(brightness=.5, hue=.3)
+jitted_imgs = [jitter(orig_img) for _ in range(4)]
+plot(jitted_imgs)
+
+####################################
+# GaussianBlur
+# ~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.GaussianBlur` transform
+# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
+# performs gaussian blur transform on an image.
+blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
+blurred_imgs = [blurrer(orig_img) for _ in range(4)]
+plot(blurred_imgs)
+
 ####################################
 # RandomInvert
 # ~~~~~~~~~~~~
@@ -244,6 +255,11 @@ equalized_imgs = [equalizer(orig_img) for _ in range(4)]
 plot(equalized_imgs)
 
 ####################################
+# Augmentation Transforms
+# -----------------------
+# The following transforms are combinations of multiple transforms,
+# either geometric or photometric, or both.
+#
 # AutoAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.AutoAugment` transform
@@ -261,7 +277,7 @@ plot(imgs, row_title=row_title)
 ####################################
 # RandAugment
 # ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.RandAugment` transform automatically augments the data.
+# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
 augmenter = T.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)
@@ -269,7 +285,9 @@ plot(imgs)
 ####################################
 # TrivialAugmentWide
 # ~~~~~~~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.TrivialAugmentWide` transform automatically augments the data.
+# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
+# However, instead of transforming an image multiple times, it transforms an image only once
+# using a random transform from a given list with a random strength number.
 augmenter = T.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)
@@ -277,18 +295,18 @@ plot(imgs)
 ####################################
 # AugMix
 # ~~~~~~
-# The :class:`~torchvision.transforms.AugMix` transform automatically augments the data.
+# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
 augmenter = T.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)
 
 ####################################
-# Randomly-applied transforms
+# Randomly-applied Transforms
 # ---------------------------
 #
-# Some transforms are randomly-applied given a probability ``p``.  That is, the
-# transformed image may actually be the same as the original one, even when
-# called with the same transformer instance!
+# The following transforms are randomly-applied given a probability ``p``.  That is, given ``p = 0.5``,
+# there is a 50% chance to return the original image, and a 50% chance to return the transformed image,
+# even when called with the same transform instance!
 #
 # RandomHorizontalFlip
 # ~~~~~~~~~~~~~~~~~~~~
-- 
GitLab


From 25c8922ad4ea7af031a6495abe7ffcdd4260f309 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 24 May 2023 12:31:18 +0200
Subject: [PATCH 462/624] fix lint (#7623)

---
 gallery/plot_transforms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gallery/plot_transforms.py b/gallery/plot_transforms.py
index 0e25de3af..2330dc0f9 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/plot_transforms.py
@@ -49,6 +49,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
 
     plt.tight_layout()
 
+
 ####################################
 # Geometric Transforms
 # --------------------
-- 
GitLab


From 15b4562d8ae11c18e5998af3fcf13d78032eedb7 Mon Sep 17 00:00:00 2001
From: ptrblck <ptrblck@users.noreply.github.com>
Date: Wed, 24 May 2023 04:56:38 -0700
Subject: [PATCH 463/624] Remove CUDA 11.7 builds; add 11.8 (#7616)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: atalman <atalman@fb.com>
---
 .github/workflows/prototype-tests-linux-gpu.yml | 2 +-
 .github/workflows/tests.yml                     | 4 ++--
 test/test_models.py                             | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index dee425054..dd850c14e 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -18,7 +18,7 @@ jobs:
           - python-version: "3.8"
             runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.7"
+            gpu-arch-version: "11.8"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c1010d9aa..cd6011b4a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,7 +24,7 @@ jobs:
           - python-version: 3.8
             runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.7"
+            gpu-arch-version: "11.8"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
@@ -85,7 +85,7 @@ jobs:
           - python-version: "3.8"
             runner: windows.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
-            gpu-arch-version: "11.7"
+            gpu-arch-version: "11.8"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
     with:
diff --git a/test/test_models.py b/test/test_models.py
index e1a288f4e..91aa66c66 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -682,10 +682,11 @@ def test_classification_model(model_fn, dev):
     model_name = model_fn.__name__
     if SKIP_BIG_MODEL and is_skippable(model_name, dev):
         pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model")
-    if model_name == "vit_h_14" and dev == "cuda":
-        # TODO: investigate why this fail on CI. It doesn't fail on AWS cluster with CUDA 11.6
-        # (can't test with later versions ATM)
-        pytest.xfail("https://github.com/pytorch/vision/issues/7143")
+    if model_name == "resnet101" and dev == "cuda":
+        # TODO: Investigate the Failure with CUDA 11.8: https://github.com/pytorch/vision/issues/7618
+        # TODO: Investigate/followup on previous failure: https://github.com/pytorch/vision/issues/7143
+        # its not happening on CI with CUDA 11.8 anymore. Follow up is needed if its still not resolved.
+        pytest.xfail("https://github.com/pytorch/vision/issues/7618")
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
     input_shape = kwargs.pop("input_shape")
-- 
GitLab


From 285500d6899b6aebac24718447c353c564838b31 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 24 May 2023 15:11:31 +0200
Subject: [PATCH 464/624] fix system image library uninstallation on macos x86
 (#7622)

---
 .github/scripts/setup-env.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index a7ba4a6bd..c86e5bf9f 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -27,9 +27,8 @@ if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then
   # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG and PNG libraries
   # installed by default that interfere with our build. We uninstall them here and use the one from conda below.
   IMAGE_LIBS=$(brew list | grep -E "jpeg|png")
-  echo "${IMAGE_LIBS}"
-  for lib in "${IMAGE_LIBS}"; do
-    brew uninstall --ignore-dependencies --force "${lib}" || true
+  for lib in $IMAGE_LIBS; do
+    brew uninstall --ignore-dependencies --force "${lib}"
   done
   echo '::endgroup::'
 fi
-- 
GitLab


From 4125d3a02b15faf4b19767a91797320151ce8bc6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 24 May 2023 16:43:47 +0200
Subject: [PATCH 465/624] kill CircleCI (#7611)

---
 .circleci/.gitignore                          |   1 -
 .circleci/build_docs/commit_docs.sh           |  35 ---
 .circleci/config.yml                          | 271 ------------------
 .circleci/config.yml.in                       | 251 ----------------
 .circleci/regenerate.py                       |  58 ----
 .circleci/smoke_test/docker/Dockerfile        |  34 ---
 .../android/scripts/binary_android_build.sh   |  27 --
 .../android/scripts/binary_android_upload.sh  |  34 ---
 .../android/scripts/install_gradle.sh         |  19 --
 .../unittest/ios/scripts/binary_ios_build.sh  |  47 ---
 .../unittest/ios/scripts/binary_ios_upload.sh |  42 ---
 .../unittest/linux/scripts/environment.yml    |  16 --
 .circleci/unittest/linux/scripts/install.sh   |  45 ---
 .../unittest/linux/scripts/post_process.sh    |   6 -
 .circleci/unittest/linux/scripts/run_test.sh  |  22 --
 .circleci/unittest/linux/scripts/setup_env.sh |  47 ---
 .../unittest/windows/scripts/environment.yml  |  19 --
 .circleci/unittest/windows/scripts/install.sh |  53 ----
 .../windows/scripts/install_conda.bat         |   1 -
 .../unittest/windows/scripts/post_process.sh  |   6 -
 .../unittest/windows/scripts/run_test.sh      |  12 -
 .../unittest/windows/scripts/set_cuda_envs.sh |  48 ----
 .../unittest/windows/scripts/setup_env.sh     |  45 ---
 .../windows/scripts/vc_env_helper.bat         |  39 ---
 .gitattributes                                |   3 -
 .../scripts/run-clang-format.py               |   0
 .github/workflows/lint.yml                    |   2 +-
 packaging/README.md                           |   6 -
 packaging/build_cmake.sh                      | 129 ---------
 packaging/build_conda.sh                      |  16 --
 packaging/build_wheel.sh                      |  60 ----
 packaging/vs2017/activate.bat                 |  44 ---
 packaging/vs2017/conda_build_config.yaml      |  23 --
 packaging/vs2017/install_activate.bat         |  29 --
 packaging/vs2017/install_runtime.bat          |  49 ----
 packaging/vs2017/meta.yaml                    |  24 --
 packaging/vs2019/install_runtime.bat          |  49 ----
 packaging/windows/internal/driver_update.bat  |  25 --
 packaging/windows/internal/vs2017_install.ps1 |  25 --
 packaging/windows/internal/vs2019_install.ps1 |  21 --
 40 files changed, 1 insertion(+), 1682 deletions(-)
 delete mode 100644 .circleci/.gitignore
 delete mode 100755 .circleci/build_docs/commit_docs.sh
 delete mode 100644 .circleci/config.yml
 delete mode 100644 .circleci/config.yml.in
 delete mode 100755 .circleci/regenerate.py
 delete mode 100644 .circleci/smoke_test/docker/Dockerfile
 delete mode 100644 .circleci/unittest/android/scripts/binary_android_build.sh
 delete mode 100644 .circleci/unittest/android/scripts/binary_android_upload.sh
 delete mode 100755 .circleci/unittest/android/scripts/install_gradle.sh
 delete mode 100755 .circleci/unittest/ios/scripts/binary_ios_build.sh
 delete mode 100644 .circleci/unittest/ios/scripts/binary_ios_upload.sh
 delete mode 100644 .circleci/unittest/linux/scripts/environment.yml
 delete mode 100755 .circleci/unittest/linux/scripts/install.sh
 delete mode 100755 .circleci/unittest/linux/scripts/post_process.sh
 delete mode 100755 .circleci/unittest/linux/scripts/run_test.sh
 delete mode 100755 .circleci/unittest/linux/scripts/setup_env.sh
 delete mode 100644 .circleci/unittest/windows/scripts/environment.yml
 delete mode 100644 .circleci/unittest/windows/scripts/install.sh
 delete mode 100644 .circleci/unittest/windows/scripts/install_conda.bat
 delete mode 100644 .circleci/unittest/windows/scripts/post_process.sh
 delete mode 100644 .circleci/unittest/windows/scripts/run_test.sh
 delete mode 100644 .circleci/unittest/windows/scripts/set_cuda_envs.sh
 delete mode 100644 .circleci/unittest/windows/scripts/setup_env.sh
 delete mode 100644 .circleci/unittest/windows/scripts/vc_env_helper.bat
 rename {.circleci/unittest/linux => .github}/scripts/run-clang-format.py (100%)
 delete mode 100644 packaging/README.md
 delete mode 100755 packaging/build_cmake.sh
 delete mode 100755 packaging/build_conda.sh
 delete mode 100755 packaging/build_wheel.sh
 delete mode 100644 packaging/vs2017/activate.bat
 delete mode 100644 packaging/vs2017/conda_build_config.yaml
 delete mode 100644 packaging/vs2017/install_activate.bat
 delete mode 100644 packaging/vs2017/install_runtime.bat
 delete mode 100644 packaging/vs2017/meta.yaml
 delete mode 100644 packaging/vs2019/install_runtime.bat
 delete mode 100644 packaging/windows/internal/driver_update.bat
 delete mode 100644 packaging/windows/internal/vs2017_install.ps1
 delete mode 100644 packaging/windows/internal/vs2019_install.ps1

diff --git a/.circleci/.gitignore b/.circleci/.gitignore
deleted file mode 100644
index 485dee64b..000000000
--- a/.circleci/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.idea
diff --git a/.circleci/build_docs/commit_docs.sh b/.circleci/build_docs/commit_docs.sh
deleted file mode 100755
index 04e3538fe..000000000
--- a/.circleci/build_docs/commit_docs.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-
-if [ "$2" == "" ]; then
-    echo call as "$0" "<src>" "<target branch>"
-    echo where src is the root of the built documentation git checkout and
-    echo branch should be "main" or "1.7" or so
-    exit 1
-fi
-
-src=$1
-target=$2
-
-echo "committing docs from ${src} to ${target}"
-
-pushd "${src}"
-git checkout gh-pages
-mkdir -p ./"${target}"
-rm -rf ./"${target}"/*
-cp -r "${src}/docs/build/html/"* ./"$target"
-if [ "${target}" == "main" ]; then
-    mkdir -p ./_static
-    rm -rf ./_static/*
-    cp -r "${src}/docs/build/html/_static/"* ./_static
-    git add --all ./_static || true
-fi
-git add --all ./"${target}" || true
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "auto-generating sphinx docs" || true
-git remote add https https://github.com/pytorch/vision.git
-git push -u https gh-pages
diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 0463007af..000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,271 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_win_wheel_py3.8
-#     - Replace binary_win_wheel_py3.8 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-#     - run:
-#         name: Checkout merge branch
-#         command: |
-#           set -ex
-#           BRANCH=$(git rev-parse --abbrev-ref HEAD)
-#           if [[ "$BRANCH" != "main" ]]; then
-#             git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH}
-#             git checkout "merged/$CIRCLE_BRANCH"
-#           fi
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-  pip_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      user:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> pip install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: >
-            pip install
-            <<# parameters.user >> --user <</ parameters.user >>
-            --progress-bar=off
-            << parameters.args >>
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.8)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-      default: "cpu"
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: ""
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cpu"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: torchvision/smoke_test:latest
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - pip_install:
-          args: jinja2 pyyaml
-      - run:
-          name: Check CircleCI config consistency
-          command: |
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  smoke_test_docker_image_build:
-    machine:
-      image: ubuntu-2004:202104-01
-    resource_class: large
-    environment:
-      image_name: torchvision/smoke_test
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Build and push Docker image
-          no_output_timeout: "1h"
-          command: |
-            set +x
-            echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin
-            set -x
-            cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest
-            docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker push ${image_name}:latest
-
-  cmake_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run: packaging/build_cmake.sh
-
-  cmake_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.small
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Build torchvision C++ distribution and test
-          no_output_timeout: 30m
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh
-
-  cmake_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake python=<< parameters.python_version >>
-            packaging/build_cmake.sh
-
-  cmake_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda create -yn python39 python=3.9
-            conda activate python39
-            packaging/build_cmake.sh
-
-  cmake_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate
-            conda update -y conda
-            conda create -yn python39 python=3.9
-            conda activate python39
-            packaging/build_cmake.sh
-
-
-workflows:
-  lint:
-    jobs:
-      - circleci_consistency
-
-  cmake:
-    jobs:
-      - cmake_linux_cpu:
-          cu_version: cpu
-          name: cmake_linux_cpu
-          python_version: '3.8'
-      - cmake_linux_gpu:
-          cu_version: cu117
-          name: cmake_linux_gpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - cmake_windows_cpu:
-          cu_version: cpu
-          name: cmake_windows_cpu
-          python_version: '3.8'
-      - cmake_windows_gpu:
-          cu_version: cu117
-          name: cmake_windows_gpu
-          python_version: '3.8'
-      - cmake_macos_cpu:
-          cu_version: cpu
-          name: cmake_macos_cpu
-          python_version: '3.8'
-
-  docker_build:
-    triggers:
-      - schedule:
-          cron: "0 10 * * 0"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - smoke_test_docker_image_build:
-          context: org-member
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
deleted file mode 100644
index f3a88f2d9..000000000
--- a/.circleci/config.yml.in
+++ /dev/null
@@ -1,251 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_win_wheel_py3.8
-#     - Replace binary_win_wheel_py3.8 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-#     - run:
-#         name: Checkout merge branch
-#         command: |
-#           set -ex
-#           BRANCH=$(git rev-parse --abbrev-ref HEAD)
-#           if [[ "$BRANCH" != "main" ]]; then
-#             git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH}
-#             git checkout "merged/$CIRCLE_BRANCH"
-#           fi
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-  pip_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      user:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> pip install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: >
-            pip install
-            <<# parameters.user >> --user <</ parameters.user >>
-            --progress-bar=off
-            << parameters.args >>
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.8)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-      default: "cpu"
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: ""
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cpu"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: torchvision/smoke_test:latest
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - pip_install:
-          args: jinja2 pyyaml
-      - run:
-          name: Check CircleCI config consistency
-          command: |
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  smoke_test_docker_image_build:
-    machine:
-      image: ubuntu-2004:202104-01
-    resource_class: large
-    environment:
-      image_name: torchvision/smoke_test
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Build and push Docker image
-          no_output_timeout: "1h"
-          command: |
-            set +x
-            echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin
-            set -x
-            cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest
-            docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker push ${image_name}:latest
-
-  cmake_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run: packaging/build_cmake.sh
-
-  cmake_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.small
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Build torchvision C++ distribution and test
-          no_output_timeout: 30m
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh
-
-  cmake_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake python=<< parameters.python_version >>
-            packaging/build_cmake.sh
-
-  cmake_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda create -yn python39 python=3.9
-            conda activate python39
-            packaging/build_cmake.sh
-
-  cmake_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate
-            conda update -y conda
-            conda create -yn python39 python=3.9
-            conda activate python39
-            packaging/build_cmake.sh
-
-
-workflows:
-  lint:
-    jobs:
-      - circleci_consistency
-
-  cmake:
-    jobs:
-      {{ cmake_workflows() }}
-
-  docker_build:
-    triggers:
-      - schedule:
-          cron: "0 10 * * 0"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - smoke_test_docker_image_build:
-          context: org-member
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
deleted file mode 100755
index e7b8db3d5..000000000
--- a/.circleci/regenerate.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script should use a very simple, functional programming style.
-Avoid Jinja macros in favor of native Python functions.
-
-Don't go overboard on code generation; use Python only to generate
-content that can't be easily declared statically using CircleCI's YAML API.
-
-Data declarations (e.g. the nested loops for defining the configuration matrix)
-should be at the top of the file for easy updating.
-
-See this comment for design rationale:
-https://github.com/pytorch/vision/pull/1321#issuecomment-531033978
-"""
-
-import os.path
-
-import jinja2
-import yaml
-from jinja2 import select_autoescape
-
-
-def indent(indentation, data_list):
-    return ("\n" + " " * indentation).join(yaml.dump(data_list, default_flow_style=False).splitlines())
-
-
-def cmake_workflows(indentation=6):
-    jobs = []
-    python_version = "3.8"
-    for os_type in ["linux", "windows", "macos"]:
-        # Skip OSX CUDA
-        device_types = ["cpu", "gpu"] if os_type != "macos" else ["cpu"]
-        for device in device_types:
-            job = {"name": f"cmake_{os_type}_{device}", "python_version": python_version}
-
-            job["cu_version"] = "cu117" if device == "gpu" else "cpu"
-            if device == "gpu" and os_type == "linux":
-                job["wheel_docker_image"] = "pytorch/manylinux-cuda117"
-            jobs.append({f"cmake_{os_type}_{device}": job})
-    return indent(indentation, jobs)
-
-
-if __name__ == "__main__":
-    d = os.path.dirname(__file__)
-    env = jinja2.Environment(
-        loader=jinja2.FileSystemLoader(d),
-        lstrip_blocks=True,
-        autoescape=select_autoescape(enabled_extensions=("html", "xml")),
-        keep_trailing_newline=True,
-    )
-
-    with open(os.path.join(d, "config.yml"), "w") as f:
-        f.write(
-            env.get_template("config.yml.in").render(
-                cmake_workflows=cmake_workflows,
-            )
-        )
diff --git a/.circleci/smoke_test/docker/Dockerfile b/.circleci/smoke_test/docker/Dockerfile
deleted file mode 100644
index 34bdcda10..000000000
--- a/.circleci/smoke_test/docker/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# this Dockerfile is for torchvision smoke test, it will be created periodically via CI system
-# if you need to do it locally, follow below steps once you have Docker installed
-# assuming you're within the directory where this Dockerfile located
-#  $ docker build . -t torchvision/smoketest
-
-# if you want to push to aws ecr, make sure you have the rights to write to ECR, then run
-# $ eval $(aws ecr get-login --region us-east-1 --no-include-email)
-# $ export MYTAG=localbuild  ## you can choose whatever tag you like
-# $ docker tag torchvision/smoketest 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchvision/smoke_test:${MYTAG}
-# $ docker push  308535385114.dkr.ecr.us-east-1.amazonaws.com/torchvision/smoke_test:${MYTAG}
-
-FROM ubuntu:latest
-
-RUN apt-get -qq update && apt-get -qq -y install curl bzip2 libsox-fmt-all \
-    && curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \
-    && bash /tmp/miniconda.sh -bfp /usr/local \
-    && rm -rf /tmp/miniconda.sh \
-    && conda install -y python=3 \
-    && conda update conda \
-    && apt-get -qq -y remove curl bzip2 \
-    && apt-get -qq -y autoremove \
-    && apt-get autoclean \
-    && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log \
-    && conda clean --all --yes
-
-ENV PATH /opt/conda/bin:$PATH
-
-RUN conda create -y --name python3.7 python=3.7
-RUN conda create -y --name python3.8 python=3.8
-RUN conda create -y --name python3.9 python=3.9
-RUN conda create -y --name python3.10 python=3.10
-SHELL [ "/bin/bash", "-c" ]
-RUN echo "source /usr/local/etc/profile.d/conda.sh" >> ~/.bashrc
-CMD [ "/bin/bash"]
diff --git a/.circleci/unittest/android/scripts/binary_android_build.sh b/.circleci/unittest/android/scripts/binary_android_build.sh
deleted file mode 100644
index 0d8c0d47d..000000000
--- a/.circleci/unittest/android/scripts/binary_android_build.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo "DIR: $(pwd)"
-echo "ANDROID_HOME=${ANDROID_HOME}"
-echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}"
-echo "JAVA_HOME=${JAVA_HOME}"
-
-WORKSPACE=/home/circleci/workspace
-VISION_ANDROID=/home/circleci/project/android
-
-. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh
-
-GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties
-rm -f $GRADLE_LOCAL_PROPERTIES
-
-echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-
-echo "GRADLE_PATH $GRADLE_PATH"
-echo "GRADLE_HOME $GRADLE_HOME"
-
-${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} assemble || true
-
-mkdir -p ~/workspace/artifacts
-find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz
-find . -type f -name *apk -print | xargs tar cfvz ~/workspace/artifacts/artifacts-apks.tgz
diff --git a/.circleci/unittest/android/scripts/binary_android_upload.sh b/.circleci/unittest/android/scripts/binary_android_upload.sh
deleted file mode 100644
index 1472a877d..000000000
--- a/.circleci/unittest/android/scripts/binary_android_upload.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo "DIR: $(pwd)"
-echo "ANDROID_HOME=${ANDROID_HOME}"
-echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}"
-echo "JAVA_HOME=${JAVA_HOME}"
-
-WORKSPACE=/home/circleci/workspace
-VISION_ANDROID=/home/circleci/project/android
-
-. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh
-
-GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties
-rm -f $GRADLE_LOCAL_PROPERTIES
-GRADLE_PROPERTIES=/home/circleci/project/android/gradle.properties
-
-echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-
-echo "SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES
-echo "mavenCentralRepositoryUsername=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES
-echo "SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES
-echo "mavenCentralRepositoryPassword=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES
-
-echo "signing.keyId=${ANDROID_SIGN_KEY}" >> $GRADLE_PROPERTIES
-echo "signing.password=${ANDROID_SIGN_PASS}" >> $GRADLE_PROPERTIES
-
-cat /home/circleci/project/android/gradle.properties | grep VERSION
-
-${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} ops:uploadArchives
-
-mkdir -p ~/workspace/artifacts
-find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz
diff --git a/.circleci/unittest/android/scripts/install_gradle.sh b/.circleci/unittest/android/scripts/install_gradle.sh
deleted file mode 100755
index ff90c657e..000000000
--- a/.circleci/unittest/android/scripts/install_gradle.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -ex
-
-_https_amazon_aws=https://downloads.gradle-dn.com/distributions
-GRADLE_VERSION=6.8.3
-
-_gradle_home=/opt/gradle
-sudo rm -rf $gradle_home
-sudo mkdir -p $_gradle_home
-
-curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip
-
-sudo unzip -q /tmp/gradle.zip -d $_gradle_home
-rm /tmp/gradle.zip
-
-sudo chmod -R 777 $_gradle_home
-
-export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION
-export GRADLE_PATH=${GRADLE_HOME}/bin/gradle
diff --git a/.circleci/unittest/ios/scripts/binary_ios_build.sh b/.circleci/unittest/ios/scripts/binary_ios_build.sh
deleted file mode 100755
index 1f117481f..000000000
--- a/.circleci/unittest/ios/scripts/binary_ios_build.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo ""
-echo "DIR: $(pwd)"
-WORKSPACE=/Users/distiller/workspace
-PROJ_ROOT_IOS=/Users/distiller/project/ios
-PYTORCH_IOS_NIGHTLY_NAME=libtorch_ios_nightly_build.zip
-export TCLLIBPATH="/usr/local/lib"
-
-# install conda
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x ~/conda.sh
-/bin/bash ~/conda.sh -b -p ~/anaconda
-export PATH="~/anaconda/bin:${PATH}"
-source ~/anaconda/bin/activate
-
-# install dependencies
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests wget --yes
-conda install -c conda-forge valgrind --yes
-export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-
-# sync submodules
-cd ${PROJ_ROOT_IOS}
-git submodule sync
-git submodule update --init --recursive
-
-# download pytorch-iOS nightly build and unzip it
-mkdir -p ${PROJ_ROOT_IOS}/lib
-mkdir -p ${PROJ_ROOT_IOS}/build
-mkdir -p ${PROJ_ROOT_IOS}/pytorch
-TORCH_ROOT="${PROJ_ROOT_IOS}/pytorch"
-
-cd ${TORCH_ROOT}
-wget https://ossci-ios-build.s3.amazonaws.com/${PYTORCH_IOS_NIGHTLY_NAME}
-mkdir -p ./build_ios
-unzip -d ./build_ios ./${PYTORCH_IOS_NIGHTLY_NAME}
-
-LIBTORCH_HEADER_ROOT="${TORCH_ROOT}/build_ios/install/include"
-cd ${PROJ_ROOT_IOS}
-IOS_ARCH=${IOS_ARCH} LIBTORCH_HEADER_ROOT=${LIBTORCH_HEADER_ROOT} ./build_ios.sh
-rm -rf ${TORCH_ROOT}
-
-# store the binary
-DEST_DIR=${WORKSPACE}/ios/${IOS_ARCH}
-mkdir -p ${DEST_DIR}
-cp ${PROJ_ROOT_IOS}/lib/*.a ${DEST_DIR}
diff --git a/.circleci/unittest/ios/scripts/binary_ios_upload.sh b/.circleci/unittest/ios/scripts/binary_ios_upload.sh
deleted file mode 100644
index ce56388e5..000000000
--- a/.circleci/unittest/ios/scripts/binary_ios_upload.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo ""
-echo "DIR: $(pwd)"
-
-WORKSPACE=/Users/distiller/workspace
-PROJ_ROOT=/Users/distiller/project
-ARTIFACTS_DIR=${WORKSPACE}/ios
-ls ${ARTIFACTS_DIR}
-ZIP_DIR=${WORKSPACE}/zip
-mkdir -p ${ZIP_DIR}/install/lib
-
-# build a FAT bianry
-cd ${ZIP_DIR}/install/lib
-libs=("${ARTIFACTS_DIR}/x86_64/libtorchvision_ops.a" "${ARTIFACTS_DIR}/arm64/libtorchvision_ops.a")
-lipo -create "${libs[@]}" -o ${ZIP_DIR}/install/lib/libtorchvision_ops.a
-lipo -i ${ZIP_DIR}/install/lib/*.a
-
-# copy the license
-cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
-# zip the library
-ZIPFILE=libtorchvision_ops_ios_nightly_build.zip
-cd ${ZIP_DIR}
-#for testing
-touch version.txt
-echo $(date +%s) > version.txt
-zip -r ${ZIPFILE} install version.txt LICENSE
-
-# upload to aws
-# Install conda then 'conda install' awscli
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x ~/conda.sh
-/bin/bash ~/conda.sh -b -p ~/anaconda
-export PATH="~/anaconda/bin:${PATH}"
-source ~/anaconda/bin/activate
-conda install -c conda-forge awscli --yes
-set +x
-export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD}
-export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD}
-set -x
-aws s3 cp ${ZIPFILE} s3://ossci-ios-build/ --acl public-read
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
deleted file mode 100644
index fae96c5f9..000000000
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pip
-  - libpng
-  - jpeg
-  - ca-certificates
-  - h5py
-  - pip:
-    - future
-    - scipy
-    - av < 10
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
deleted file mode 100755
index 6375e1910..000000000
--- a/.circleci/unittest/linux/scripts/install.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -ex
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION: ${CU_VERSION} "
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-    cudatoolkit="pytorch-cuda=${version}"
-
-     # make sure local cuda is set to required cuda version and not CUDA version by default
-    rm -f /usr/local/cuda
-    ln -s /usr/local/cuda-${version} /usr/local/cuda
-fi
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-if [ "${os}" == "MacOSX" ]; then
-    conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}"
-else
-    conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-fi
-
-
-printf "* Installing torchvision\n"
-python setup.py develop
diff --git a/.circleci/unittest/linux/scripts/post_process.sh b/.circleci/unittest/linux/scripts/post_process.sh
deleted file mode 100755
index e97bf2a7b..000000000
--- a/.circleci/unittest/linux/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh
deleted file mode 100755
index 5348baa71..000000000
--- a/.circleci/unittest/linux/scripts/run_test.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-python -m torch.utils.collect_env
-
-case "$(uname -s)" in
-  Darwin*)
-    # The largest macOS runner is not able to handle the regular test suite plus the transforms v2 tests at the same
-    # time due to insufficient resources. Thus, we ignore the transforms v2 tests at first and run them in a separate
-    # step afterwards.
-    GLOB='test/test_transforms_v2*'
-    pytest --junitxml=test-results/junit.xml -v --durations 20 --ignore-glob="${GLOB}"
-    eval "pytest --junitxml=test-results/junit-transforms-v2.xml -v --durations 20 ${GLOB}"
-    ;;
-  *)
-    pytest --junitxml=test-results/junit.xml -v --durations 20
-    ;;
-esac
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
deleted file mode 100755
index 8a8a78f1f..000000000
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -ex
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-# Avoid error: "fatal: unsafe repository"
-git config --global --add safe.directory '*'
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh"
-    bash ./miniconda.sh -b -f -p "${conda_dir}"
-fi
-eval "$(${conda_dir}/bin/conda shell.bash hook)"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-FFMPEG_PIN="=4.2"
-if [[ "${PYTHON_VERSION}" == "3.9" ]]; then
-    FFMPEG_PIN=">=4.2"
-fi
-
-conda install -y -c pytorch "ffmpeg${FFMPEG_PIN}"
-conda env update --file "${this_dir}/environment.yml" --prune
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
deleted file mode 100644
index d229aafb4..000000000
--- a/.circleci/unittest/windows/scripts/environment.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pip
-  - libpng
-  - jpeg
-  - ca-certificates
-  - hdf5
-  - setuptools
-  - pip:
-    - future
-    - scipy
-    - av !=9.1.1, <10
-    - dataclasses
-    - h5py
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
deleted file mode 100644
index 7c55c8144..000000000
--- a/.circleci/unittest/windows/scripts/install.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -ex
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-# TODO, refactor the below logic to make it easy to understand how to get correct cuda_version.
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-
-    cuda_toolkit_pckg="cudatoolkit"
-    if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 || $CUDA_VERSION == 11.8 || $CUDA_VERSION == 12.1 ]]; then
-        cuda_toolkit_pckg="pytorch-cuda"
-    fi
-
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-    cudatoolkit="${cuda_toolkit_pckg}=${version}"
-fi
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-
-torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())")
-echo torch.cuda.is_available is $torch_cuda
-
-if [ ! -z "${CUDA_VERSION:-}" ] ; then
-    if [ "$torch_cuda" == "False" ]; then
-        echo "torch with cuda installed but torch.cuda.is_available() is False"
-        exit 1
-    fi
-fi
-
-source "$this_dir/set_cuda_envs.sh"
-
-printf "* Installing torchvision\n"
-"$this_dir/vc_env_helper.bat" python setup.py develop
diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.circleci/unittest/windows/scripts/install_conda.bat
deleted file mode 100644
index 6052ad08b..000000000
--- a/.circleci/unittest/windows/scripts/install_conda.bat
+++ /dev/null
@@ -1 +0,0 @@
-start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
diff --git a/.circleci/unittest/windows/scripts/post_process.sh b/.circleci/unittest/windows/scripts/post_process.sh
deleted file mode 100644
index 5c5cbb758..000000000
--- a/.circleci/unittest/windows/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.circleci/unittest/windows/scripts/run_test.sh
deleted file mode 100644
index 802ad37f5..000000000
--- a/.circleci/unittest/windows/scripts/run_test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source "$this_dir/set_cuda_envs.sh"
-
-python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
diff --git a/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/.circleci/unittest/windows/scripts/set_cuda_envs.sh
deleted file mode 100644
index 7db3137b5..000000000
--- a/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-
-echo CU_VERSION is "${CU_VERSION}"
-echo CUDA_VERSION is "${CUDA_VERSION}"
-
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
-# to understand this code, see https://github.com/pytorch/vision/issues/4443
-version="cpu"
-if [[ ! -z "${CUDA_VERSION}" ]] ; then
-    version="$CUDA_VERSION"
-else
-    if [[ ${#CU_VERSION} -eq 5 ]]; then
-        version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-fi
-
-# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
-# It would exit the shell. One result is cpu tests would not run if the shell exit.
-# Unless there's an error, Don't exit.
-if [[ "$version" != "cpu" ]]; then
-    # set cuda envs
-    export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
-    export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-    export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-
-    if  [ ! -d "$CUDA_PATH" ]; then
-        echo "$CUDA_PATH" does not exist
-        exit 1
-    fi
-
-    if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then
-        echo "nvjpeg does not exist"
-        exit 1
-    fi
-
-    # check cuda driver version
-    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
-        if [[ -x "$path" ]]; then
-            "$path" || echo "true";
-            break
-        fi
-    done
-
-    which nvcc
-    nvcc --version
-    env | grep CUDA
-fi
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
deleted file mode 100644
index 846978759..000000000
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -ex
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    export tmp_conda="$(echo $conda_dir | tr '/' '\\')"
-    export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe"
-    curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
-    "$this_dir/install_conda.bat"
-    unset tmp_conda
-    unset miniconda_exe
-fi
-
-eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-conda env update --file "${this_dir}/environment.yml" --prune
-
-# 4. Downgrade setuptools on Python 3.7.
-#    See https://github.com/pytorch/vision/pull/5868
-if [[ "${PYTHON_VERSION}" == '3.7' ]]; then
-  pip install --upgrade setuptools==58.0.4
-fi
diff --git a/.circleci/unittest/windows/scripts/vc_env_helper.bat b/.circleci/unittest/windows/scripts/vc_env_helper.bat
deleted file mode 100644
index 941013567..000000000
--- a/.circleci/unittest/windows/scripts/vc_env_helper.bat
+++ /dev/null
@@ -1,39 +0,0 @@
-@echo on
-
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15INSTALLDIR=%%i"
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto vswhere
-    )
-)
-
-:vswhere
-if "%VSDEVCMD_ARGS%" == "" (
-    call "%VS15VCVARSALL%" x64 || exit /b 1
-) else (
-    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
-)
-
-@echo on
-
-set DISTUTILS_USE_SDK=1
-
-set args=%1
-shift
-:start
-if [%1] == [] goto done
-set args=%args% %1
-shift
-goto start
-
-:done
-if "%args%" == "" (
-    echo Usage: vc_env_helper.bat [command] [args]
-    echo e.g. vc_env_helper.bat cl /c test.cpp
-)
-
-%args% || exit /b 1
diff --git a/.gitattributes b/.gitattributes
index f9d672d7f..22d0452f8 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -6,6 +6,3 @@
 
 # To ignore it use below
 *.ipynb linguist-documentation
-
-# To exclude autogenerated files from code reviews
-.circleci/config.yml linguist-generated=true
diff --git a/.circleci/unittest/linux/scripts/run-clang-format.py b/.github/scripts/run-clang-format.py
similarity index 100%
rename from .circleci/unittest/linux/scripts/run-clang-format.py
rename to .github/scripts/run-clang-format.py
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ec8d285c9..8203bb61e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -61,7 +61,7 @@ jobs:
 
         echo '::group::Lint C source'
         set +e
-        ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format
+        ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format
         
         if [ $? -ne 0 ]; then
           git --no-pager diff
diff --git a/packaging/README.md b/packaging/README.md
deleted file mode 100644
index 3ceac5303..000000000
--- a/packaging/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Building torchvision packages for release
-
-TorchVision release packages are built by using `build_wheel.sh` and `build_conda.sh` for all permutations of
-supported operating systems, compute platforms and python versions.
-
-OS/Python/Compute matrix is defined in https://github.com/pytorch/vision/blob/main/.circleci/regenerate.py
diff --git a/packaging/build_cmake.sh b/packaging/build_cmake.sh
deleted file mode 100755
index 99d98c67f..000000000
--- a/packaging/build_cmake.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-set -ex
-
-PARALLELISM=8
-if [ -n "$MAX_JOBS" ]; then
-    PARALLELISM=$MAX_JOBS
-fi
-
-if [[ "$(uname)" != Darwin && "$OSTYPE" != "msys" ]]; then
-    eval "$(./conda/bin/conda shell.bash hook)"
-    conda activate ./env
-fi
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_plain_constraint
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    conda install -yq conda-build cmake future
-    pip install dataclasses
-fi
-
-setup_visual_studio_constraint
-setup_junit_results_folder
-
-if [[ "$(uname)" == Darwin ]]; then
-  # TODO: this can be removed as soon as mkl's CMake support works with clang
-  #  see https://github.com/pytorch/vision/pull/4203 for details
-  MKL_CONSTRAINT='mkl==2021.2.0'
-else
-  MKL_CONSTRAINT=''
-fi
-
-if [[ $CONDA_BUILD_VARIANT == "cpu" ]]; then
-  PYTORCH_MUTEX_CONSTRAINT='pytorch-mutex=1.0=cpu'
-else
-  PYTORCH_MUTEX_CONSTRAINT=''
-fi
-
-conda install -yq \pytorch=$PYTORCH_VERSION $CONDA_CUDATOOLKIT_CONSTRAINT $PYTORCH_MUTEX_CONSTRAINT $MKL_CONSTRAINT numpy -c nvidia -c "pytorch-${UPLOAD_CHANNEL}"
-TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)"))
-
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    conda install -yq libpng jpeg
-else
-    yum install -y libpng-devel libjpeg-turbo-devel
-fi
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    source .circleci/unittest/windows/scripts/set_cuda_envs.sh
-fi
-
-mkdir cpp_build
-pushd cpp_build
-
-# Generate libtorchvision files
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-
-# Compile and install libtorchvision
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cmake.bat" $PARALLELISM
-    CONDA_PATH=$(dirname $(which python))
-    cp -r "C:/Program Files (x86)/torchvision/include/torchvision" $CONDA_PATH/include
-else
-    make -j$PARALLELISM
-    make install
-
-    if [[ "$(uname)" == Darwin ]]; then
-        CONDA_PATH=$(dirname $(dirname $(which python)))
-        cp -r /usr/local/include/torchvision $CONDA_PATH/include/
-        export C_INCLUDE_PATH=/usr/local/include
-        export CPLUS_INCLUDE_PATH=/usr/local/include
-    fi
-fi
-
-popd
-
-# Install torchvision locally
-python setup.py develop
-
-# Trace, compile and run project that uses Faster-RCNN
-pushd test/tracing/frcnn
-mkdir build
-
-# Trace model
-python trace_model.py
-cp fasterrcnn_resnet50_fpn.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_frcnn.bat" $PARALLELISM
-    mv fasterrcnn_resnet50_fpn.pt Release
-    cd Release
-    export PATH=$(cygpath -w "C:/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"):$(cygpath -w "C:/Program Files (x86)/torchvision/bin"):$(cygpath -w $TORCH_PATH)/lib:$PATH
-else
-    make -j$PARALLELISM
-fi
-
-# Run traced program
-./test_frcnn_tracing
-
-# Compile and run the CPP example
-popd
-cd examples/cpp/hello_world
-mkdir build
-
-# Trace model
-python trace_model.py
-cp resnet18.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cpp_example.bat" $PARALLELISM
-    mv resnet18.pt Release
-    cd Release
-else
-    make -j$PARALLELISM
-fi
-
-# Run CPP example
-./hello-world
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
deleted file mode 100755
index ec171f827..000000000
--- a/packaging/build_conda.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_constraint
-setup_visual_studio_constraint
-setup_junit_results_folder
-export CUDATOOLKIT_CHANNEL="nvidia"
-
-conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --no-test --python "$PYTHON_VERSION" packaging/torchvision
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
deleted file mode 100755
index 3299d16ec..000000000
--- a/packaging/build_wheel.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=wheel
-setup_env
-setup_wheel_python
-pip_install numpy pyyaml future ninja
-pip_install --upgrade setuptools
-setup_pip_pytorch_version
-python setup.py clean
-
-# Copy binaries to be included in the wheel distribution
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    if [[ "$(uname)" == Darwin ]]; then
-        # Install delocate to relocate the required binaries
-        pip_install "delocate>=0.9"
-    else
-        cp "$bin_path/Library/bin/libpng16.dll" torchvision
-        cp "$bin_path/Library/bin/libjpeg.dll" torchvision
-    fi
-else
-    # Install auditwheel to get some inspection utilities
-    pip_install auditwheel
-
-    # Point to custom libraries
-    export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
-    export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include
-    export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib
-fi
-
-download_copy_ffmpeg
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
-else
-    IS_WHEEL=1 python setup.py bdist_wheel
-fi
-
-
-if [[ "$(uname)" == Darwin ]]; then
-    pushd dist/
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    for whl in *.whl; do
-        DYLD_FALLBACK_LIBRARY_PATH="$env_path/lib/:$DYLD_FALLBACK_LIBRARY_PATH" delocate-wheel -v --ignore-missing-dependencies $whl
-    done
-else
-    if [[ "$OSTYPE" == "msys" ]]; then
-        "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py
-    else
-        LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py
-    fi
-fi
diff --git a/packaging/vs2017/activate.bat b/packaging/vs2017/activate.bat
deleted file mode 100644
index ccecfc254..000000000
--- a/packaging/vs2017/activate.bat
+++ /dev/null
@@ -1,44 +0,0 @@
-:: Set env vars that tell distutils to use the compiler that we put on path
-SET DISTUTILS_USE_SDK=1
-SET MSSdk=1
-
-SET "VS_VERSION=15.0"
-SET "VS_MAJOR=15"
-SET "VS_YEAR=2017"
-
-set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
-set "MSYS2_ENV_CONV_EXCL=CL"
-
-:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
-:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
-set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VSINSTALLDIR=%%i\"
-        goto :vswhere
-    )
-)
-
-:vswhere
-
-:: Shorten PATH to avoid the `input line too long` error.
-SET MyPath=%PATH%
-
-setlocal EnableDelayedExpansion
-
-SET TempPath="%MyPath:;=";"%"
-SET var=
-FOR %%a IN (%TempPath%) DO (
-    IF EXIST %%~sa (
-        SET "var=!var!;%%~sa"
-    )
-)
-
-set "TempPath=!var:~1!"
-endlocal & set "PATH=%TempPath%"
-
-:: Shorten current directory too
-FOR %%A IN (.) DO CD "%%~sA"
-
-:: other things added by install_activate.bat at package build time
diff --git a/packaging/vs2017/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml
deleted file mode 100644
index 781814fd0..000000000
--- a/packaging/vs2017/conda_build_config.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-blas_impl:
-  - mkl                        # [x86_64]
-c_compiler:
-  - vs2017                     # [win]
-cxx_compiler:
-  - vs2017                     # [win]
-python:
-  - 3.8
-# This differs from target_platform in that it determines what subdir the compiler
-#    will target, not what subdir the compiler package will be itself.
-#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
-#    code on win-64 miniconda.
-cross_compiler_target_platform:
-  - win-64                     # [win]
-target_platform:
-  - win-64                     # [win]
-vc:
-  - 14
-zip_keys:
-  -                             # [win]
-    - vc                        # [win]
-    - c_compiler                # [win]
-    - cxx_compiler              # [win]
diff --git a/packaging/vs2017/install_activate.bat b/packaging/vs2017/install_activate.bat
deleted file mode 100644
index 253d2f2c2..000000000
--- a/packaging/vs2017/install_activate.bat
+++ /dev/null
@@ -1,29 +0,0 @@
-set YEAR=2017
-set VER=15
-
-mkdir "%PREFIX%\etc\conda\activate.d"
-COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-
-IF "%cross_compiler_target_platform%" == "win-64" (
-  set "target_platform=amd64"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  IF "%VSDEVCMD_ARGS%" == "" (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) ELSE (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  )
-  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) else (
-  set "target_platform=x86"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo popd
-  )
diff --git a/packaging/vs2017/install_runtime.bat b/packaging/vs2017/install_runtime.bat
deleted file mode 100644
index 5163c16cf..000000000
--- a/packaging/vs2017/install_runtime.bat
+++ /dev/null
@@ -1,49 +0,0 @@
-set VC_PATH=x86
-if "%ARCH%"=="64" (
-   set VC_PATH=x64
-)
-
-set MSC_VER=2017
-
-rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
-rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
-rem     set SP=%%A
-rem     )
-
-rem if not "%SP%" == "%PKG_VERSION%" (
-rem    echo "Version detected from registry: %SP%"
-rem    echo    "does not match version of package being built (%PKG_VERSION%)"
-rem    echo "Do you have current updates for VS 2015 installed?"
-rem    exit 1
-rem )
-
-
-REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
-if %ERRORLEVEL% GEQ 8 exit 1
-
-REM ========== This one comes from visual studio 2017
-set "VC_VER=141"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto :eof
-    )
-)
-
-@setlocal
-call "%VS15VARSALL%" x64
-
-set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
-
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-@endlocal
diff --git a/packaging/vs2017/meta.yaml b/packaging/vs2017/meta.yaml
deleted file mode 100644
index 1f569525e..000000000
--- a/packaging/vs2017/meta.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-{% set vcver="14.1" %}
-{% set vcfeature="14" %}
-{% set vsyear="2017" %}
-{% set fullver="15.4.27004.2010" %}
-
-package:
-  name: vs{{ vsyear }}
-  version: {{ fullver }}
-
-build:
-  skip: True  [not win]
-  script_env:
-    - VSDEVCMD_ARGS # [win]
-
-outputs:
-  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
-    script: install_activate.bat
-    track_features:
-      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
-      strong:
-        - vc{{ vcfeature }}
-    about:
-      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
-      license: BSD 3-clause
diff --git a/packaging/vs2019/install_runtime.bat b/packaging/vs2019/install_runtime.bat
deleted file mode 100644
index e09a5ccfb..000000000
--- a/packaging/vs2019/install_runtime.bat
+++ /dev/null
@@ -1,49 +0,0 @@
-set VC_PATH=x86
-if "%ARCH%"=="64" (
-   set VC_PATH=x64
-)
-
-set MSC_VER=2019
-
-rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
-rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
-rem     set SP=%%A
-rem     )
-
-rem if not "%SP%" == "%PKG_VERSION%" (
-rem    echo "Version detected from registry: %SP%"
-rem    echo    "does not match version of package being built (%PKG_VERSION%)"
-rem    echo "Do you have current updates for VS 2015 installed?"
-rem    exit 1
-rem )
-
-
-REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
-if %ERRORLEVEL% GEQ 8 exit 1
-
-REM ========== This one comes from visual studio 2019
-set "VC_VER=142"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto :eof
-    )
-)
-
-@setlocal
-call "%VS15VARSALL%" x64
-
-set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
-
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-@endlocal
diff --git a/packaging/windows/internal/driver_update.bat b/packaging/windows/internal/driver_update.bat
deleted file mode 100644
index 00b43affc..000000000
--- a/packaging/windows/internal/driver_update.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
-if errorlevel 1 exit /b 1
-
-start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
-if errorlevel 1 exit /b 1
-
-del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL
-
-setlocal EnableDelayedExpansion
-set NVIDIA_GPU_EXISTS=0
-for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
-    set GPUS=%%i
-    if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
-        SET NVIDIA_GPU_EXISTS=1
-        goto gpu_check_end
-    )
-)
-:gpu_check_end
-endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
-
-if "%NVIDIA_GPU_EXISTS%" == "0" (
-    echo "CUDA Driver installation Failed"
-    exit /b 1
-)
diff --git a/packaging/windows/internal/vs2017_install.ps1 b/packaging/windows/internal/vs2017_install.ps1
deleted file mode 100644
index 3e953de1a..000000000
--- a/packaging/windows/internal/vs2017_install.ps1
+++ /dev/null
@@ -1,25 +0,0 @@
-$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.14.13",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2017 installer failed"
-    exit 1
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
diff --git a/packaging/windows/internal/vs2019_install.ps1 b/packaging/windows/internal/vs2019_install.ps1
deleted file mode 100644
index e436051f0..000000000
--- a/packaging/windows/internal/vs2019_install.ps1
+++ /dev/null
@@ -1,21 +0,0 @@
-$VS_DOWNLOAD_LINK = "https://aka.ms/vs/16/release/vs_buildtools.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2019 installer failed"
-    exit 1
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
-- 
GitLab


From 4a51822ca20027b6e03ec4fb582c31cc9545ba4e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 25 May 2023 15:02:45 +0200
Subject: [PATCH 466/624] update retryable and viable/strict workflows (#7628)

---
 .github/pytorch-probot.yml                | 4 ++--
 .github/workflows/update-viablestrict.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index d381297f9..1a3402466 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -6,5 +6,5 @@ retryable_workflows:
 - Build Linux
 - Build Macos
 - Build M1
-- Tests on Linux
-- Tests on macOS
+- Build Windows
+- Tests
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 2d9c22656..665d833b6 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -17,7 +17,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@main
     with:
       repository: pytorch/vision
-      required_checks: "Build Linux,Build M1,Build Macos,Tests on Linux,Tests on macOS,Docs,Lint"
+      required_checks: "Build Linux,Build M1,Build Macos,Build Windows,Tests,CMake,Lint,Docs"
     secrets:
       ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
       GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
-- 
GitLab


From 81d12c3dd0f4cba754e34e9a588a6a3e3e03ce31 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 26 May 2023 09:49:26 +0100
Subject: [PATCH 467/624] Fix segfault in DeformConv2d when `mask` is None
 (#7632)

---
 torchvision/ops/deform_conv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/ops/deform_conv.py b/torchvision/ops/deform_conv.py
index bb4400e5c..b3cc83332 100644
--- a/torchvision/ops/deform_conv.py
+++ b/torchvision/ops/deform_conv.py
@@ -68,7 +68,7 @@ def deform_conv2d(
     use_mask = mask is not None
 
     if mask is None:
-        mask = torch.zeros((input.shape[0], 0), device=input.device, dtype=input.dtype)
+        mask = torch.zeros((input.shape[0], 1), device=input.device, dtype=input.dtype)
 
     if bias is None:
         bias = torch.zeros(out_channels, device=input.device, dtype=input.dtype)
-- 
GitLab


From 01b9faa16cfeacbb70aa33bd18534de50891786b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 26 May 2023 19:29:59 -0700
Subject: [PATCH 468/624] Use the same CUDNN version on Windows as PyTorch
 (#7635)

---
 packaging/windows/internal/cuda_install.bat | 31 ++-------------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index 2065973af..5a6169dbe 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -23,39 +23,12 @@ set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
 set CUDNN_FOLDER="cuda"
 set CUDNN_LIB_FOLDER="lib\x64"
 
-if %CUDA_VER% EQU 117 goto cuda117
 if %CUDA_VER% EQU 118 goto cuda118
 if %CUDA_VER% EQU 121 goto cuda121
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
 
-:cuda117
-
-set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvjpeg_11.7 nvjpeg_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7"
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
 :cuda118
 
 set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
@@ -66,8 +39,8 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvjpeg_11.8 nvjpeg_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
 )
 
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.5.0.96_cuda11-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.5.0.96_cuda11-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.7.0.84_cuda11-archive
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 set CUDNN_LIB_FOLDER="lib"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
     curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-- 
GitLab


From db22668ea2429a3aca403cebe20ce113e4552e3f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 30 May 2023 14:15:24 +0200
Subject: [PATCH 469/624] pin numpy for Python 3.8 (#7639)

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8b8ddcde1..8f4b1c761 100644
--- a/setup.py
+++ b/setup.py
@@ -57,8 +57,10 @@ pytorch_dep = "torch"
 if os.getenv("PYTORCH_VERSION"):
     pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
+numpy_dep = "numpy" if sys.version_info[:2] >= (3, 9) else "numpy < 1.25"
+
 requirements = [
-    "numpy",
+    numpy_dep,
     "requests",
     pytorch_dep,
 ]
-- 
GitLab


From 9d0a93eee90bf7c401b74ebf9c8be80346254f15 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 30 May 2023 14:36:15 +0200
Subject: [PATCH 470/624] also list full conda env in CI (#7640)

---
 .github/scripts/setup-env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index c86e5bf9f..ce86955d3 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -89,6 +89,7 @@ echo '::group::Install TorchVision'
 python setup.py develop
 echo '::endgroup::'
 
-echo '::group::Collect PyTorch environment information'
+echo '::group::Collect environment information'
+conda list
 python -m torch.utils.collect_env
 echo '::endgroup::'
-- 
GitLab


From f36c5de4fdaa65b419babcbd4655f4c1a1d31ea6 Mon Sep 17 00:00:00 2001
From: kurtamohler <kmohler@quansight.com>
Date: Tue, 30 May 2023 14:21:13 -0700
Subject: [PATCH 471/624] Avoid `_prims_common.check` in favor of
 `torch._check` (#7625)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/_meta_registrations.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py
index 7285e15ce..9831cfdcb 100644
--- a/torchvision/_meta_registrations.py
+++ b/torchvision/_meta_registrations.py
@@ -6,8 +6,6 @@ import torch.library
 # Ensure that torch.ops.torchvision is visible
 import torchvision.extension  # noqa: F401
 
-from torch._prims_common import check
-
 
 @functools.lru_cache(None)
 def get_meta_lib():
@@ -25,8 +23,8 @@ def register_meta(op_name, overload_name="default"):
 
 @register_meta("roi_align")
 def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
-    check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
-    check(
+    torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    torch._check(
         input.dtype == rois.dtype,
         lambda: (
             "Expected tensor for input to have the same type as tensor for rois; "
@@ -42,7 +40,7 @@ def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, samp
 def meta_roi_align_backward(
     grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned
 ):
-    check(
+    torch._check(
         grad.dtype == rois.dtype,
         lambda: (
             "Expected tensor for grad to have the same type as tensor for rois; "
-- 
GitLab


From c45f5dc6f865b89502b4ff60f68e061cae6eb1fa Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 31 May 2023 13:42:08 +0200
Subject: [PATCH 472/624] only run pyav backend tests for VideoReader if av is
 available (#7641)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 test/test_videoapi.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index c1bfb9012..cb723e25d 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -25,6 +25,13 @@ CheckerConfig = ["duration", "video_fps", "audio_sample_rate"]
 GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
 
 
+def backends():
+    backends_ = ["video_reader"]
+    if av is not None:
+        backends_.append("pyav")
+    return backends_
+
+
 def fate(name, path="."):
     """Download and return a path to a sample from the FFmpeg test suite.
     See the `FFmpeg Automated Test Environment <https://www.ffmpeg.org/fate.html>`_
@@ -53,7 +60,7 @@ test_videos = {
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    @pytest.mark.parametrize("backend", backends())
     def test_frame_reading(self, test_video, backend):
         torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
@@ -119,7 +126,7 @@ class TestVideoApi:
 
     @pytest.mark.parametrize("stream", ["video", "audio"])
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    @pytest.mark.parametrize("backend", backends())
     def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
         torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
@@ -166,7 +173,7 @@ class TestVideoApi:
             del reader, reader_md
 
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    @pytest.mark.parametrize("backend", backends())
     def test_metadata(self, test_video, config, backend):
         """
         Test that the metadata returned via pyav corresponds to the one returned
@@ -180,7 +187,7 @@ class TestVideoApi:
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    @pytest.mark.parametrize("backend", backends())
     def test_seek_start(self, test_video, backend):
         torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
@@ -249,7 +256,7 @@ class TestVideoApi:
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
+    @pytest.mark.parametrize("backend", backends())
     def test_keyframe_reading(self, test_video, config, backend):
         torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
-- 
GitLab


From 70c8f500063b8106de49a1e205886a75d358afb7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 31 May 2023 13:59:12 +0100
Subject: [PATCH 473/624] Add discalimer about skipped video tests (#7646)

---
 test/test_videoapi.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index cb723e25d..05fbcbdbf 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -10,6 +10,12 @@ from torchvision.datasets.utils import download_url
 from torchvision.io import _HAS_VIDEO_OPT, VideoReader
 
 
+# WARNING: these tests have been skipped forever on the CI because the video ops
+# are never properly available. This is bad, but things have been in a terrible
+# state for a long time already as we write this comment, and we'll hopefully be
+# able to get rid of this all soon.
+
+
 try:
     import av
 
-- 
GitLab


From a6f6387932f9d6aaf09b2d05c9fa377e3e65b0e6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 31 May 2023 15:28:26 +0200
Subject: [PATCH 474/624] always install dependencies upfront with pip in CI
 (#7645)

---
 .github/scripts/setup-env.sh | 26 +++++++++++++-------------
 setup.py                     |  4 +---
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index ce86955d3..4042b14b2 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -71,19 +71,19 @@ if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
 fi
 echo '::endgroup::'
 
-if [[ "${OS_TYPE}" == "windows" ]]; then
-  echo '::group::Install third party dependencies prior to TorchVision install on Windows'
-  # `easy_install`, i.e. `python setup.py` has problems downloading the dependencies due to SSL.
-  # Thus, we install them upfront with `pip` to avoid that.
-  # Instead of fixing the SSL error, we can probably maintain this special case until we switch away from the deprecated
-  # `easy_install` anyway.
-  python setup.py egg_info
-  # The requires.txt cannot be used with `pip install -r` directly. The requirements are listed at the top and the
-  # optional dependencies come in non-standard syntax after a blank line. Thus, we just extract the header.
-  sed -e '/^$/,$d' *.egg-info/requires.txt > requirements.txt
-  pip install --progress-bar=off -r requirements.txt
-  echo '::endgroup::'
-fi
+echo '::group::Install third party dependencies prior to TorchVision install'
+# Installing with `easy_install`, e.g. `python setup.py install` or `python setup.py develop`, has some quirks when
+# when pulling in third-party dependencies. For example:
+# - On Windows, we often hit an SSL error although `pip` can install just fine.
+# - It happily pulls in pre-releases, which can lead to more problems down the line.
+#   `pip` does not unless explicitly told to do so.
+# Thus, we use `easy_install` to extract the third-party dependencies here and install them upfront with `pip`.
+python setup.py egg_info
+# The requires.txt cannot be used with `pip install -r` directly. The requirements are listed at the top and the
+# optional dependencies come in non-standard syntax after a blank line. Thus, we just extract the header.
+sed -e '/^$/,$d' *.egg-info/requires.txt | tee requirements.txt
+pip install --progress-bar=off -r requirements.txt
+echo '::endgroup::'
 
 echo '::group::Install TorchVision'
 python setup.py develop
diff --git a/setup.py b/setup.py
index 8f4b1c761..8b8ddcde1 100644
--- a/setup.py
+++ b/setup.py
@@ -57,10 +57,8 @@ pytorch_dep = "torch"
 if os.getenv("PYTORCH_VERSION"):
     pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
-numpy_dep = "numpy" if sys.version_info[:2] >= (3, 9) else "numpy < 1.25"
-
 requirements = [
-    numpy_dep,
+    "numpy",
     "requests",
     pytorch_dep,
 ]
-- 
GitLab


From 0ab7d05c5202e8620000566b6ea7925eae9146e1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 31 May 2023 17:10:56 +0100
Subject: [PATCH 475/624] Allow classification references to use the tensor
 backend (#7629)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 references/classification/presets.py | 46 +++++++++++++++++++++-------
 references/classification/train.py   | 13 ++++++--
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/references/classification/presets.py b/references/classification/presets.py
index 5d1bf1cc7..a710f92ae 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -16,8 +16,16 @@ class ClassificationPresetTrain:
         ra_magnitude=9,
         augmix_severity=3,
         random_erase_prob=0.0,
+        backend="pil",
     ):
-        trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+        trans = []
+        backend = backend.lower()
+        if backend == "tensor":
+            trans.append(transforms.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
+
+        trans.append(transforms.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
         if hflip_prob > 0:
             trans.append(transforms.RandomHorizontalFlip(hflip_prob))
         if auto_augment_policy is not None:
@@ -30,9 +38,12 @@ class ClassificationPresetTrain:
             else:
                 aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy)
                 trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation))
+
+        if backend == "pil":
+            trans.append(transforms.PILToTensor())
+
         trans.extend(
             [
-                transforms.PILToTensor(),
                 transforms.ConvertImageDtype(torch.float),
                 transforms.Normalize(mean=mean, std=std),
             ]
@@ -55,17 +66,30 @@ class ClassificationPresetEval:
         mean=(0.485, 0.456, 0.406),
         std=(0.229, 0.224, 0.225),
         interpolation=InterpolationMode.BILINEAR,
+        backend="pil",
     ):
+        trans = []
 
-        self.transforms = transforms.Compose(
-            [
-                transforms.Resize(resize_size, interpolation=interpolation),
-                transforms.CenterCrop(crop_size),
-                transforms.PILToTensor(),
-                transforms.ConvertImageDtype(torch.float),
-                transforms.Normalize(mean=mean, std=std),
-            ]
-        )
+        backend = backend.lower()
+        if backend == "tensor":
+            trans.append(transforms.PILToTensor())
+        else:
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
+
+        trans += [
+            transforms.Resize(resize_size, interpolation=interpolation, antialias=True),
+            transforms.CenterCrop(crop_size),
+        ]
+
+        if backend == "pil":
+            trans.append(transforms.PILToTensor())
+
+        trans += [
+            transforms.ConvertImageDtype(torch.float),
+            transforms.Normalize(mean=mean, std=std),
+        ]
+
+        self.transforms = transforms.Compose(trans)
 
     def __call__(self, img):
         return self.transforms(img)
diff --git a/references/classification/train.py b/references/classification/train.py
index 10ba22bce..0c1a30145 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -7,6 +7,7 @@ import presets
 import torch
 import torch.utils.data
 import torchvision
+import torchvision.transforms
 import transforms
 import utils
 from sampler import RASampler
@@ -143,6 +144,7 @@ def load_data(traindir, valdir, args):
                 random_erase_prob=random_erase_prob,
                 ra_magnitude=ra_magnitude,
                 augmix_severity=augmix_severity,
+                backend=args.backend,
             ),
         )
         if args.cache_dataset:
@@ -160,10 +162,16 @@ def load_data(traindir, valdir, args):
     else:
         if args.weights and args.test_only:
             weights = torchvision.models.get_weight(args.weights)
-            preprocessing = weights.transforms()
+            preprocessing = weights.transforms(antialias=True)
+            if args.backend == "tensor":
+                preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing])
+
         else:
             preprocessing = presets.ClassificationPresetEval(
-                crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation
+                crop_size=val_crop_size,
+                resize_size=val_resize_size,
+                interpolation=interpolation,
+                backend=args.backend,
             )
 
         dataset_test = torchvision.datasets.ImageFolder(
@@ -507,6 +515,7 @@ def get_args_parser(add_help=True):
         "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)"
     )
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
     return parser
 
 
-- 
GitLab


From f3d6c6be5396d7dcfd25271568a3dd35c0e03f25 Mon Sep 17 00:00:00 2001
From: ptrblck <ptrblck@users.noreply.github.com>
Date: Thu, 1 Jun 2023 12:41:48 -0700
Subject: [PATCH 476/624] disable tf32 in cuDNN for classification models
 (#7634)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_models.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 91aa66c66..4633617d4 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -25,6 +25,14 @@ ACCEPT = os.getenv("EXPECTTEST_ACCEPT", "0") == "1"
 SKIP_BIG_MODEL = os.getenv("SKIP_BIG_MODEL", "1") == "1"
 
 
+@contextlib.contextmanager
+def disable_tf32():
+    previous = torch.backends.cudnn.allow_tf32
+    torch.backends.cudnn.allow_tf32 = False
+    yield
+    torch.backends.cudnn.allow_tf32 = previous
+
+
 def list_model_fns(module):
     return [get_model_builder(name) for name in list_models(module)]
 
@@ -671,6 +679,7 @@ def test_vitc_models(model_fn, dev):
     test_classification_model(model_fn, dev)
 
 
+@disable_tf32()  # see: https://github.com/pytorch/vision/issues/7618
 @pytest.mark.parametrize("model_fn", list_model_fns(models))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
 def test_classification_model(model_fn, dev):
@@ -682,11 +691,6 @@ def test_classification_model(model_fn, dev):
     model_name = model_fn.__name__
     if SKIP_BIG_MODEL and is_skippable(model_name, dev):
         pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model")
-    if model_name == "resnet101" and dev == "cuda":
-        # TODO: Investigate the Failure with CUDA 11.8: https://github.com/pytorch/vision/issues/7618
-        # TODO: Investigate/followup on previous failure: https://github.com/pytorch/vision/issues/7143
-        # its not happening on CI with CUDA 11.8 anymore. Follow up is needed if its still not resolved.
-        pytest.xfail("https://github.com/pytorch/vision/issues/7618")
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
     input_shape = kwargs.pop("input_shape")
-- 
GitLab


From a5035df501747c8fc2cd7f6c1a41c44ce6934db3 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 2 Jun 2023 08:59:20 -0700
Subject: [PATCH 477/624] Make `disable_tf32` context manager handle exceptions
 (#7651)

---
 test/test_models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 4633617d4..f6eeb7c28 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -29,8 +29,10 @@ SKIP_BIG_MODEL = os.getenv("SKIP_BIG_MODEL", "1") == "1"
 def disable_tf32():
     previous = torch.backends.cudnn.allow_tf32
     torch.backends.cudnn.allow_tf32 = False
-    yield
-    torch.backends.cudnn.allow_tf32 = previous
+    try:
+        yield
+    finally:
+        torch.backends.cudnn.allow_tf32 = previous
 
 
 def list_model_fns(module):
-- 
GitLab


From 8835737414c8a0375258a9c263b2d87e46b05300 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 5 Jun 2023 15:16:26 +0200
Subject: [PATCH 478/624] dont reinstall CUDA in CMake / Windows workflow
 (#7648)

Co-authored-by: Andrey Talman <atalman@fb.com>
---
 .github/scripts/setup-env.sh                |   8 ++
 .github/workflows/build-cmake.yml           |   9 --
 packaging/windows/internal/cuda_install.bat | 135 --------------------
 3 files changed, 8 insertions(+), 144 deletions(-)
 delete mode 100644 packaging/windows/internal/cuda_install.bat

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index 4042b14b2..d10273590 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -52,6 +52,14 @@ fi
 
 echo '::endgroup::'
 
+if [[ "${OS_TYPE}" == windows && "${GPU_ARCH_TYPE}" == cuda ]]; then
+  echo '::group::Install VisualStudio CUDA extensions on Windows'
+  TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations"
+  mkdir -p "${TARGET_DIR}"
+  cp -r "${CUDA_HOME}/MSBuildExtensions/"* "${TARGET_DIR}"
+  echo '::endgroup::'
+fi
+
 echo '::group::Install PyTorch'
 # TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
 if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
index 8fdf99c84..06bd4de75 100644
--- a/.github/workflows/build-cmake.yml
+++ b/.github/workflows/build-cmake.yml
@@ -76,15 +76,6 @@ jobs:
 
         source packaging/windows/internal/vc_install_helper.sh
 
-        # FIXME: Basically, we are reinstalling CUDA here. We only need this, because we need to copy some files that
-        #  can be extracted from the CUDA installer, but are not available on our Windows AMI.
-        #  See https://github.com/pytorch/test-infra/pull/4189
-        if [[ ${{ matrix.gpu-arch-type }} == cuda ]]; then
-          export CU_VERSION=cu$(echo ${{ matrix.gpu-arch-version }} | sed 's/\.//')
-          echo CU_VERSION="${CU_VERSION}"
-          packaging/windows/internal/cuda_install.bat
-        fi
-
         export PYTHON_VERSION=3.8
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
deleted file mode 100644
index 5a6169dbe..000000000
--- a/packaging/windows/internal/cuda_install.bat
+++ /dev/null
@@ -1,135 +0,0 @@
-@echo on
-
-if "%CU_VERSION%" == "cpu" (
-    echo Skipping for CPU builds
-    exit /b 0
-)
-
-set SRC_DIR=%~dp0\..
-
-if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
-
-rem in unit test workflow, we get CUDA_VERSION, for example 11.1
-if defined CUDA_VERSION (
-    set CUDA_VER=%CUDA_VERSION:.=%
-) else (
-    set CUDA_VER=%CU_VERSION:cu=%
-)
-
-set /a CUDA_VER=%CU_VERSION:cu=%
-set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
-set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
-set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
-set CUDNN_FOLDER="cuda"
-set CUDNN_LIB_FOLDER="lib\x64"
-
-if %CUDA_VER% EQU 118 goto cuda118
-if %CUDA_VER% EQU 121 goto cuda121
-
-echo CUDA %CUDA_VERSION_STR% is not supported
-exit /b 1
-
-:cuda118
-
-set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvjpeg_11.8 nvjpeg_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
-)
-
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.7.0.84_cuda11-archive
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
-:cuda121
-
-set CUDA_INSTALL_EXE=cuda_12.1.0_531.14_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1"
-)
-
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.8.1.3_cuda12-archive
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
-:cuda_common
-
-if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
-    curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
-    if errorlevel 1 exit /b 1
-)
-
-echo Installing CUDA toolkit...
-7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
-pushd "%SRC_DIR%\temp_build\cuda"
-sc config wuauserv start= disabled
-sc stop wuauserv
-sc query wuauserv
-
-start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs"
-echo %errorlevel%
-
-popd
-
-echo Installing VS 2019 integration...
-xcopy /YI "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-
-
-echo Installing NvToolsExt...
-7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-
-echo Setting up environment...
-set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
-set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-
-if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
-    echo CUDA %CUDA_VERSION_STR% installed failed.
-    echo --------- RunDll32.exe.log
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log"
-    echo --------- setup.exe.log -------
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log"
-    exit /b 1
-)
-
-echo Installing cuDNN...
-7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
-- 
GitLab


From a00152bda742a69c9e6304ad926dc07141ed7284 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 6 Jun 2023 17:08:35 +0100
Subject: [PATCH 479/624] pdf -> abs link for arxiv papers (#7655)

---
 torchvision/models/mnasnet.py          | 10 +++++-----
 torchvision/models/resnet.py           | 10 +++++-----
 torchvision/models/swin_transformer.py | 14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 65462154a..5846111ab 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -95,7 +95,7 @@ def _get_depths(alpha: float) -> List[int]:
 
 
 class MNASNet(torch.nn.Module):
-    """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    """MNASNet, as described in https://arxiv.org/abs/1807.11626. This
     implements the B1 variant of the model.
     >>> model = MNASNet(1.0, num_classes=1000)
     >>> x = torch.rand(1, 3, 224, 224)
@@ -327,7 +327,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
 def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 0.5 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet0_5_Weights`, optional): The
@@ -355,7 +355,7 @@ def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool =
 def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 0.75 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet0_75_Weights`, optional): The
@@ -383,7 +383,7 @@ def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool
 def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 1.0 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet1_0_Weights`, optional): The
@@ -411,7 +411,7 @@ def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool =
 def mnasnet1_3(*, weights: Optional[MNASNet1_3_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 1.3 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet1_3_Weights`, optional): The
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index 367fc62eb..83c0340ce 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -682,7 +682,7 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
 def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     Args:
         weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The
@@ -708,7 +708,7 @@ def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
 def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     Args:
         weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
@@ -734,7 +734,7 @@ def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
 def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -766,7 +766,7 @@ def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
 def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -798,7 +798,7 @@ def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = T
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
 def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index df897e2b9..2035f659b 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -508,7 +508,7 @@ class SwinTransformerBlockV2(SwinTransformerBlock):
 class SwinTransformer(nn.Module):
     """
     Implements Swin Transformer from the `"Swin Transformer: Hierarchical Vision Transformer using
-    Shifted Windows" <https://arxiv.org/pdf/2103.14030>`_ paper.
+    Shifted Windows" <https://arxiv.org/abs/2103.14030>`_ paper.
     Args:
         patch_size (List[int]): Patch size.
         embed_dim (int): Patch embedding dimension.
@@ -804,7 +804,7 @@ class Swin_V2_B_Weights(WeightsEnum):
 def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_tiny architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_T_Weights`, optional): The
@@ -842,7 +842,7 @@ def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, *
 def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_small architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_S_Weights`, optional): The
@@ -880,7 +880,7 @@ def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, *
 def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_base architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_B_Weights`, optional): The
@@ -918,7 +918,7 @@ def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, *
 def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_tiny architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_T_Weights`, optional): The
@@ -958,7 +958,7 @@ def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = T
 def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_small architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_S_Weights`, optional): The
@@ -998,7 +998,7 @@ def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = T
 def swin_v2_b(*, weights: Optional[Swin_V2_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_base architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_B_Weights`, optional): The
-- 
GitLab


From 657027f3548f20aea7b1c34e26511823b1607145 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 6 Jun 2023 15:56:47 -0400
Subject: [PATCH 480/624] Remove unused packaging/pkg_helpers.bash, Remove
 CONDA_CHANNEL_FLAGS  (#7656)

---
 packaging/pkg_helpers.bash    | 352 ----------------------------------
 packaging/pre_build_script.sh |   2 +-
 2 files changed, 1 insertion(+), 353 deletions(-)
 delete mode 100644 packaging/pkg_helpers.bash

diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
deleted file mode 100644
index e7e5e7c43..000000000
--- a/packaging/pkg_helpers.bash
+++ /dev/null
@@ -1,352 +0,0 @@
-# A set of useful bash functions for common functionality we need to do in
-# many build scripts
-
-
-# Setup CUDA environment variables, based on CU_VERSION
-#
-# Inputs:
-#   CU_VERSION (cpu, cu92, cu100)
-#   NO_CUDA_PACKAGE (bool)
-#   BUILD_TYPE (conda, wheel)
-#
-# Outputs:
-#   VERSION_SUFFIX (e.g., "")
-#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
-#   WHEEL_DIR (e.g., cu100/)
-#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   FORCE_CUDA (respected by torchvision setup.py)
-#   NVCC_FLAGS (respected by torchvision setup.py)
-#
-# Precondition: CUDA versions are installed in their conventional locations in
-# /usr/local/cuda-*
-#
-# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
-# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
-# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
-# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
-# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
-# version of a Python package.  But that doesn't apply if you're on OS X,
-# since the default CU_VERSION on OS X is cpu.
-setup_cuda() {
-
-  # First, compute version suffixes.  By default, assume no version suffixes
-  export VERSION_SUFFIX=""
-  export PYTORCH_VERSION_SUFFIX=""
-  export WHEEL_DIR=""
-  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
-  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
-    export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
-    # Match the suffix scheme of pytorch, unless this package does not have
-    # CUDA builds (in which case, use default)
-    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
-      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
-      export WHEEL_DIR="$CU_VERSION/"
-    fi
-  fi
-
-  # Now work out the CUDA settings
-  case "$CU_VERSION" in
-    cu121)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.1"
-      else
-        export CUDA_HOME=/usr/local/cuda-12.1/
-      fi
-      export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
-      ;;
-    cu118)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.8/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
-      ;;
-    cu117)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.7/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cpu)
-      ;;
-    rocm*)
-      export FORCE_CUDA=1
-      ;;
-    *)
-      echo "Unrecognized CU_VERSION=$CU_VERSION"
-      exit 1
-      ;;
-  esac
-  if [[ -n "$CUDA_HOME" ]]; then
-    # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one
-    export PATH="$CUDA_HOME/bin:$PATH"
-    export FORCE_CUDA=1
-  fi
-}
-
-# Populate build version if necessary, and add version suffix
-#
-# Inputs:
-#   BUILD_VERSION (e.g., 0.2.0 or empty)
-#   VERSION_SUFFIX (e.g., +cpu)
-#
-# Outputs:
-#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
-#
-# Fill BUILD_VERSION if it doesn't exist already with a nightly string
-# Usage: setup_build_version 0.2.0
-setup_build_version() {
-  if [[ -z "$BUILD_VERSION" ]]; then
-    if [[ -z "$1" ]]; then
-      setup_base_build_version
-    else
-      BUILD_VERSION="$1"
-    fi
-    BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
-  else
-    BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
-  fi
-
-  # Set build version based on tag if on tag
-  if [[ -n "${CIRCLE_TAG}" ]]; then
-    # Strip tag
-    BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
-  fi
-
-  export BUILD_VERSION
-}
-
-setup_base_build_version() {
-  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-  # version.txt for some reason has `a` character after major.minor.rev
-  # command below yields 0.10.0 from version.txt containing 0.10.0a0
-  BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" )
-  export BUILD_VERSION
-}
-
-# Set some useful variables for OS X, if applicable
-setup_macos() {
-  if [[ "$(uname)" == Darwin ]]; then
-    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
-  fi
-}
-
-
-# Top-level entry point for things every package will need to do
-#
-# Usage: setup_env 0.2.0
-setup_env() {
-  setup_cuda
-  setup_build_version "$1"
-  setup_macos
-}
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Inputs:
-#   PYTHON_VERSION (3.8, 3.9, 3.10)
-#   UNICODE_ABI (bool)
-#
-# Outputs:
-#   PATH modified to put correct Python version in PATH
-#
-# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
-setup_wheel_python() {
-  if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    eval "$(conda shell.bash hook)"
-    conda env remove -n "env$PYTHON_VERSION" || true
-    conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
-    conda activate "env$PYTHON_VERSION"
-    # Install libpng from Anaconda (defaults)
-    conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
-  else
-    # Install native CentOS libJPEG, freetype and GnuTLS
-    yum install -y libjpeg-turbo-devel freetype gnutls
-    case "$PYTHON_VERSION" in
-      3.8) python_abi=cp38-cp38 ;;
-      3.9) python_abi=cp39-cp39 ;;
-      3.10) python_abi=cp310-cp310 ;;
-      *)
-        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
-        exit 1
-        ;;
-    esac
-    # Download all the dependencies required to compile image and video_reader
-    # extensions
-
-    mkdir -p ext_libraries
-    pushd ext_libraries
-    popd
-    export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH"
-  fi
-}
-
-# Install with pip a bit more robustly than the default
-pip_install() {
-  retry pip install --progress-bar off "$@"
-}
-
-# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
-# version into PYTORCH_VERSION, if applicable
-setup_pip_pytorch_version() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    # Install latest prerelease version of torch, per our nightlies, consistent
-    # with the requested cuda version
-    pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html"
-    if [[ "$CUDA_VERSION" == "cpu" ]]; then
-      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
-      # in this case
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
-    else
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
-    fi
-  else
-    pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \
-      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html"
-  fi
-}
-
-# Fill PYTORCH_VERSION with the latest conda nightly version, and
-# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
-#
-# You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
-setup_conda_pytorch_constraint() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-nightly -c pytorch"
-    PYTHON="python"
-    # Check if we have python 3 instead and prefer that
-    if python3 --version >/dev/null 2>/dev/null; then
-      PYTHON="python3"
-    fi
-    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
-                              ${PYTHON} -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
-                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               print(re.sub(r'\\+.*$', '', \
-                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
-                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
-                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
-    if [[ -z "$PYTORCH_VERSION" ]]; then
-      echo "PyTorch version auto detection failed"
-      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
-      exit 1
-    fi
-  else
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}"
-  fi
-  if [[ "$CU_VERSION" == cpu ]]; then
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
-  else
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-  fi
-  if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev"
-  fi
-}
-
-# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
-setup_conda_cudatoolkit_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-  else
-    case "$CU_VERSION" in
-      cu121)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=12.1 # [not osx]"
-        ;;
-      cu118)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.8 # [not osx]"
-        ;;
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-setup_conda_cudatoolkit_plain_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  export CMAKE_USE_CUDA=1
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-    export CMAKE_USE_CUDA=0
-  else
-    case "$CU_VERSION" in
-      cu121)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=12.1"
-        ;;
-      cu118)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.8"
-        ;;
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        export CMAKE_USE_CUDA=0
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-# Build the proper compiler package before building the final package
-setup_visual_studio_constraint() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-      export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR
-      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml
-  fi
-}
-
-setup_junit_results_folder() {
-  if [[ "$CI" == "true" ]]; then
-    export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml"
-  fi
-}
-
-
-download_copy_ffmpeg() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-    # conda install -yq ffmpeg=4.2 -c pytorch
-    # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2
-    # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=-
-    # cp Library/bin/*.dll ../torchvision
-    echo "FFmpeg is disabled currently on Windows"
-  else
-    if [[ "$(uname)" == Darwin ]]; then
-      conda install -yq ffmpeg=4.2 -c pytorch
-      conda install -yq wget
-    else
-      # pushd ext_libraries
-      # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # ldconfig
-      # which ffmpeg
-      # popd
-      echo "FFmpeg is disabled currently on Linux"
-    fi
-  fi
-}
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index a03982b28..9d10738cf 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -13,7 +13,7 @@ fi
 
 if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Install libpng from Anaconda (defaults)
-  conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
+  conda install libpng "jpeg<=9b" -yq
   conda install -yq ffmpeg=4.2 -c pytorch
 
   # Copy binaries to be included in the wheel distribution
-- 
GitLab


From cff78aa6e3f201ab44cec86afed0f016943a19cf Mon Sep 17 00:00:00 2001
From: Max Chuprov <maxchup30@gmail.com>
Date: Mon, 12 Jun 2023 16:24:22 +0700
Subject: [PATCH 481/624] fix bug when using PIL backend in
 references/classification (#7665)

Co-authored-by: Max Chuprov <m.chuprov@expasoft.tech>
---
 references/classification/presets.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/references/classification/presets.py b/references/classification/presets.py
index a710f92ae..0f2c914be 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -69,11 +69,10 @@ class ClassificationPresetEval:
         backend="pil",
     ):
         trans = []
-
         backend = backend.lower()
         if backend == "tensor":
             trans.append(transforms.PILToTensor())
-        else:
+        elif backend != "pil":
             raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
         trans += [
-- 
GitLab


From 906c2e95598f2b32b4e3a45a0fcf705bad87df3c Mon Sep 17 00:00:00 2001
From: Dhuige <31347265+Dhuige@users.noreply.github.com>
Date: Mon, 12 Jun 2023 11:24:49 +0200
Subject: [PATCH 482/624] Typo backbone_utils.py (#7662)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/models/detection/backbone_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
index 1b10961d3..87ae8627f 100644
--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -158,7 +158,7 @@ def _validate_trainable_layers(
     if not is_trained:
         if trainable_backbone_layers is not None:
             warnings.warn(
-                "Changing trainable_backbone_layers has not effect if "
+                "Changing trainable_backbone_layers has no effect if "
                 "neither pretrained nor pretrained_backbone have been set to True, "
                 f"falling back to trainable_backbone_layers={max_value} so that all layers are trainable"
             )
-- 
GitLab


From e44bba123d78b0c9c08af6e17a681a2e4b135002 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 13 Jun 2023 13:42:14 +0100
Subject: [PATCH 483/624] Fix: don't call round() on float images for ResizeV2
 (#7669)

---
 test/test_transforms_v2_functional.py             | 10 ++++++++++
 torchvision/transforms/v2/functional/_geometry.py |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index ed861fee9..60a06f571 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -1395,3 +1395,13 @@ def test_memory_format_consistency_resize_image_tensor(test_id, info, args_kwarg
         assert expected_stride == output_stride, error_msg_fn("")
     else:
         assert False, error_msg_fn("")
+
+
+def test_resize_float16_no_rounding():
+    # Make sure Resize() doesn't round float16 images
+    # Non-regression test for https://github.com/pytorch/vision/issues/7667
+
+    img = torch.randint(0, 256, size=(1, 3, 100, 100), dtype=torch.float16)
+    out = F.resize(img, size=(10, 10))
+    assert out.dtype == torch.float16
+    assert (out.round() - out).sum() > 0
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index b9124f280..ced7ff0b2 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -228,7 +228,9 @@ def resize_image_tensor(
         if need_cast:
             if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
                 image = image.clamp_(min=0, max=255)
-            image = image.round_().to(dtype=dtype)
+            if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+                image = image.round_()
+            image = image.to(dtype=dtype)
 
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
-- 
GitLab


From 8324c481dd4c3096697332d76fbdc9d912f7360b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 14 Jun 2023 14:22:06 +0100
Subject: [PATCH 484/624] Add uint8 bicubic support to ResizeV2 (#7668)

Co-authored-by: vfdev-5 <vfdev.5@gmail.com>
---
 test/test_transforms_v2_consistency.py        | 22 ++++++++++++---
 test/transforms_v2_kernel_infos.py            | 28 +++++++++++++++----
 .../transforms/v2/functional/_geometry.py     | 16 +++++------
 3 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 05ab6b67a..e541feaf1 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -87,10 +87,8 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs([32]),
             ArgsKwargs((32, 29)),
             ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC),
             ArgsKwargs((30, 27), interpolation=PIL.Image.NEAREST),
             ArgsKwargs((35, 29), interpolation=PIL.Image.BILINEAR),
-            ArgsKwargs((34, 25), interpolation=PIL.Image.BICUBIC),
             NotScriptableArgsKwargs(31, max_size=32),
             ArgsKwargs([31], max_size=32),
             NotScriptableArgsKwargs(30, max_size=100),
@@ -101,6 +99,15 @@ CONSISTENCY_CONFIGS = [
         # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
         closeness_kwargs=dict(rtol=0, atol=1),
     ),
+    ConsistencyConfig(
+        v2_transforms.Resize,
+        legacy_transforms.Resize,
+        [
+            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC, antialias=True),
+            ArgsKwargs((34, 25), interpolation=PIL.Image.BICUBIC, antialias=True),
+        ],
+        closeness_kwargs=dict(rtol=0, atol=21),
+    ),
     ConsistencyConfig(
         v2_transforms.CenterCrop,
         legacy_transforms.CenterCrop,
@@ -309,15 +316,22 @@ CONSISTENCY_CONFIGS = [
             ArgsKwargs(17, scale=(0.3, 0.7)),
             ArgsKwargs(25, ratio=(0.5, 1.5)),
             ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC),
             ArgsKwargs((31, 28), interpolation=PIL.Image.NEAREST),
-            ArgsKwargs((33, 26), interpolation=PIL.Image.BICUBIC),
             ArgsKwargs((29, 32), antialias=False),
             ArgsKwargs((28, 31), antialias=True),
         ],
         # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
         closeness_kwargs=dict(rtol=0, atol=1),
     ),
+    ConsistencyConfig(
+        v2_transforms.RandomResizedCrop,
+        legacy_transforms.RandomResizedCrop,
+        [
+            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC, antialias=True),
+            ArgsKwargs((33, 26), interpolation=PIL.Image.BICUBIC, antialias=True),
+        ],
+        closeness_kwargs=dict(rtol=0, atol=21),
+    ),
     ConsistencyConfig(
         v2_transforms.RandomErasing,
         legacy_transforms.RandomErasing,
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index e5873f80d..7b877fb09 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -257,17 +257,20 @@ def sample_inputs_resize_image_tensor():
 
     for image_loader, interpolation in itertools.product(
         make_image_loaders(sizes=["random"], color_spaces=["RGB"]),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-            F.InterpolationMode.BICUBIC,
-        ],
+        [F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
     ):
         yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation)
 
     yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25)
 
 
+def sample_inputs_resize_image_tensor_bicubic():
+    for image_loader, interpolation in itertools.product(
+        make_image_loaders(sizes=["random"], color_spaces=["RGB"]), [F.InterpolationMode.BICUBIC]
+    ):
+        yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation)
+
+
 @pil_reference_wrapper
 def reference_resize_image_tensor(*args, **kwargs):
     if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in {
@@ -364,6 +367,21 @@ KERNEL_INFOS.extend(
                 xfail_jit_python_scalar_arg("size"),
             ],
         ),
+        KernelInfo(
+            F.resize_image_tensor,
+            sample_inputs_fn=sample_inputs_resize_image_tensor_bicubic,
+            reference_fn=reference_resize_image_tensor,
+            reference_inputs_fn=reference_inputs_resize_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(10, mae=True),
+                **cuda_vs_cpu_pixel_difference(atol=30),
+                **float32_vs_uint8_pixel_difference(1, mae=True),
+            },
+            test_marks=[
+                xfail_jit_python_scalar_arg("size"),
+            ],
+        ),
         KernelInfo(
             F.resize_bounding_box,
             sample_inputs_fn=sample_inputs_resize_bounding_box,
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index ced7ff0b2..aab3be24e 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -190,14 +190,13 @@ def resize_image_tensor(
         if interpolation == InterpolationMode.NEAREST or interpolation == InterpolationMode.NEAREST_EXACT:
             # uint8 dtype can be included for cpu and cuda input if nearest mode
             acceptable_dtypes.append(torch.uint8)
-        elif (
-            interpolation == InterpolationMode.BILINEAR
-            and image.device.type == "cpu"
-            and "AVX2" in torch.backends.cpu.get_cpu_capability()
-        ):
-            # uint8 dtype support for bilinear mode is limited to cpu and
-            # according to our benchmarks non-AVX CPUs should prefer u8->f32->interpolate->u8 path
-            acceptable_dtypes.append(torch.uint8)
+        elif image.device.type == "cpu":
+            # uint8 dtype support for bilinear and bicubic is limited to cpu and
+            # according to our benchmarks, non-AVX CPUs should still prefer u8->f32->interpolate->u8 path for bilinear
+            if (interpolation == InterpolationMode.BILINEAR and "AVX2" in torch.backends.cpu.get_cpu_capability()) or (
+                interpolation == InterpolationMode.BICUBIC
+            ):
+                acceptable_dtypes.append(torch.uint8)
 
         strides = image.stride()
         if image.is_contiguous(memory_format=torch.channels_last) and image.shape[0] == 1 and numel != strides[0]:
@@ -227,6 +226,7 @@ def resize_image_tensor(
 
         if need_cast:
             if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
+                # This path is hit on non-AVX archs, or on GPU.
                 image = image.clamp_(min=0, max=255)
             if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
                 image = image.round_()
-- 
GitLab


From 3d70e4bb9ee4197a7271ec88f567a395472697e3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 19 Jun 2023 21:29:38 +0200
Subject: [PATCH 485/624] remove obsolete transforms tests (#7678)

---
 test/test_functional_tensor.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 0e1cc648a..de9d10d6b 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -609,21 +609,6 @@ def test_resize_antialias(device, dt, size, interpolation):
     assert_equal(resized_tensor, resize_result)
 
 
-@needs_cuda
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
-def test_assert_resize_antialias(interpolation):
-
-    # Checks implementation on very large scales
-    # and catch TORCH_CHECK inside PyTorch implementation
-    torch.manual_seed(12)
-    tensor, _ = _create_data(1000, 1000, device="cuda")
-
-    # Error message is not yet updated in pytorch nightly
-    # with pytest.raises(RuntimeError, match=r"Provided interpolation parameters can not be handled"):
-    with pytest.raises(RuntimeError, match=r"Too much shared memory required"):
-        F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True)
-
-
 def test_resize_antialias_default_warning():
 
     img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8)
@@ -641,25 +626,6 @@ def test_resize_antialias_default_warning():
         F.resized_crop(img, 0, 0, 10, 10, size=(20, 20), interpolation=NEAREST)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("dt", [torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize("size", [[10, 7], [10, 42], [42, 7]])
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
-def test_interpolate_antialias_backward(device, dt, size, interpolation):
-
-    if dt == torch.float16 and device == "cpu":
-        # skip float16 on CPU case
-        return
-
-    torch.manual_seed(12)
-    x = (torch.rand(1, 32, 29, 3, dtype=torch.double, device=device).permute(0, 3, 1, 2).requires_grad_(True),)
-    resize = partial(F.resize, size=size, interpolation=interpolation, antialias=True)
-    assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False)
-
-    x = (torch.rand(1, 3, 32, 29, dtype=torch.double, device=device, requires_grad=True),)
-    assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False)
-
-
 def check_functional_vs_PIL_vs_scripted(
     fn, fn_pil, fn_t, config, device, dtype, channels=3, tol=2.0 + 1e-10, agg_method="max"
 ):
-- 
GitLab


From b9a1984cc98fd72b93ab340e4bdd9c33bdbe326b Mon Sep 17 00:00:00 2001
From: "Adam J. Stewart" <ajstewart426@gmail.com>
Date: Mon, 19 Jun 2023 15:42:48 -0500
Subject: [PATCH 486/624] Simpler file chunking (#7673)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 packaging/wheel/relocate.py                       | 12 +-----------
 torchvision/datasets/utils.py                     |  2 +-
 torchvision/prototype/datasets/_builtin/README.md |  2 +-
 torchvision/prototype/datasets/utils/_resource.py |  2 +-
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py
index e6a4ef9d4..6a8d35f1a 100644
--- a/packaging/wheel/relocate.py
+++ b/packaging/wheel/relocate.py
@@ -2,7 +2,6 @@
 
 import glob
 import hashlib
-import io
 
 # Standard library imports
 import os
@@ -65,21 +64,12 @@ PLATFORM_ARCH = platform.machine()
 PYTHON_VERSION = sys.version_info
 
 
-def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE):
-    """Yield pieces of data from a file-like object until EOF."""
-    while True:
-        chunk = file.read(size)
-        if not chunk:
-            break
-        yield chunk
-
-
 def rehash(path, blocksize=1 << 20):
     """Return (hash, length) for path using hashlib.sha256()"""
     h = hashlib.sha256()
     length = 0
     with open(path, "rb") as f:
-        for block in read_chunks(f, size=blocksize):
+        while block := f.read(blocksize):
             length += len(block)
             h.update(block)
     digest = "sha256=" + urlsafe_b64encode(h.digest()).decode("latin1").rstrip("=")
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index 220c1ae79..b79b4ef4e 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -57,7 +57,7 @@ def calculate_md5(fpath: str, chunk_size: int = 1024 * 1024) -> str:
     else:
         md5 = hashlib.md5()
     with open(fpath, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
+        while chunk := f.read(chunk_size):
             md5.update(chunk)
     return md5.hexdigest()
 
diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index 05d61c687..3b33100eb 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -91,7 +91,7 @@ import hashlib
 def sha256sum(path, chunk_size=1024 * 1024):
     checksum = hashlib.sha256()
     with open(path, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
+        while chunk := f.read(chunk_size):
             checksum.update(chunk)
     print(checksum.hexdigest())
 ```
diff --git a/torchvision/prototype/datasets/utils/_resource.py b/torchvision/prototype/datasets/utils/_resource.py
index af4ede38d..dadec014b 100644
--- a/torchvision/prototype/datasets/utils/_resource.py
+++ b/torchvision/prototype/datasets/utils/_resource.py
@@ -136,7 +136,7 @@ class OnlineResource(abc.ABC):
     def _check_sha256(self, path: pathlib.Path, *, chunk_size: int = 1024 * 1024) -> None:
         hash = hashlib.sha256()
         with open(path, "rb") as file:
-            for chunk in iter(lambda: file.read(chunk_size), b""):
+            while chunk := file.read(chunk_size):
                 hash.update(chunk)
         sha256 = hash.hexdigest()
         if sha256 != self.sha256:
-- 
GitLab


From fc83b280387d92fe6aea73f35c1ed4ad1bb739da Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Tue, 20 Jun 2023 16:39:48 +0800
Subject: [PATCH 487/624] Add objc clang format (#7677)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .clang-format           | 10 +++++++---
 .pre-commit-config.yaml |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.clang-format b/.clang-format
index 6d0ab740d..95d60445f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -60,9 +60,6 @@ MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -85,4 +82,11 @@ SpacesInSquareBrackets: false
 Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
+---
+Language: ObjC
+ColumnLimit: 120
+AlignAfterOpenBracket: Align
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
 ...
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 343df7f10..762ebf6fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,6 +6,8 @@ repos:
       - id: check-toml
       - id: check-yaml
         exclude: packaging/.*
+        args:
+          - --allow-multiple-documents
       - id: mixed-line-ending
         args: [--fix=lf]
       - id: end-of-file-fixer
-- 
GitLab


From 016804ee80af899ad1c0c10f5cfc4102c16ac920 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 21 Jun 2023 11:08:15 +0100
Subject: [PATCH 488/624] Rename cpu_and_gpu into cpu_and_cuda (#7687)

---
 test/common_utils.py                  |  2 +-
 test/conftest.py                      |  2 +-
 test/test_functional_tensor.py        | 90 +++++++++++++--------------
 test/test_models.py                   | 12 ++--
 test/test_ops.py                      | 42 ++++++-------
 test/test_prototype_models.py         |  6 +-
 test/test_transforms_tensor.py        | 60 +++++++++---------
 test/test_transforms_v2.py            |  6 +-
 test/test_transforms_v2_functional.py | 50 +++++++--------
 9 files changed, 135 insertions(+), 135 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 1d0b82a82..3de113b53 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -122,7 +122,7 @@ def disable_console_output():
         yield
 
 
-def cpu_and_gpu():
+def cpu_and_cuda():
     import pytest  # noqa
 
     return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
diff --git a/test/conftest.py b/test/conftest.py
index a9e8f1cda..468587f1c 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -33,7 +33,7 @@ def pytest_collection_modifyitems(items):
         # The needs_cuda mark will exist if the test was explicitly decorated with
         # the @needs_cuda decorator. It will also exist if it was parametrized with a
         # parameter that has the mark: for example if a test is parametrized with
-        # @pytest.mark.parametrize('device', cpu_and_gpu())
+        # @pytest.mark.parametrize('device', cpu_and_cuda())
         # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
         # and the ones with device == 'cpu' won't have the mark.
         needs_cuda = item.get_closest_marker("needs_cuda") is not None
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index de9d10d6b..43f54e6f1 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -21,7 +21,7 @@ from common_utils import (
     _create_data_batch,
     _test_fn_on_batch,
     assert_equal,
-    cpu_and_gpu,
+    cpu_and_cuda,
     needs_cuda,
 )
 from torchvision.transforms import InterpolationMode
@@ -34,7 +34,7 @@ NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
 )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fn", [F.get_image_size, F.get_image_num_channels, F.get_dimensions])
 def test_image_sizes(device, fn):
     script_F = torch.jit.script(fn)
@@ -72,7 +72,7 @@ class TestRotate:
     scripted_rotate = torch.jit.script(F.rotate)
     IMG_W = 26
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(7, 33), (26, IMG_W), (32, IMG_W)])
     @pytest.mark.parametrize(
         "center",
@@ -131,7 +131,7 @@ class TestRotate:
             f"{out_pil_tensor[0, :7, :7]}"
         )
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_rotate_batch(self, device, dt):
         if dt == torch.float16 and device == "cpu":
@@ -157,7 +157,7 @@ class TestAffine:
     ALL_DTYPES = [None, torch.float32, torch.float64, torch.float16]
     scripted_affine = torch.jit.script(F.affine)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_identity_map(self, device, height, width, dt):
@@ -180,7 +180,7 @@ class TestAffine:
         )
         assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize(
@@ -224,7 +224,7 @@ class TestAffine:
         # Tolerance : less than 6% of different pixels
         assert ratio_diff_pixels < 0.06
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize("angle", [90, 45, 15, -30, -60, -120])
@@ -258,7 +258,7 @@ class TestAffine:
         # Tolerance : less than 3% of different pixels
         assert ratio_diff_pixels < 0.03
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize("t", [[10, 12], (-12, -13)])
@@ -283,7 +283,7 @@ class TestAffine:
 
         _assert_equal_tensor_to_pil(out_tensor, out_pil_img)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize(
@@ -344,7 +344,7 @@ class TestAffine:
         tol = 0.06 if device == "cuda" else 0.05
         assert ratio_diff_pixels < tol
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_batches(self, device, dt):
         if dt == torch.float16 and device == "cpu":
@@ -357,7 +357,7 @@ class TestAffine:
 
         _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0])
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_interpolation_type(self, device):
         tensor, pil_img = _create_data(26, 26, device=device)
 
@@ -389,7 +389,7 @@ def _get_data_dims_and_points_for_perspective():
     return dims_and_points
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
@@ -435,7 +435,7 @@ def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn):
     assert ratio_diff_pixels < 0.05
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 def test_perspective_batch(device, dims_and_points, dt):
@@ -473,7 +473,7 @@ def test_perspective_interpolation_type():
     assert_equal(res1, res2)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
     "size",
@@ -539,7 +539,7 @@ def test_resize(device, dt, size, max_size, interpolation):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_resize_asserts(device):
 
     tensor, pil_img = _create_data(26, 36, device=device)
@@ -556,7 +556,7 @@ def test_resize_asserts(device):
             F.resize(img, size=32, max_size=32)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("size", [[96, 72], [96, 420], [420, 72]])
 @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
@@ -663,7 +663,7 @@ def check_functional_vs_PIL_vs_scripted(
     _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=atol, **config)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"brightness_factor": f} for f in (0.1, 0.5, 1.0, 1.34, 2.5)])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -679,7 +679,7 @@ def test_adjust_brightness(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_invert(device, dtype, channels):
@@ -688,7 +688,7 @@ def test_invert(device, dtype, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("config", [{"bits": bits} for bits in range(0, 8)])
 @pytest.mark.parametrize("channels", [1, 3])
 def test_posterize(device, config, channels):
@@ -705,7 +705,7 @@ def test_posterize(device, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0, 64, 128, 192, 255]])
 @pytest.mark.parametrize("channels", [1, 3])
 def test_solarize1(device, config, channels):
@@ -722,7 +722,7 @@ def test_solarize1(device, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0.0, 0.25, 0.5, 0.75, 1.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -754,7 +754,7 @@ def test_solarize2(device, dtype, config, channels):
         *[(torch.int64, threshold) for threshold in [0, 2**32, 2**63 - 1]],
     ],
 )
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_solarize_threshold_within_bound(threshold, dtype, device):
     make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
     img = make_img((3, 12, 23), dtype=dtype, device=device)
@@ -770,7 +770,7 @@ def test_solarize_threshold_within_bound(threshold, dtype, device):
         (torch.int64, 2**64),
     ],
 )
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_solarize_threshold_above_bound(threshold, dtype, device):
     make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
     img = make_img((3, 12, 23), dtype=dtype, device=device)
@@ -778,7 +778,7 @@ def test_solarize_threshold_above_bound(threshold, dtype, device):
         F_t.solarize(img, threshold)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"sharpness_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -794,7 +794,7 @@ def test_adjust_sharpness(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_autocontrast(device, dtype, channels):
@@ -803,7 +803,7 @@ def test_autocontrast(device, dtype, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_autocontrast_equal_minmax(device, dtype, channels):
@@ -815,7 +815,7 @@ def test_autocontrast_equal_minmax(device, dtype, channels):
     assert (F.autocontrast(a)[0] == F.autocontrast(a[0])).all()
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("channels", [1, 3])
 def test_equalize(device, channels):
     torch.use_deterministic_algorithms(False)
@@ -832,7 +832,7 @@ def test_equalize(device, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"contrast_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -842,7 +842,7 @@ def test_adjust_contrast(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"saturation_factor": f} for f in [0.5, 0.75, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -852,7 +852,7 @@ def test_adjust_saturation(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"hue_factor": f} for f in [-0.45, -0.25, 0.0, 0.25, 0.45]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -862,7 +862,7 @@ def test_adjust_hue(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"gamma": g1, "gain": g2} for g1, g2 in zip([0.8, 1.0, 1.2], [0.7, 1.0, 1.3])])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -878,7 +878,7 @@ def test_adjust_gamma(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("pad", [2, [3], [0, 3], (3, 3), [4, 2, 4, 3]])
 @pytest.mark.parametrize(
@@ -928,7 +928,7 @@ def test_pad(device, dt, pad, config):
     _test_fn_on_batch(batch_tensors, F.pad, padding=script_pad, **config)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC])
 def test_resized_crop(device, mode):
     # test values of F.resized_crop in several cases:
@@ -963,7 +963,7 @@ def test_resized_crop(device, mode):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "func, args",
     [
@@ -996,7 +996,7 @@ def test_assert_image_tensor(device, func, args):
         func(tensor, *args)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_vflip(device):
     script_vflip = torch.jit.script(F.vflip)
 
@@ -1013,7 +1013,7 @@ def test_vflip(device):
     _test_fn_on_batch(batch_tensors, F.vflip)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_hflip(device):
     script_hflip = torch.jit.script(F.hflip)
 
@@ -1030,7 +1030,7 @@ def test_hflip(device):
     _test_fn_on_batch(batch_tensors, F.hflip)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "top, left, height, width",
     [
@@ -1059,7 +1059,7 @@ def test_crop(device, top, left, height, width):
     _test_fn_on_batch(batch_tensors, F.crop, top=top, left=left, height=height, width=width)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("image_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
@@ -1113,7 +1113,7 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn):
     torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_hsv2rgb(device):
     scripted_fn = torch.jit.script(F_t._hsv2rgb)
     shape = (3, 100, 150)
@@ -1144,7 +1144,7 @@ def test_hsv2rgb(device):
     _test_fn_on_batch(batch_tensors, F_t._hsv2rgb)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_rgb2hsv(device):
     scripted_fn = torch.jit.script(F_t._rgb2hsv)
     shape = (3, 150, 100)
@@ -1183,7 +1183,7 @@ def test_rgb2hsv(device):
     _test_fn_on_batch(batch_tensors, F_t._rgb2hsv)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_output_channels", (3, 1))
 def test_rgb_to_grayscale(device, num_output_channels):
     script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale)
@@ -1202,7 +1202,7 @@ def test_rgb_to_grayscale(device, num_output_channels):
     _test_fn_on_batch(batch_tensors, F.rgb_to_grayscale, num_output_channels=num_output_channels)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_center_crop(device):
     script_center_crop = torch.jit.script(F.center_crop)
 
@@ -1220,7 +1220,7 @@ def test_center_crop(device):
     _test_fn_on_batch(batch_tensors, F.center_crop, output_size=[10, 11])
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_five_crop(device):
     script_five_crop = torch.jit.script(F.five_crop)
 
@@ -1254,7 +1254,7 @@ def test_five_crop(device):
         assert_equal(transformed_batch, s_transformed_batch)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_ten_crop(device):
     script_ten_crop = torch.jit.script(F.ten_crop)
 
@@ -1300,7 +1300,7 @@ def test_elastic_transform_asserts():
         _ = F.elastic_transform(img_tensor, displacement=torch.rand(1, 2))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC])
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
diff --git a/test/test_models.py b/test/test_models.py
index f6eeb7c28..67eb2115c 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -15,7 +15,7 @@ import torch
 import torch.fx
 import torch.nn as nn
 from _utils_internal import get_relative_path
-from common_utils import cpu_and_gpu, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
+from common_utils import cpu_and_cuda, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
 from PIL import Image
 from torchvision import models, transforms
 from torchvision.models import get_model_builder, list_models
@@ -676,14 +676,14 @@ def vitc_b_16(**kwargs: Any):
 
 
 @pytest.mark.parametrize("model_fn", [vitc_b_16])
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_vitc_models(model_fn, dev):
     test_classification_model(model_fn, dev)
 
 
 @disable_tf32()  # see: https://github.com/pytorch/vision/issues/7618
 @pytest.mark.parametrize("model_fn", list_model_fns(models))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_classification_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -726,7 +726,7 @@ def test_classification_model(model_fn, dev):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.segmentation))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_segmentation_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -791,7 +791,7 @@ def test_segmentation_model(model_fn, dev):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.detection))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_detection_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -923,7 +923,7 @@ def test_detection_model_validation(model_fn):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.video))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_video_model(model_fn, dev):
     set_rng_seed(0)
     # the default input shape is
diff --git a/test/test_ops.py b/test/test_ops.py
index 463ebb333..b993bce65 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -10,7 +10,7 @@ import pytest
 import torch
 import torch.fx
 import torch.nn.functional as F
-from common_utils import assert_equal, cpu_and_gpu, needs_cuda
+from common_utils import assert_equal, cpu_and_cuda, needs_cuda
 from PIL import Image
 from torch import nn, Tensor
 from torch.autograd import gradcheck
@@ -97,7 +97,7 @@ class PoolWrapper(nn.Module):
 class RoIOpTester(ABC):
     dtype = torch.float64
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, deterministic=False, **kwargs):
         x_dtype = self.dtype if x_dtype is None else x_dtype
@@ -126,7 +126,7 @@ class RoIOpTester(ABC):
         tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5
         torch.testing.assert_close(gt_y.to(y), y, rtol=tol, atol=tol)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -135,7 +135,7 @@ class RoIOpTester(ABC):
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_torch_fx_trace(self, device, x_dtype=torch.float, rois_dtype=torch.float):
         op_obj = self.make_obj().to(device=device)
         graph_module = torch.fx.symbolic_trace(op_obj)
@@ -155,7 +155,7 @@ class RoIOpTester(ABC):
         torch.testing.assert_close(output_gt, output_fx, rtol=tol, atol=tol)
 
     @pytest.mark.parametrize("seed", range(10))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     def test_backward(self, seed, device, contiguous, deterministic=False):
         torch.random.manual_seed(seed)
@@ -418,7 +418,7 @@ class TestRoIAlign(RoIOpTester):
         self._helper_boxes_shape(ops.roi_align)
 
     @pytest.mark.parametrize("aligned", (True, False))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("deterministic", (True, False))
     def test_forward(self, device, contiguous, deterministic, aligned, x_dtype=None, rois_dtype=None):
@@ -450,7 +450,7 @@ class TestRoIAlign(RoIOpTester):
             )
 
     @pytest.mark.parametrize("seed", range(10))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("deterministic", (True, False))
     def test_backward(self, seed, device, contiguous, deterministic):
@@ -612,7 +612,7 @@ class TestMultiScaleRoIAlign:
         )
         assert repr(t) == expected_string
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -885,7 +885,7 @@ class TestDeformConv:
         )
         return DeformConvModuleWrapper(obj) if wrap else obj
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -894,7 +894,7 @@ class TestDeformConv:
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
     def test_forward(self, device, contiguous, batch_sz, dtype=None):
@@ -946,7 +946,7 @@ class TestDeformConv:
             wrong_mask = torch.rand_like(mask[:, :2])
             layer(x, offset, wrong_mask)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
     def test_backward(self, device, contiguous, batch_sz):
@@ -1411,7 +1411,7 @@ def assert_empty_loss(iou_fn, dtype, device):
 
 class TestGeneralizedBoxIouLoss:
     # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_giou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
@@ -1439,7 +1439,7 @@ class TestGeneralizedBoxIouLoss:
         with pytest.raises(ValueError, match="Invalid"):
             ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
         assert_empty_loss(ops.generalized_box_iou_loss, dtype, device)
@@ -1447,7 +1447,7 @@ class TestGeneralizedBoxIouLoss:
 
 class TestCompleteBoxIouLoss:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_ciou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
 
@@ -1461,14 +1461,14 @@ class TestCompleteBoxIouLoss:
         with pytest.raises(ValueError, match="Invalid"):
             ops.complete_box_iou_loss(box1s, box2s, reduction="xyz")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
         assert_empty_loss(ops.complete_box_iou_loss, dtype, device)
 
 
 class TestDistanceBoxIouLoss:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_distance_iou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
@@ -1483,7 +1483,7 @@ class TestDistanceBoxIouLoss:
         with pytest.raises(ValueError, match="Invalid"):
             ops.distance_box_iou_loss(box1s, box2s, reduction="xyz")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_distance_iou_inputs(self, dtype, device):
         assert_empty_loss(ops.distance_box_iou_loss, dtype, device)
@@ -1528,7 +1528,7 @@ class TestFocalLoss:
 
     @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
     @pytest.mark.parametrize("gamma", [0, 2])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [0, 1])
     def test_correct_ratio(self, alpha, gamma, device, dtype, seed):
@@ -1557,7 +1557,7 @@ class TestFocalLoss:
         torch.testing.assert_close(correct_ratio, loss_ratio, atol=tol, rtol=tol)
 
     @pytest.mark.parametrize("reduction", ["mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [2, 3])
     def test_equal_ce_loss(self, reduction, device, dtype, seed):
@@ -1584,7 +1584,7 @@ class TestFocalLoss:
     @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
     @pytest.mark.parametrize("gamma", [0, 2])
     @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [4, 5])
     def test_jit(self, alpha, gamma, reduction, device, dtype, seed):
@@ -1600,7 +1600,7 @@ class TestFocalLoss:
         torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
 
     # Raise ValueError for anonymous reduction mode
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_reduction_mode(self, device, dtype, reduction="xyz"):
         if device == "cpu" and dtype is torch.half:
diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
index 6d9f22c15..d32df68f1 100644
--- a/test/test_prototype_models.py
+++ b/test/test_prototype_models.py
@@ -1,13 +1,13 @@
 import pytest
 import test_models as TM
 import torch
-from common_utils import cpu_and_gpu, set_rng_seed
+from common_utils import cpu_and_cuda, set_rng_seed
 from torchvision.prototype import models
 
 
 @pytest.mark.parametrize("model_fn", (models.depth.stereo.raft_stereo_base,))
 @pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_raft_stereo(model_fn, model_mode, dev):
     # A simple test to make sure the model can do forward pass and jit scriptable
     set_rng_seed(0)
@@ -40,7 +40,7 @@ def test_raft_stereo(model_fn, model_mode, dev):
 
 @pytest.mark.parametrize("model_fn", (models.depth.stereo.crestereo_base,))
 @pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_crestereo(model_fn, model_mode, dev):
     set_rng_seed(0)
 
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index 077a12af4..e2ab5673f 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -12,7 +12,7 @@ from common_utils import (
     _create_data,
     _create_data_batch,
     assert_equal,
-    cpu_and_gpu,
+    cpu_and_cuda,
     float_dtypes,
     get_tmp_dir,
     int_dtypes,
@@ -105,7 +105,7 @@ def _test_fn_save_load(fn, tmpdir):
     _ = torch.jit.load(p)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "func,method,fn_kwargs,match_kwargs",
     [
@@ -130,7 +130,7 @@ def test_random(func, method, device, channels, fn_kwargs, match_kwargs):
 
 
 @pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("channels", [1, 3])
 class TestColorJitter:
     @pytest.fixture(autouse=True)
@@ -206,7 +206,7 @@ class TestColorJitter:
         )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("m", ["constant", "edge", "reflect", "symmetric"])
 @pytest.mark.parametrize("mul", [1, -1])
 def test_pad(m, mul, device):
@@ -229,7 +229,7 @@ def test_pad(m, mul, device):
     _test_op(F.pad, T.Pad, device=device, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_crop(device):
     fn_kwargs = {"top": 2, "left": 3, "height": 4, "width": 5}
     # Test transforms.RandomCrop with size and padding as tuple
@@ -257,7 +257,7 @@ def test_crop(device):
     _test_functional_op(F.crop, fn_kwargs=fn_kwargs, device=device)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "padding_config",
     [
@@ -283,7 +283,7 @@ def test_random_crop_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_center_crop(device, tmpdir):
     fn_kwargs = {"output_size": (4, 5)}
     meth_kwargs = {"size": (4, 5)}
@@ -313,7 +313,7 @@ def test_center_crop_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "fn, method, out_length",
     [
@@ -380,7 +380,7 @@ class TestResize:
         assert y.shape[1] == size
         assert y.shape[2] == int(size * 46 / 32)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64])
     @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]])
     @pytest.mark.parametrize("max_size", [None, 35, 1000])
@@ -404,7 +404,7 @@ class TestResize:
         fn = T.Resize(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
     @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]])
     @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]])
@@ -460,42 +460,42 @@ def test_random_affine_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("shear", [15, 10.0, (5.0, 10.0), [-15, 15], [-10.0, 10.0, -11.0, 11.0]])
 def test_random_affine_shear(device, interpolation, shear):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, shear=shear)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
 def test_random_affine_scale(device, interpolation, scale):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, scale=scale)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("translate", [(0.1, 0.2), [0.2, 0.1]])
 def test_random_affine_translate(device, interpolation, translate):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, translate=translate)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]])
 def test_random_affine_degrees(device, interpolation, degrees):
     _test_random_affine_helper(device, degrees=degrees, interpolation=interpolation)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_random_affine_fill(device, interpolation, fill):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, fill=fill)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("center", [(0, 0), [10, 10], None, (56, 44)])
 @pytest.mark.parametrize("expand", [True, False])
 @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]])
@@ -517,7 +517,7 @@ def test_random_rotate_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("distortion_scale", np.linspace(0.1, 1.0, num=20))
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
@@ -537,7 +537,7 @@ def test_random_perspective_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "Klass, meth_kwargs",
     [(T.Grayscale, {"num_output_channels": 1}), (T.Grayscale, {"num_output_channels": 3}), (T.RandomGrayscale, {})],
@@ -547,7 +547,7 @@ def test_to_grayscale(device, Klass, meth_kwargs):
     _test_class_op(Klass, meth_kwargs=meth_kwargs, test_exact_match=False, device=device, tol=tol, agg_method="max")
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("in_dtype", int_dtypes() + float_dtypes())
 @pytest.mark.parametrize("out_dtype", int_dtypes() + float_dtypes())
 def test_convert_image_dtype(device, in_dtype, out_dtype):
@@ -578,7 +578,7 @@ def test_convert_image_dtype_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("policy", [policy for policy in T.AutoAugmentPolicy])
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_autoaugment(device, policy, fill):
@@ -592,7 +592,7 @@ def test_autoaugment(device, policy, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_ops", [1, 2, 3])
 @pytest.mark.parametrize("magnitude", [7, 9, 11])
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
@@ -607,7 +607,7 @@ def test_randaugment(device, num_ops, magnitude, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_trivialaugmentwide(device, fill):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -620,7 +620,7 @@ def test_trivialaugmentwide(device, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_augmix(device, fill):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -686,7 +686,7 @@ def test_autoaugment__op_apply_shear(interpolation, mode):
     _assert_approx_equal_tensor_to_pil(out, expected_out)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "config",
     [
@@ -724,7 +724,7 @@ def test_random_erasing_with_invalid_data():
         random_erasing(img)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_normalize(device, tmpdir):
     fn = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     tensor, _ = _create_data(26, 34, device=device)
@@ -743,7 +743,7 @@ def test_normalize(device, tmpdir):
     scripted_fn.save(os.path.join(tmpdir, "t_norm.pt"))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_linear_transformation(device, tmpdir):
     c, h, w = 3, 24, 32
 
@@ -769,7 +769,7 @@ def test_linear_transformation(device, tmpdir):
     scripted_fn.save(os.path.join(tmpdir, "t_norm.pt"))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_compose(device):
     tensor, _ = _create_data(26, 34, device=device)
     tensor = tensor.to(dtype=torch.float32) / 255.0
@@ -797,7 +797,7 @@ def test_compose(device):
         torch.jit.script(t)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_random_apply(device):
     tensor, _ = _create_data(26, 34, device=device)
     tensor = tensor.to(dtype=torch.float32) / 255.0
@@ -839,7 +839,7 @@ def test_random_apply(device):
             torch.jit.script(transforms)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "meth_kwargs",
     [
@@ -877,7 +877,7 @@ def test_gaussian_blur(device, channels, meth_kwargs):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "fill",
     [
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 02e3e1e56..71df9ad72 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -16,7 +16,7 @@ import torchvision.transforms.v2 as transforms
 from common_utils import (
     assert_equal,
     assert_run_python_script,
-    cpu_and_gpu,
+    cpu_and_cuda,
     make_bounding_box,
     make_bounding_boxes,
     make_detection_mask,
@@ -173,7 +173,7 @@ class TestSmoke:
             next(make_vanilla_tensor_images()),
         ],
     )
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_common(self, transform, adapter, container_type, image_or_video, device):
         spatial_size = F.get_spatial_size(image_or_video)
         input = dict(
@@ -1364,7 +1364,7 @@ class TestRandomChoice:
 
 
 class TestRandomIoUCrop:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
     def test__get_params(self, device, options, mocker):
         image = mocker.MagicMock(spec=datapoints.Image)
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 60a06f571..9a2ea37a4 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -14,7 +14,7 @@ import torch
 from common_utils import (
     assert_close,
     cache,
-    cpu_and_gpu,
+    cpu_and_cuda,
     DEFAULT_SQUARE_SPATIAL_SIZE,
     make_bounding_boxes,
     needs_cuda,
@@ -120,7 +120,7 @@ class TestKernels:
         [info for info in KERNEL_INFOS if info.logs_usage],
         args_kwargs_fn=lambda info: info.sample_inputs_fn(),
     )
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_logging(self, spy_on, info, args_kwargs, device):
         spy = spy_on(torch._C._log_api_usage_once)
 
@@ -131,7 +131,7 @@ class TestKernels:
 
     @ignore_jit_warning_no_profile
     @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_scripted_vs_eager(self, test_id, info, args_kwargs, device):
         kernel_eager = info.kernel
         kernel_scripted = script(kernel_eager)
@@ -167,7 +167,7 @@ class TestKernels:
         ]
 
     @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
@@ -208,7 +208,7 @@ class TestKernels:
         )
 
     @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_no_inplace(self, info, args_kwargs, device):
         (input, *other_args), kwargs = args_kwargs.load(device)
         input = input.as_subclass(torch.Tensor)
@@ -240,7 +240,7 @@ class TestKernels:
         )
 
     @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_dtype_and_device_consistency(self, info, args_kwargs, device):
         (input, *other_args), kwargs = args_kwargs.load(device)
         input = input.as_subclass(torch.Tensor)
@@ -320,7 +320,7 @@ class TestDispatchers:
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(),
     )
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_logging(self, spy_on, info, args_kwargs, device):
         spy = spy_on(torch._C._log_api_usage_once)
 
@@ -331,7 +331,7 @@ class TestDispatchers:
 
     @ignore_jit_warning_no_profile
     @image_sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_scripted_smoke(self, info, args_kwargs, device):
         dispatcher = script(info.dispatcher)
 
@@ -553,7 +553,7 @@ def test_alias(alias, target):
         args_kwargs_fn=lambda info: info.sample_inputs_fn(),
     ),
 )
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_convert_dtype_image_tensor_dtype_and_device(info, args_kwargs, device):
     (input, *other_args), kwargs = args_kwargs.load(device)
     dtype = other_args[0] if other_args else kwargs.get("dtype", torch.float32)
@@ -564,7 +564,7 @@ def test_convert_dtype_image_tensor_dtype_and_device(info, args_kwargs, device):
     assert output.device == input.device
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_channels", [1, 3])
 def test_normalize_image_tensor_stats(device, num_channels):
     stats = pytest.importorskip("scipy.stats", reason="SciPy is not available")
@@ -664,7 +664,7 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
     return true_matrix
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_affine_bounding_box_on_fixed_input(device):
     # Check transformation against known expected output
     format = datapoints.BoundingBoxFormat.XYXY
@@ -715,7 +715,7 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_affine_segmentation_mask_on_fixed_input(device):
     # Check transformation against known expected output and CPU/CUDA devices
 
@@ -820,7 +820,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
         torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
     # Check transformation against known expected output
@@ -876,7 +876,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
     # Check transformation against known expected output and CPU/CUDA devices
 
@@ -892,7 +892,7 @@ def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
     torch.testing.assert_close(out_mask, expected_mask)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "format",
     [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH],
@@ -949,7 +949,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     torch.testing.assert_close(output_spatial_size, spatial_size)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device):
     mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
     mask[:, :, 0] = 1
@@ -961,7 +961,7 @@ def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device):
     torch.testing.assert_close(out_mask, expected_mask)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
     mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
     mask[:, 0, :] = 1
@@ -973,7 +973,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
     torch.testing.assert_close(out_mask, expected_mask)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "format",
     [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH],
@@ -1032,7 +1032,7 @@ def _parse_padding(padding):
     return padding
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]])
 def test_correctness_pad_bounding_box(device, padding):
     def _compute_expected_bbox(bbox, padding_):
@@ -1087,7 +1087,7 @@ def test_correctness_pad_bounding_box(device, padding):
         torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_pad_segmentation_mask_on_fixed_input(device):
     mask = torch.ones((1, 3, 3), dtype=torch.long, device=device)
 
@@ -1098,7 +1098,7 @@ def test_correctness_pad_segmentation_mask_on_fixed_input(device):
     torch.testing.assert_close(out_mask, expected_mask)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "startpoints, endpoints",
     [
@@ -1182,7 +1182,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
         torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=0, atol=1)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "output_size",
     [(18, 18), [18, 15], (16, 19), [12], [46, 48]],
@@ -1236,7 +1236,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
         torch.testing.assert_close(output_spatial_size, output_size)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("output_size", [[4, 2], [4], [7, 6]])
 def test_correctness_center_crop_mask(device, output_size):
     def _compute_expected_mask(mask, output_size):
@@ -1260,7 +1260,7 @@ def test_correctness_center_crop_mask(device, output_size):
 
 
 # Copied from test/test_functional_tensor.py
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("spatial_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
@@ -1357,7 +1357,7 @@ def test_equalize_image_tensor_edge_cases():
     assert output.unique().tolist() == [0, 255]
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_uniform_temporal_subsample(device):
     video = torch.arange(10, device=device)[:, None, None, None].expand(-1, 3, 8, 8)
     out_video = F.uniform_temporal_subsample(video, 5)
-- 
GitLab


From 58842057d4e5a3816073a1f7e3067db7e3fdd60a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 21 Jun 2023 12:26:33 +0200
Subject: [PATCH 489/624] fix mypy (#7685)

---
 torchvision/models/_api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 51db5c0b2..e244207a8 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -122,7 +122,9 @@ def get_weight(name: str) -> WeightsEnum:
     base_module_name = ".".join(sys.modules[__name__].__name__.split(".")[:-1])
     base_module = importlib.import_module(base_module_name)
     model_modules = [base_module] + [
-        x[1] for x in inspect.getmembers(base_module, inspect.ismodule) if x[1].__file__.endswith("__init__.py")
+        x[1]
+        for x in inspect.getmembers(base_module, inspect.ismodule)
+        if x[1].__file__.endswith("__init__.py")  # type: ignore[union-attr]
     ]
 
     weights_enum = None
-- 
GitLab


From 17d50fc6bc4cc091f01d1b916191e2fcfc30bf32 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 21 Jun 2023 15:11:21 +0100
Subject: [PATCH 490/624] Udate google analytics ID (#7688)

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4bb75fe6e..7b3e9e8a7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -140,7 +140,7 @@ html_theme_options = {
     "logo_only": True,
     "pytorch_project": "docs",
     "navigation_with_keys": True,
-    "analytics_id": "UA-117752657-2",
+    "analytics_id": "GTM-T8XT4PS",
 }
 
 html_logo = "_static/img/pytorch-logo-dark.svg"
-- 
GitLab


From 5178a2e2b19f9c133e3304bdca1a4601a77e76bf Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 21 Jun 2023 16:36:38 +0200
Subject: [PATCH 491/624] [PoC] refactor transforms v2 tests (#7562)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/common_utils.py                   |  22 +
 test/test_transforms_v2.py             |  24 -
 test/test_transforms_v2_refactored.py  | 721 +++++++++++++++++++++++++
 test/transforms_v2_dispatcher_infos.py |  13 -
 test/transforms_v2_kernel_infos.py     | 173 ------
 5 files changed, 743 insertions(+), 210 deletions(-)
 create mode 100644 test/test_transforms_v2_refactored.py

diff --git a/test/common_utils.py b/test/common_utils.py
index 3de113b53..abefd07c4 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -7,9 +7,11 @@ import itertools
 import os
 import pathlib
 import random
+import re
 import shutil
 import sys
 import tempfile
+import warnings
 from collections import defaultdict
 from subprocess import CalledProcessError, check_output, STDOUT
 from typing import Callable, Sequence, Tuple, Union
@@ -880,3 +882,23 @@ def assert_run_python_script(source_code):
             raise RuntimeError(f"script errored with output:\n{e.output.decode()}")
         if out != b"":
             raise AssertionError(out.decode())
+
+
+@contextlib.contextmanager
+def assert_no_warnings():
+    # The name `catch_warnings` is a misnomer as the context manager does **not** catch any warnings, but rather scopes
+    # the warning filters. All changes that are made to the filters while in this context, will be reset upon exit.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        yield
+
+
+@contextlib.contextmanager
+def ignore_jit_no_profile_information_warning():
+    # Calling a scripted object often triggers a warning like
+    # `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information`
+    # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
+    # them.
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning)
+        yield
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 71df9ad72..935d25edd 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1711,8 +1711,6 @@ def test_antialias_warning():
     tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
 
     match = "The default value of the antialias parameter"
-    with pytest.warns(UserWarning, match=match):
-        transforms.Resize((20, 20))(tensor_img)
     with pytest.warns(UserWarning, match=match):
         transforms.RandomResizedCrop((20, 20))(tensor_img)
     with pytest.warns(UserWarning, match=match):
@@ -1722,18 +1720,6 @@ def test_antialias_warning():
     with pytest.warns(UserWarning, match=match):
         transforms.RandomResize(10, 20)(tensor_img)
 
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize(tensor_img, (20, 20))
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize_image_tensor(tensor_img, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize(tensor_video, (20, 20))
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize_video(tensor_video, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Image(tensor_img).resize((20, 20))
     with pytest.warns(UserWarning, match=match):
         datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
 
@@ -1744,27 +1730,17 @@ def test_antialias_warning():
 
     with warnings.catch_warnings():
         warnings.simplefilter("error")
-        transforms.Resize((20, 20))(pil_img)
         transforms.RandomResizedCrop((20, 20))(pil_img)
         transforms.ScaleJitter((20, 20))(pil_img)
         transforms.RandomShortestSize((20, 20))(pil_img)
         transforms.RandomResize(10, 20)(pil_img)
-        transforms.functional.resize(pil_img, (20, 20))
 
-        transforms.Resize((20, 20), antialias=True)(tensor_img)
         transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
         transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
         transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
         transforms.RandomResize(10, 20, antialias=True)(tensor_img)
 
-        transforms.functional.resize(tensor_img, (20, 20), antialias=True)
-        transforms.functional.resize_image_tensor(tensor_img, (20, 20), antialias=True)
-        transforms.functional.resize(tensor_video, (20, 20), antialias=True)
-        transforms.functional.resize_video(tensor_video, (20, 20), antialias=True)
-
-        datapoints.Image(tensor_img).resize((20, 20), antialias=True)
         datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
-        datapoints.Video(tensor_video).resize((20, 20), antialias=True)
         datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
 
 
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
new file mode 100644
index 000000000..2b9565c74
--- /dev/null
+++ b/test/test_transforms_v2_refactored.py
@@ -0,0 +1,721 @@
+import contextlib
+import inspect
+import re
+from typing import get_type_hints
+from unittest import mock
+
+import numpy as np
+import PIL.Image
+import pytest
+
+import torch
+import torchvision.transforms.v2 as transforms
+from common_utils import (
+    assert_equal,
+    assert_no_warnings,
+    cache,
+    cpu_and_cuda,
+    ignore_jit_no_profile_information_warning,
+    make_bounding_box,
+    make_detection_mask,
+    make_image,
+    make_segmentation_mask,
+    make_video,
+)
+from torch.testing import assert_close
+from torchvision import datapoints
+from torchvision.transforms.functional import pil_modes_mapping
+from torchvision.transforms.v2 import functional as F
+
+
+def _to_tolerances(maybe_tolerance_dict):
+    if not isinstance(maybe_tolerance_dict, dict):
+        return dict(rtol=None, atol=None)
+
+    tolerances = dict(rtol=0, atol=0)
+    tolerances.update(maybe_tolerance_dict)
+    return tolerances
+
+
+def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel produces closes results for inputs on GPU and CPU."""
+    if input.device.type != "cuda":
+        return
+
+    input_cuda = input.as_subclass(torch.Tensor)
+    input_cpu = input_cuda.to("cpu")
+
+    actual = kernel(input_cuda, *args, **kwargs)
+    expected = kernel(input_cpu, *args, **kwargs)
+
+    assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol)
+
+
+@cache
+def _script(fn):
+    try:
+        return torch.jit.script(fn)
+    except Exception as error:
+        raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
+
+
+def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel is scriptable and if the scripted output is close to the eager one."""
+    if input.device.type != "cpu":
+        return
+
+    kernel_scripted = _script(kernel)
+
+    input = input.as_subclass(torch.Tensor)
+    with ignore_jit_no_profile_information_warning():
+        actual = kernel_scripted(input, *args, **kwargs)
+    expected = kernel(input, *args, **kwargs)
+
+    assert_close(actual, expected, rtol=rtol, atol=atol)
+
+
+def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel produces close results for batched and unbatched inputs."""
+    unbatched_input = input.as_subclass(torch.Tensor)
+
+    for batch_dims in [(2,), (2, 1)]:
+        repeats = [*batch_dims, *[1] * input.ndim]
+
+        actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs)
+
+        expected = kernel(unbatched_input, *args, **kwargs)
+        # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata
+        if isinstance(expected, torch.Tensor):
+            expected = expected.repeat(repeats)
+        else:
+            tensor, *metadata = expected
+            expected = (tensor.repeat(repeats), *metadata)
+
+        assert_close(actual, expected, rtol=rtol, atol=atol)
+
+    for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]:
+        degenerate_batched_input = torch.empty(
+            degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device
+        )
+
+        output = kernel(degenerate_batched_input, *args, **kwargs)
+        # Most kernels just return a tensor, but some also return some additional metadata
+        if not isinstance(output, torch.Tensor):
+            output, *_ = output
+
+        assert output.shape[: -input.ndim] == degenerate_batch_dims
+
+
+def check_kernel(
+    kernel,
+    input,
+    *args,
+    check_cuda_vs_cpu=True,
+    check_scripted_vs_eager=True,
+    check_batched_vs_unbatched=True,
+    **kwargs,
+):
+    initial_input_version = input._version
+
+    output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs)
+    # Most kernels just return a tensor, but some also return some additional metadata
+    if not isinstance(output, torch.Tensor):
+        output, *_ = output
+
+    # check that no inplace operation happened
+    assert input._version == initial_input_version
+
+    assert output.dtype == input.dtype
+    assert output.device == input.device
+
+    if check_cuda_vs_cpu:
+        _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu))
+
+    if check_scripted_vs_eager:
+        _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager))
+
+    if check_batched_vs_unbatched:
+        _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched))
+
+
+def _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs):
+    """Checks if the dispatcher can be scripted and the scripted version can be called without error."""
+    if not isinstance(input, datapoints.Image):
+        return
+
+    dispatcher_scripted = _script(dispatcher)
+    with ignore_jit_no_profile_information_warning():
+        dispatcher_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
+
+
+def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs):
+    """Checks if the dispatcher correctly dispatches the input to the corresponding kernel and that the input type is
+    preserved in doing so. For bounding boxes also checks that the format is preserved.
+    """
+    if isinstance(input, datapoints._datapoint.Datapoint):
+        # Due to our complex dispatch architecture for datapoints, we cannot spy on the kernel directly,
+        # but rather have to patch the `Datapoint.__F` attribute to contain the spied on kernel.
+        spy = mock.MagicMock(wraps=kernel)
+        with mock.patch.object(F, kernel.__name__, spy):
+            # Due to Python's name mangling, the `Datapoint.__F` attribute is only accessible from inside the class.
+            # Since that is not the case here, we need to prefix f"_{cls.__name__}"
+            # See https://docs.python.org/3/tutorial/classes.html#private-variables for details
+            with mock.patch.object(datapoints._datapoint.Datapoint, "_Datapoint__F", new=F):
+                output = dispatcher(input, *args, **kwargs)
+
+        spy.assert_called_once()
+    else:
+        with mock.patch(f"{dispatcher.__module__}.{kernel.__name__}", wraps=kernel) as spy:
+            output = dispatcher(input, *args, **kwargs)
+
+            spy.assert_called_once()
+
+    assert isinstance(output, type(input))
+
+    if isinstance(input, datapoints.BoundingBox):
+        assert output.format == input.format
+
+
+def check_dispatcher(
+    dispatcher,
+    kernel,
+    input,
+    *args,
+    check_scripted_smoke=True,
+    check_dispatch=True,
+    **kwargs,
+):
+    with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy:
+        dispatcher(input, *args, **kwargs)
+
+        spy.assert_any_call(f"{dispatcher.__module__}.{dispatcher.__name__}")
+
+    unknown_input = object()
+    with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
+        dispatcher(unknown_input, *args, **kwargs)
+
+    if check_scripted_smoke:
+        _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs)
+
+    if check_dispatch:
+        _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs)
+
+
+def _check_dispatcher_kernel_signature_match(dispatcher, *, kernel, input_type):
+    """Checks if the signature of the dispatcher matches the kernel signature."""
+    dispatcher_signature = inspect.signature(dispatcher)
+    dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
+
+    kernel_signature = inspect.signature(kernel)
+    kernel_params = list(kernel_signature.parameters.values())[1:]
+
+    if issubclass(input_type, datapoints._datapoint.Datapoint):
+        # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be
+        # explicitly passed to the kernel.
+        kernel_params = [param for param in kernel_params if param.name not in input_type.__annotations__.keys()]
+
+    dispatcher_params = iter(dispatcher_params)
+    for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
+        try:
+            # In general, the dispatcher parameters are a superset of the kernel parameters. Thus, we filter out
+            # dispatcher parameters that have no kernel equivalent while keeping the order intact.
+            while dispatcher_param.name != kernel_param.name:
+                dispatcher_param = next(dispatcher_params)
+        except StopIteration:
+            raise AssertionError(
+                f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` "
+                f"has no corresponding parameter on the dispatcher `{dispatcher.__name__}`."
+            ) from None
+
+        if issubclass(input_type, PIL.Image.Image):
+            # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check
+            # them in the first place.
+            dispatcher_param._annotation = kernel_param._annotation = inspect.Parameter.empty
+
+        assert dispatcher_param == kernel_param
+
+
+def _check_dispatcher_datapoint_signature_match(dispatcher):
+    """Checks if the signature of the dispatcher matches the corresponding method signature on the Datapoint class."""
+    dispatcher_signature = inspect.signature(dispatcher)
+    dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
+
+    datapoint_method = getattr(datapoints._datapoint.Datapoint, dispatcher.__name__)
+    datapoint_signature = inspect.signature(datapoint_method)
+    datapoint_params = list(datapoint_signature.parameters.values())[1:]
+
+    # Some annotations in the `datapoints._datapoint` module
+    # are stored as strings. The block below makes them concrete again (non-strings), so they can be compared to the
+    # natively concrete dispatcher annotations.
+    datapoint_annotations = get_type_hints(datapoint_method)
+    for param in datapoint_params:
+        param._annotation = datapoint_annotations[param.name]
+
+    assert dispatcher_params == datapoint_params
+
+
+def check_dispatcher_signatures_match(dispatcher, *, kernel, input_type):
+    _check_dispatcher_kernel_signature_match(dispatcher, kernel=kernel, input_type=input_type)
+    _check_dispatcher_datapoint_signature_match(dispatcher)
+
+
+def _check_transform_v1_compatibility(transform, input):
+    """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
+    ``get_params`` method, is scriptable, and the scripted version can be called without error."""
+    if not hasattr(transform, "_v1_transform_cls"):
+        return
+
+    if type(input) is not torch.Tensor:
+        return
+
+    if hasattr(transform._v1_transform_cls, "get_params"):
+        assert type(transform).get_params is transform._v1_transform_cls.get_params
+
+    scripted_transform = _script(transform)
+    with ignore_jit_no_profile_information_warning():
+        scripted_transform(input)
+
+
+def check_transform(transform_cls, input, *args, **kwargs):
+    transform = transform_cls(*args, **kwargs)
+
+    output = transform(input)
+    assert isinstance(output, type(input))
+
+    if isinstance(input, datapoints.BoundingBox):
+        assert output.format == input.format
+
+    _check_transform_v1_compatibility(transform, input)
+
+
+def transform_cls_to_functional(transform_cls):
+    def wrapper(input, *args, **kwargs):
+        transform = transform_cls(*args, **kwargs)
+        return transform(input)
+
+    wrapper.__name__ = transform_cls.__name__
+
+    return wrapper
+
+
+# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well
+INTERPOLATION_MODES = [
+    transforms.InterpolationMode.NEAREST,
+    transforms.InterpolationMode.NEAREST_EXACT,
+    transforms.InterpolationMode.BILINEAR,
+    transforms.InterpolationMode.BICUBIC,
+]
+
+
+@contextlib.contextmanager
+def assert_warns_antialias_default_value():
+    with pytest.warns(UserWarning, match="The default value of the antialias parameter of all the resizing transforms"):
+        yield
+
+
+def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
+    def transform(bbox, affine_matrix_, format_, spatial_size_):
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        in_dtype = bbox.dtype
+        if not torch.is_floating_point(bbox):
+            bbox = bbox.float()
+        bbox_xyxy = F.convert_format_bounding_box(
+            bbox.as_subclass(torch.Tensor),
+            old_format=format_,
+            new_format=datapoints.BoundingBoxFormat.XYXY,
+            inplace=True,
+        )
+        points = np.array(
+            [
+                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
+            ]
+        )
+        transformed_points = np.matmul(points, affine_matrix_.T)
+        out_bbox = torch.tensor(
+            [
+                np.min(transformed_points[:, 0]).item(),
+                np.min(transformed_points[:, 1]).item(),
+                np.max(transformed_points[:, 0]).item(),
+                np.max(transformed_points[:, 1]).item(),
+            ],
+            dtype=bbox_xyxy.dtype,
+        )
+        out_bbox = F.convert_format_bounding_box(
+            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
+        )
+        # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = out_bbox.to(dtype=in_dtype)
+        return out_bbox
+
+    if bounding_box.ndim < 2:
+        bounding_box = [bounding_box]
+
+    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
+    if len(expected_bboxes) > 1:
+        expected_bboxes = torch.stack(expected_bboxes)
+    else:
+        expected_bboxes = expected_bboxes[0]
+
+    return expected_bboxes
+
+
+class TestResize:
+    INPUT_SIZE = (17, 11)
+    OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)]
+
+    def _make_max_size_kwarg(self, *, use_max_size, size):
+        if use_max_size:
+            if not (isinstance(size, int) or len(size) == 1):
+                # This would result in an `ValueError`
+                return None
+
+            max_size = (size if isinstance(size, int) else size[0]) + 1
+        else:
+            max_size = None
+
+        return dict(max_size=max_size)
+
+    def _make_input(self, input_type, *, dtype=None, device="cpu", **kwargs):
+        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
+            input = make_image(size=self.INPUT_SIZE, dtype=dtype or torch.uint8, device=device, **kwargs)
+            if input_type is torch.Tensor:
+                input = input.as_subclass(torch.Tensor)
+            elif input_type is PIL.Image.Image:
+                input = F.to_image_pil(input)
+        elif input_type is datapoints.BoundingBox:
+            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
+            input = make_bounding_box(
+                spatial_size=self.INPUT_SIZE,
+                dtype=dtype or torch.float32,
+                device=device,
+                **kwargs,
+            )
+        elif input_type is datapoints.Mask:
+            input = make_segmentation_mask(size=self.INPUT_SIZE, dtype=dtype or torch.uint8, device=device, **kwargs)
+        elif input_type is datapoints.Video:
+            input = make_video(size=self.INPUT_SIZE, dtype=dtype or torch.uint8, device=device, **kwargs)
+
+        return input
+
+    def _compute_output_size(self, *, input_size, size, max_size):
+        if not (isinstance(size, int) or len(size) == 1):
+            return tuple(size)
+
+        if not isinstance(size, int):
+            size = size[0]
+
+        old_height, old_width = input_size
+        ratio = old_width / old_height
+        if ratio > 1:
+            new_height = size
+            new_width = int(ratio * new_height)
+        else:
+            new_width = size
+            new_height = int(new_width / ratio)
+
+        if max_size is not None and max(new_height, new_width) > max_size:
+            # Need to recompute the aspect ratio, since it might have changed due to rounding
+            ratio = new_width / new_height
+            if ratio > 1:
+                new_width = max_size
+                new_height = int(new_width / ratio)
+            else:
+                new_height = max_size
+                new_width = int(new_height * ratio)
+
+        return new_height, new_width
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("antialias", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_tensor(self, size, interpolation, use_max_size, antialias, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA.
+        # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that.
+        atol = 30 if transforms.InterpolationMode.BICUBIC and dtype is torch.uint8 else 1
+        check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol)
+
+        check_kernel(
+            F.resize_image_tensor,
+            self._make_input(datapoints.Image, dtype=dtype, device=device),
+            size=size,
+            interpolation=interpolation,
+            **max_size_kwarg,
+            antialias=antialias,
+            check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, format, size, use_max_size, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        bounding_box = self._make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        check_kernel(
+            F.resize_bounding_box,
+            bounding_box,
+            spatial_size=bounding_box.spatial_size,
+            size=size,
+            **max_size_kwarg,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize(
+        "dtype_and_make_mask", [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
+    )
+    def test_kernel_mask(self, dtype_and_make_mask):
+        dtype, make_mask = dtype_and_make_mask
+        check_kernel(F.resize_mask, make_mask(dtype=dtype), size=self.OUTPUT_SIZES[-1])
+
+    def test_kernel_video(self):
+        check_kernel(F.resize_video, self._make_input(datapoints.Video), size=self.OUTPUT_SIZES[-1], antialias=True)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize(
+        "input_type_and_kernel",
+        [
+            (torch.Tensor, F.resize_image_tensor),
+            (PIL.Image.Image, F.resize_image_pil),
+            (datapoints.Image, F.resize_image_tensor),
+            (datapoints.BoundingBox, F.resize_bounding_box),
+            (datapoints.Mask, F.resize_mask),
+            (datapoints.Video, F.resize_video),
+        ],
+    )
+    def test_dispatcher(self, size, input_type_and_kernel):
+        input_type, kernel = input_type_and_kernel
+        check_dispatcher(
+            F.resize,
+            kernel,
+            self._make_input(input_type),
+            size=size,
+            antialias=True,
+            check_scripted_smoke=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.resize_image_tensor),
+            (PIL.Image.Image, F.resize_image_pil),
+            (datapoints.Image, F.resize_image_tensor),
+            (datapoints.BoundingBox, F.resize_bounding_box),
+            (datapoints.Mask, F.resize_mask),
+            (datapoints.Video, F.resize_video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_signatures_match(F.resize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    def test_transform(self, size, device, input_type):
+        input = self._make_input(input_type, device=device)
+
+        check_transform(
+            transforms.Resize,
+            input,
+            size=size,
+            antialias=True,
+        )
+
+    def _check_output_size(self, input, output, *, size, max_size):
+        assert tuple(F.get_spatial_size(output)) == self._compute_output_size(
+            input_size=F.get_spatial_size(input), size=size, max_size=max_size
+        )
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2.
+    # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT`
+    @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST})
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_image_correctness(self, size, interpolation, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
+        expected = F.to_image_tensor(
+            F.resize(F.to_image_pil(image), size=size, interpolation=interpolation, **max_size_kwarg)
+        )
+
+        self._check_output_size(image, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected, atol=1, rtol=0)
+
+    def _reference_resize_bounding_box(self, bounding_box, *, size, max_size=None):
+        old_height, old_width = bounding_box.spatial_size
+        new_height, new_width = self._compute_output_size(
+            input_size=bounding_box.spatial_size, size=size, max_size=max_size
+        )
+
+        if (old_height, old_width) == (new_height, new_width):
+            return bounding_box
+
+        affine_matrix = np.array(
+            [
+                [new_width / old_width, 0, 0],
+                [0, new_height / old_height, 0],
+            ],
+            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+        )
+
+        expected_bboxes = reference_affine_bounding_box_helper(
+            bounding_box,
+            format=bounding_box.format,
+            spatial_size=(new_height, new_width),
+            affine_matrix=affine_matrix,
+        )
+        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes, spatial_size=(new_height, new_width))
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_bounding_box_correctness(self, format, size, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        bounding_box = self._make_input(datapoints.BoundingBox)
+
+        actual = fn(bounding_box, size=size, **max_size_kwarg)
+        expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
+
+        self._check_output_size(bounding_box, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES))
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
+    )
+    def test_pil_interpolation_compat_smoke(self, interpolation, input_type):
+        input = self._make_input(input_type)
+
+        with (
+            contextlib.nullcontext()
+            if isinstance(input, PIL.Image.Image)
+            # This error is triggered in PyTorch core
+            else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}")
+        ):
+            F.resize(
+                input,
+                size=self.OUTPUT_SIZES[0],
+                interpolation=interpolation,
+            )
+
+    def test_dispatcher_pil_antialias_warning(self):
+        with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
+            F.resize(self._make_input(PIL.Image.Image), size=self.OUTPUT_SIZES[0], antialias=False)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    def test_max_size_error(self, size, input_type):
+        if isinstance(size, int) or len(size) == 1:
+            max_size = (size if isinstance(size, int) else size[0]) - 1
+            match = "must be strictly greater than the requested size"
+        else:
+            # value can be anything other than None
+            max_size = -1
+            match = "size should be an int or a sequence of length 1"
+
+        with pytest.raises(ValueError, match=match):
+            F.resize(self._make_input(input_type), size=size, max_size=max_size, antialias=True)
+
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, datapoints.Image, datapoints.Video],
+    )
+    def test_antialias_warning(self, interpolation, input_type):
+        with (
+            assert_warns_antialias_default_value()
+            if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC}
+            else assert_no_warnings()
+        ):
+            F.resize(self._make_input(input_type), size=self.OUTPUT_SIZES[0], interpolation=interpolation)
+
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
+    )
+    def test_interpolation_int(self, interpolation, input_type):
+        # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to
+        # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a
+        # difference and thus we don't test it here.
+        if issubclass(input_type, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
+            return
+
+        input = self._make_input(input_type)
+
+        expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
+        actual = F.resize(
+            input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True
+        )
+
+        assert_equal(actual, expected)
+
+    def test_transform_unknown_size_error(self):
+        with pytest.raises(ValueError, match="size can either be an integer or a list or tuple of one or two integers"):
+            transforms.Resize(size=object())
+
+    @pytest.mark.parametrize(
+        "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)]
+    )
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    def test_noop(self, size, input_type):
+        input = self._make_input(input_type)
+
+        output = F.resize(input, size=size, antialias=True)
+
+        # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
+        # is a good reason to break this, feel free to downgrade to an equality check.
+        if isinstance(input, datapoints._datapoint.Datapoint):
+            # We can't test identity directly, since that checks for the identity of the Python object. Since all
+            # datapoints unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check
+            # that the underlying storage is the same
+            assert output.data_ptr() == input.data_ptr()
+        else:
+            assert output is input
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    def test_no_regression_5405(self, input_type):
+        # Checks that `max_size` is not ignored if `size == small_edge_size`
+        # See https://github.com/pytorch/vision/issues/5405
+
+        input = self._make_input(input_type)
+
+        size = min(F.get_spatial_size(input))
+        max_size = size + 1
+        output = F.resize(input, size=size, max_size=max_size, antialias=True)
+
+        assert max(F.get_spatial_size(output)) == max_size
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 1d9dd0252..cb1bc257e 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -148,19 +148,6 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F.horizontal_flip_image_pil, kernel_name="horizontal_flip_image_pil"),
     ),
-    DispatcherInfo(
-        F.resize,
-        kernels={
-            datapoints.Image: F.resize_image_tensor,
-            datapoints.Video: F.resize_video,
-            datapoints.BoundingBox: F.resize_bounding_box,
-            datapoints.Mask: F.resize_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F.resize_image_pil),
-        test_marks=[
-            xfail_jit_python_scalar_arg("size"),
-        ],
-    ),
     DispatcherInfo(
         F.affine,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 7b877fb09..547e708b7 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -238,179 +238,6 @@ KERNEL_INFOS.extend(
 )
 
 
-def _get_resize_sizes(spatial_size):
-    height, width = spatial_size
-    length = max(spatial_size)
-    yield length
-    yield [length]
-    yield (length,)
-    new_height = int(height * 0.75)
-    new_width = int(width * 1.25)
-    yield [new_height, new_width]
-    yield height, width
-
-
-def sample_inputs_resize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]):
-        for size in _get_resize_sizes(image_loader.spatial_size):
-            yield ArgsKwargs(image_loader, size=size)
-
-    for image_loader, interpolation in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=["RGB"]),
-        [F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
-    ):
-        yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation)
-
-    yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25)
-
-
-def sample_inputs_resize_image_tensor_bicubic():
-    for image_loader, interpolation in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=["RGB"]), [F.InterpolationMode.BICUBIC]
-    ):
-        yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation)
-
-
-@pil_reference_wrapper
-def reference_resize_image_tensor(*args, **kwargs):
-    if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in {
-        F.InterpolationMode.BILINEAR,
-        F.InterpolationMode.BICUBIC,
-    }:
-        raise pytest.UsageError("Anti-aliasing is always active in PIL")
-    return F.resize_image_pil(*args, **kwargs)
-
-
-def reference_inputs_resize_image_tensor():
-    for image_loader, interpolation in itertools.product(
-        make_image_loaders_for_interpolation(),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.NEAREST_EXACT,
-            F.InterpolationMode.BILINEAR,
-            F.InterpolationMode.BICUBIC,
-        ],
-    ):
-        for size in _get_resize_sizes(image_loader.spatial_size):
-            yield ArgsKwargs(
-                image_loader,
-                size=size,
-                interpolation=interpolation,
-                antialias=interpolation
-                in {
-                    F.InterpolationMode.BILINEAR,
-                    F.InterpolationMode.BICUBIC,
-                },
-            )
-
-
-def sample_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
-            yield ArgsKwargs(bounding_box_loader, spatial_size=bounding_box_loader.spatial_size, size=size)
-
-
-def sample_inputs_resize_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
-        yield ArgsKwargs(mask_loader, size=[min(mask_loader.shape[-2:]) + 1])
-
-
-def sample_inputs_resize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader, size=[min(video_loader.shape[-2:]) + 1])
-
-
-def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=None):
-    old_height, old_width = spatial_size
-    new_height, new_width = F._geometry._compute_resized_output_size(spatial_size, size=size, max_size=max_size)
-
-    if (old_height, old_width) == (new_height, new_width):
-        return bounding_box, (old_height, old_width)
-
-    affine_matrix = np.array(
-        [
-            [new_width / old_width, 0, 0],
-            [0, new_height / old_height, 0],
-        ],
-        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
-    )
-
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box,
-        format=bounding_box.format,
-        spatial_size=(new_height, new_width),
-        affine_matrix=affine_matrix,
-    )
-    return expected_bboxes, (new_height, new_width)
-
-
-def reference_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(extra_dims=((), (4,))):
-        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.resize_image_tensor,
-            sample_inputs_fn=sample_inputs_resize_image_tensor,
-            reference_fn=reference_resize_image_tensor,
-            reference_inputs_fn=reference_inputs_resize_image_tensor,
-            float32_vs_uint8=True,
-            closeness_kwargs={
-                **pil_reference_pixel_difference(10, mae=True),
-                **cuda_vs_cpu_pixel_difference(),
-                **float32_vs_uint8_pixel_difference(1, mae=True),
-            },
-            test_marks=[
-                xfail_jit_python_scalar_arg("size"),
-            ],
-        ),
-        KernelInfo(
-            F.resize_image_tensor,
-            sample_inputs_fn=sample_inputs_resize_image_tensor_bicubic,
-            reference_fn=reference_resize_image_tensor,
-            reference_inputs_fn=reference_inputs_resize_image_tensor,
-            float32_vs_uint8=True,
-            closeness_kwargs={
-                **pil_reference_pixel_difference(10, mae=True),
-                **cuda_vs_cpu_pixel_difference(atol=30),
-                **float32_vs_uint8_pixel_difference(1, mae=True),
-            },
-            test_marks=[
-                xfail_jit_python_scalar_arg("size"),
-            ],
-        ),
-        KernelInfo(
-            F.resize_bounding_box,
-            sample_inputs_fn=sample_inputs_resize_bounding_box,
-            reference_fn=reference_resize_bounding_box,
-            reference_inputs_fn=reference_inputs_resize_bounding_box,
-            closeness_kwargs={
-                (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
-            },
-            test_marks=[
-                xfail_jit_python_scalar_arg("size"),
-            ],
-        ),
-        KernelInfo(
-            F.resize_mask,
-            sample_inputs_fn=sample_inputs_resize_mask,
-            closeness_kwargs=pil_reference_pixel_difference(10),
-            test_marks=[
-                xfail_jit_python_scalar_arg("size"),
-            ],
-        ),
-        KernelInfo(
-            F.resize_video,
-            sample_inputs_fn=sample_inputs_resize_video,
-            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
-        ),
-    ]
-)
-
-
 _AFFINE_KWARGS = combinations_grid(
     angle=[-87, 15, 90],
     translate=[(5, 5), (-5, -5)],
-- 
GitLab


From b5401b9424412a172dc26439d1958d4d9fa7b979 Mon Sep 17 00:00:00 2001
From: Kobrin Eli <kobrineli@ispras.ru>
Date: Thu, 22 Jun 2023 12:12:04 +0300
Subject: [PATCH 492/624] Fix heap buffer overflow in `decode_png` (#7691)

---
 test/assets/toosmall_png/heapbof.png         | Bin 0 -> 7 bytes
 test/test_image.py                           |   3 +++
 torchvision/csrc/io/image/cpu/decode_png.cpp |   1 +
 3 files changed, 4 insertions(+)
 create mode 100644 test/assets/toosmall_png/heapbof.png

diff --git a/test/assets/toosmall_png/heapbof.png b/test/assets/toosmall_png/heapbof.png
new file mode 100644
index 0000000000000000000000000000000000000000..e720d1833423d20f7df5a5bab5411956ed01a879
GIT binary patch
literal 7
OcmeAS@N;KiU;qFJNC9sE

literal 0
HcmV?d00001

diff --git a/test/test_image.py b/test/test_image.py
index 4c210ea7e..b08dc2026 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -32,6 +32,7 @@ DAMAGED_JPEG = os.path.join(IMAGE_ROOT, "damaged_jpeg")
 DAMAGED_PNG = os.path.join(IMAGE_ROOT, "damaged_png")
 ENCODE_JPEG = os.path.join(IMAGE_ROOT, "encode_jpeg")
 INTERLACED_PNG = os.path.join(IMAGE_ROOT, "interlaced_png")
+TOOSMALL_PNG = os.path.join(IMAGE_ROOT, "toosmall_png")
 IS_WINDOWS = sys.platform in ("win32", "cygwin")
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split("."))
 
@@ -193,6 +194,8 @@ def test_decode_png_errors():
         decode_png(torch.randint(3, 5, (300,), dtype=torch.uint8))
     with pytest.raises(RuntimeError, match="Out of bound read in decode_png"):
         decode_png(read_file(os.path.join(DAMAGED_PNG, "sigsegv.png")))
+    with pytest.raises(RuntimeError, match="Content is too small for png"):
+        decode_png(read_file(os.path.join(TOOSMALL_PNG, "heapbof.png")))
 
 
 @pytest.mark.parametrize(
diff --git a/torchvision/csrc/io/image/cpu/decode_png.cpp b/torchvision/csrc/io/image/cpu/decode_png.cpp
index b1ceaf1ba..d27eafe45 100644
--- a/torchvision/csrc/io/image/cpu/decode_png.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_png.cpp
@@ -49,6 +49,7 @@ torch::Tensor decode_png(
     png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
     TORCH_CHECK(false, "Internal error.");
   }
+  TORCH_CHECK(datap_len >= 8, "Content is too small for png!")
   auto is_png = !png_sig_cmp(datap, 0, 8);
   TORCH_CHECK(is_png, "Content is not png!")
 
-- 
GitLab


From f02caa53307c4c157c210c9fbe3fffb97ac2e635 Mon Sep 17 00:00:00 2001
From: Duc Nguyen <justanhduc@users.noreply.github.com>
Date: Thu, 22 Jun 2023 18:01:51 +0700
Subject: [PATCH 493/624] Fix ShuffleNet ONNX export (#7686)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/models/shufflenetv2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index 52b85244b..3f3322b7a 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -35,7 +35,7 @@ def channel_shuffle(x: Tensor, groups: int) -> Tensor:
     x = torch.transpose(x, 1, 2).contiguous()
 
     # flatten
-    x = x.view(batchsize, -1, height, width)
+    x = x.view(batchsize, num_channels, height, width)
 
     return x
 
-- 
GitLab


From f81d68aafa43b732176fd26a15f3abe064e41c6b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 22 Jun 2023 15:39:22 +0100
Subject: [PATCH 494/624] Fix test RNG (#7693)

---
 test/test_transforms_v2_consistency.py | 7 +++++++
 test/test_transforms_v2_refactored.py  | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index e541feaf1..f035dde45 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -22,6 +22,7 @@ from common_utils import (
     make_image,
     make_images,
     make_segmentation_mask,
+    set_rng_seed,
 )
 from torch import nn
 from torchvision import datapoints, transforms as legacy_transforms
@@ -35,6 +36,12 @@ from torchvision.transforms.v2.utils import query_spatial_size
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
 
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
 class NotScriptableArgsKwargs(ArgsKwargs):
     """
     This class is used to mark parameters that render the transform non-scriptable. They still work in eager mode and
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 2b9565c74..002da24ac 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -21,6 +21,7 @@ from common_utils import (
     make_image,
     make_segmentation_mask,
     make_video,
+    set_rng_seed,
 )
 from torch.testing import assert_close
 from torchvision import datapoints
@@ -28,6 +29,12 @@ from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
 
 
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
 def _to_tolerances(maybe_tolerance_dict):
     if not isinstance(maybe_tolerance_dict, dict):
         return dict(rtol=None, atol=None)
-- 
GitLab


From 463cdeab31c61f33a39d4adf1d88f1f4c26689d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radek=20Barto=C5=88?= <blackhex@post.cz>
Date: Thu, 22 Jun 2023 18:34:22 +0000
Subject: [PATCH 495/624] CMake build using VS2022 (#7694)

Co-authored-by: Andrey Talman <atalman@fb.com>
---
 .github/scripts/setup-env.sh                    | 6 +++++-
 .github/workflows/build-cmake.yml               | 4 ++--
 .github/workflows/tests.yml                     | 2 ++
 packaging/windows/internal/vc_env_helper.bat    | 8 ++++++--
 packaging/windows/internal/vc_install_helper.sh | 6 ------
 5 files changed, 15 insertions(+), 11 deletions(-)
 delete mode 100644 packaging/windows/internal/vc_install_helper.sh

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index d10273590..e4af4e7c6 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -54,7 +54,11 @@ echo '::endgroup::'
 
 if [[ "${OS_TYPE}" == windows && "${GPU_ARCH_TYPE}" == cuda ]]; then
   echo '::group::Install VisualStudio CUDA extensions on Windows'
-  TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations"
+  if [[ "${VC_YEAR:-}" == "2022" ]]; then
+    TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/MSBuild/Microsoft/VC/v170/BuildCustomizations"
+  else
+    TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations"
+  fi
   mkdir -p "${TARGET_DIR}"
   cp -r "${CUDA_HOME}/MSBuildExtensions/"* "${TARGET_DIR}"
   echo '::endgroup::'
diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
index 06bd4de75..3871dca34 100644
--- a/.github/workflows/build-cmake.yml
+++ b/.github/workflows/build-cmake.yml
@@ -74,9 +74,9 @@ jobs:
       script: |
         set -euo pipefail
 
-        source packaging/windows/internal/vc_install_helper.sh
-
         export PYTHON_VERSION=3.8
+        export VC_YEAR=2022
+        export VSDEVCMD_ARGS=""
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index cd6011b4a..22e1a4ac1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -98,6 +98,8 @@ jobs:
         set -euxo pipefail
 
         export PYTHON_VERSION=${{ matrix.python-version }}
+        export VC_YEAR=2019
+        export VSDEVCMD_ARGS=""
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
         
diff --git a/packaging/windows/internal/vc_env_helper.bat b/packaging/windows/internal/vc_env_helper.bat
index e85a372f9..d3484a66e 100644
--- a/packaging/windows/internal/vc_env_helper.bat
+++ b/packaging/windows/internal/vc_env_helper.bat
@@ -1,7 +1,11 @@
 @echo on
 
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
+set VC_VERSION_LOWER=17
+set VC_VERSION_UPPER=18
+if "%VC_YEAR%" == "2019" (
+    set VC_VERSION_LOWER=16
+    set VC_VERSION_UPPER=17
+)
 if "%VC_YEAR%" == "2017" (
     set VC_VERSION_LOWER=15
     set VC_VERSION_UPPER=16
diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh
deleted file mode 100644
index 251509ae1..000000000
--- a/packaging/windows/internal/vc_install_helper.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-export VC_YEAR=2019
-export VSDEVCMD_ARGS=""
-- 
GitLab


From 887b6f1f3172c7f2fa88315f9896d8384b07a97a Mon Sep 17 00:00:00 2001
From: Masahiro Hiramori <mhg00g13@gmail.com>
Date: Fri, 23 Jun 2023 18:51:20 +0900
Subject: [PATCH 496/624] Add GPU support for draw_segmentation_masks (#7684)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_utils.py   | 25 ++++++++++++++-----------
 torchvision/utils.py |  5 ++++-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 32b3db596..b13bd0f0f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -9,7 +9,7 @@ import pytest
 import torch
 import torchvision.transforms.functional as F
 import torchvision.utils as utils
-from common_utils import assert_equal
+from common_utils import assert_equal, cpu_and_cuda
 from PIL import __version__ as PILLOW_VERSION, Image, ImageColor
 
 
@@ -203,12 +203,13 @@ def test_draw_no_boxes():
     ],
 )
 @pytest.mark.parametrize("alpha", (0, 0.5, 0.7, 1))
-def test_draw_segmentation_masks(colors, alpha):
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_segmentation_masks(colors, alpha, device):
     """This test makes sure that masks draw their corresponding color where they should"""
     num_masks, h, w = 2, 100, 100
     dtype = torch.uint8
-    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype)
-    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype, device=device)
+    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool, device=device)
 
     # For testing we enforce that there's no overlap between the masks. The
     # current behaviour is that the last mask's color will take priority when
@@ -234,7 +235,7 @@ def test_draw_segmentation_masks(colors, alpha):
     for mask, color in zip(masks, colors):
         if isinstance(color, str):
             color = ImageColor.getrgb(color)
-        color = torch.tensor(color, dtype=dtype)
+        color = torch.tensor(color, dtype=dtype, device=device)
 
         if alpha == 1:
             assert (out[:, mask] == color[:, None]).all()
@@ -245,11 +246,12 @@ def test_draw_segmentation_masks(colors, alpha):
         torch.testing.assert_close(out[:, mask], interpolated_color, rtol=0.0, atol=1.0)
 
 
-def test_draw_segmentation_masks_errors():
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_segmentation_masks_errors(device):
     h, w = 10, 10
 
-    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool)
-    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
+    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool, device=device)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8, device=device)
 
     with pytest.raises(TypeError, match="The image must be a tensor"):
         utils.draw_segmentation_masks(image="Not A Tensor Image", masks=masks)
@@ -281,9 +283,10 @@ def test_draw_segmentation_masks_errors():
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
 
 
-def test_draw_no_segmention_mask():
-    img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
-    masks = torch.full((0, 100, 100), 0, dtype=torch.bool)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_no_segmention_mask(device):
+    img = torch.full((3, 100, 100), 0, dtype=torch.uint8, device=device)
+    masks = torch.full((0, 100, 100), 0, dtype=torch.bool, device=device)
     with pytest.warns(UserWarning, match=re.escape("masks doesn't contain any mask. No mask was drawn")):
         res = utils.draw_segmentation_masks(img, masks)
         # Check that the function didn't change the image
diff --git a/torchvision/utils.py b/torchvision/utils.py
index 1418656a7..6ec19a0e0 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -304,7 +304,10 @@ def draw_segmentation_masks(
         return image
 
     out_dtype = torch.uint8
-    colors = [torch.tensor(color, dtype=out_dtype) for color in _parse_colors(colors, num_objects=num_masks)]
+    colors = [
+        torch.tensor(color, dtype=out_dtype, device=image.device)
+        for color in _parse_colors(colors, num_objects=num_masks)
+    ]
 
     img_to_draw = image.detach().clone()
     # TODO: There might be a way to vectorize this
-- 
GitLab


From 52eb5039bed1a23eee14014ff4cd6fd9cc9b2b08 Mon Sep 17 00:00:00 2001
From: Tommaso Bianconcini <33906341+biascia@users.noreply.github.com>
Date: Fri, 23 Jun 2023 15:23:52 +0200
Subject: [PATCH 497/624] Fix typos in mobilenetv3 docstrings (#7695)

---
 torchvision/models/mobilenetv3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index aa520e149..1041d4d14 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -378,7 +378,7 @@ def mobilenet_v3_large(
             weights are used.
         progress (bool, optional): If True, displays a progress bar of the
             download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
             base class. Please refer to the `source code
             <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
             for more details about this class.
@@ -409,7 +409,7 @@ def mobilenet_v3_small(
             weights are used.
         progress (bool, optional): If True, displays a progress bar of the
             download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
             base class. Please refer to the `source code
             <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
             for more details about this class.
-- 
GitLab


From 357a40f167002a6e2dcfc99153f6eabc19ffde9a Mon Sep 17 00:00:00 2001
From: Sahil Goyal <sahilgoyal6801@gmail.com>
Date: Tue, 27 Jun 2023 14:10:58 +0530
Subject: [PATCH 498/624] clarifying docs for ToPILImage() (#7679)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/transforms/transforms.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index d0290f932..38fc41720 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -199,21 +199,21 @@ class ConvertImageDtype(torch.nn.Module):
 
 
 class ToPILImage:
-    """Convert a tensor or an ndarray to PIL Image - this does not scale values.
+    """Convert a tensor or an ndarray to PIL Image
 
     This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
-    H x W x C to a PIL Image while preserving the value range.
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
 
     Args:
         mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
             If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
             - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
             - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
             - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
-            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
-            ``short``).
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, ``short``).
 
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
-- 
GitLab


From d814772ebce5fa5575ba11a908c515abd0d17d6b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 27 Jun 2023 14:54:51 +0200
Subject: [PATCH 499/624] make datapoints deepcopyable (#7701)

---
 test/test_datapoints.py              | 154 +++++++++++++++++++++++++++
 test/test_prototype_datapoints.py    | 133 -----------------------
 torchvision/datapoints/_datapoint.py |  12 ++-
 3 files changed, 165 insertions(+), 134 deletions(-)
 delete mode 100644 test/test_prototype_datapoints.py

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 39c051233..1334fd728 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -1,5 +1,8 @@
+from copy import deepcopy
+
 import pytest
 import torch
+from common_utils import assert_equal
 from PIL import Image
 
 from torchvision import datapoints
@@ -30,3 +33,154 @@ def test_bbox_instance(data, format):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[(format.upper())]
     assert bboxes.format == format
+
+
+@pytest.mark.parametrize(
+    ("data", "input_requires_grad", "expected_requires_grad"),
+    [
+        ([[[0.0, 1.0], [0.0, 1.0]]], None, False),
+        ([[[0.0, 1.0], [0.0, 1.0]]], False, False),
+        ([[[0.0, 1.0], [0.0, 1.0]]], True, True),
+        (torch.rand(3, 16, 16, requires_grad=False), None, False),
+        (torch.rand(3, 16, 16, requires_grad=False), False, False),
+        (torch.rand(3, 16, 16, requires_grad=False), True, True),
+        (torch.rand(3, 16, 16, requires_grad=True), None, True),
+        (torch.rand(3, 16, 16, requires_grad=True), False, False),
+        (torch.rand(3, 16, 16, requires_grad=True), True, True),
+    ],
+)
+def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
+    datapoint = datapoints.Image(data, requires_grad=input_requires_grad)
+    assert datapoint.requires_grad is expected_requires_grad
+
+
+def test_isinstance():
+    assert isinstance(datapoints.Image(torch.rand(3, 16, 16)), torch.Tensor)
+
+
+def test_wrapping_no_copy():
+    tensor = torch.rand(3, 16, 16)
+    image = datapoints.Image(tensor)
+
+    assert image.data_ptr() == tensor.data_ptr()
+
+
+def test_to_wrapping():
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    image_to = image.to(torch.float64)
+
+    assert type(image_to) is datapoints.Image
+    assert image_to.dtype is torch.float64
+
+
+def test_to_datapoint_reference():
+    tensor = torch.rand((3, 16, 16), dtype=torch.float64)
+    image = datapoints.Image(tensor)
+
+    tensor_to = tensor.to(image)
+
+    assert type(tensor_to) is torch.Tensor
+    assert tensor_to.dtype is torch.float64
+
+
+def test_clone_wrapping():
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    image_clone = image.clone()
+
+    assert type(image_clone) is datapoints.Image
+    assert image_clone.data_ptr() != image.data_ptr()
+
+
+def test_requires_grad__wrapping():
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    assert not image.requires_grad
+
+    image_requires_grad = image.requires_grad_(True)
+
+    assert type(image_requires_grad) is datapoints.Image
+    assert image.requires_grad
+    assert image_requires_grad.requires_grad
+
+
+def test_detach_wrapping():
+    image = datapoints.Image(torch.rand(3, 16, 16), requires_grad=True)
+
+    image_detached = image.detach()
+
+    assert type(image_detached) is datapoints.Image
+
+
+def test_other_op_no_wrapping():
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    # any operation besides the ones listed in `Datapoint._NO_WRAPPING_EXCEPTIONS` will do here
+    output = image * 2
+
+    assert type(output) is torch.Tensor
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        lambda t: t.numpy(),
+        lambda t: t.tolist(),
+        lambda t: t.max(dim=-1),
+    ],
+)
+def test_no_tensor_output_op_no_wrapping(op):
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    output = op(image)
+
+    assert type(output) is not datapoints.Image
+
+
+def test_inplace_op_no_wrapping():
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    output = image.add_(0)
+
+    assert type(output) is torch.Tensor
+    assert type(image) is datapoints.Image
+
+
+def test_wrap_like():
+    image = datapoints.Image(torch.rand(3, 16, 16))
+
+    # any operation besides the ones listed in `Datapoint._NO_WRAPPING_EXCEPTIONS` will do here
+    output = image * 2
+
+    image_new = datapoints.Image.wrap_like(image, output)
+
+    assert type(image_new) is datapoints.Image
+    assert image_new.data_ptr() == output.data_ptr()
+
+
+@pytest.mark.parametrize(
+    "datapoint",
+    [
+        datapoints.Image(torch.rand(3, 16, 16)),
+        datapoints.Video(torch.rand(2, 3, 16, 16)),
+        datapoints.BoundingBox([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)),
+        datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)),
+    ],
+)
+@pytest.mark.parametrize("requires_grad", [False, True])
+def test_deepcopy(datapoint, requires_grad):
+    if requires_grad and not datapoint.dtype.is_floating_point:
+        return
+
+    datapoint.requires_grad_(requires_grad)
+
+    datapoint_deepcopied = deepcopy(datapoint)
+
+    assert datapoint_deepcopied is not datapoint
+    assert datapoint_deepcopied.data_ptr() != datapoint.data_ptr()
+    assert_equal(datapoint_deepcopied, datapoint)
+
+    assert type(datapoint_deepcopied) is type(datapoint)
+    assert datapoint_deepcopied.requires_grad is requires_grad
+    assert datapoint_deepcopied.is_leaf
diff --git a/test/test_prototype_datapoints.py b/test/test_prototype_datapoints.py
deleted file mode 100644
index 04e3cd67f..000000000
--- a/test/test_prototype_datapoints.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import pytest
-import torch
-
-from torchvision.prototype import datapoints as proto_datapoints
-
-
-@pytest.mark.parametrize(
-    ("data", "input_requires_grad", "expected_requires_grad"),
-    [
-        ([0.0], None, False),
-        ([0.0], False, False),
-        ([0.0], True, True),
-        (torch.tensor([0.0], requires_grad=False), None, False),
-        (torch.tensor([0.0], requires_grad=False), False, False),
-        (torch.tensor([0.0], requires_grad=False), True, True),
-        (torch.tensor([0.0], requires_grad=True), None, True),
-        (torch.tensor([0.0], requires_grad=True), False, False),
-        (torch.tensor([0.0], requires_grad=True), True, True),
-    ],
-)
-def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
-    datapoint = proto_datapoints.Label(data, requires_grad=input_requires_grad)
-    assert datapoint.requires_grad is expected_requires_grad
-
-
-def test_isinstance():
-    assert isinstance(
-        proto_datapoints.Label([0, 1, 0], categories=["foo", "bar"]),
-        torch.Tensor,
-    )
-
-
-def test_wrapping_no_copy():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    assert label.data_ptr() == tensor.data_ptr()
-
-
-def test_to_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    label_to = label.to(torch.int32)
-
-    assert type(label_to) is proto_datapoints.Label
-    assert label_to.dtype is torch.int32
-    assert label_to.categories is label.categories
-
-
-def test_to_datapoint_reference():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
-
-    tensor_to = tensor.to(label)
-
-    assert type(tensor_to) is torch.Tensor
-    assert tensor_to.dtype is torch.int32
-
-
-def test_clone_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    label_clone = label.clone()
-
-    assert type(label_clone) is proto_datapoints.Label
-    assert label_clone.data_ptr() != label.data_ptr()
-    assert label_clone.categories is label.categories
-
-
-def test_requires_grad__wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.float32)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    assert not label.requires_grad
-
-    label_requires_grad = label.requires_grad_(True)
-
-    assert type(label_requires_grad) is proto_datapoints.Label
-    assert label.requires_grad
-    assert label_requires_grad.requires_grad
-
-
-def test_other_op_no_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    # any operation besides .to() and .clone() will do here
-    output = label * 2
-
-    assert type(output) is torch.Tensor
-
-
-@pytest.mark.parametrize(
-    "op",
-    [
-        lambda t: t.numpy(),
-        lambda t: t.tolist(),
-        lambda t: t.max(dim=-1),
-    ],
-)
-def test_no_tensor_output_op_no_wrapping(op):
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    output = op(label)
-
-    assert type(output) is not proto_datapoints.Label
-
-
-def test_inplace_op_no_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    output = label.add_(0)
-
-    assert type(output) is torch.Tensor
-    assert type(label) is proto_datapoints.Label
-
-
-def test_wrap_like():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = proto_datapoints.Label(tensor, categories=["foo", "bar"])
-
-    # any operation besides .to() and .clone() will do here
-    output = label * 2
-
-    label_new = proto_datapoints.Label.wrap_like(label, output)
-
-    assert type(label_new) is proto_datapoints.Label
-    assert label_new.data_ptr() == output.data_ptr()
-    assert label_new.categories is label.categories
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index fe489d13e..0dabec58f 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from types import ModuleType
-from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
 
 import PIL.Image
 import torch
@@ -36,6 +36,7 @@ class Datapoint(torch.Tensor):
     _NO_WRAPPING_EXCEPTIONS = {
         torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output),
         torch.Tensor.to: lambda cls, input, output: cls.wrap_like(input, output),
+        torch.Tensor.detach: lambda cls, input, output: cls.wrap_like(input, output),
         # We don't need to wrap the output of `Tensor.requires_grad_`, since it is an inplace operation and thus
         # retains the type automatically
         torch.Tensor.requires_grad_: lambda cls, input, output: output,
@@ -132,6 +133,15 @@ class Datapoint(torch.Tensor):
         with DisableTorchFunctionSubclass():
             return super().dtype
 
+    def __deepcopy__(self: D, memo: Dict[int, Any]) -> D:
+        # We need to detach first, since a plain `Tensor.clone` will be part of the computation graph, which does
+        # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
+        # attribute is cleared, so we need to refill it before we return.
+        # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
+        # `BoundingBox.format` and `BoundingBox.spatial_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBox.clone()`.
+        return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
+
     def horizontal_flip(self) -> Datapoint:
         return self
 
-- 
GitLab


From c9ac3a5b03731fa17d3934b552f308791314602b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 27 Jun 2023 15:46:24 +0200
Subject: [PATCH 500/624] fix to_grayscale deprecation warning (#7702)

---
 torchvision/transforms/v2/functional/_deprecated.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index 954daa97c..c88e3eb81 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -10,15 +10,10 @@ from torchvision.transforms import functional as _F
 
 @torch.jit.unused
 def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
-    call = ", num_output_channels=3" if num_output_channels == 3 else ""
-    replacement = "convert_color_space(..., color_space=datapoints.ColorSpace.GRAY)"
-    if num_output_channels == 3:
-        replacement = f"convert_color_space({replacement}, color_space=datapoints.ColorSpace.RGB)"
     warnings.warn(
-        f"The function `to_grayscale(...{call})` is deprecated in will be removed in a future release. "
-        f"Instead, please use `{replacement}`.",
+        "The function `to_grayscale` is deprecated in will be removed in a future release. "
+        "Instead, please use `rgb_to_grayscale`.",
     )
-
     return _F.to_grayscale(inpt, num_output_channels=num_output_channels)
 
 
-- 
GitLab


From 25c8a3a2cc2699e4e261b9e0777a6dc5badb5f9f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 28 Jun 2023 12:50:02 +0200
Subject: [PATCH 501/624] port horizontal flip tests (#7703)

---
 test/test_transforms_v2.py                    |  53 ------
 test/test_transforms_v2_refactored.py         | 174 ++++++++++++++++--
 test/transforms_v2_dispatcher_infos.py        |  10 -
 test/transforms_v2_kernel_infos.py            |  93 +---------
 .../transforms/v2/functional/_geometry.py     |   3 +-
 5 files changed, 166 insertions(+), 167 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 935d25edd..311a442ff 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -406,59 +406,6 @@ def test_simple_tensor_heuristic(flat_inputs):
         assert transform.was_applied(output, input)
 
 
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomHorizontalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
-        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_datapoints_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(datapoints.Image(input))
-
-        assert_equal(datapoints.Image(expected), actual)
-
-    def test_datapoints_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(datapoints.Mask(input))
-
-        assert_equal(datapoints.Mask(expected), actual)
-
-    def test_datapoints_bounding_box(self, p):
-        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.spatial_size == expected.spatial_size
-
-
 @pytest.mark.parametrize("p", [0.0, 1.0])
 class TestRandomVerticalFlip:
     def input_expected_image_tensor(self, p, dtype=torch.float32):
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 002da24ac..05eb47ab6 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -295,9 +295,9 @@ def check_transform(transform_cls, input, *args, **kwargs):
     _check_transform_v1_compatibility(transform, input)
 
 
-def transform_cls_to_functional(transform_cls):
+def transform_cls_to_functional(transform_cls, **transform_specific_kwargs):
     def wrapper(input, *args, **kwargs):
-        transform = transform_cls(*args, **kwargs)
+        transform = transform_cls(*args, **transform_specific_kwargs, **kwargs)
         return transform(input)
 
     wrapper.__name__ = transform_cls.__name__
@@ -321,14 +321,14 @@ def assert_warns_antialias_default_value():
 
 
 def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
-    def transform(bbox, affine_matrix_, format_, spatial_size_):
+    def transform(bbox):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
             bbox = bbox.float()
         bbox_xyxy = F.convert_format_bounding_box(
             bbox.as_subclass(torch.Tensor),
-            old_format=format_,
+            old_format=format,
             new_format=datapoints.BoundingBoxFormat.XYXY,
             inplace=True,
         )
@@ -340,7 +340,7 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size,
                 [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
             ]
         )
-        transformed_points = np.matmul(points, affine_matrix_.T)
+        transformed_points = np.matmul(points, affine_matrix.T)
         out_bbox = torch.tensor(
             [
                 np.min(transformed_points[:, 0]).item(),
@@ -351,23 +351,14 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size,
             dtype=bbox_xyxy.dtype,
         )
         out_bbox = F.convert_format_bounding_box(
-            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
+            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = F.clamp_bounding_box(out_bbox, format=format, spatial_size=spatial_size)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
-    if bounding_box.ndim < 2:
-        bounding_box = [bounding_box]
-
-    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
-    if len(expected_bboxes) > 1:
-        expected_bboxes = torch.stack(expected_bboxes)
-    else:
-        expected_bboxes = expected_bboxes[0]
-
-    return expected_bboxes
+    return torch.stack([transform(b) for b in bounding_box.reshape(-1, 4).unbind()]).reshape(bounding_box.shape)
 
 
 class TestResize:
@@ -493,7 +484,7 @@ class TestResize:
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
-        "input_type_and_kernel",
+        ("input_type", "kernel"),
         [
             (torch.Tensor, F.resize_image_tensor),
             (PIL.Image.Image, F.resize_image_pil),
@@ -503,8 +494,7 @@ class TestResize:
             (datapoints.Video, F.resize_video),
         ],
     )
-    def test_dispatcher(self, size, input_type_and_kernel):
-        input_type, kernel = input_type_and_kernel
+    def test_dispatcher(self, size, input_type, kernel):
         check_dispatcher(
             F.resize,
             kernel,
@@ -726,3 +716,147 @@ class TestResize:
         output = F.resize(input, size=size, max_size=max_size, antialias=True)
 
         assert max(F.get_spatial_size(output)) == max_size
+
+
+class TestHorizontalFlip:
+    def _make_input(self, input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), **kwargs):
+        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
+            input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+            if input_type is torch.Tensor:
+                input = input.as_subclass(torch.Tensor)
+            elif input_type is PIL.Image.Image:
+                input = F.to_image_pil(input)
+        elif input_type is datapoints.BoundingBox:
+            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
+            input = make_bounding_box(
+                dtype=dtype or torch.float32,
+                device=device,
+                spatial_size=spatial_size,
+                **kwargs,
+            )
+        elif input_type is datapoints.Mask:
+            input = make_segmentation_mask(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+        elif input_type is datapoints.Video:
+            input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+
+        return input
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_tensor(self, dtype, device):
+        check_kernel(F.horizontal_flip_image_tensor, self._make_input(torch.Tensor))
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, format, dtype, device):
+        bounding_box = self._make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        check_kernel(
+            F.horizontal_flip_bounding_box,
+            bounding_box,
+            format=format,
+            spatial_size=bounding_box.spatial_size,
+        )
+
+    @pytest.mark.parametrize(
+        "dtype_and_make_mask", [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
+    )
+    def test_kernel_mask(self, dtype_and_make_mask):
+        dtype, make_mask = dtype_and_make_mask
+        check_kernel(F.horizontal_flip_mask, make_mask(dtype=dtype))
+
+    def test_kernel_video(self):
+        check_kernel(F.horizontal_flip_video, self._make_input(datapoints.Video))
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.horizontal_flip_image_tensor),
+            (PIL.Image.Image, F.horizontal_flip_image_pil),
+            (datapoints.Image, F.horizontal_flip_image_tensor),
+            (datapoints.BoundingBox, F.horizontal_flip_bounding_box),
+            (datapoints.Mask, F.horizontal_flip_mask),
+            (datapoints.Video, F.horizontal_flip_video),
+        ],
+    )
+    def test_dispatcher(self, kernel, input_type):
+        check_dispatcher(F.horizontal_flip, kernel, self._make_input(input_type))
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.resize_image_tensor),
+            (PIL.Image.Image, F.resize_image_pil),
+            (datapoints.Image, F.resize_image_tensor),
+            (datapoints.BoundingBox, F.resize_bounding_box),
+            (datapoints.Mask, F.resize_mask),
+            (datapoints.Video, F.resize_video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_signatures_match(F.resize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, input_type, device):
+        input = self._make_input(input_type, device=device)
+
+        check_transform(transforms.RandomHorizontalFlip, input, p=1)
+
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_image_correctness(self, fn):
+        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image_tensor(F.horizontal_flip(F.to_image_pil(image)))
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_horizontal_flip_bounding_box(self, bounding_box):
+        affine_matrix = np.array(
+            [
+                [-1, 0, bounding_box.spatial_size[1]],
+                [0, 1, 0],
+            ],
+            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+        )
+
+        expected_bboxes = reference_affine_bounding_box_helper(
+            bounding_box,
+            format=bounding_box.format,
+            spatial_size=bounding_box.spatial_size,
+            affine_matrix=affine_matrix,
+        )
+
+        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes)
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_bounding_box_correctness(self, format, fn):
+        bounding_box = self._make_input(datapoints.BoundingBox)
+
+        actual = fn(bounding_box)
+        expected = self._reference_horizontal_flip_bounding_box(bounding_box)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_noop(self, input_type, device):
+        input = self._make_input(input_type, device=device)
+
+        transform = transforms.RandomHorizontalFlip(p=0)
+
+        output = transform(input)
+
+        assert_equal(output, input)
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index cb1bc257e..e0f7edd71 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -138,16 +138,6 @@ xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
 
 
 DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.horizontal_flip,
-        kernels={
-            datapoints.Image: F.horizontal_flip_image_tensor,
-            datapoints.Video: F.horizontal_flip_video,
-            datapoints.BoundingBox: F.horizontal_flip_bounding_box,
-            datapoints.Mask: F.horizontal_flip_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F.horizontal_flip_image_pil, kernel_name="horizontal_flip_image_pil"),
-    ),
     DispatcherInfo(
         F.affine,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 547e708b7..54fd3a679 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -156,88 +156,6 @@ def xfail_jit_python_scalar_arg(name, *, reason=None):
 KERNEL_INFOS = []
 
 
-def sample_inputs_horizontal_flip_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.float32]):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_horizontal_flip_image_tensor():
-    for image_loader in make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]):
-        yield ArgsKwargs(image_loader)
-
-
-def sample_inputs_horizontal_flip_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(
-        formats=[datapoints.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
-        )
-
-
-def sample_inputs_horizontal_flip_mask():
-    for image_loader in make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]):
-        yield ArgsKwargs(image_loader)
-
-
-def sample_inputs_horizontal_flip_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader)
-
-
-def reference_horizontal_flip_bounding_box(bounding_box, *, format, spatial_size):
-    affine_matrix = np.array(
-        [
-            [-1, 0, spatial_size[1]],
-            [0, 1, 0],
-        ],
-        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
-    )
-
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
-    )
-
-    return expected_bboxes
-
-
-def reference_inputs_flip_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(extra_dims=[()]):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
-        )
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.horizontal_flip_image_tensor,
-            kernel_name="horizontal_flip_image_tensor",
-            sample_inputs_fn=sample_inputs_horizontal_flip_image_tensor,
-            reference_fn=pil_reference_wrapper(F.horizontal_flip_image_pil),
-            reference_inputs_fn=reference_inputs_horizontal_flip_image_tensor,
-            float32_vs_uint8=True,
-        ),
-        KernelInfo(
-            F.horizontal_flip_bounding_box,
-            sample_inputs_fn=sample_inputs_horizontal_flip_bounding_box,
-            reference_fn=reference_horizontal_flip_bounding_box,
-            reference_inputs_fn=reference_inputs_flip_bounding_box,
-        ),
-        KernelInfo(
-            F.horizontal_flip_mask,
-            sample_inputs_fn=sample_inputs_horizontal_flip_mask,
-        ),
-        KernelInfo(
-            F.horizontal_flip_video,
-            sample_inputs_fn=sample_inputs_horizontal_flip_video,
-        ),
-    ]
-)
-
-
 _AFFINE_KWARGS = combinations_grid(
     angle=[-87, 15, 90],
     translate=[(5, 5), (-5, -5)],
@@ -573,6 +491,15 @@ def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
     return expected_bboxes
 
 
+def reference_inputs_vertical_flip_bounding_box():
+    for bounding_box_loader in make_bounding_box_loaders(extra_dims=[()]):
+        yield ArgsKwargs(
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
+        )
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -587,7 +514,7 @@ KERNEL_INFOS.extend(
             F.vertical_flip_bounding_box,
             sample_inputs_fn=sample_inputs_vertical_flip_bounding_box,
             reference_fn=reference_vertical_flip_bounding_box,
-            reference_inputs_fn=reference_inputs_flip_bounding_box,
+            reference_inputs_fn=reference_inputs_vertical_flip_bounding_box,
         ),
         KernelInfo(
             F.vertical_flip_mask,
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index aab3be24e..b56205e61 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -43,7 +43,8 @@ def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
 
 
-horizontal_flip_image_pil = _FP.hflip
+def horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return _FP.hflip(image)
 
 
 def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
-- 
GitLab


From 1c1cd68bad4ac8fbdc7ce9ce88b89f097b320b59 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 28 Jun 2023 22:46:50 +0200
Subject: [PATCH 502/624] undeprecate to_grayscale (#7707)

---
 test/test_transforms_v2_functional.py               |  1 +
 torchvision/transforms/functional.py                |  2 +-
 torchvision/transforms/v2/functional/__init__.py    |  3 ++-
 torchvision/transforms/v2/functional/_color.py      |  5 +++++
 torchvision/transforms/v2/functional/_deprecated.py | 10 ----------
 5 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 9a2ea37a4..93996432a 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -539,6 +539,7 @@ class TestDispatchers:
             (F.to_pil_image, F.to_image_pil),
             (F.elastic_transform, F.elastic),
             (F.convert_image_dtype, F.convert_dtype_image_tensor),
+            (F.to_grayscale, F.rgb_to_grayscale),
         ]
     ],
 )
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index 2c2f1e193..3e81005c6 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -1248,7 +1248,7 @@ def affine(
 
 # Looks like to_grayscale() is a stand-alone functional that is never called
 # from the transform classes. Perhaps it's still here for BC? I can't be
-# bothered to dig. Anyway, this can be deprecated as we migrate to V2.
+# bothered to dig.
 @torch.jit.unused
 def to_grayscale(img, num_output_channels=1):
     """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image.
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index ffb34c877..b4803f4f1 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -76,6 +76,7 @@ from ._color import (
     solarize_image_pil,
     solarize_image_tensor,
     solarize_video,
+    to_grayscale,
 )
 from ._geometry import (
     affine,
@@ -168,4 +169,4 @@ from ._misc import (
 from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
 from ._type_conversion import pil_to_tensor, to_image_pil, to_image_tensor, to_pil_image
 
-from ._deprecated import get_image_size, to_grayscale, to_tensor  # usort: skip
+from ._deprecated import get_image_size, to_tensor  # usort: skip
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 4ba7e5b36..13417e4a9 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -56,6 +56,11 @@ def rgb_to_grayscale(
         )
 
 
+# `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
+# superset in terms of functionality and has the same signature, we alias here to avoid disruption.
+to_grayscale = rgb_to_grayscale
+
+
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
     ratio = float(ratio)
     fp = image1.is_floating_point()
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index c88e3eb81..c9a0f647e 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -1,22 +1,12 @@
 import warnings
 from typing import Any, List, Union
 
-import PIL.Image
 import torch
 
 from torchvision import datapoints
 from torchvision.transforms import functional as _F
 
 
-@torch.jit.unused
-def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
-    warnings.warn(
-        "The function `to_grayscale` is deprecated in will be removed in a future release. "
-        "Instead, please use `rgb_to_grayscale`.",
-    )
-    return _F.to_grayscale(inpt, num_output_channels=num_output_channels)
-
-
 @torch.jit.unused
 def to_tensor(inpt: Any) -> torch.Tensor:
     warnings.warn(
-- 
GitLab


From a335d916db0694770e8152f41e19195de3134523 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 29 Jun 2023 10:52:32 +0200
Subject: [PATCH 503/624] fix formatting in test_functional_tensor.py (#7709)

---
 test/test_functional_tensor.py | 48 +++-------------------------------
 1 file changed, 4 insertions(+), 44 deletions(-)

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 43f54e6f1..fb3f5744e 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -293,24 +293,8 @@ class TestAffine:
             (33, (5, -4), 1.0, [0.0, 0.0], [0, 0, 0]),
             (45, [-5, 4], 1.2, [0.0, 0.0], (1, 2, 3)),
             (33, (-4, -8), 2.0, [0.0, 0.0], [255, 255, 255]),
-            (
-                85,
-                (10, -10),
-                0.7,
-                [0.0, 0.0],
-                [
-                    1,
-                ],
-            ),
-            (
-                0,
-                [0, 0],
-                1.0,
-                [
-                    35.0,
-                ],
-                (2.0,),
-            ),
+            (85, (10, -10), 0.7, [0.0, 0.0], [1]),
+            (0, [0, 0], 1.0, [35.0], (2.0,)),
             (-25, [0, 0], 1.2, [0.0, 15.0], None),
             (-45, [-10, 0], 0.7, [2.0, 5.0], None),
             (-45, [-10, -10], 1.2, [4.0, 5.0], None),
@@ -392,19 +376,7 @@ def _get_data_dims_and_points_for_perspective():
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize(
-    "fill",
-    (
-        None,
-        [0, 0, 0],
-        [1, 2, 3],
-        [255, 255, 255],
-        [
-            1,
-        ],
-        (2.0,),
-    ),
-)
+@pytest.mark.parametrize("fill", (None, [0, 0, 0], [1, 2, 3], [255, 255, 255], [1], (2.0,)))
 @pytest.mark.parametrize("fn", [F.perspective, torch.jit.script(F.perspective)])
 def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn):
 
@@ -475,19 +447,7 @@ def test_perspective_interpolation_type():
 
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize(
-    "size",
-    [
-        32,
-        26,
-        [
-            32,
-        ],
-        [32, 32],
-        (32, 32),
-        [26, 35],
-    ],
-)
+@pytest.mark.parametrize("size", [32, 26, [32], [32, 32], (32, 32), [26, 35]])
 @pytest.mark.parametrize("max_size", [None, 34, 40, 1000])
 @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
 def test_resize(device, dt, size, max_size, interpolation):
-- 
GitLab


From 22d981f431899a9ff43d0434d6d5e073036522aa Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 29 Jun 2023 11:08:33 +0200
Subject: [PATCH 504/624] ignore #7709 in git blame (#7710)

---
 .git-blame-ignore-revs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index eec938547..b9754e29b 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -7,3 +7,5 @@
 d367a01a18a3ae6bee13d8be3b63fd6a581ea46f
 # Upgrade usort to 1.0.2 and black to 22.3.0 (#5106) 
 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f
+# Fix unnecessary exploded black formatting (#7709)
+a335d916db0694770e8152f41e19195de3134523
-- 
GitLab


From 0e4961551d3b9cd6e766381cb7539531de20450b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 30 Jun 2023 10:30:33 +0200
Subject: [PATCH 505/624] port affine tests (#7708)

---
 test/test_transforms_v2.py             | 124 -------
 test/test_transforms_v2_functional.py  |  83 -----
 test/test_transforms_v2_refactored.py  | 465 ++++++++++++++++++++++++-
 test/transforms_v2_dispatcher_infos.py |  15 -
 test/transforms_v2_kernel_infos.py     | 182 ----------
 5 files changed, 453 insertions(+), 416 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 311a442ff..e9d1dfc05 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -668,130 +668,6 @@ class TestRandomRotation:
         assert out_img.spatial_size == out_bbox.spatial_size
 
 
-class TestRandomAffine:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomAffine(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        for kwargs in [
-            {"center": 12},
-            {"translate": 12},
-            {"scale": 12},
-        ]:
-            with pytest.raises(TypeError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
-            with pytest.raises(ValueError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
-            transforms.RandomAffine(12, translate=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="scale values should be positive"):
-            transforms.RandomAffine(12, scale=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(12, shear=-10)
-
-        for s in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
-                transforms.RandomAffine(12, shear=s)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    def test__get_params(self, degrees, translate, scale, shear, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
-
-        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
-        params = transform._get_params([image])
-
-        if not isinstance(degrees, (list, tuple)):
-            assert -degrees <= params["angle"] <= degrees
-        else:
-            assert degrees[0] <= params["angle"] <= degrees[1]
-
-        if translate is not None:
-            w_max = int(round(translate[0] * w))
-            h_max = int(round(translate[1] * h))
-            assert -w_max <= params["translate"][0] <= w_max
-            assert -h_max <= params["translate"][1] <= h_max
-        else:
-            assert params["translate"] == (0, 0)
-
-        if scale is not None:
-            assert scale[0] <= params["scale"] <= scale[1]
-        else:
-            assert params["scale"] == 1.0
-
-        if shear is not None:
-            if isinstance(shear, float):
-                assert -shear <= params["shear"][0] <= shear
-                assert params["shear"][1] == 0.0
-            elif len(shear) == 2:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert params["shear"][1] == 0.0
-            else:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert shear[2] <= params["shear"][1] <= shear[3]
-        else:
-            assert params["shear"] == (0, 0)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomAffine(
-            degrees,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.affine")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
-
-
 class TestRandomCrop:
     def test_assertions(self):
         with pytest.raises(ValueError, match="Please provide only two dimensions"):
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 93996432a..79ea20d85 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -665,77 +665,6 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
     return true_matrix
 
 
-@pytest.mark.parametrize("device", cpu_and_cuda())
-def test_correctness_affine_bounding_box_on_fixed_input(device):
-    # Check transformation against known expected output
-    format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (64, 64)
-    in_boxes = [
-        [20, 25, 35, 45],
-        [50, 5, 70, 22],
-        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
-        [1, 1, 5, 5],
-    ]
-    in_boxes = torch.tensor(in_boxes, dtype=torch.float64, device=device)
-    # Tested parameters
-    angle = 63
-    scale = 0.89
-    dx = 0.12
-    dy = 0.23
-
-    # Expected bboxes computed using albumentations:
-    # from albumentations.augmentations.geometric.functional import bbox_shift_scale_rotate
-    # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox
-    # expected_bboxes = []
-    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *spatial_size)
-    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *spatial_size)
-    #     out_box = denormalize_bbox(n_out_box, *spatial_size)
-    #     expected_bboxes.append(out_box)
-    expected_bboxes = [
-        (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695),
-        (54.88288587110401, 50.08453280875634, 76.44484547743795, 72.81332520036864),
-        (27.709526487041554, 34.74952648704156, 51.650473512958435, 58.69047351295844),
-        (48.56528888843238, 9.611532109828834, 53.35347829361575, 14.39972151501221),
-    ]
-
-    expected_bboxes = clamp_bounding_box(
-        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
-    ).tolist()
-
-    output_boxes = F.affine_bounding_box(
-        in_boxes,
-        format=format,
-        spatial_size=spatial_size,
-        angle=angle,
-        translate=(dx * spatial_size[1], dy * spatial_size[0]),
-        scale=scale,
-        shear=(0, 0),
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_cuda())
-def test_correctness_affine_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees and scale
-    expected_mask = torch.rot90(mask, k=-1, dims=(-2, -1))
-    expected_mask = torch.nn.functional.interpolate(expected_mask[None, :].float(), size=(64, 64), mode="nearest")
-    expected_mask = expected_mask[0, :, 16 : 64 - 16, 16 : 64 - 16].long()
-
-    out_mask = F.affine_mask(mask, 90, [0.0, 0.0], 64.0 / 32.0, [0.0, 0.0])
-
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
 @pytest.mark.parametrize("angle", range(-90, 90, 56))
 @pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
 def test_correctness_rotate_bounding_box(angle, expand, center):
@@ -950,18 +879,6 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     torch.testing.assert_close(output_spatial_size, spatial_size)
 
 
-@pytest.mark.parametrize("device", cpu_and_cuda())
-def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device):
-    mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    mask[:, :, 0] = 1
-
-    out_mask = F.horizontal_flip_mask(mask)
-
-    expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    expected_mask[:, :, -1] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
 @pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
     mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 05eb47ab6..0db4824d5 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -1,5 +1,6 @@
 import contextlib
 import inspect
+import math
 import re
 from typing import get_type_hints
 from unittest import mock
@@ -25,6 +26,8 @@ from common_utils import (
 )
 from torch.testing import assert_close
 from torchvision import datapoints
+
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
 
@@ -162,7 +165,7 @@ def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs):
     if isinstance(input, datapoints._datapoint.Datapoint):
         # Due to our complex dispatch architecture for datapoints, we cannot spy on the kernel directly,
         # but rather have to patch the `Datapoint.__F` attribute to contain the spied on kernel.
-        spy = mock.MagicMock(wraps=kernel)
+        spy = mock.MagicMock(wraps=kernel, name=kernel.__name__)
         with mock.patch.object(F, kernel.__name__, spy):
             # Due to Python's name mangling, the `Datapoint.__F` attribute is only accessible from inside the class.
             # Since that is not the case here, we need to prefix f"_{cls.__name__}"
@@ -473,10 +476,9 @@ class TestResize:
         )
 
     @pytest.mark.parametrize(
-        "dtype_and_make_mask", [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
+        ("dtype", "make_mask"), [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
     )
-    def test_kernel_mask(self, dtype_and_make_mask):
-        dtype, make_mask = dtype_and_make_mask
+    def test_kernel_mask(self, dtype, make_mask):
         check_kernel(F.resize_mask, make_mask(dtype=dtype), size=self.OUTPUT_SIZES[-1])
 
     def test_kernel_video(self):
@@ -744,7 +746,7 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.horizontal_flip_image_tensor, self._make_input(torch.Tensor))
+        check_kernel(F.horizontal_flip_image_tensor, self._make_input(torch.Tensor, dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@@ -785,16 +787,16 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
         [
-            (torch.Tensor, F.resize_image_tensor),
-            (PIL.Image.Image, F.resize_image_pil),
-            (datapoints.Image, F.resize_image_tensor),
-            (datapoints.BoundingBox, F.resize_bounding_box),
-            (datapoints.Mask, F.resize_mask),
-            (datapoints.Video, F.resize_video),
+            (torch.Tensor, F.horizontal_flip_image_tensor),
+            (PIL.Image.Image, F.horizontal_flip_image_pil),
+            (datapoints.Image, F.horizontal_flip_image_tensor),
+            (datapoints.BoundingBox, F.horizontal_flip_bounding_box),
+            (datapoints.Mask, F.horizontal_flip_mask),
+            (datapoints.Video, F.horizontal_flip_video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_signatures_match(F.resize, kernel=kernel, input_type=input_type)
+        check_dispatcher_signatures_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "input_type",
@@ -860,3 +862,442 @@ class TestHorizontalFlip:
         output = transform(input)
 
         assert_equal(output, input)
+
+
+class TestAffine:
+    def _make_input(
+        self, input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), mask_type="segmentation", **kwargs
+    ):
+        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
+            input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+            if input_type is torch.Tensor:
+                input = input.as_subclass(torch.Tensor)
+            elif input_type is PIL.Image.Image:
+                input = F.to_image_pil(input)
+        elif input_type is datapoints.BoundingBox:
+            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
+            input = make_bounding_box(
+                dtype=dtype or torch.float32,
+                device=device,
+                spatial_size=spatial_size,
+                **kwargs,
+            )
+        elif input_type is datapoints.Mask:
+            if mask_type == "segmentation":
+                make_mask = make_segmentation_mask
+                default_dtype = torch.uint8
+            elif mask_type == "detection":
+                make_mask = make_detection_mask
+                default_dtype = torch.bool
+            input = make_mask(size=spatial_size, dtype=dtype or default_dtype, device=device, **kwargs)
+        elif input_type is datapoints.Video:
+            input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+
+        return input
+
+    def _adapt_fill(self, value, *, dtype):
+        """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype"""
+        if value is None:
+            return value
+
+        max_value = get_max_value(dtype)
+
+        if isinstance(value, (int, float)):
+            return type(value)(value * max_value)
+        elif isinstance(value, (list, tuple)):
+            return type(value)(type(v)(v * max_value) for v in value)
+        else:
+            raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'")
+
+    _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
+        # float, int
+        angle=[-10.9, 18],
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)],
+        # float
+        scale=[0.5],
+        # float, int,
+        # one-list of float, one-list of int, one-tuple of float, one-tuple of int
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)],
+        # None
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)],
+    )
+    # The special case for shear makes sure we pick a value that is supported while JIT scripting
+    _MINIMAL_AFFINE_KWARGS = {
+        k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list))
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+    _CORRECTNESS_AFFINE_KWARGS = {
+        k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)]
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+
+    _EXHAUSTIVE_TYPE_FILLS = [
+        None,
+        1,
+        0.5,
+        [1],
+        [0.2],
+        (0,),
+        (0.7,),
+        [1, 0, 1],
+        [0.1, 0.2, 0.3],
+        (0, 1, 0),
+        (0.9, 0.234, 0.314),
+    ]
+    _CORRECTNESS_FILL = [
+        v for v in _EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)
+    ]
+
+    _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
+        degrees=[30, (-15, 20)],
+        translate=[None, (0.5, 0.5)],
+        scale=[None, (0.75, 1.25)],
+        shear=[None, (12, 30, -17, 5), 10, (-5, 12)],
+    )
+    _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {
+        k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()
+    }
+
+    def _check_kernel(self, kernel, input, *args, **kwargs):
+        kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy()
+        kwargs_.update(kwargs)
+        check_kernel(kernel, input, *args, **kwargs_)
+
+    @pytest.mark.parametrize(
+        ("param", "value"),
+        [
+            (param, value)
+            for param, values in [
+                ("angle", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"]),
+                ("translate", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"]),
+                ("shear", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"]),
+                ("center", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"]),
+                ("interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]),
+                ("fill", _EXHAUSTIVE_TYPE_FILLS),
+            ]
+            for value in values
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_tensor(self, param, value, dtype, device):
+        if param == "fill":
+            value = self._adapt_fill(value, dtype=dtype)
+        self._check_kernel(
+            F.affine_image_tensor,
+            self._make_input(torch.Tensor, dtype=dtype, device=device),
+            **{param: value},
+            check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
+            check_cuda_vs_cpu=dict(atol=1, rtol=0)
+            if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
+            else True,
+        )
+
+    @pytest.mark.parametrize(
+        ("param", "value"),
+        [
+            (param, value)
+            for param, values in [
+                ("angle", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"]),
+                ("translate", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"]),
+                ("shear", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"]),
+                ("center", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"]),
+            ]
+            for value in values
+        ],
+    )
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, param, value, format, dtype, device):
+        bounding_box = self._make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device)
+        self._check_kernel(
+            F.affine_bounding_box,
+            self._make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device),
+            format=format,
+            spatial_size=bounding_box.spatial_size,
+            **{param: value},
+            check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
+        )
+
+    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
+    def test_kernel_mask(self, mask_type):
+        check_kernel(
+            F.affine_mask, self._make_input(datapoints.Mask, mask_type=mask_type), **self._MINIMAL_AFFINE_KWARGS
+        )
+
+    def test_kernel_video(self):
+        check_kernel(F.affine_video, self._make_input(datapoints.Video), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.affine_image_tensor),
+            (PIL.Image.Image, F.affine_image_pil),
+            (datapoints.Image, F.affine_image_tensor),
+            (datapoints.BoundingBox, F.affine_bounding_box),
+            (datapoints.Mask, F.affine_mask),
+            (datapoints.Video, F.affine_video),
+        ],
+    )
+    def test_dispatcher(self, kernel, input_type):
+        check_dispatcher(F.affine, kernel, self._make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.affine_image_tensor),
+            (PIL.Image.Image, F.affine_image_pil),
+            (datapoints.Image, F.affine_image_tensor),
+            (datapoints.BoundingBox, F.affine_bounding_box),
+            (datapoints.Mask, F.affine_mask),
+            (datapoints.Video, F.affine_video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_signatures_match(F.affine, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, input_type, device):
+        input = self._make_input(input_type, device=device)
+
+        check_transform(transforms.RandomAffine, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", _CORRECTNESS_FILL)
+    def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill):
+        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        fill = self._adapt_fill(fill, dtype=torch.uint8)
+
+        actual = F.affine(
+            image,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+            interpolation=interpolation,
+            fill=fill,
+        )
+        expected = F.to_image_tensor(
+            F.affine(
+                F.to_image_pil(image),
+                angle=angle,
+                translate=translate,
+                scale=scale,
+                shear=shear,
+                center=center,
+                interpolation=interpolation,
+                fill=fill,
+            )
+        )
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", _CORRECTNESS_FILL)
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, center, interpolation, fill, seed):
+        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        fill = self._adapt_fill(fill, dtype=torch.uint8)
+
+        transform = transforms.RandomAffine(
+            **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill
+        )
+
+        torch.manual_seed(seed)
+        actual = transform(image)
+
+        torch.manual_seed(seed)
+        expected = F.to_image_tensor(transform(F.to_image_pil(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
+
+    def _compute_affine_matrix(self, *, angle, translate, scale, shear, center):
+        rot = math.radians(angle)
+        cx, cy = center
+        tx, ty = translate
+        sx, sy = [math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear)]
+
+        c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
+        t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+        c_matrix_inv = np.linalg.inv(c_matrix)
+        rs_matrix = np.array(
+            [
+                [scale * math.cos(rot), -scale * math.sin(rot), 0],
+                [scale * math.sin(rot), scale * math.cos(rot), 0],
+                [0, 0, 1],
+            ]
+        )
+        shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
+        shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
+        rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
+        true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
+        return true_matrix
+
+    def _reference_affine_bounding_box(self, bounding_box, *, angle, translate, scale, shear, center):
+        if center is None:
+            center = [s * 0.5 for s in bounding_box.spatial_size[::-1]]
+
+        affine_matrix = self._compute_affine_matrix(
+            angle=angle, translate=translate, scale=scale, shear=shear, center=center
+        )
+        affine_matrix = affine_matrix[:2, :]
+
+        expected_bboxes = reference_affine_bounding_box_helper(
+            bounding_box,
+            format=bounding_box.format,
+            spatial_size=bounding_box.spatial_size,
+            affine_matrix=affine_matrix,
+        )
+
+        return expected_bboxes
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_bounding_box_correctness(self, format, angle, translate, scale, shear, center):
+        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+
+        actual = F.affine(
+            bounding_box,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+        expected = self._reference_affine_bounding_box(
+            bounding_box,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_box_correctness(self, format, center, seed):
+        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+
+        transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([bounding_box])
+
+        torch.manual_seed(seed)
+        actual = transform(bounding_box)
+
+        expected = self._reference_affine_bounding_box(bounding_box, **params, center=center)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
+    @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"])
+    @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"])
+    @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
+        image = self._make_input(torch.Tensor)
+        height, width = F.get_spatial_size(image)
+
+        transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([image])
+
+        if isinstance(degrees, (int, float)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+        if translate is not None:
+            width_max = int(round(translate[0] * width))
+            height_max = int(round(translate[1] * height))
+            assert -width_max <= params["translate"][0] <= width_max
+            assert -height_max <= params["translate"][1] <= height_max
+        else:
+            assert params["translate"] == (0, 0)
+
+        if scale is not None:
+            assert scale[0] <= params["scale"] <= scale[1]
+        else:
+            assert params["scale"] == 1.0
+
+        if shear is not None:
+            if isinstance(shear, (int, float)):
+                assert -shear <= params["shear"][0] <= shear
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 2:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 4:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert shear[2] <= params["shear"][1] <= shear[3]
+        else:
+            assert params["shear"] == (0, 0)
+
+    @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"])
+    @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]])
+    def test_transform_sequence_len_errors(self, param, value):
+        if param in {"degrees", "shear"} and not isinstance(value, list):
+            return
+
+        kwargs = {param: value}
+        if param != "degrees":
+            kwargs["degrees"] = 0
+
+        with pytest.raises(
+            ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2"
+        ):
+            transforms.RandomAffine(**kwargs)
+
+    def test_transform_negative_degrees_error(self):
+        with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=-1)
+
+    @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]])
+    def test_transform_translate_range_error(self, translate):
+        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
+            transforms.RandomAffine(degrees=0, translate=translate)
+
+    @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]])
+    def test_transform_scale_range_error(self, scale):
+        with pytest.raises(ValueError, match="scale values should be positive"):
+            transforms.RandomAffine(degrees=0, scale=scale)
+
+    def test_transform_negative_shear_error(self):
+        with pytest.raises(ValueError, match="If shear is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=0, shear=-1)
+
+    def test_transform_unknown_fill_error(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(degrees=0, fill="fill")
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index e0f7edd71..b217e1638 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -138,21 +138,6 @@ xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
 
 
 DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.affine,
-        kernels={
-            datapoints.Image: F.affine_image_tensor,
-            datapoints.Video: F.affine_video,
-            datapoints.BoundingBox: F.affine_bounding_box,
-            datapoints.Mask: F.affine_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F.affine_image_pil),
-        test_marks=[
-            *xfails_pil_if_fill_sequence_needs_broadcast,
-            xfail_jit_python_scalar_arg("shear"),
-            xfail_jit_python_scalar_arg("fill"),
-        ],
-    ),
     DispatcherInfo(
         F.vertical_flip,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 54fd3a679..0daae8aee 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -1,7 +1,6 @@
 import decimal
 import functools
 import itertools
-import math
 
 import numpy as np
 import PIL.Image
@@ -156,46 +155,6 @@ def xfail_jit_python_scalar_arg(name, *, reason=None):
 KERNEL_INFOS = []
 
 
-_AFFINE_KWARGS = combinations_grid(
-    angle=[-87, 15, 90],
-    translate=[(5, 5), (-5, -5)],
-    scale=[0.77, 1.27],
-    shear=[(12, 12), (0, 0)],
-)
-
-
-def _diversify_affine_kwargs_types(affine_kwargs):
-    angle = affine_kwargs["angle"]
-    for diverse_angle in [int(angle), float(angle)]:
-        yield dict(affine_kwargs, angle=diverse_angle)
-
-    shear = affine_kwargs["shear"]
-    for diverse_shear in [tuple(shear), list(shear), int(shear[0]), float(shear[0])]:
-        yield dict(affine_kwargs, shear=diverse_shear)
-
-
-def _full_affine_params(**partial_params):
-    partial_params.setdefault("angle", 0.0)
-    partial_params.setdefault("translate", [0.0, 0.0])
-    partial_params.setdefault("scale", 1.0)
-    partial_params.setdefault("shear", [0.0, 0.0])
-    partial_params.setdefault("center", None)
-    return partial_params
-
-
-_DIVERSE_AFFINE_PARAMS = [
-    _full_affine_params(**{name: arg})
-    for name, args in [
-        ("angle", [1.0, 2]),
-        ("translate", [[1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
-        ("scale", [0.5]),
-        ("shear", [1.0, 2, [1.0], [2], (1.0,), (2,), [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
-        ("center", [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
-    ]
-    for arg in args
-]
-
-
 def get_fills(*, num_channels, dtype):
     yield None
 
@@ -226,72 +185,6 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
     return other_args, dict(kwargs, fill=fill)
 
 
-def sample_inputs_affine_image_tensor():
-    make_affine_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
-    )
-
-    for image_loader, affine_params in itertools.product(make_affine_image_loaders(), _DIVERSE_AFFINE_PARAMS):
-        yield ArgsKwargs(image_loader, **affine_params)
-
-    for image_loader in make_affine_image_loaders():
-        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
-            yield ArgsKwargs(image_loader, **_full_affine_params(), fill=fill)
-
-    for image_loader, interpolation in itertools.product(
-        make_affine_image_loaders(),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-        ],
-    ):
-        yield ArgsKwargs(image_loader, **_full_affine_params(), fill=0)
-
-
-def reference_inputs_affine_image_tensor():
-    for image_loader, affine_kwargs in itertools.product(make_image_loaders_for_interpolation(), _AFFINE_KWARGS):
-        yield ArgsKwargs(
-            image_loader,
-            interpolation=F.InterpolationMode.NEAREST,
-            **affine_kwargs,
-        )
-
-
-def sample_inputs_affine_bounding_box():
-    for bounding_box_loader, affine_params in itertools.product(
-        make_bounding_box_loaders(formats=[datapoints.BoundingBoxFormat.XYXY]), _DIVERSE_AFFINE_PARAMS
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
-            **affine_params,
-        )
-
-
-def _compute_affine_matrix(angle, translate, scale, shear, center):
-    rot = math.radians(angle)
-    cx, cy = center
-    tx, ty = translate
-    sx, sy = [math.radians(sh_) for sh_ in shear]
-
-    c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
-    t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
-    c_matrix_inv = np.linalg.inv(c_matrix)
-    rs_matrix = np.array(
-        [
-            [scale * math.cos(rot), -scale * math.sin(rot), 0],
-            [scale * math.sin(rot), scale * math.cos(rot), 0],
-            [0, 0, 1],
-        ]
-    )
-    shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
-    shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
-    rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
-    true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
-    return true_matrix
-
-
 def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
     def transform(bbox, affine_matrix_, format_, spatial_size_):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
@@ -342,81 +235,6 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size,
     return expected_bboxes
 
 
-def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
-    if center is None:
-        center = [s * 0.5 for s in spatial_size[::-1]]
-
-    affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
-    affine_matrix = affine_matrix[:2, :]
-
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
-    )
-
-    return expected_bboxes
-
-
-def reference_inputs_affine_bounding_box():
-    for bounding_box_loader, affine_kwargs in itertools.product(
-        make_bounding_box_loaders(extra_dims=[()]),
-        _AFFINE_KWARGS,
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
-            **affine_kwargs,
-        )
-
-
-def sample_inputs_affine_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
-        yield ArgsKwargs(mask_loader, **_full_affine_params())
-
-
-def sample_inputs_affine_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader, **_full_affine_params())
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.affine_image_tensor,
-            sample_inputs_fn=sample_inputs_affine_image_tensor,
-            reference_fn=pil_reference_wrapper(F.affine_image_pil),
-            reference_inputs_fn=reference_inputs_affine_image_tensor,
-            float32_vs_uint8=True,
-            closeness_kwargs=pil_reference_pixel_difference(10, mae=True),
-            test_marks=[
-                xfail_jit_python_scalar_arg("shear"),
-                xfail_jit_python_scalar_arg("fill"),
-            ],
-        ),
-        KernelInfo(
-            F.affine_bounding_box,
-            sample_inputs_fn=sample_inputs_affine_bounding_box,
-            reference_fn=reference_affine_bounding_box,
-            reference_inputs_fn=reference_inputs_affine_bounding_box,
-            test_marks=[
-                xfail_jit_python_scalar_arg("shear"),
-            ],
-        ),
-        KernelInfo(
-            F.affine_mask,
-            sample_inputs_fn=sample_inputs_affine_mask,
-            test_marks=[
-                xfail_jit_python_scalar_arg("shear"),
-            ],
-        ),
-        KernelInfo(
-            F.affine_video,
-            sample_inputs_fn=sample_inputs_affine_video,
-        ),
-    ]
-)
-
-
 def sample_inputs_convert_format_bounding_box():
     formats = list(datapoints.BoundingBoxFormat)
     for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
-- 
GitLab


From 71968bc4afb8892284844a7c4cbd772696f42a88 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 30 Jun 2023 16:36:42 +0200
Subject: [PATCH 506/624] port vertical flip (#7712)

---
 test/test_transforms_v2.py                    |  55 +------
 test/test_transforms_v2_refactored.py         | 148 +++++++++++++++++-
 test/transforms_v2_dispatcher_infos.py        |  10 --
 test/transforms_v2_kernel_infos.py            |  81 ----------
 .../transforms/v2/functional/_geometry.py     |   3 +-
 5 files changed, 146 insertions(+), 151 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index e9d1dfc05..a47cf3bd4 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -29,7 +29,7 @@ from common_utils import (
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints
 from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
+from torchvision.transforms.functional import InterpolationMode, to_pil_image
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
 
@@ -406,59 +406,6 @@ def test_simple_tensor_heuristic(flat_inputs):
         assert transform.was_applied(output, input)
 
 
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomVerticalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
-        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_datapoints_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(datapoints.Image(input))
-
-        assert_equal(datapoints.Image(expected), actual)
-
-    def test_datapoints_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(datapoints.Mask(input))
-
-        assert_equal(datapoints.Mask(expected), actual)
-
-    def test_datapoints_bounding_box(self, p):
-        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.spatial_size == expected.spatial_size
-
-
 class TestPad:
     def test_assertions(self):
         with pytest.raises(TypeError, match="Got inappropriate padding arg"):
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 0db4824d5..b737a7f01 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -842,7 +842,7 @@ class TestHorizontalFlip:
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = self._make_input(datapoints.BoundingBox)
+        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_horizontal_flip_bounding_box(bounding_box)
@@ -1025,12 +1025,10 @@ class TestAffine:
 
     @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
     def test_kernel_mask(self, mask_type):
-        check_kernel(
-            F.affine_mask, self._make_input(datapoints.Mask, mask_type=mask_type), **self._MINIMAL_AFFINE_KWARGS
-        )
+        self._check_kernel(F.affine_mask, self._make_input(datapoints.Mask, mask_type=mask_type))
 
     def test_kernel_video(self):
-        check_kernel(F.affine_video, self._make_input(datapoints.Video), **self._MINIMAL_AFFINE_KWARGS)
+        self._check_kernel(F.affine_video, self._make_input(datapoints.Video))
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -1301,3 +1299,143 @@ class TestAffine:
     def test_transform_unknown_fill_error(self):
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.RandomAffine(degrees=0, fill="fill")
+
+
+class TestVerticalFlip:
+    def _make_input(self, input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), **kwargs):
+        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
+            input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+            if input_type is torch.Tensor:
+                input = input.as_subclass(torch.Tensor)
+            elif input_type is PIL.Image.Image:
+                input = F.to_image_pil(input)
+        elif input_type is datapoints.BoundingBox:
+            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
+            input = make_bounding_box(
+                dtype=dtype or torch.float32,
+                device=device,
+                spatial_size=spatial_size,
+                **kwargs,
+            )
+        elif input_type is datapoints.Mask:
+            input = make_segmentation_mask(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+        elif input_type is datapoints.Video:
+            input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+
+        return input
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_tensor(self, dtype, device):
+        check_kernel(F.vertical_flip_image_tensor, self._make_input(torch.Tensor, dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, format, dtype, device):
+        bounding_box = self._make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        check_kernel(
+            F.vertical_flip_bounding_box,
+            bounding_box,
+            format=format,
+            spatial_size=bounding_box.spatial_size,
+        )
+
+    @pytest.mark.parametrize(
+        "dtype_and_make_mask", [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
+    )
+    def test_kernel_mask(self, dtype_and_make_mask):
+        dtype, make_mask = dtype_and_make_mask
+        check_kernel(F.vertical_flip_mask, make_mask(dtype=dtype))
+
+    def test_kernel_video(self):
+        check_kernel(F.vertical_flip_video, self._make_input(datapoints.Video))
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.vertical_flip_image_tensor),
+            (PIL.Image.Image, F.vertical_flip_image_pil),
+            (datapoints.Image, F.vertical_flip_image_tensor),
+            (datapoints.BoundingBox, F.vertical_flip_bounding_box),
+            (datapoints.Mask, F.vertical_flip_mask),
+            (datapoints.Video, F.vertical_flip_video),
+        ],
+    )
+    def test_dispatcher(self, kernel, input_type):
+        check_dispatcher(F.vertical_flip, kernel, self._make_input(input_type))
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.vertical_flip_image_tensor),
+            (PIL.Image.Image, F.vertical_flip_image_pil),
+            (datapoints.Image, F.vertical_flip_image_tensor),
+            (datapoints.BoundingBox, F.vertical_flip_bounding_box),
+            (datapoints.Mask, F.vertical_flip_mask),
+            (datapoints.Video, F.vertical_flip_video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_signatures_match(F.vertical_flip, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, input_type, device):
+        input = self._make_input(input_type, device=device)
+
+        check_transform(transforms.RandomVerticalFlip, input, p=1)
+
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_image_correctness(self, fn):
+        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image_tensor(F.vertical_flip(F.to_image_pil(image)))
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_vertical_flip_bounding_box(self, bounding_box):
+        affine_matrix = np.array(
+            [
+                [1, 0, 0],
+                [0, -1, bounding_box.spatial_size[0]],
+            ],
+            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+        )
+
+        expected_bboxes = reference_affine_bounding_box_helper(
+            bounding_box,
+            format=bounding_box.format,
+            spatial_size=bounding_box.spatial_size,
+            affine_matrix=affine_matrix,
+        )
+
+        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes)
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_bounding_box_correctness(self, format, fn):
+        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+
+        actual = fn(bounding_box)
+        expected = self._reference_vertical_flip_bounding_box(bounding_box)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_noop(self, input_type, device):
+        input = self._make_input(input_type, device=device)
+
+        transform = transforms.RandomVerticalFlip(p=0)
+
+        output = transform(input)
+
+        assert_equal(output, input)
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index b217e1638..6b13ad338 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -138,16 +138,6 @@ xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
 
 
 DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.vertical_flip,
-        kernels={
-            datapoints.Image: F.vertical_flip_image_tensor,
-            datapoints.Video: F.vertical_flip_video,
-            datapoints.BoundingBox: F.vertical_flip_bounding_box,
-            datapoints.Mask: F.vertical_flip_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F.vertical_flip_image_pil, kernel_name="vertical_flip_image_pil"),
-    ),
     DispatcherInfo(
         F.rotate,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 0daae8aee..b28b514fa 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -264,87 +264,6 @@ KERNEL_INFOS.append(
 )
 
 
-def sample_inputs_vertical_flip_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.float32]):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_vertical_flip_image_tensor():
-    for image_loader in make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]):
-        yield ArgsKwargs(image_loader)
-
-
-def sample_inputs_vertical_flip_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(
-        formats=[datapoints.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
-        )
-
-
-def sample_inputs_vertical_flip_mask():
-    for image_loader in make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]):
-        yield ArgsKwargs(image_loader)
-
-
-def sample_inputs_vertical_flip_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader)
-
-
-def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
-    affine_matrix = np.array(
-        [
-            [1, 0, 0],
-            [0, -1, spatial_size[0]],
-        ],
-        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
-    )
-
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
-    )
-
-    return expected_bboxes
-
-
-def reference_inputs_vertical_flip_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(extra_dims=[()]):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
-        )
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.vertical_flip_image_tensor,
-            kernel_name="vertical_flip_image_tensor",
-            sample_inputs_fn=sample_inputs_vertical_flip_image_tensor,
-            reference_fn=pil_reference_wrapper(F.vertical_flip_image_pil),
-            reference_inputs_fn=reference_inputs_vertical_flip_image_tensor,
-            float32_vs_uint8=True,
-        ),
-        KernelInfo(
-            F.vertical_flip_bounding_box,
-            sample_inputs_fn=sample_inputs_vertical_flip_bounding_box,
-            reference_fn=reference_vertical_flip_bounding_box,
-            reference_inputs_fn=reference_inputs_vertical_flip_bounding_box,
-        ),
-        KernelInfo(
-            F.vertical_flip_mask,
-            sample_inputs_fn=sample_inputs_vertical_flip_mask,
-        ),
-        KernelInfo(
-            F.vertical_flip_video,
-            sample_inputs_fn=sample_inputs_vertical_flip_video,
-        ),
-    ]
-)
-
 _ROTATE_ANGLES = [-87, 15, 90]
 
 
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index b56205e61..1d298ff91 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -93,7 +93,8 @@ def vertical_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-2)
 
 
-vertical_flip_image_pil = _FP.vflip
+def vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
+    return _FP.vflip(image)
 
 
 def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
-- 
GitLab


From 43030cb2a78ce7f171647a79be8744067d24aeb3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 12:49:58 +0200
Subject: [PATCH 507/624] remove center unsetting if expand=True (#7715)

---
 torchvision/transforms/v2/functional/_geometry.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 1d298ff91..e1dd2866b 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -921,7 +921,6 @@ def rotate_image_pil(
 
     if center is not None and expand:
         warnings.warn("The provided center argument has no effect on the result if expand is True")
-        center = None
 
     return _FP.rotate(
         image, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center
@@ -938,7 +937,6 @@ def rotate_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     if center is not None and expand:
         warnings.warn("The provided center argument has no effect on the result if expand is True")
-        center = None
 
     return _affine_bounding_box_with_expand(
         bounding_box,
-- 
GitLab


From c3e925656cd82ca0ce687dc3ab1e071b4d5544a1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 13:33:18 +0200
Subject: [PATCH 508/624] fix PIL version check (#7716)

---
 test/test_transforms.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/test/test_transforms.py b/test/test_transforms.py
index 41075c651..7581bf332 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -952,33 +952,6 @@ def test_adjust_contrast():
     torch.testing.assert_close(y_np, y_ans)
 
 
-@pytest.mark.skipif(Image.__version__ >= "7", reason="Temporarily disabled")
-def test_adjust_saturation():
-    x_shape = [2, 2, 3]
-    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
-
-    # test 0
-    y_pil = F.adjust_saturation(x_pil, 1)
-    y_np = np.array(y_pil)
-    torch.testing.assert_close(y_np, x_np)
-
-    # test 1
-    y_pil = F.adjust_saturation(x_pil, 0.5)
-    y_np = np.array(y_pil)
-    y_ans = [2, 4, 8, 87, 128, 173, 39, 25, 138, 133, 215, 88]
-    y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
-    torch.testing.assert_close(y_np, y_ans)
-
-    # test 2
-    y_pil = F.adjust_saturation(x_pil, 2)
-    y_np = np.array(y_pil)
-    y_ans = [0, 6, 22, 0, 149, 255, 32, 0, 255, 4, 255, 0]
-    y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
-    torch.testing.assert_close(y_np, y_ans)
-
-
 def test_adjust_hue():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-- 
GitLab


From 2d4484fba0f45637f68adadf5a056a6147642aa4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 14:24:26 +0200
Subject: [PATCH 509/624] port rotate (#7713)

---
 test/test_transforms_v2.py             |  74 ---
 test/test_transforms_v2_refactored.py  | 650 ++++++++++++++++---------
 test/transforms_v2_dispatcher_infos.py |  14 -
 test/transforms_v2_kernel_infos.py     | 123 -----
 4 files changed, 433 insertions(+), 428 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index a47cf3bd4..093c378aa 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -541,80 +541,6 @@ class TestRandomZoomOut:
         fn.assert_has_calls(calls)
 
 
-class TestRandomRotation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomRotation(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomRotation(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomRotation(12, fill="abc")
-
-        with pytest.raises(TypeError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=12)
-
-        with pytest.raises(ValueError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=[1, 2, 3])
-
-    def test__get_params(self):
-        angle_bound = 34
-        transform = transforms.RandomRotation(angle_bound)
-
-        params = transform._get_params(None)
-        assert -angle_bound <= params["angle"] <= angle_bound
-
-        angle_bounds = [12, 34]
-        transform = transforms.RandomRotation(angle_bounds)
-
-        params = transform._get_params(None)
-        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("expand", [False, True])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, expand, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomRotation(
-            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.rotate")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
-
-    @pytest.mark.parametrize("angle", [34, -87])
-    @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_spatial_size(self, angle, expand):
-        # Specific test for BoundingBox.rotate
-        bbox = datapoints.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
-        )
-        img = datapoints.Image(torch.rand(1, 3, 32, 32))
-
-        out_img = img.rotate(angle, expand=expand)
-        out_bbox = bbox.rotate(angle, expand=expand)
-
-        assert out_img.spatial_size == out_bbox.spatial_size
-
-
 class TestRandomCrop:
     def test_assertions(self):
         with pytest.raises(ValueError, match="Please provide only two dimensions"):
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index b737a7f01..2130a8cf5 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -308,6 +308,97 @@ def transform_cls_to_functional(transform_cls, **transform_specific_kwargs):
     return wrapper
 
 
+def make_input(input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), mask_type="segmentation", **kwargs):
+    if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
+        input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+        if input_type is torch.Tensor:
+            input = input.as_subclass(torch.Tensor)
+        elif input_type is PIL.Image.Image:
+            input = F.to_image_pil(input)
+    elif input_type is datapoints.BoundingBox:
+        kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
+        input = make_bounding_box(
+            dtype=dtype or torch.float32,
+            device=device,
+            spatial_size=spatial_size,
+            **kwargs,
+        )
+    elif input_type is datapoints.Mask:
+        if mask_type == "segmentation":
+            make_mask = make_segmentation_mask
+            default_dtype = torch.uint8
+        elif mask_type == "detection":
+            make_mask = make_detection_mask
+            default_dtype = torch.bool
+        else:
+            raise ValueError(f"`mask_type` can be `'segmentation'` or `'detection'`, but got {mask_type}.")
+        input = make_mask(size=spatial_size, dtype=dtype or default_dtype, device=device, **kwargs)
+    elif input_type is datapoints.Video:
+        input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {input_type} instead."
+        )
+
+    return input
+
+
+def param_value_parametrization(**kwargs):
+    """Helper function to turn
+
+    @pytest.mark.parametrize(
+        ("param", "value"),
+        ("a", 1),
+        ("a", 2),
+        ("a", 3),
+        ("b", -1.0)
+        ("b", 1.0)
+    )
+
+    into
+
+    @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0])
+    """
+    return pytest.mark.parametrize(
+        ("param", "value"),
+        [(param, value) for param, values in kwargs.items() for value in values],
+    )
+
+
+def adapt_fill(value, *, dtype):
+    """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype"""
+    if value is None:
+        return value
+
+    max_value = get_max_value(dtype)
+
+    if isinstance(value, (int, float)):
+        return type(value)(value * max_value)
+    elif isinstance(value, (list, tuple)):
+        return type(value)(type(v)(v * max_value) for v in value)
+    else:
+        raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.")
+
+
+EXHAUSTIVE_TYPE_FILLS = [
+    None,
+    1,
+    0.5,
+    [1],
+    [0.2],
+    (0,),
+    (0.7,),
+    [1, 0, 1],
+    [0.1, 0.2, 0.3],
+    (0, 1, 0),
+    (0.9, 0.234, 0.314),
+]
+CORRECTNESS_FILLS = [
+    v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)
+]
+
+
 # We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well
 INTERPOLATION_MODES = [
     transforms.InterpolationMode.NEAREST,
@@ -380,28 +471,6 @@ class TestResize:
 
         return dict(max_size=max_size)
 
-    def _make_input(self, input_type, *, dtype=None, device="cpu", **kwargs):
-        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-            input = make_image(size=self.INPUT_SIZE, dtype=dtype or torch.uint8, device=device, **kwargs)
-            if input_type is torch.Tensor:
-                input = input.as_subclass(torch.Tensor)
-            elif input_type is PIL.Image.Image:
-                input = F.to_image_pil(input)
-        elif input_type is datapoints.BoundingBox:
-            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
-            input = make_bounding_box(
-                spatial_size=self.INPUT_SIZE,
-                dtype=dtype or torch.float32,
-                device=device,
-                **kwargs,
-            )
-        elif input_type is datapoints.Mask:
-            input = make_segmentation_mask(size=self.INPUT_SIZE, dtype=dtype or torch.uint8, device=device, **kwargs)
-        elif input_type is datapoints.Video:
-            input = make_video(size=self.INPUT_SIZE, dtype=dtype or torch.uint8, device=device, **kwargs)
-
-        return input
-
     def _compute_output_size(self, *, input_size, size, max_size):
         if not (isinstance(size, int) or len(size) == 1):
             return tuple(size)
@@ -447,7 +516,7 @@ class TestResize:
 
         check_kernel(
             F.resize_image_tensor,
-            self._make_input(datapoints.Image, dtype=dtype, device=device),
+            make_input(datapoints.Image, dtype=dtype, device=device, spatial_size=self.INPUT_SIZE),
             size=size,
             interpolation=interpolation,
             **max_size_kwarg,
@@ -465,7 +534,9 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = self._make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_input(
+            datapoints.BoundingBox, dtype=dtype, device=device, format=format, spatial_size=self.INPUT_SIZE
+        )
         check_kernel(
             F.resize_bounding_box,
             bounding_box,
@@ -475,14 +546,21 @@ class TestResize:
             check_scripted_vs_eager=not isinstance(size, int),
         )
 
-    @pytest.mark.parametrize(
-        ("dtype", "make_mask"), [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
-    )
-    def test_kernel_mask(self, dtype, make_mask):
-        check_kernel(F.resize_mask, make_mask(dtype=dtype), size=self.OUTPUT_SIZES[-1])
+    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
+    def test_kernel_mask(self, mask_type):
+        check_kernel(
+            F.resize_mask,
+            make_input(datapoints.Mask, spatial_size=self.INPUT_SIZE, mask_type=mask_type),
+            size=self.OUTPUT_SIZES[-1],
+        )
 
     def test_kernel_video(self):
-        check_kernel(F.resize_video, self._make_input(datapoints.Video), size=self.OUTPUT_SIZES[-1], antialias=True)
+        check_kernel(
+            F.resize_video,
+            make_input(datapoints.Video, spatial_size=self.INPUT_SIZE),
+            size=self.OUTPUT_SIZES[-1],
+            antialias=True,
+        )
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
@@ -500,7 +578,7 @@ class TestResize:
         check_dispatcher(
             F.resize,
             kernel,
-            self._make_input(input_type),
+            make_input(input_type, spatial_size=self.INPUT_SIZE),
             size=size,
             antialias=True,
             check_scripted_smoke=not isinstance(size, int),
@@ -527,7 +605,7 @@ class TestResize:
         [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
     )
     def test_transform(self, size, device, input_type):
-        input = self._make_input(input_type, device=device)
+        input = make_input(input_type, device=device, spatial_size=self.INPUT_SIZE)
 
         check_transform(
             transforms.Resize,
@@ -551,7 +629,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu", spatial_size=self.INPUT_SIZE)
 
         actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
         expected = F.to_image_tensor(
@@ -594,7 +672,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = self._make_input(datapoints.BoundingBox)
+        bounding_box = make_input(datapoints.BoundingBox, spatial_size=self.INPUT_SIZE)
 
         actual = fn(bounding_box, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
@@ -608,7 +686,7 @@ class TestResize:
         [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
     )
     def test_pil_interpolation_compat_smoke(self, interpolation, input_type):
-        input = self._make_input(input_type)
+        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
 
         with (
             contextlib.nullcontext()
@@ -624,7 +702,9 @@ class TestResize:
 
     def test_dispatcher_pil_antialias_warning(self):
         with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
-            F.resize(self._make_input(PIL.Image.Image), size=self.OUTPUT_SIZES[0], antialias=False)
+            F.resize(
+                make_input(PIL.Image.Image, spatial_size=self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False
+            )
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
@@ -641,7 +721,7 @@ class TestResize:
             match = "size should be an int or a sequence of length 1"
 
         with pytest.raises(ValueError, match=match):
-            F.resize(self._make_input(input_type), size=size, max_size=max_size, antialias=True)
+            F.resize(make_input(input_type, spatial_size=self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
@@ -654,7 +734,11 @@ class TestResize:
             if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC}
             else assert_no_warnings()
         ):
-            F.resize(self._make_input(input_type), size=self.OUTPUT_SIZES[0], interpolation=interpolation)
+            F.resize(
+                make_input(input_type, spatial_size=self.INPUT_SIZE),
+                size=self.OUTPUT_SIZES[0],
+                interpolation=interpolation,
+            )
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
@@ -668,7 +752,7 @@ class TestResize:
         if issubclass(input_type, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
             return
 
-        input = self._make_input(input_type)
+        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
 
         expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
         actual = F.resize(
@@ -689,7 +773,7 @@ class TestResize:
         [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
     )
     def test_noop(self, size, input_type):
-        input = self._make_input(input_type)
+        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
 
         output = F.resize(input, size=size, antialias=True)
 
@@ -711,7 +795,7 @@ class TestResize:
         # Checks that `max_size` is not ignored if `size == small_edge_size`
         # See https://github.com/pytorch/vision/issues/5405
 
-        input = self._make_input(input_type)
+        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
 
         size = min(F.get_spatial_size(input))
         max_size = size + 1
@@ -721,38 +805,16 @@ class TestResize:
 
 
 class TestHorizontalFlip:
-    def _make_input(self, input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), **kwargs):
-        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-            input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-            if input_type is torch.Tensor:
-                input = input.as_subclass(torch.Tensor)
-            elif input_type is PIL.Image.Image:
-                input = F.to_image_pil(input)
-        elif input_type is datapoints.BoundingBox:
-            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
-            input = make_bounding_box(
-                dtype=dtype or torch.float32,
-                device=device,
-                spatial_size=spatial_size,
-                **kwargs,
-            )
-        elif input_type is datapoints.Mask:
-            input = make_segmentation_mask(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-        elif input_type is datapoints.Video:
-            input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-
-        return input
-
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.horizontal_flip_image_tensor, self._make_input(torch.Tensor, dtype=dtype, device=device))
+        check_kernel(F.horizontal_flip_image_tensor, make_input(torch.Tensor, dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = self._make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
         check_kernel(
             F.horizontal_flip_bounding_box,
             bounding_box,
@@ -760,15 +822,12 @@ class TestHorizontalFlip:
             spatial_size=bounding_box.spatial_size,
         )
 
-    @pytest.mark.parametrize(
-        "dtype_and_make_mask", [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
-    )
-    def test_kernel_mask(self, dtype_and_make_mask):
-        dtype, make_mask = dtype_and_make_mask
-        check_kernel(F.horizontal_flip_mask, make_mask(dtype=dtype))
+    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
+    def test_kernel_mask(self, mask_type):
+        check_kernel(F.horizontal_flip_mask, make_input(datapoints.Mask, mask_type=mask_type))
 
     def test_kernel_video(self):
-        check_kernel(F.horizontal_flip_video, self._make_input(datapoints.Video))
+        check_kernel(F.horizontal_flip_video, make_input(datapoints.Video))
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -782,7 +841,7 @@ class TestHorizontalFlip:
         ],
     )
     def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.horizontal_flip, kernel, self._make_input(input_type))
+        check_dispatcher(F.horizontal_flip, kernel, make_input(input_type))
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -804,7 +863,7 @@ class TestHorizontalFlip:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, input_type, device):
-        input = self._make_input(input_type, device=device)
+        input = make_input(input_type, device=device)
 
         check_transform(transforms.RandomHorizontalFlip, input, p=1)
 
@@ -812,7 +871,7 @@ class TestHorizontalFlip:
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_image_correctness(self, fn):
-        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
         expected = F.to_image_tensor(F.horizontal_flip(F.to_image_pil(image)))
@@ -842,7 +901,7 @@ class TestHorizontalFlip:
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_input(datapoints.BoundingBox, format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_horizontal_flip_bounding_box(bounding_box)
@@ -855,7 +914,7 @@ class TestHorizontalFlip:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, input_type, device):
-        input = self._make_input(input_type, device=device)
+        input = make_input(input_type, device=device)
 
         transform = transforms.RandomHorizontalFlip(p=0)
 
@@ -865,50 +924,6 @@ class TestHorizontalFlip:
 
 
 class TestAffine:
-    def _make_input(
-        self, input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), mask_type="segmentation", **kwargs
-    ):
-        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-            input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-            if input_type is torch.Tensor:
-                input = input.as_subclass(torch.Tensor)
-            elif input_type is PIL.Image.Image:
-                input = F.to_image_pil(input)
-        elif input_type is datapoints.BoundingBox:
-            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
-            input = make_bounding_box(
-                dtype=dtype or torch.float32,
-                device=device,
-                spatial_size=spatial_size,
-                **kwargs,
-            )
-        elif input_type is datapoints.Mask:
-            if mask_type == "segmentation":
-                make_mask = make_segmentation_mask
-                default_dtype = torch.uint8
-            elif mask_type == "detection":
-                make_mask = make_detection_mask
-                default_dtype = torch.bool
-            input = make_mask(size=spatial_size, dtype=dtype or default_dtype, device=device, **kwargs)
-        elif input_type is datapoints.Video:
-            input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-
-        return input
-
-    def _adapt_fill(self, value, *, dtype):
-        """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype"""
-        if value is None:
-            return value
-
-        max_value = get_max_value(dtype)
-
-        if isinstance(value, (int, float)):
-            return type(value)(value * max_value)
-        elif isinstance(value, (list, tuple)):
-            return type(value)(type(v)(v * max_value) for v in value)
-        else:
-            raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'")
-
     _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
         # float, int
         angle=[-10.9, 18],
@@ -934,23 +949,6 @@ class TestAffine:
         for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
     }
 
-    _EXHAUSTIVE_TYPE_FILLS = [
-        None,
-        1,
-        0.5,
-        [1],
-        [0.2],
-        (0,),
-        (0.7,),
-        [1, 0, 1],
-        [0.1, 0.2, 0.3],
-        (0, 1, 0),
-        (0.9, 0.234, 0.314),
-    ]
-    _CORRECTNESS_FILL = [
-        v for v in _EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)
-    ]
-
     _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
         degrees=[30, (-15, 20)],
         translate=[None, (0.5, 0.5)],
@@ -966,29 +964,22 @@ class TestAffine:
         kwargs_.update(kwargs)
         check_kernel(kernel, input, *args, **kwargs_)
 
-    @pytest.mark.parametrize(
-        ("param", "value"),
-        [
-            (param, value)
-            for param, values in [
-                ("angle", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"]),
-                ("translate", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"]),
-                ("shear", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"]),
-                ("center", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"]),
-                ("interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]),
-                ("fill", _EXHAUSTIVE_TYPE_FILLS),
-            ]
-            for value in values
-        ],
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        fill=EXHAUSTIVE_TYPE_FILLS,
     )
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, param, value, dtype, device):
         if param == "fill":
-            value = self._adapt_fill(value, dtype=dtype)
+            value = adapt_fill(value, dtype=dtype)
         self._check_kernel(
             F.affine_image_tensor,
-            self._make_input(torch.Tensor, dtype=dtype, device=device),
+            make_input(torch.Tensor, dtype=dtype, device=device),
             **{param: value},
             check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
             check_cuda_vs_cpu=dict(atol=1, rtol=0)
@@ -996,27 +987,20 @@ class TestAffine:
             else True,
         )
 
-    @pytest.mark.parametrize(
-        ("param", "value"),
-        [
-            (param, value)
-            for param, values in [
-                ("angle", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"]),
-                ("translate", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"]),
-                ("shear", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"]),
-                ("center", _EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"]),
-            ]
-            for value in values
-        ],
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
     )
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, param, value, format, dtype, device):
-        bounding_box = self._make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device)
+        bounding_box = make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device)
         self._check_kernel(
             F.affine_bounding_box,
-            self._make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device),
+            make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device),
             format=format,
             spatial_size=bounding_box.spatial_size,
             **{param: value},
@@ -1025,10 +1009,10 @@ class TestAffine:
 
     @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
     def test_kernel_mask(self, mask_type):
-        self._check_kernel(F.affine_mask, self._make_input(datapoints.Mask, mask_type=mask_type))
+        self._check_kernel(F.affine_mask, make_input(datapoints.Mask, mask_type=mask_type))
 
     def test_kernel_video(self):
-        self._check_kernel(F.affine_video, self._make_input(datapoints.Video))
+        self._check_kernel(F.affine_video, make_input(datapoints.Video))
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -1042,7 +1026,7 @@ class TestAffine:
         ],
     )
     def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.affine, kernel, self._make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+        check_dispatcher(F.affine, kernel, make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -1064,7 +1048,7 @@ class TestAffine:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, input_type, device):
-        input = self._make_input(input_type, device=device)
+        input = make_input(input_type, device=device)
 
         check_transform(transforms.RandomAffine, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
 
@@ -1076,11 +1060,11 @@ class TestAffine:
     @pytest.mark.parametrize(
         "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
     )
-    @pytest.mark.parametrize("fill", _CORRECTNESS_FILL)
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill):
-        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
 
-        fill = self._adapt_fill(fill, dtype=torch.uint8)
+        fill = adapt_fill(fill, dtype=torch.uint8)
 
         actual = F.affine(
             image,
@@ -1112,12 +1096,12 @@ class TestAffine:
     @pytest.mark.parametrize(
         "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
     )
-    @pytest.mark.parametrize("fill", _CORRECTNESS_FILL)
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_image_correctness(self, center, interpolation, fill, seed):
-        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
 
-        fill = self._adapt_fill(fill, dtype=torch.uint8)
+        fill = adapt_fill(fill, dtype=torch.uint8)
 
         transform = transforms.RandomAffine(
             **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill
@@ -1179,7 +1163,7 @@ class TestAffine:
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_box_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_input(datapoints.BoundingBox, format=format)
 
         actual = F.affine(
             bounding_box,
@@ -1204,7 +1188,7 @@ class TestAffine:
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_box_correctness(self, format, center, seed):
-        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_input(datapoints.BoundingBox, format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
@@ -1224,7 +1208,7 @@ class TestAffine:
     @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"])
     @pytest.mark.parametrize("seed", list(range(10)))
     def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
-        image = self._make_input(torch.Tensor)
+        image = make_input(torch.Tensor)
         height, width = F.get_spatial_size(image)
 
         transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
@@ -1302,38 +1286,16 @@ class TestAffine:
 
 
 class TestVerticalFlip:
-    def _make_input(self, input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), **kwargs):
-        if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-            input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-            if input_type is torch.Tensor:
-                input = input.as_subclass(torch.Tensor)
-            elif input_type is PIL.Image.Image:
-                input = F.to_image_pil(input)
-        elif input_type is datapoints.BoundingBox:
-            kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
-            input = make_bounding_box(
-                dtype=dtype or torch.float32,
-                device=device,
-                spatial_size=spatial_size,
-                **kwargs,
-            )
-        elif input_type is datapoints.Mask:
-            input = make_segmentation_mask(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-        elif input_type is datapoints.Video:
-            input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-
-        return input
-
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.vertical_flip_image_tensor, self._make_input(torch.Tensor, dtype=dtype, device=device))
+        check_kernel(F.vertical_flip_image_tensor, make_input(torch.Tensor, dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = self._make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
         check_kernel(
             F.vertical_flip_bounding_box,
             bounding_box,
@@ -1341,15 +1303,12 @@ class TestVerticalFlip:
             spatial_size=bounding_box.spatial_size,
         )
 
-    @pytest.mark.parametrize(
-        "dtype_and_make_mask", [(torch.uint8, make_segmentation_mask), (torch.bool, make_detection_mask)]
-    )
-    def test_kernel_mask(self, dtype_and_make_mask):
-        dtype, make_mask = dtype_and_make_mask
-        check_kernel(F.vertical_flip_mask, make_mask(dtype=dtype))
+    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
+    def test_kernel_mask(self, mask_type):
+        check_kernel(F.vertical_flip_mask, make_input(datapoints.Mask, mask_type=mask_type))
 
     def test_kernel_video(self):
-        check_kernel(F.vertical_flip_video, self._make_input(datapoints.Video))
+        check_kernel(F.vertical_flip_video, make_input(datapoints.Video))
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -1363,7 +1322,7 @@ class TestVerticalFlip:
         ],
     )
     def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.vertical_flip, kernel, self._make_input(input_type))
+        check_dispatcher(F.vertical_flip, kernel, make_input(input_type))
 
     @pytest.mark.parametrize(
         ("input_type", "kernel"),
@@ -1385,13 +1344,13 @@ class TestVerticalFlip:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, input_type, device):
-        input = self._make_input(input_type, device=device)
+        input = make_input(input_type, device=device)
 
         check_transform(transforms.RandomVerticalFlip, input, p=1)
 
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_image_correctness(self, fn):
-        image = self._make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
         expected = F.to_image_tensor(F.vertical_flip(F.to_image_pil(image)))
@@ -1419,7 +1378,7 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = self._make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_input(datapoints.BoundingBox, format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_vertical_flip_bounding_box(bounding_box)
@@ -1432,10 +1391,267 @@ class TestVerticalFlip:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, input_type, device):
-        input = self._make_input(input_type, device=device)
+        input = make_input(input_type, device=device)
 
         transform = transforms.RandomVerticalFlip(p=0)
 
         output = transform(input)
 
         assert_equal(output, input)
+
+
+class TestRotate:
+    _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
+        # float, int
+        angle=[-10.9, 18],
+        # None
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)],
+    )
+    _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()}
+    _CORRECTNESS_AFFINE_KWARGS = {
+        k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)]
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+
+    _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
+        degrees=[30, (-15, 20)],
+    )
+    _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()}
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_tensor(self, param, value, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+        check_kernel(
+            F.rotate_image_tensor,
+            make_input(torch.Tensor, dtype=dtype, device=device),
+            **kwargs,
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, param, value, format, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+
+        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+
+        check_kernel(
+            F.rotate_bounding_box,
+            bounding_box,
+            format=format,
+            spatial_size=bounding_box.spatial_size,
+            **kwargs,
+        )
+
+    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
+    def test_kernel_mask(self, mask_type):
+        check_kernel(F.rotate_mask, make_input(datapoints.Mask, mask_type=mask_type), **self._MINIMAL_AFFINE_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.rotate_video, make_input(datapoints.Video), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.rotate_image_tensor),
+            (PIL.Image.Image, F.rotate_image_pil),
+            (datapoints.Image, F.rotate_image_tensor),
+            (datapoints.BoundingBox, F.rotate_bounding_box),
+            (datapoints.Mask, F.rotate_mask),
+            (datapoints.Video, F.rotate_video),
+        ],
+    )
+    def test_dispatcher(self, kernel, input_type):
+        check_dispatcher(F.rotate, kernel, make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("input_type", "kernel"),
+        [
+            (torch.Tensor, F.rotate_image_tensor),
+            (PIL.Image.Image, F.rotate_image_pil),
+            (datapoints.Image, F.rotate_image_tensor),
+            (datapoints.BoundingBox, F.rotate_bounding_box),
+            (datapoints.Mask, F.rotate_mask),
+            (datapoints.Video, F.rotate_video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_signatures_match(F.rotate, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "input_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, input_type, device):
+        input = make_input(input_type, device=device)
+
+        check_transform(transforms.RandomRotation, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_functional_image_correctness(self, angle, center, interpolation, expand, fill):
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill)
+        expected = F.to_image_tensor(
+            F.rotate(
+                F.to_image_pil(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill
+            )
+        )
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, center, interpolation, expand, fill, seed):
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        transform = transforms.RandomRotation(
+            **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES,
+            center=center,
+            interpolation=interpolation,
+            expand=expand,
+            fill=fill,
+        )
+
+        torch.manual_seed(seed)
+        actual = transform(image)
+
+        torch.manual_seed(seed)
+        expected = F.to_image_tensor(transform(F.to_image_pil(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
+
+    def _reference_rotate_bounding_box(self, bounding_box, *, angle, expand, center):
+        # FIXME
+        if expand:
+            raise ValueError("This reference currently does not support expand=True")
+
+        if center is None:
+            center = [s * 0.5 for s in bounding_box.spatial_size[::-1]]
+
+        a = np.cos(angle * np.pi / 180.0)
+        b = np.sin(angle * np.pi / 180.0)
+        cx = center[0]
+        cy = center[1]
+        affine_matrix = np.array(
+            [
+                [a, b, cx - cx * a - b * cy],
+                [-b, a, cy + cx * b - a * cy],
+            ],
+            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+        )
+
+        expected_bboxes = reference_affine_bounding_box_helper(
+            bounding_box,
+            format=bounding_box.format,
+            spatial_size=bounding_box.spatial_size,
+            affine_matrix=affine_matrix,
+        )
+
+        return expected_bboxes
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    # TODO: add support for expand=True in the reference
+    @pytest.mark.parametrize("expand", [False])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_bounding_box_correctness(self, format, angle, expand, center):
+        bounding_box = make_input(datapoints.BoundingBox, format=format)
+
+        actual = F.rotate(bounding_box, angle=angle, expand=expand, center=center)
+        expected = self._reference_rotate_bounding_box(bounding_box, angle=angle, expand=expand, center=center)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    # TODO: add support for expand=True in the reference
+    @pytest.mark.parametrize("expand", [False])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_box_correctness(self, format, expand, center, seed):
+        bounding_box = make_input(datapoints.BoundingBox, format=format)
+
+        transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([bounding_box])
+
+        torch.manual_seed(seed)
+        actual = transform(bounding_box)
+
+        expected = self._reference_rotate_bounding_box(bounding_box, **params, expand=expand, center=center)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transform_get_params_bounds(self, degrees, seed):
+        transform = transforms.RandomRotation(degrees=degrees)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([])
+
+        if isinstance(degrees, (int, float)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+    @pytest.mark.parametrize("param", ["degrees", "center"])
+    @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]])
+    def test_transform_sequence_len_errors(self, param, value):
+        if param == "degrees" and not isinstance(value, list):
+            return
+
+        kwargs = {param: value}
+        if param != "degrees":
+            kwargs["degrees"] = 0
+
+        with pytest.raises(
+            ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2"
+        ):
+            transforms.RandomRotation(**kwargs)
+
+    def test_transform_negative_degrees_error(self):
+        with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=-1)
+
+    def test_transform_unknown_fill_error(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(degrees=0, fill="fill")
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 6b13ad338..6f61526f3 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -138,20 +138,6 @@ xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
 
 
 DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.rotate,
-        kernels={
-            datapoints.Image: F.rotate_image_tensor,
-            datapoints.Video: F.rotate_video,
-            datapoints.BoundingBox: F.rotate_bounding_box,
-            datapoints.Mask: F.rotate_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F.rotate_image_pil),
-        test_marks=[
-            xfail_jit_python_scalar_arg("fill"),
-            *xfails_pil_if_fill_sequence_needs_broadcast,
-        ],
-    ),
     DispatcherInfo(
         F.crop,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index b28b514fa..cae8d3157 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -264,129 +264,6 @@ KERNEL_INFOS.append(
 )
 
 
-_ROTATE_ANGLES = [-87, 15, 90]
-
-
-def sample_inputs_rotate_image_tensor():
-    make_rotate_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
-    )
-
-    for image_loader in make_rotate_image_loaders():
-        yield ArgsKwargs(image_loader, angle=15.0, expand=True)
-
-    for image_loader, center in itertools.product(
-        make_rotate_image_loaders(), [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]
-    ):
-        yield ArgsKwargs(image_loader, angle=15.0, center=center)
-
-    for image_loader in make_rotate_image_loaders():
-        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
-            yield ArgsKwargs(image_loader, angle=15.0, fill=fill)
-
-    for image_loader, interpolation in itertools.product(
-        make_rotate_image_loaders(),
-        [F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
-    ):
-        yield ArgsKwargs(image_loader, angle=15.0, fill=0)
-
-
-def reference_inputs_rotate_image_tensor():
-    for image_loader, angle in itertools.product(make_image_loaders_for_interpolation(), _ROTATE_ANGLES):
-        yield ArgsKwargs(image_loader, angle=angle)
-
-
-def sample_inputs_rotate_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
-            angle=_ROTATE_ANGLES[0],
-        )
-
-
-def reference_inputs_rotate_bounding_box():
-    for bounding_box_loader, angle in itertools.product(
-        make_bounding_box_loaders(extra_dims=((), (4,))), _ROTATE_ANGLES
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
-            angle=angle,
-        )
-
-    # TODO: add samples with expand=True and center
-
-
-def reference_rotate_bounding_box(bounding_box, *, format, spatial_size, angle, expand=False, center=None):
-
-    if center is None:
-        center = [spatial_size[1] * 0.5, spatial_size[0] * 0.5]
-
-    a = np.cos(angle * np.pi / 180.0)
-    b = np.sin(angle * np.pi / 180.0)
-    cx = center[0]
-    cy = center[1]
-    affine_matrix = np.array(
-        [
-            [a, b, cx - cx * a - b * cy],
-            [-b, a, cy + cx * b - a * cy],
-        ],
-        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
-    )
-
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
-    )
-    return expected_bboxes, spatial_size
-
-
-def sample_inputs_rotate_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
-        yield ArgsKwargs(mask_loader, angle=15.0)
-
-
-def sample_inputs_rotate_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader, angle=15.0)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.rotate_image_tensor,
-            sample_inputs_fn=sample_inputs_rotate_image_tensor,
-            reference_fn=pil_reference_wrapper(F.rotate_image_pil),
-            reference_inputs_fn=reference_inputs_rotate_image_tensor,
-            float32_vs_uint8=True,
-            closeness_kwargs=pil_reference_pixel_difference(1, mae=True),
-            test_marks=[
-                xfail_jit_python_scalar_arg("fill"),
-            ],
-        ),
-        KernelInfo(
-            F.rotate_bounding_box,
-            sample_inputs_fn=sample_inputs_rotate_bounding_box,
-            reference_fn=reference_rotate_bounding_box,
-            reference_inputs_fn=reference_inputs_rotate_bounding_box,
-            closeness_kwargs={
-                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-4, rtol=1e-4),
-                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-4, rtol=1e-4),
-            },
-        ),
-        KernelInfo(
-            F.rotate_mask,
-            sample_inputs_fn=sample_inputs_rotate_mask,
-        ),
-        KernelInfo(
-            F.rotate_video,
-            sample_inputs_fn=sample_inputs_rotate_video,
-        ),
-    ]
-)
-
 _CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20], width=[12, 20])
 
 
-- 
GitLab


From cbc36eb4b083041b5aec69be68b5ec11e54266b4 Mon Sep 17 00:00:00 2001
From: MateuszGuzek <48548729+MateuszGuzek@users.noreply.github.com>
Date: Wed, 5 Jul 2023 14:08:09 +0200
Subject: [PATCH 510/624] Add filter parameters to `list_models()` (#7718)

Co-authored-by: Mateusz Guzek <matguzek@meta.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_extended_models.py | 74 ++++++++++++++++++++++++++++++++----
 torchvision/models/_api.py   | 33 ++++++++++++++--
 2 files changed, 96 insertions(+), 11 deletions(-)

diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index 0866cc0f8..96a3fc5f8 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -103,17 +103,18 @@ def test_weights_deserializable(name):
         assert pickle.loads(pickle.dumps(weights)) is weights
 
 
+def get_models_from_module(module):
+    return [
+        v.__name__
+        for k, v in module.__dict__.items()
+        if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__
+    ]
+
+
 @pytest.mark.parametrize(
     "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow]
 )
 def test_list_models(module):
-    def get_models_from_module(module):
-        return [
-            v.__name__
-            for k, v in module.__dict__.items()
-            if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__
-        ]
-
     a = set(get_models_from_module(module))
     b = set(x.replace("quantized_", "") for x in models.list_models(module))
 
@@ -121,6 +122,65 @@ def test_list_models(module):
     assert a == b
 
 
+@pytest.mark.parametrize(
+    "include_filters",
+    [
+        None,
+        [],
+        (),
+        "",
+        "*resnet*",
+        ["*alexnet*"],
+        "*not-existing-model-for-test?",
+        ["*resnet*", "*alexnet*"],
+        ["*resnet*", "*alexnet*", "*not-existing-model-for-test?"],
+        ("*resnet*", "*alexnet*"),
+        set(["*resnet*", "*alexnet*"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "exclude_filters",
+    [
+        None,
+        [],
+        (),
+        "",
+        "*resnet*",
+        ["*alexnet*"],
+        ["*not-existing-model-for-test?"],
+        ["resnet34", "*not-existing-model-for-test?"],
+        ["resnet34", "*resnet1*"],
+        ("resnet34", "*resnet1*"),
+        set(["resnet34", "*resnet1*"]),
+    ],
+)
+def test_list_models_filters(include_filters, exclude_filters):
+    actual = set(models.list_models(models, include=include_filters, exclude=exclude_filters))
+    classification_models = set(get_models_from_module(models))
+
+    if isinstance(include_filters, str):
+        include_filters = [include_filters]
+    if isinstance(exclude_filters, str):
+        exclude_filters = [exclude_filters]
+
+    if include_filters:
+        expected = set()
+        for include_f in include_filters:
+            include_f = include_f.strip("*?")
+            expected = expected | set(x for x in classification_models if include_f in x)
+    else:
+        expected = classification_models
+
+    if exclude_filters:
+        for exclude_f in exclude_filters:
+            exclude_f = exclude_f.strip("*?")
+            if exclude_f != "":
+                a_exclude = set(x for x in classification_models if exclude_f in x)
+                expected = expected - a_exclude
+
+    assert expected == actual
+
+
 @pytest.mark.parametrize(
     "name, weight",
     [
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index e244207a8..0999bf7ba 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -1,3 +1,4 @@
+import fnmatch
 import importlib
 import inspect
 import sys
@@ -6,7 +7,7 @@ from enum import Enum
 from functools import partial
 from inspect import signature
 from types import ModuleType
-from typing import Any, Callable, Dict, List, Mapping, Optional, Type, TypeVar, Union
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set, Type, TypeVar, Union
 
 from torch import nn
 
@@ -203,19 +204,43 @@ def register_model(name: Optional[str] = None) -> Callable[[Callable[..., M]], C
     return wrapper
 
 
-def list_models(module: Optional[ModuleType] = None) -> List[str]:
+def list_models(
+    module: Optional[ModuleType] = None,
+    include: Union[Iterable[str], str, None] = None,
+    exclude: Union[Iterable[str], str, None] = None,
+) -> List[str]:
     """
     Returns a list with the names of registered models.
 
     Args:
         module (ModuleType, optional): The module from which we want to extract the available models.
+        include (str or Iterable[str], optional): Filter(s) for including the models from the set of all models.
+            Filters are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is the union of individual filters.
+        exclude (str or Iterable[str], optional): Filter(s) applied after include_filters to remove models.
+            Filter are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is removal of all the models that match any individual filter.
 
     Returns:
         models (list): A list with the names of available models.
     """
-    models = [
+    all_models = {
         k for k, v in BUILTIN_MODELS.items() if module is None or v.__module__.rsplit(".", 1)[0] == module.__name__
-    ]
+    }
+    if include:
+        models: Set[str] = set()
+        if isinstance(include, str):
+            include = [include]
+        for include_filter in include:
+            models = models | set(fnmatch.filter(all_models, include_filter))
+    else:
+        models = all_models
+
+    if exclude:
+        if isinstance(exclude, str):
+            exclude = [exclude]
+        for exclude_filter in exclude:
+            models = models - set(fnmatch.filter(all_models, exclude_filter))
     return sorted(models)
 
 
-- 
GitLab


From 23b0938f897f4003bb26c086115de94b9976cb9f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 22:20:49 +0200
Subject: [PATCH 511/624] extract make_* functions out of make_*_loader (#7717)

---
 test/common_utils.py                   | 289 ++++++++++-----
 test/test_prototype_transforms.py      |  14 +-
 test/test_transforms_v2.py             |  10 +-
 test/test_transforms_v2_consistency.py |  10 +-
 test/test_transforms_v2_functional.py  | 157 ---------
 test/test_transforms_v2_refactored.py  | 465 ++++++++++++-------------
 test/transforms_v2_kernel_infos.py     |  96 ++---
 7 files changed, 487 insertions(+), 554 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index abefd07c4..72ecf1043 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -27,7 +27,7 @@ from PIL import Image
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
+from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_pil, to_image_tensor
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -399,6 +399,9 @@ class ArgsKwargs:
         )
 
 
+# new v2 default
+DEFAULT_SIZE = (17, 11)
+# old v2 defaults
 DEFAULT_SQUARE_SPATIAL_SIZE = 15
 DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
 DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
@@ -406,13 +409,12 @@ DEFAULT_SPATIAL_SIZES = (
     DEFAULT_LANDSCAPE_SPATIAL_SIZE,
     DEFAULT_PORTRAIT_SPATIAL_SIZE,
     DEFAULT_SQUARE_SPATIAL_SIZE,
-    "random",
 )
 
 
 def _parse_spatial_size(size, *, name="size"):
     if size == "random":
-        return tuple(torch.randint(15, 33, (2,)).tolist())
+        raise ValueError("This should never happen")
     elif isinstance(size, int) and size > 0:
         return (size, size)
     elif (
@@ -492,8 +494,40 @@ def get_num_channels(color_space):
     return num_channels
 
 
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, get_num_channels(color_space), *size),
+        low=0,
+        high=max_value,
+        dtype=dtype or torch.uint8,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return datapoints.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_image_pil(make_image(*args, **kwargs))
+
+
 def make_image_loader(
-    size="random",
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     *,
     color_space="RGB",
     extra_dims=(),
@@ -501,24 +535,25 @@ def make_image_loader(
     constant_alpha=True,
     memory_format=torch.contiguous_format,
 ):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
-        max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(
-            shape, low=0, high=max_value, dtype=dtype, device=device, memory_format=memory_format
+        *batch_dims, _, height, width = shape
+        return make_image(
+            (height, width),
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
         )
-        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
-            data[..., -1, :, :] = max_value
-        return datapoints.Image(data)
 
     return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
 
 
-make_image = from_loader(make_image_loader)
-
-
 def make_image_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
@@ -540,7 +575,7 @@ make_images = from_loaders(make_image_loaders)
 
 
 def make_image_loader_for_interpolation(
-    size="random", *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
 ):
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
@@ -589,76 +624,114 @@ class BoundingBoxLoader(TensorLoader):
     spatial_size: Tuple[int, int]
 
 
-def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
-    low, high = torch.broadcast_tensors(
-        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
+def make_bounding_box(
+    size=None,
+    *,
+    format=datapoints.BoundingBoxFormat.XYXY,
+    spatial_size=None,
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+):
+    """
+    size: Size of the actual bounding box, i.e.
+        - (box[3] - box[1], box[2] - box[0]) for XYXY
+        - (H, W) for XYWH and CXCYWH
+    spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on
+        returned datapoints.BoundingBox
+
+    To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker
+    functions, e.g.
+
+    .. code::
+
+        image = make_image=(size=size)
+        bounding_box = make_bounding_box(spatial_size=size)
+        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
+
+    For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all
+    other maker functions, e.g.
+
+    .. code::
+
+        image = make_image=()
+        bounding_box = make_bounding_box()
+        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
+    """
+
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    if spatial_size is None:
+        if size is None:
+            spatial_size = DEFAULT_SIZE
+        else:
+            height, width = size
+            height_margin, width_margin = torch.randint(10, (2,)).tolist()
+            spatial_size = (height + height_margin, width + width_margin)
+
+    dtype = dtype or torch.float32
+
+    if any(dim == 0 for dim in batch_dims):
+        return datapoints.BoundingBox(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        )
+
+    if size is None:
+        h, w = [torch.randint(1, s, batch_dims) for s in spatial_size]
+    else:
+        h, w = [torch.full(batch_dims, s, dtype=torch.int) for s in size]
+
+    y = sample_position(h, spatial_size[0])
+    x = sample_position(w, spatial_size[1])
+
+    if format is datapoints.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is datapoints.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return datapoints.BoundingBox(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
     )
-    return torch.stack(
-        [
-            torch.randint(low_scalar, high_scalar, (), **kwargs)
-            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
-        ]
-    ).reshape(low.shape)
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
-    if format not in {
-        datapoints.BoundingBoxFormat.XYXY,
-        datapoints.BoundingBoxFormat.XYWH,
-        datapoints.BoundingBoxFormat.CXCYWH,
-    }:
-        raise pytest.UsageError(f"Can't make bounding box in format {format}")
 
     spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
 
     def fn(shape, dtype, device):
-        *extra_dims, num_coordinates = shape
+        *batch_dims, num_coordinates = shape
         if num_coordinates != 4:
             raise pytest.UsageError()
 
-        if any(dim == 0 for dim in extra_dims):
-            return datapoints.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
-            )
-
-        height, width = spatial_size
-
-        if format == datapoints.BoundingBoxFormat.XYXY:
-            x1 = torch.randint(0, width // 2, extra_dims)
-            y1 = torch.randint(0, height // 2, extra_dims)
-            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-            parts = (x1, y1, x2, y2)
-        elif format == datapoints.BoundingBoxFormat.XYWH:
-            x = torch.randint(0, width // 2, extra_dims)
-            y = torch.randint(0, height // 2, extra_dims)
-            w = randint_with_tensor_bounds(1, width - x)
-            h = randint_with_tensor_bounds(1, height - y)
-            parts = (x, y, w, h)
-        else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, extra_dims)
-            cy = torch.randint(1, height - 1, extra_dims)
-            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-            parts = (cx, cy, w, h)
-
-        return datapoints.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        return make_bounding_box(
+            format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
     return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
-make_bounding_box = from_loader(make_bounding_box_loader)
-
-
 def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size="random",
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     dtypes=(torch.float32, torch.float64, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
@@ -672,24 +745,35 @@ class MaskLoader(TensorLoader):
     pass
 
 
-def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
+def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
     size = _parse_spatial_size(size)
-    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
 
     def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return datapoints.Mask(data)
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
 
     return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
 
 
-make_detection_mask = from_loader(make_detection_mask_loader)
-
-
 def make_detection_mask_loaders(
     sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
+    num_objects=(1, 0, 5),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
 ):
@@ -700,25 +784,38 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_spatial_size(size)
-    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
 
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return datapoints.Mask(data)
 
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+def make_segmentation_mask_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
+):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    spatial_size = _parse_spatial_size(size)
 
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
 
-make_segmentation_mask = from_loader(make_segmentation_mask_loader)
+    return MaskLoader(fn, shape=(*extra_dims, *spatial_size), dtype=dtype)
 
 
 def make_segmentation_mask_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, "random"),
+    num_categories=(1, 2, 10),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
 ):
@@ -732,8 +829,8 @@ make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
 def make_mask_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
-    num_categories=(1, 2, "random"),
+    num_objects=(1, 0, 5),
+    num_categories=(1, 2, 10),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
 ):
@@ -750,29 +847,35 @@ class VideoLoader(ImageLoader):
     pass
 
 
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
 def make_video_loader(
-    size="random",
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     *,
     color_space="RGB",
-    num_frames="random",
+    num_frames=3,
     extra_dims=(),
     dtype=torch.uint8,
 ):
     size = _parse_spatial_size(size)
-    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device, memory_format):
-        video = make_image(
-            size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device, memory_format=memory_format
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            batch_dims=batch_dims,
+            color_space=color_space,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
         )
-        return datapoints.Video(video)
 
     return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
 
 
-make_video = from_loader(make_video_loader)
-
-
 def make_video_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
@@ -780,7 +883,7 @@ def make_video_loaders(
         "GRAY",
         "RGB",
     ),
-    num_frames=(1, 0, "random"),
+    num_frames=(1, 0, 3),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8, torch.float32, torch.float64),
 ):
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 255c3b5c3..c574979e2 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -216,7 +216,7 @@ class TestFixedSizeCrop:
 
         flat_inputs = [
             make_image(size=spatial_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape),
+            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -312,9 +312,9 @@ class TestFixedSizeCrop:
         )
 
         bounding_boxes = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
-        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
+        masks = make_detection_mask(size=spatial_size, batch_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -350,7 +350,7 @@ class TestFixedSizeCrop:
         )
 
         bounding_box = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
@@ -496,7 +496,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -505,7 +505,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -514,7 +514,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         datapoint_image = make_image(size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 093c378aa..374358179 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -182,13 +182,13 @@ class TestSmoke:
             video_datapoint=make_video(size=spatial_size),
             image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
             bounding_box_xyxy=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
+                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
             ),
             bounding_box_xywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
+                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,)
             ),
             bounding_box_cxcywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
+                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,)
             ),
             bounding_box_degenerate_xyxy=datapoints.BoundingBox(
                 [
@@ -289,7 +289,7 @@ class TestSmoke:
                         ],
                         dtypes=[torch.uint8],
                         extra_dims=[(), (4,)],
-                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
+                        **(dict(num_frames=[3]) if fn is make_videos else dict()),
                     )
                     for fn in [
                         make_images,
@@ -1124,7 +1124,7 @@ class TestRandomIoUCrop:
         transform = transforms.RandomIoUCrop()
 
         image = datapoints.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), batch_dims=(6,))
         masks = make_detection_mask((32, 24), num_objects=6)
 
         sample = [image, bboxes, masks]
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index f035dde45..bf297473b 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1090,7 +1090,7 @@ class TestRefDetTransforms:
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1098,9 +1098,9 @@ class TestRefDetTransforms:
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1108,9 +1108,9 @@ class TestRefDetTransforms:
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB")
+        datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 79ea20d85..465cc2271 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -665,163 +665,6 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
     return true_matrix
 
 
-@pytest.mark.parametrize("angle", range(-90, 90, 56))
-@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
-def test_correctness_rotate_bounding_box(angle, expand, center):
-    def _compute_expected_bbox(bbox, angle_, expand_, center_):
-        affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
-        affine_matrix = affine_matrix[:2, :]
-
-        height, width = bbox.spatial_size
-        bbox_xyxy = convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-                # image frame
-                [0.0, 0.0, 1.0],
-                [0.0, height, 1.0],
-                [width, height, 1.0],
-                [width, 0.0, 1.0],
-            ]
-        )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = [
-            float(np.min(transformed_points[:4, 0])),
-            float(np.min(transformed_points[:4, 1])),
-            float(np.max(transformed_points[:4, 0])),
-            float(np.max(transformed_points[:4, 1])),
-        ]
-        if expand_:
-            tr_x = np.min(transformed_points[4:, 0])
-            tr_y = np.min(transformed_points[4:, 1])
-            out_bbox[0] -= tr_x
-            out_bbox[1] -= tr_y
-            out_bbox[2] -= tr_x
-            out_bbox[3] -= tr_y
-
-            height = int(height - 2 * tr_y)
-            width = int(width - 2 * tr_x)
-
-        out_bbox = datapoints.BoundingBox(
-            out_bbox,
-            format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(height, width),
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        out_bbox = clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
-        return out_bbox, (height, width)
-
-    spatial_size = (32, 38)
-
-    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
-        bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
-
-        output_bboxes, output_spatial_size = F.rotate_bounding_box(
-            bboxes.as_subclass(torch.Tensor),
-            format=bboxes_format,
-            spatial_size=bboxes_spatial_size,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]]
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
-            expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_)
-            expected_bboxes.append(expected_bbox)
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0)
-
-
-@pytest.mark.parametrize("device", cpu_and_cuda())
-@pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
-def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
-    # Check transformation against known expected output
-    format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (64, 64)
-    # xyxy format
-    in_boxes = [
-        [1, 1, 5, 5],
-        [1, spatial_size[0] - 6, 5, spatial_size[0] - 2],
-        [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
-        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
-    ]
-    in_boxes = torch.tensor(in_boxes, dtype=torch.float64, device=device)
-    # Tested parameters
-    angle = 45
-    center = None if expand else [12, 23]
-
-    # # Expected bboxes computed using Detectron2:
-    # from detectron2.data.transforms import RotationTransform, AugmentationList
-    # from detectron2.data.transforms import AugInput
-    # import cv2
-    # inpt = AugInput(im1, boxes=np.array(in_boxes, dtype="float32"))
-    # augs = AugmentationList([RotationTransform(*size, angle, expand=expand, center=center, interp=cv2.INTER_NEAREST), ])
-    # out = augs(inpt)
-    # print(inpt.boxes)
-    if expand:
-        expected_bboxes = [
-            [1.65937957, 42.67157288, 7.31623382, 48.32842712],
-            [41.96446609, 82.9766594, 47.62132034, 88.63351365],
-            [82.26955262, 42.67157288, 87.92640687, 48.32842712],
-            [31.35786438, 31.35786438, 59.64213562, 59.64213562],
-        ]
-    else:
-        expected_bboxes = [
-            [-11.33452378, 12.39339828, -5.67766953, 18.05025253],
-            [28.97056275, 52.69848481, 34.627417, 58.35533906],
-            [69.27564928, 12.39339828, 74.93250353, 18.05025253],
-            [18.36396103, 1.07968978, 46.64823228, 29.36396103],
-        ]
-        expected_bboxes = clamp_bounding_box(
-            datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
-        ).tolist()
-
-    output_boxes, _ = F.rotate_bounding_box(
-        in_boxes,
-        format=format,
-        spatial_size=spatial_size,
-        angle=angle,
-        expand=expand,
-        center=center,
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_cuda())
-def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees
-    expected_mask = torch.rot90(mask, k=1, dims=(-2, -1))
-    out_mask = F.rotate_mask(mask, 90, expand=False)
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "format",
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 2130a8cf5..69180b99d 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -20,6 +20,8 @@ from common_utils import (
     make_bounding_box,
     make_detection_mask,
     make_image,
+    make_image_pil,
+    make_image_tensor,
     make_segmentation_mask,
     make_video,
     set_rng_seed,
@@ -308,42 +310,6 @@ def transform_cls_to_functional(transform_cls, **transform_specific_kwargs):
     return wrapper
 
 
-def make_input(input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), mask_type="segmentation", **kwargs):
-    if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-        input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-        if input_type is torch.Tensor:
-            input = input.as_subclass(torch.Tensor)
-        elif input_type is PIL.Image.Image:
-            input = F.to_image_pil(input)
-    elif input_type is datapoints.BoundingBox:
-        kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
-        input = make_bounding_box(
-            dtype=dtype or torch.float32,
-            device=device,
-            spatial_size=spatial_size,
-            **kwargs,
-        )
-    elif input_type is datapoints.Mask:
-        if mask_type == "segmentation":
-            make_mask = make_segmentation_mask
-            default_dtype = torch.uint8
-        elif mask_type == "detection":
-            make_mask = make_detection_mask
-            default_dtype = torch.bool
-        else:
-            raise ValueError(f"`mask_type` can be `'segmentation'` or `'detection'`, but got {mask_type}.")
-        input = make_mask(size=spatial_size, dtype=dtype or default_dtype, device=device, **kwargs)
-    elif input_type is datapoints.Video:
-        input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {input_type} instead."
-        )
-
-    return input
-
-
 def param_value_parametrization(**kwargs):
     """Helper function to turn
 
@@ -516,7 +482,7 @@ class TestResize:
 
         check_kernel(
             F.resize_image_tensor,
-            make_input(datapoints.Image, dtype=dtype, device=device, spatial_size=self.INPUT_SIZE),
+            make_image(self.INPUT_SIZE, dtype=dtype, device=device),
             size=size,
             interpolation=interpolation,
             **max_size_kwarg,
@@ -534,8 +500,11 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_input(
-            datapoints.BoundingBox, dtype=dtype, device=device, format=format, spatial_size=self.INPUT_SIZE
+        bounding_box = make_bounding_box(
+            format=format,
+            spatial_size=self.INPUT_SIZE,
+            dtype=dtype,
+            device=device,
         )
         check_kernel(
             F.resize_bounding_box,
@@ -546,53 +515,44 @@ class TestResize:
             check_scripted_vs_eager=not isinstance(size, int),
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(
-            F.resize_mask,
-            make_input(datapoints.Mask, spatial_size=self.INPUT_SIZE, mask_type=mask_type),
-            size=self.OUTPUT_SIZES[-1],
-        )
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1])
 
     def test_kernel_video(self):
-        check_kernel(
-            F.resize_video,
-            make_input(datapoints.Video, spatial_size=self.INPUT_SIZE),
-            size=self.OUTPUT_SIZES[-1],
-            antialias=True,
-        )
+        check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.resize_image_tensor),
-            (PIL.Image.Image, F.resize_image_pil),
-            (datapoints.Image, F.resize_image_tensor),
-            (datapoints.BoundingBox, F.resize_bounding_box),
-            (datapoints.Mask, F.resize_mask),
-            (datapoints.Video, F.resize_video),
+            (F.resize_image_tensor, make_image_tensor),
+            (F.resize_image_pil, make_image_pil),
+            (F.resize_image_tensor, make_image),
+            (F.resize_bounding_box, make_bounding_box),
+            (F.resize_mask, make_segmentation_mask),
+            (F.resize_video, make_video),
         ],
     )
-    def test_dispatcher(self, size, input_type, kernel):
+    def test_dispatcher(self, size, kernel, make_input):
         check_dispatcher(
             F.resize,
             kernel,
-            make_input(input_type, spatial_size=self.INPUT_SIZE),
+            make_input(self.INPUT_SIZE),
             size=size,
             antialias=True,
             check_scripted_smoke=not isinstance(size, int),
         )
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.resize_image_tensor),
-            (PIL.Image.Image, F.resize_image_pil),
-            (datapoints.Image, F.resize_image_tensor),
-            (datapoints.BoundingBox, F.resize_bounding_box),
-            (datapoints.Mask, F.resize_mask),
-            (datapoints.Video, F.resize_video),
+            (F.resize_image_tensor, torch.Tensor),
+            (F.resize_image_pil, PIL.Image.Image),
+            (F.resize_image_tensor, datapoints.Image),
+            (F.resize_bounding_box, datapoints.BoundingBox),
+            (F.resize_mask, datapoints.Mask),
+            (F.resize_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
@@ -601,18 +561,19 @@ class TestResize:
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_transform(self, size, device, input_type):
-        input = make_input(input_type, device=device, spatial_size=self.INPUT_SIZE)
-
-        check_transform(
-            transforms.Resize,
-            input,
-            size=size,
-            antialias=True,
-        )
+    def test_transform(self, size, device, make_input):
+        check_transform(transforms.Resize, make_input(self.INPUT_SIZE, device=device), size=size, antialias=True)
 
     def _check_output_size(self, input, output, *, size, max_size):
         assert tuple(F.get_spatial_size(output)) == self._compute_output_size(
@@ -629,7 +590,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu", spatial_size=self.INPUT_SIZE)
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8)
 
         actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
         expected = F.to_image_tensor(
@@ -672,7 +633,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_input(datapoints.BoundingBox, spatial_size=self.INPUT_SIZE)
+        bounding_box = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE)
 
         actual = fn(bounding_box, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
@@ -682,11 +643,11 @@ class TestResize:
 
     @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES))
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
     )
-    def test_pil_interpolation_compat_smoke(self, interpolation, input_type):
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+    def test_pil_interpolation_compat_smoke(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
 
         with (
             contextlib.nullcontext()
@@ -702,16 +663,22 @@ class TestResize:
 
     def test_dispatcher_pil_antialias_warning(self):
         with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
-            F.resize(
-                make_input(PIL.Image.Image, spatial_size=self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False
-            )
+            F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_max_size_error(self, size, input_type):
+    def test_max_size_error(self, size, make_input):
         if isinstance(size, int) or len(size) == 1:
             max_size = (size if isinstance(size, int) else size[0]) - 1
             match = "must be strictly greater than the requested size"
@@ -721,39 +688,39 @@ class TestResize:
             match = "size should be an int or a sequence of length 1"
 
         with pytest.raises(ValueError, match=match):
-            F.resize(make_input(input_type, spatial_size=self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
+            F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, datapoints.Image, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image, make_video],
     )
-    def test_antialias_warning(self, interpolation, input_type):
+    def test_antialias_warning(self, interpolation, make_input):
         with (
             assert_warns_antialias_default_value()
             if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC}
             else assert_no_warnings()
         ):
             F.resize(
-                make_input(input_type, spatial_size=self.INPUT_SIZE),
+                make_input(self.INPUT_SIZE),
                 size=self.OUTPUT_SIZES[0],
                 interpolation=interpolation,
             )
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
     )
-    def test_interpolation_int(self, interpolation, input_type):
+    def test_interpolation_int(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
+
         # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to
         # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a
         # difference and thus we don't test it here.
-        if issubclass(input_type, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
+        if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
             return
 
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
-
         expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
         actual = F.resize(
             input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True
@@ -769,13 +736,21 @@ class TestResize:
         "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)]
     )
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_noop(self, size, input_type):
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+    def test_noop(self, size, make_input):
+        input = make_input(self.INPUT_SIZE)
 
-        output = F.resize(input, size=size, antialias=True)
+        output = F.resize(input, size=F.get_spatial_size(input), antialias=True)
 
         # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
         # is a good reason to break this, feel free to downgrade to an equality check.
@@ -788,14 +763,22 @@ class TestResize:
             assert output is input
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_no_regression_5405(self, input_type):
+    def test_no_regression_5405(self, make_input):
         # Checks that `max_size` is not ignored if `size == small_edge_size`
         # See https://github.com/pytorch/vision/issues/5405
 
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+        input = make_input(self.INPUT_SIZE)
 
         size = min(F.get_spatial_size(input))
         max_size = size + 1
@@ -808,13 +791,13 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.horizontal_flip_image_tensor, make_input(torch.Tensor, dtype=dtype, device=device))
+        check_kernel(F.horizontal_flip_image_tensor, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
         check_kernel(
             F.horizontal_flip_bounding_box,
             bounding_box,
@@ -822,56 +805,54 @@ class TestHorizontalFlip:
             spatial_size=bounding_box.spatial_size,
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(F.horizontal_flip_mask, make_input(datapoints.Mask, mask_type=mask_type))
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.horizontal_flip_mask, make_mask())
 
     def test_kernel_video(self):
-        check_kernel(F.horizontal_flip_video, make_input(datapoints.Video))
+        check_kernel(F.horizontal_flip_video, make_video())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.horizontal_flip_image_tensor),
-            (PIL.Image.Image, F.horizontal_flip_image_pil),
-            (datapoints.Image, F.horizontal_flip_image_tensor),
-            (datapoints.BoundingBox, F.horizontal_flip_bounding_box),
-            (datapoints.Mask, F.horizontal_flip_mask),
-            (datapoints.Video, F.horizontal_flip_video),
+            (F.horizontal_flip_image_tensor, make_image_tensor),
+            (F.horizontal_flip_image_pil, make_image_pil),
+            (F.horizontal_flip_image_tensor, make_image),
+            (F.horizontal_flip_bounding_box, make_bounding_box),
+            (F.horizontal_flip_mask, make_segmentation_mask),
+            (F.horizontal_flip_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.horizontal_flip, kernel, make_input(input_type))
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.horizontal_flip, kernel, make_input())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.horizontal_flip_image_tensor),
-            (PIL.Image.Image, F.horizontal_flip_image_pil),
-            (datapoints.Image, F.horizontal_flip_image_tensor),
-            (datapoints.BoundingBox, F.horizontal_flip_bounding_box),
-            (datapoints.Mask, F.horizontal_flip_mask),
-            (datapoints.Video, F.horizontal_flip_video),
+            (F.horizontal_flip_image_tensor, torch.Tensor),
+            (F.horizontal_flip_image_pil, PIL.Image.Image),
+            (F.horizontal_flip_image_tensor, datapoints.Image),
+            (F.horizontal_flip_bounding_box, datapoints.BoundingBox),
+            (F.horizontal_flip_mask, datapoints.Mask),
+            (F.horizontal_flip_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
-
-        check_transform(transforms.RandomHorizontalFlip, input, p=1)
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomHorizontalFlip, make_input(device=device), p=1)
 
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_image_correctness(self, fn):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
         expected = F.to_image_tensor(F.horizontal_flip(F.to_image_pil(image)))
@@ -901,7 +882,7 @@ class TestHorizontalFlip:
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_horizontal_flip_bounding_box(bounding_box)
@@ -909,12 +890,12 @@ class TestHorizontalFlip:
         torch.testing.assert_close(actual, expected)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform_noop(self, input_type, device):
-        input = make_input(input_type, device=device)
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
 
         transform = transforms.RandomHorizontalFlip(p=0)
 
@@ -979,7 +960,7 @@ class TestAffine:
             value = adapt_fill(value, dtype=dtype)
         self._check_kernel(
             F.affine_image_tensor,
-            make_input(torch.Tensor, dtype=dtype, device=device),
+            make_image(dtype=dtype, device=device),
             **{param: value},
             check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
             check_cuda_vs_cpu=dict(atol=1, rtol=0)
@@ -997,58 +978,58 @@ class TestAffine:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, param, value, format, dtype, device):
-        bounding_box = make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
         self._check_kernel(
             F.affine_bounding_box,
-            make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device),
+            bounding_box,
             format=format,
             spatial_size=bounding_box.spatial_size,
             **{param: value},
             check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        self._check_kernel(F.affine_mask, make_input(datapoints.Mask, mask_type=mask_type))
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        self._check_kernel(F.affine_mask, make_mask())
 
     def test_kernel_video(self):
-        self._check_kernel(F.affine_video, make_input(datapoints.Video))
+        self._check_kernel(F.affine_video, make_video())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.affine_image_tensor),
-            (PIL.Image.Image, F.affine_image_pil),
-            (datapoints.Image, F.affine_image_tensor),
-            (datapoints.BoundingBox, F.affine_bounding_box),
-            (datapoints.Mask, F.affine_mask),
-            (datapoints.Video, F.affine_video),
+            (F.affine_image_tensor, make_image_tensor),
+            (F.affine_image_pil, make_image_pil),
+            (F.affine_image_tensor, make_image),
+            (F.affine_bounding_box, make_bounding_box),
+            (F.affine_mask, make_segmentation_mask),
+            (F.affine_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.affine, kernel, make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.affine, kernel, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.affine_image_tensor),
-            (PIL.Image.Image, F.affine_image_pil),
-            (datapoints.Image, F.affine_image_tensor),
-            (datapoints.BoundingBox, F.affine_bounding_box),
-            (datapoints.Mask, F.affine_mask),
-            (datapoints.Video, F.affine_video),
+            (F.affine_image_tensor, torch.Tensor),
+            (F.affine_image_pil, PIL.Image.Image),
+            (F.affine_image_tensor, datapoints.Image),
+            (F.affine_bounding_box, datapoints.BoundingBox),
+            (F.affine_mask, datapoints.Mask),
+            (F.affine_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.affine, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
+    def test_transform(self, make_input, device):
+        input = make_input(device=device)
 
         check_transform(transforms.RandomAffine, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
 
@@ -1062,7 +1043,7 @@ class TestAffine:
     )
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1099,7 +1080,7 @@ class TestAffine:
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_image_correctness(self, center, interpolation, fill, seed):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1163,7 +1144,7 @@ class TestAffine:
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_box_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = F.affine(
             bounding_box,
@@ -1188,7 +1169,7 @@ class TestAffine:
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_box_correctness(self, format, center, seed):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
@@ -1208,7 +1189,7 @@ class TestAffine:
     @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"])
     @pytest.mark.parametrize("seed", list(range(10)))
     def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
-        image = make_input(torch.Tensor)
+        image = make_image()
         height, width = F.get_spatial_size(image)
 
         transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
@@ -1289,13 +1270,13 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.vertical_flip_image_tensor, make_input(torch.Tensor, dtype=dtype, device=device))
+        check_kernel(F.vertical_flip_image_tensor, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
         check_kernel(
             F.vertical_flip_bounding_box,
             bounding_box,
@@ -1303,54 +1284,52 @@ class TestVerticalFlip:
             spatial_size=bounding_box.spatial_size,
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(F.vertical_flip_mask, make_input(datapoints.Mask, mask_type=mask_type))
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.vertical_flip_mask, make_mask())
 
     def test_kernel_video(self):
-        check_kernel(F.vertical_flip_video, make_input(datapoints.Video))
+        check_kernel(F.vertical_flip_video, make_video())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.vertical_flip_image_tensor),
-            (PIL.Image.Image, F.vertical_flip_image_pil),
-            (datapoints.Image, F.vertical_flip_image_tensor),
-            (datapoints.BoundingBox, F.vertical_flip_bounding_box),
-            (datapoints.Mask, F.vertical_flip_mask),
-            (datapoints.Video, F.vertical_flip_video),
+            (F.vertical_flip_image_tensor, make_image_tensor),
+            (F.vertical_flip_image_pil, make_image_pil),
+            (F.vertical_flip_image_tensor, make_image),
+            (F.vertical_flip_bounding_box, make_bounding_box),
+            (F.vertical_flip_mask, make_segmentation_mask),
+            (F.vertical_flip_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.vertical_flip, kernel, make_input(input_type))
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.vertical_flip, kernel, make_input())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.vertical_flip_image_tensor),
-            (PIL.Image.Image, F.vertical_flip_image_pil),
-            (datapoints.Image, F.vertical_flip_image_tensor),
-            (datapoints.BoundingBox, F.vertical_flip_bounding_box),
-            (datapoints.Mask, F.vertical_flip_mask),
-            (datapoints.Video, F.vertical_flip_video),
+            (F.vertical_flip_image_tensor, torch.Tensor),
+            (F.vertical_flip_image_pil, PIL.Image.Image),
+            (F.vertical_flip_image_tensor, datapoints.Image),
+            (F.vertical_flip_bounding_box, datapoints.BoundingBox),
+            (F.vertical_flip_mask, datapoints.Mask),
+            (F.vertical_flip_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.vertical_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
-
-        check_transform(transforms.RandomVerticalFlip, input, p=1)
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomVerticalFlip, make_input(device=device), p=1)
 
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_image_correctness(self, fn):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
         expected = F.to_image_tensor(F.vertical_flip(F.to_image_pil(image)))
@@ -1378,7 +1357,7 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_vertical_flip_bounding_box(bounding_box)
@@ -1386,12 +1365,12 @@ class TestVerticalFlip:
         torch.testing.assert_close(actual, expected)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform_noop(self, input_type, device):
-        input = make_input(input_type, device=device)
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
 
         transform = transforms.RandomVerticalFlip(p=0)
 
@@ -1434,7 +1413,7 @@ class TestRotate:
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
         check_kernel(
             F.rotate_image_tensor,
-            make_input(torch.Tensor, dtype=dtype, device=device),
+            make_image(dtype=dtype, device=device),
             **kwargs,
             check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
         )
@@ -1452,7 +1431,7 @@ class TestRotate:
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
 
-        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.rotate_bounding_box,
@@ -1462,50 +1441,50 @@ class TestRotate:
             **kwargs,
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(F.rotate_mask, make_input(datapoints.Mask, mask_type=mask_type), **self._MINIMAL_AFFINE_KWARGS)
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS)
 
     def test_kernel_video(self):
-        check_kernel(F.rotate_video, make_input(datapoints.Video), **self._MINIMAL_AFFINE_KWARGS)
+        check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.rotate_image_tensor),
-            (PIL.Image.Image, F.rotate_image_pil),
-            (datapoints.Image, F.rotate_image_tensor),
-            (datapoints.BoundingBox, F.rotate_bounding_box),
-            (datapoints.Mask, F.rotate_mask),
-            (datapoints.Video, F.rotate_video),
+            (F.rotate_image_tensor, make_image_tensor),
+            (F.rotate_image_pil, make_image_pil),
+            (F.rotate_image_tensor, make_image),
+            (F.rotate_bounding_box, make_bounding_box),
+            (F.rotate_mask, make_segmentation_mask),
+            (F.rotate_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.rotate, kernel, make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.rotate, kernel, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.rotate_image_tensor),
-            (PIL.Image.Image, F.rotate_image_pil),
-            (datapoints.Image, F.rotate_image_tensor),
-            (datapoints.BoundingBox, F.rotate_bounding_box),
-            (datapoints.Mask, F.rotate_mask),
-            (datapoints.Video, F.rotate_video),
+            (F.rotate_image_tensor, torch.Tensor),
+            (F.rotate_image_pil, PIL.Image.Image),
+            (F.rotate_image_tensor, datapoints.Image),
+            (F.rotate_bounding_box, datapoints.BoundingBox),
+            (F.rotate_mask, datapoints.Mask),
+            (F.rotate_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.rotate, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
-
-        check_transform(transforms.RandomRotation, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
+    def test_transform(self, make_input, device):
+        check_transform(
+            transforms.RandomRotation, make_input(device=device), **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES
+        )
 
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
@@ -1515,7 +1494,7 @@ class TestRotate:
     @pytest.mark.parametrize("expand", [False, True])
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     def test_functional_image_correctness(self, angle, center, interpolation, expand, fill):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1537,7 +1516,7 @@ class TestRotate:
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_image_correctness(self, center, interpolation, expand, fill, seed):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1593,7 +1572,7 @@ class TestRotate:
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_box_correctness(self, format, angle, expand, center):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = F.rotate(bounding_box, angle=angle, expand=expand, center=center)
         expected = self._reference_rotate_bounding_box(bounding_box, angle=angle, expand=expand, center=center)
@@ -1606,7 +1585,7 @@ class TestRotate:
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_box_correctness(self, format, expand, center, seed):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
 
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index cae8d3157..dc04fbfc7 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -11,6 +11,7 @@ import torchvision.transforms.v2.functional as F
 from common_utils import (
     ArgsKwargs,
     combinations_grid,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     get_num_channels,
     ImageLoader,
     InfoBase,
@@ -260,6 +261,9 @@ KERNEL_INFOS.append(
         reference_fn=reference_convert_format_bounding_box,
         reference_inputs_fn=reference_inputs_convert_format_bounding_box,
         logs_usage=True,
+        closeness_kwargs={
+            (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
+        },
     ),
 )
 
@@ -296,7 +300,7 @@ def sample_inputs_crop_bounding_box():
 
 
 def sample_inputs_crop_mask():
-    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=["random"], num_objects=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=[10], num_objects=[5]):
         yield ArgsKwargs(mask_loader, top=4, left=3, height=7, width=8)
 
 
@@ -306,7 +310,7 @@ def reference_inputs_crop_mask():
 
 
 def sample_inputs_crop_video():
-    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=[3]):
         yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
 
 
@@ -415,7 +419,7 @@ def sample_inputs_resized_crop_mask():
 
 
 def sample_inputs_resized_crop_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0])
 
 
@@ -457,7 +461,7 @@ _PAD_PARAMS = combinations_grid(
 
 def sample_inputs_pad_image_tensor():
     make_pad_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
+        make_image_loaders, sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]
     )
 
     for image_loader, padding in itertools.product(
@@ -512,7 +516,7 @@ def sample_inputs_pad_bounding_box():
 
 
 def sample_inputs_pad_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]):
         yield ArgsKwargs(mask_loader, padding=[1])
 
 
@@ -524,7 +528,7 @@ def reference_inputs_pad_mask():
 
 
 def sample_inputs_pad_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, padding=[1])
 
 
@@ -620,7 +624,7 @@ _ENDPOINTS = [[9, 8], [7, 6], [5, 4], [3, 2]]
 
 
 def sample_inputs_perspective_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(
                 image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]
@@ -672,7 +676,7 @@ def sample_inputs_perspective_bounding_box():
 
 
 def sample_inputs_perspective_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
 
     yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
@@ -686,7 +690,7 @@ def reference_inputs_perspective_mask():
 
 
 def sample_inputs_perspective_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
 
     yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
@@ -745,7 +749,7 @@ def _get_elastic_displacement(spatial_size):
 
 
 def sample_inputs_elastic_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         displacement = _get_elastic_displacement(image_loader.spatial_size)
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
@@ -777,13 +781,13 @@ def sample_inputs_elastic_bounding_box():
 
 
 def sample_inputs_elastic_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         displacement = _get_elastic_displacement(mask_loader.shape[-2:])
         yield ArgsKwargs(mask_loader, displacement=displacement)
 
 
 def sample_inputs_elastic_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         displacement = _get_elastic_displacement(video_loader.shape[-2:])
         yield ArgsKwargs(video_loader, displacement=displacement)
 
@@ -854,7 +858,7 @@ def sample_inputs_center_crop_bounding_box():
 
 
 def sample_inputs_center_crop_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]):
         height, width = mask_loader.shape[-2:]
         yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2))
 
@@ -867,7 +871,7 @@ def reference_inputs_center_crop_mask():
 
 
 def sample_inputs_center_crop_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         height, width = video_loader.shape[-2:]
         yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2))
 
@@ -947,7 +951,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_equalize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1008,7 +1012,7 @@ def reference_inputs_equalize_image_tensor():
 
 
 def sample_inputs_equalize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1031,7 +1035,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_invert_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1041,7 +1045,7 @@ def reference_inputs_invert_image_tensor():
 
 
 def sample_inputs_invert_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1067,7 +1071,7 @@ _POSTERIZE_BITS = [1, 4, 8]
 
 
 def sample_inputs_posterize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
 
 
@@ -1080,7 +1084,7 @@ def reference_inputs_posterize_image_tensor():
 
 
 def sample_inputs_posterize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0])
 
 
@@ -1110,7 +1114,7 @@ def _get_solarize_thresholds(dtype):
 
 
 def sample_inputs_solarize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))
 
 
@@ -1125,7 +1129,7 @@ def uint8_to_float32_threshold_adapter(other_args, kwargs):
 
 
 def sample_inputs_solarize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype)))
 
 
@@ -1149,7 +1153,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_autocontrast_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1159,7 +1163,7 @@ def reference_inputs_autocontrast_image_tensor():
 
 
 def sample_inputs_autocontrast_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1189,7 +1193,7 @@ _ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]
 
 def sample_inputs_adjust_sharpness_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random", (2, 2)],
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE, (2, 2)],
         color_spaces=("GRAY", "RGB"),
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
@@ -1204,7 +1208,7 @@ def reference_inputs_adjust_sharpness_image_tensor():
 
 
 def sample_inputs_adjust_sharpness_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
 
 
@@ -1228,7 +1232,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_erase_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         # FIXME: make the parameters more diverse
         h, w = 6, 7
         v = torch.rand(image_loader.num_channels, h, w)
@@ -1236,7 +1240,7 @@ def sample_inputs_erase_image_tensor():
 
 
 def sample_inputs_erase_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         # FIXME: make the parameters more diverse
         h, w = 6, 7
         v = torch.rand(video_loader.num_channels, h, w)
@@ -1261,7 +1265,7 @@ _ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]
 
 
 def sample_inputs_adjust_brightness_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
 
 
@@ -1274,7 +1278,7 @@ def reference_inputs_adjust_brightness_image_tensor():
 
 
 def sample_inputs_adjust_brightness_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
 
 
@@ -1301,7 +1305,7 @@ _ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
 
 
 def sample_inputs_adjust_contrast_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
@@ -1314,7 +1318,7 @@ def reference_inputs_adjust_contrast_image_tensor():
 
 
 def sample_inputs_adjust_contrast_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
@@ -1353,7 +1357,7 @@ _ADJUST_GAMMA_GAMMAS_GAINS = [
 
 def sample_inputs_adjust_gamma_image_tensor():
     gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
 
 
@@ -1367,7 +1371,7 @@ def reference_inputs_adjust_gamma_image_tensor():
 
 def sample_inputs_adjust_gamma_video():
     gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, gamma=gamma, gain=gain)
 
 
@@ -1397,7 +1401,7 @@ _ADJUST_HUE_FACTORS = [-0.1, 0.5]
 
 
 def sample_inputs_adjust_hue_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
 
 
@@ -1410,7 +1414,7 @@ def reference_inputs_adjust_hue_image_tensor():
 
 
 def sample_inputs_adjust_hue_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
 
 
@@ -1439,7 +1443,7 @@ _ADJUST_SATURATION_FACTORS = [0.1, 0.5]
 
 
 def sample_inputs_adjust_saturation_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
@@ -1452,7 +1456,7 @@ def reference_inputs_adjust_saturation_image_tensor():
 
 
 def sample_inputs_adjust_saturation_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
@@ -1612,7 +1616,7 @@ _NORMALIZE_MEANS_STDS = [
 
 def sample_inputs_normalize_image_tensor():
     for image_loader, (mean, std) in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]),
+        make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]),
         _NORMALIZE_MEANS_STDS,
     ):
         yield ArgsKwargs(image_loader, mean=mean, std=std)
@@ -1637,7 +1641,7 @@ def reference_inputs_normalize_image_tensor():
 def sample_inputs_normalize_video():
     mean, std = _NORMALIZE_MEANS_STDS[0]
     for video_loader in make_video_loaders(
-        sizes=["random"], color_spaces=["RGB"], num_frames=["random"], dtypes=[torch.float32]
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[3], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(video_loader, mean=mean, std=std)
 
@@ -1671,7 +1675,9 @@ def sample_inputs_convert_dtype_image_tensor():
             # conversion cannot be performed safely
             continue
 
-        for image_loader in make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[input_dtype]):
+        for image_loader in make_image_loaders(
+            sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[input_dtype]
+        ):
             yield ArgsKwargs(image_loader, dtype=output_dtype)
 
 
@@ -1736,7 +1742,7 @@ def reference_inputs_convert_dtype_image_tensor():
 
 
 def sample_inputs_convert_dtype_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1781,7 +1787,7 @@ KERNEL_INFOS.extend(
 
 
 def sample_inputs_uniform_temporal_subsample_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=[4]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[4]):
         yield ArgsKwargs(video_loader, num_samples=2)
 
 
@@ -1797,7 +1803,9 @@ def reference_uniform_temporal_subsample_video(x, num_samples):
 
 
 def reference_inputs_uniform_temporal_subsample_video():
-    for video_loader in make_video_loaders(sizes=["random"], color_spaces=["RGB"], num_frames=[10]):
+    for video_loader in make_video_loaders(
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[10]
+    ):
         for num_samples in range(1, video_loader.shape[-4] + 1):
             yield ArgsKwargs(video_loader, num_samples)
 
-- 
GitLab


From 08c9938f3e69b4fb2d710a038479264f3c23b133 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 7 Jul 2023 11:46:32 +0100
Subject: [PATCH 512/624] Add --use-v2 support to classification references 
 (#7724)

---
 references/classification/presets.py | 68 +++++++++++++++++-----------
 references/classification/train.py   |  3 ++
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/references/classification/presets.py b/references/classification/presets.py
index 0f2c914be..9970ee577 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -1,8 +1,19 @@
 import torch
-from torchvision.transforms import autoaugment, transforms
 from torchvision.transforms.functional import InterpolationMode
 
 
+def get_module(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+
+        return torchvision.transforms.v2
+    else:
+        import torchvision.transforms
+
+        return torchvision.transforms
+
+
 class ClassificationPresetTrain:
     def __init__(
         self,
@@ -17,41 +28,44 @@ class ClassificationPresetTrain:
         augmix_severity=3,
         random_erase_prob=0.0,
         backend="pil",
+        use_v2=False,
     ):
-        trans = []
+        module = get_module(use_v2)
+
+        transforms = []
         backend = backend.lower()
         if backend == "tensor":
-            trans.append(transforms.PILToTensor())
+            transforms.append(module.PILToTensor())
         elif backend != "pil":
             raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
-        trans.append(transforms.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
+        transforms.append(module.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
         if hflip_prob > 0:
-            trans.append(transforms.RandomHorizontalFlip(hflip_prob))
+            transforms.append(module.RandomHorizontalFlip(hflip_prob))
         if auto_augment_policy is not None:
             if auto_augment_policy == "ra":
-                trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
+                transforms.append(module.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
             elif auto_augment_policy == "ta_wide":
-                trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation))
+                transforms.append(module.TrivialAugmentWide(interpolation=interpolation))
             elif auto_augment_policy == "augmix":
-                trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity))
+                transforms.append(module.AugMix(interpolation=interpolation, severity=augmix_severity))
             else:
-                aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy)
-                trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation))
+                aa_policy = module.AutoAugmentPolicy(auto_augment_policy)
+                transforms.append(module.AutoAugment(policy=aa_policy, interpolation=interpolation))
 
         if backend == "pil":
-            trans.append(transforms.PILToTensor())
+            transforms.append(module.PILToTensor())
 
-        trans.extend(
+        transforms.extend(
             [
-                transforms.ConvertImageDtype(torch.float),
-                transforms.Normalize(mean=mean, std=std),
+                module.ConvertImageDtype(torch.float),
+                module.Normalize(mean=mean, std=std),
             ]
         )
         if random_erase_prob > 0:
-            trans.append(transforms.RandomErasing(p=random_erase_prob))
+            transforms.append(module.RandomErasing(p=random_erase_prob))
 
-        self.transforms = transforms.Compose(trans)
+        self.transforms = module.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
@@ -67,28 +81,30 @@ class ClassificationPresetEval:
         std=(0.229, 0.224, 0.225),
         interpolation=InterpolationMode.BILINEAR,
         backend="pil",
+        use_v2=False,
     ):
-        trans = []
+        module = get_module(use_v2)
+        transforms = []
         backend = backend.lower()
         if backend == "tensor":
-            trans.append(transforms.PILToTensor())
+            transforms.append(module.PILToTensor())
         elif backend != "pil":
             raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
-        trans += [
-            transforms.Resize(resize_size, interpolation=interpolation, antialias=True),
-            transforms.CenterCrop(crop_size),
+        transforms += [
+            module.Resize(resize_size, interpolation=interpolation, antialias=True),
+            module.CenterCrop(crop_size),
         ]
 
         if backend == "pil":
-            trans.append(transforms.PILToTensor())
+            transforms.append(module.PILToTensor())
 
-        trans += [
-            transforms.ConvertImageDtype(torch.float),
-            transforms.Normalize(mean=mean, std=std),
+        transforms += [
+            module.ConvertImageDtype(torch.float),
+            module.Normalize(mean=mean, std=std),
         ]
 
-        self.transforms = transforms.Compose(trans)
+        self.transforms = module.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
diff --git a/references/classification/train.py b/references/classification/train.py
index 0c1a30145..e53476319 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -145,6 +145,7 @@ def load_data(traindir, valdir, args):
                 ra_magnitude=ra_magnitude,
                 augmix_severity=augmix_severity,
                 backend=args.backend,
+                use_v2=args.use_v2,
             ),
         )
         if args.cache_dataset:
@@ -172,6 +173,7 @@ def load_data(traindir, valdir, args):
                 resize_size=val_resize_size,
                 interpolation=interpolation,
                 backend=args.backend,
+                use_v2=args.use_v2,
             )
 
         dataset_test = torchvision.datasets.ImageFolder(
@@ -516,6 +518,7 @@ def get_args_parser(add_help=True):
     )
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
     parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
     return parser
 
 
-- 
GitLab


From bb3aae7b2543637191ad9c810f082eae622534b8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 13 Jul 2023 15:47:17 +0100
Subject: [PATCH 513/624] Add --backend and --use-v2 support to detection refs
 (#7732)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 references/classification/presets.py          |  47 +++---
 references/detection/coco_utils.py            |  35 +++--
 references/detection/engine.py                |   4 +-
 references/detection/group_by_aspect_ratio.py |   4 +-
 references/detection/presets.py               | 142 +++++++++++-------
 references/detection/train.py                 |  29 ++--
 references/detection/transforms.py            |   9 +-
 test/test_transforms_v2_consistency.py        |   2 +-
 8 files changed, 166 insertions(+), 106 deletions(-)

diff --git a/references/classification/presets.py b/references/classification/presets.py
index 9970ee577..9b53f0ccd 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -15,6 +15,9 @@ def get_module(use_v2):
 
 
 class ClassificationPresetTrain:
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter. We may change that in the
+    # future though, if we change the output type from the dataset.
     def __init__(
         self,
         *,
@@ -30,42 +33,42 @@ class ClassificationPresetTrain:
         backend="pil",
         use_v2=False,
     ):
-        module = get_module(use_v2)
+        T = get_module(use_v2)
 
         transforms = []
         backend = backend.lower()
         if backend == "tensor":
-            transforms.append(module.PILToTensor())
+            transforms.append(T.PILToTensor())
         elif backend != "pil":
             raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
-        transforms.append(module.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
+        transforms.append(T.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
         if hflip_prob > 0:
-            transforms.append(module.RandomHorizontalFlip(hflip_prob))
+            transforms.append(T.RandomHorizontalFlip(hflip_prob))
         if auto_augment_policy is not None:
             if auto_augment_policy == "ra":
-                transforms.append(module.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
+                transforms.append(T.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
             elif auto_augment_policy == "ta_wide":
-                transforms.append(module.TrivialAugmentWide(interpolation=interpolation))
+                transforms.append(T.TrivialAugmentWide(interpolation=interpolation))
             elif auto_augment_policy == "augmix":
-                transforms.append(module.AugMix(interpolation=interpolation, severity=augmix_severity))
+                transforms.append(T.AugMix(interpolation=interpolation, severity=augmix_severity))
             else:
-                aa_policy = module.AutoAugmentPolicy(auto_augment_policy)
-                transforms.append(module.AutoAugment(policy=aa_policy, interpolation=interpolation))
+                aa_policy = T.AutoAugmentPolicy(auto_augment_policy)
+                transforms.append(T.AutoAugment(policy=aa_policy, interpolation=interpolation))
 
         if backend == "pil":
-            transforms.append(module.PILToTensor())
+            transforms.append(T.PILToTensor())
 
         transforms.extend(
             [
-                module.ConvertImageDtype(torch.float),
-                module.Normalize(mean=mean, std=std),
+                T.ConvertImageDtype(torch.float),
+                T.Normalize(mean=mean, std=std),
             ]
         )
         if random_erase_prob > 0:
-            transforms.append(module.RandomErasing(p=random_erase_prob))
+            transforms.append(T.RandomErasing(p=random_erase_prob))
 
-        self.transforms = module.Compose(transforms)
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
@@ -83,28 +86,28 @@ class ClassificationPresetEval:
         backend="pil",
         use_v2=False,
     ):
-        module = get_module(use_v2)
+        T = get_module(use_v2)
         transforms = []
         backend = backend.lower()
         if backend == "tensor":
-            transforms.append(module.PILToTensor())
+            transforms.append(T.PILToTensor())
         elif backend != "pil":
             raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
         transforms += [
-            module.Resize(resize_size, interpolation=interpolation, antialias=True),
-            module.CenterCrop(crop_size),
+            T.Resize(resize_size, interpolation=interpolation, antialias=True),
+            T.CenterCrop(crop_size),
         ]
 
         if backend == "pil":
-            transforms.append(module.PILToTensor())
+            transforms.append(T.PILToTensor())
 
         transforms += [
-            module.ConvertImageDtype(torch.float),
-            module.Normalize(mean=mean, std=std),
+            T.ConvertImageDtype(torch.float),
+            T.Normalize(mean=mean, std=std),
         ]
 
-        self.transforms = module.Compose(transforms)
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 38c8279c3..313faacdb 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -7,6 +7,7 @@ import torchvision
 import transforms as T
 from pycocotools import mask as coco_mask
 from pycocotools.coco import COCO
+from torchvision.datasets import wrap_dataset_for_transforms_v2
 
 
 class FilterAndRemapCocoCategories:
@@ -49,7 +50,6 @@ class ConvertCocoPolysToMask:
         w, h = image.size
 
         image_id = target["image_id"]
-        image_id = torch.tensor([image_id])
 
         anno = target["annotations"]
 
@@ -126,10 +126,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
             return True
         return False
 
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
     ids = []
     for ds_idx, img_id in enumerate(dataset.ids):
         ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -196,12 +192,15 @@ def convert_to_coco_api(ds):
 
 
 def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
     for _ in range(10):
         if isinstance(dataset, torchvision.datasets.CocoDetection):
             break
         if isinstance(dataset, torch.utils.data.Subset):
             dataset = dataset.dataset
-    if isinstance(dataset, torchvision.datasets.CocoDetection):
+    if isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(
+        getattr(dataset, "_dataset", None), torchvision.datasets.CocoDetection
+    ):
         return dataset.coco
     return convert_to_coco_api(dataset)
 
@@ -220,7 +219,7 @@ class CocoDetection(torchvision.datasets.CocoDetection):
         return img, target
 
 
-def get_coco(root, image_set, transforms, mode="instances"):
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False):
     anno_file_template = "{}_{}2017.json"
     PATHS = {
         "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
@@ -228,17 +227,21 @@ def get_coco(root, image_set, transforms, mode="instances"):
         # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
     }
 
-    t = [ConvertCocoPolysToMask()]
-
-    if transforms is not None:
-        t.append(transforms)
-    transforms = T.Compose(t)
-
     img_folder, ann_file = PATHS[image_set]
     img_folder = os.path.join(root, img_folder)
     ann_file = os.path.join(root, ann_file)
 
-    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+    if use_v2:
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        # TODO: need to update target_keys to handle masks for segmentation!
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"boxes", "labels", "image_id"})
+    else:
+        t = [ConvertCocoPolysToMask()]
+        if transforms is not None:
+            t.append(transforms)
+        transforms = T.Compose(t)
+
+        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
 
     if image_set == "train":
         dataset = _coco_remove_images_without_annotations(dataset)
@@ -248,5 +251,7 @@ def get_coco(root, image_set, transforms, mode="instances"):
     return dataset
 
 
-def get_coco_kp(root, image_set, transforms):
+def get_coco_kp(root, image_set, transforms, use_v2=False):
+    if use_v2:
+        raise ValueError("KeyPoints aren't supported by transforms V2 yet.")
     return get_coco(root, image_set, transforms, mode="person_keypoints")
diff --git a/references/detection/engine.py b/references/detection/engine.py
index 0e5d55f18..0e9bfffdf 100644
--- a/references/detection/engine.py
+++ b/references/detection/engine.py
@@ -26,7 +26,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc
 
     for images, targets in metric_logger.log_every(data_loader, print_freq, header):
         images = list(image.to(device) for image in images)
-        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
         with torch.cuda.amp.autocast(enabled=scaler is not None):
             loss_dict = model(images, targets)
             losses = sum(loss for loss in loss_dict.values())
@@ -97,7 +97,7 @@ def evaluate(model, data_loader, device):
         outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
         model_time = time.time() - model_time
 
-        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        res = {target["image_id"]: output for target, output in zip(targets, outputs)}
         evaluator_time = time.time()
         coco_evaluator.update(res)
         evaluator_time = time.time() - evaluator_time
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
index d12e14b54..d4a447248 100644
--- a/references/detection/group_by_aspect_ratio.py
+++ b/references/detection/group_by_aspect_ratio.py
@@ -164,7 +164,9 @@ def compute_aspect_ratios(dataset, indices=None):
     if hasattr(dataset, "get_height_and_width"):
         return _compute_aspect_ratios_custom_dataset(dataset, indices)
 
-    if isinstance(dataset, torchvision.datasets.CocoDetection):
+    if isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(
+        getattr(dataset, "_dataset", None), torchvision.datasets.CocoDetection
+    ):
         return _compute_aspect_ratios_coco_dataset(dataset, indices)
 
     if isinstance(dataset, torchvision.datasets.VOCDetection):
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 779f3f218..120f079af 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -1,73 +1,109 @@
+from collections import defaultdict
+
 import torch
-import transforms as T
+import transforms as reference_transforms
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.datapoints
+        import torchvision.transforms.v2
+
+        return torchvision.transforms.v2, torchvision.datapoints
+    else:
+        return reference_transforms, None
 
 
 class DetectionPresetTrain:
-    def __init__(self, *, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)):
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter.
+    def __init__(
+        self,
+        *,
+        data_augmentation,
+        hflip_prob=0.5,
+        mean=(123.0, 117.0, 104.0),
+        backend="pil",
+        use_v2=False,
+    ):
+
+        T, datapoints = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "datapoint":
+            transforms.append(T.ToImageTensor())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+
         if data_augmentation == "hflip":
-            self.transforms = T.Compose(
-                [
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [T.RandomHorizontalFlip(p=hflip_prob)]
         elif data_augmentation == "lsj":
-            self.transforms = T.Compose(
-                [
-                    T.ScaleJitter(target_size=(1024, 1024)),
-                    T.FixedSizeCrop(size=(1024, 1024), fill=mean),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.ScaleJitter(target_size=(1024, 1024), antialias=True),
+                # TODO: FixedSizeCrop below doesn't work on tensors!
+                reference_transforms.FixedSizeCrop(size=(1024, 1024), fill=mean),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "multiscale":
-            self.transforms = T.Compose(
-                [
-                    T.RandomShortestSize(
-                        min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
-                    ),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "ssd":
-            self.transforms = T.Compose(
-                [
-                    T.RandomPhotometricDistort(),
-                    T.RandomZoomOut(fill=list(mean)),
-                    T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            fill = defaultdict(lambda: mean, {datapoints.Mask: 0}) if use_v2 else list(mean)
+            transforms += [
+                T.RandomPhotometricDistort(),
+                T.RandomZoomOut(fill=fill),
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "ssdlite":
-            self.transforms = T.Compose(
-                [
-                    T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         else:
             raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
 
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2.
+            transforms += [T.ToImageTensor() if use_v2 else T.PILToTensor()]
+
+        transforms += [T.ConvertImageDtype(torch.float)]
+
+        if use_v2:
+            transforms += [
+                T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY),
+                T.SanitizeBoundingBox(),
+            ]
+
+        self.transforms = T.Compose(transforms)
+
     def __call__(self, img, target):
         return self.transforms(img, target)
 
 
 class DetectionPresetEval:
-    def __init__(self):
-        self.transforms = T.Compose(
-            [
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-            ]
-        )
+    def __init__(self, backend="pil", use_v2=False):
+        T, _ = get_modules(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImageTensor() if use_v2 else T.PILToTensor()]
+        elif backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "datapoint":
+            transforms += [T.ToImageTensor()]
+        else:
+            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.ConvertImageDtype(torch.float)]
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
diff --git a/references/detection/train.py b/references/detection/train.py
index dea483c5f..db86f33aa 100644
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -40,23 +40,26 @@ def copypaste_collate_fn(batch):
     return copypaste(*utils.collate_fn(batch))
 
 
-def get_dataset(name, image_set, transform, data_path):
-    paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)}
-    p, ds_fn, num_classes = paths[name]
+def get_dataset(is_train, args):
+    image_set = "train" if is_train else "val"
+    paths = {"coco": (args.data_path, get_coco, 91), "coco_kp": (args.data_path, get_coco_kp, 2)}
+    p, ds_fn, num_classes = paths[args.dataset]
 
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
     return ds, num_classes
 
 
-def get_transform(train, args):
-    if train:
-        return presets.DetectionPresetTrain(data_augmentation=args.data_augmentation)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.DetectionPresetTrain(
+            data_augmentation=args.data_augmentation, backend=args.backend, use_v2=args.use_v2
+        )
     elif args.weights and args.test_only:
         weights = torchvision.models.get_weight(args.weights)
         trans = weights.transforms()
         return lambda img, target: (trans(img), target)
     else:
-        return presets.DetectionPresetEval()
+        return presets.DetectionPresetEval(backend=args.backend, use_v2=args.use_v2)
 
 
 def get_args_parser(add_help=True):
@@ -159,10 +162,16 @@ def get_args_parser(add_help=True):
         help="Use CopyPaste data augmentation. Works only with data-augmentation='lsj'.",
     )
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
+
     return parser
 
 
 def main(args):
+    if args.backend.lower() == "datapoint" and not args.use_v2:
+        raise ValueError("Use --use-v2 if you want to use the datapoint backend.")
+
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -177,8 +186,8 @@ def main(args):
     # Data loading code
     print("Loading data")
 
-    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path)
-    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path)
+    dataset, num_classes = get_dataset(is_train=True, args=args)
+    dataset_test, _ = get_dataset(is_train=False, args=args)
 
     print("Creating data loaders")
     if args.distributed:
diff --git a/references/detection/transforms.py b/references/detection/transforms.py
index d26bf6eac..65cf4e835 100644
--- a/references/detection/transforms.py
+++ b/references/detection/transforms.py
@@ -293,11 +293,13 @@ class ScaleJitter(nn.Module):
         target_size: Tuple[int, int],
         scale_range: Tuple[float, float] = (0.1, 2.0),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias=True,
     ):
         super().__init__()
         self.target_size = target_size
         self.scale_range = scale_range
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
@@ -315,14 +317,17 @@ class ScaleJitter(nn.Module):
         new_width = int(orig_width * r)
         new_height = int(orig_height * r)
 
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
 
         if target is not None:
             target["boxes"][:, 0::2] *= new_width / orig_width
             target["boxes"][:, 1::2] *= new_height / orig_height
             if "masks" in target:
                 target["masks"] = F.resize(
-                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
+                    target["masks"],
+                    [new_height, new_width],
+                    interpolation=InterpolationMode.NEAREST,
+                    antialias=self.antialias,
                 )
 
         return image, target
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index bf297473b..102afdb37 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1133,7 +1133,7 @@ class TestRefDetTransforms:
                 {"with_mask": False},
             ),
             (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}),
-            (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024)), {}),
+            (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024), antialias=True), {}),
             (
                 det_transforms.RandomShortestSize(
                     min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
-- 
GitLab


From 29418e34a94e2c43f861a321265f7f21035e7b19 Mon Sep 17 00:00:00 2001
From: Fansure Grin <47016740+fansuregrin@users.noreply.github.com>
Date: Fri, 14 Jul 2023 20:01:27 +0800
Subject: [PATCH 514/624] Fix the confusing example of
 `gt_pred_pairs_of_highest_quality` (#7730)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/models/detection/_utils.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
index a25bdc1d4..559db858a 100644
--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
@@ -408,17 +408,9 @@ class Matcher:
         # Find the highest quality match available, even if it is low, including ties
         gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
         # Example gt_pred_pairs_of_highest_quality:
-        #   tensor([[    0, 39796],
-        #           [    1, 32055],
-        #           [    1, 32070],
-        #           [    2, 39190],
-        #           [    2, 40255],
-        #           [    3, 40390],
-        #           [    3, 41455],
-        #           [    4, 45470],
-        #           [    5, 45325],
-        #           [    5, 46390]])
-        # Each row is a (gt index, prediction index)
+        # (tensor([0, 1, 1, 2, 2, 3, 3, 4, 5, 5]),
+        #  tensor([39796, 32055, 32070, 39190, 40255, 40390, 41455, 45470, 45325, 46390]))
+        # Each element in the first tensor is a gt index, and each element in second tensor is a prediction index
         # Note how gt items 1, 2, 3, and 5 each have two ties
 
         pred_inds_to_update = gt_pred_pairs_of_highest_quality[1]
-- 
GitLab


From a6dea861f5a221e526eeec2170ee094c0e5fa912 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 24 Jul 2023 13:03:46 +0100
Subject: [PATCH 515/624] Add status notice in README of prototype datasets
 (#7755)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/prototype/datasets/README.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 torchvision/prototype/datasets/README.md

diff --git a/torchvision/prototype/datasets/README.md b/torchvision/prototype/datasets/README.md
new file mode 100644
index 000000000..79b426caa
--- /dev/null
+++ b/torchvision/prototype/datasets/README.md
@@ -0,0 +1,7 @@
+# Status of prototype datasets
+
+These prototype datasets are based on [torchdata](https://github.com/pytorch/data)'s datapipes. Torchdata
+development [is
+paused](https://github.com/pytorch/data/#torchdata-see-note-below-on-current-status)
+as of July 2023, so we are not actively maintaining this module. There is no
+estimated date for a stable release of these datasets.
-- 
GitLab


From cc0f9d02e1e7549e9e8ef1081a0d3b5d72f1297d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 25 Jul 2023 17:03:44 +0200
Subject: [PATCH 516/624] improve UX for v2 Compose (#7758)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2_refactored.py   | 63 +++++++++++++++++++++++++
 torchvision/transforms/v2/_container.py |  9 ++--
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 69180b99d..64a79262f 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -26,6 +26,8 @@ from common_utils import (
     make_video,
     set_rng_seed,
 )
+
+from torch import nn
 from torch.testing import assert_close
 from torchvision import datapoints
 
@@ -1634,3 +1636,64 @@ class TestRotate:
     def test_transform_unknown_fill_error(self):
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.RandomAffine(degrees=0, fill="fill")
+
+
+class TestCompose:
+    class BuiltinTransform(transforms.Transform):
+        def _transform(self, inpt, params):
+            return inpt
+
+    class PackedInputTransform(nn.Module):
+        def forward(self, sample):
+            assert len(sample) == 2
+            return sample
+
+    class UnpackedInputTransform(nn.Module):
+        def forward(self, image, label):
+            return image, label
+
+    @pytest.mark.parametrize(
+        "transform_clss",
+        [
+            [BuiltinTransform],
+            [PackedInputTransform],
+            [UnpackedInputTransform],
+            [BuiltinTransform, BuiltinTransform],
+            [PackedInputTransform, PackedInputTransform],
+            [UnpackedInputTransform, UnpackedInputTransform],
+            [BuiltinTransform, PackedInputTransform, BuiltinTransform],
+            [BuiltinTransform, UnpackedInputTransform, BuiltinTransform],
+            [PackedInputTransform, BuiltinTransform, PackedInputTransform],
+            [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform],
+        ],
+    )
+    @pytest.mark.parametrize("unpack", [True, False])
+    def test_packed_unpacked(self, transform_clss, unpack):
+        needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in transform_clss)
+        needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in transform_clss)
+        assert not (needs_packed_inputs and needs_unpacked_inputs)
+
+        transform = transforms.Compose([cls() for cls in transform_clss])
+
+        image = make_image()
+        label = 3
+        packed_input = (image, label)
+
+        def call_transform():
+            if unpack:
+                return transform(*packed_input)
+            else:
+                return transform(packed_input)
+
+        if needs_unpacked_inputs and not unpack:
+            with pytest.raises(TypeError, match="missing 1 required positional argument"):
+                call_transform()
+        elif needs_packed_inputs and unpack:
+            with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"):
+                call_transform()
+        else:
+            output = call_transform()
+
+            assert isinstance(output, tuple) and len(output) == 2
+            assert output[0] is image
+            assert output[1] is label
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index fffef4157..8f591c497 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -43,13 +43,16 @@ class Compose(Transform):
         super().__init__()
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
         self.transforms = transforms
 
     def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
+        needs_unpacking = len(inputs) > 1
         for transform in self.transforms:
-            sample = transform(sample)
-        return sample
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
 
     def extra_repr(self) -> str:
         format_string = []
-- 
GitLab


From 1402eb8ee0cb21f44cbdde5745677d4bce4a35e5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 27 Jul 2023 13:45:24 +0100
Subject: [PATCH 517/624] Add scale option to ToDtype. Remove ConvertDtype.
 (#7759)

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst                    |   1 -
 gallery/plot_transforms_v2_e2e.py             |   2 +-
 test/common_utils.py                          |   4 +-
 test/test_transforms_v2.py                    |  60 +----
 test/test_transforms_v2_consistency.py        |   2 +-
 test/test_transforms_v2_functional.py         |  23 +-
 test/test_transforms_v2_refactored.py         | 205 +++++++++++++++++-
 test/transforms_v2_dispatcher_infos.py        |  10 -
 test/transforms_v2_kernel_infos.py            | 123 +----------
 torchvision/transforms/v2/__init__.py         |   2 +-
 torchvision/transforms/v2/_meta.py            |  22 +-
 torchvision/transforms/v2/_misc.py            |  60 ++++-
 .../transforms/v2/functional/__init__.py      |   6 +-
 .../transforms/v2/functional/_color.py        |  14 +-
 torchvision/transforms/v2/functional/_meta.py |  33 +--
 15 files changed, 296 insertions(+), 271 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 0d6961bbe..665f50ba3 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -234,7 +234,6 @@ Conversion
     v2.PILToTensor
     v2.ToImageTensor
     ConvertImageDtype
-    v2.ConvertDtype
     v2.ConvertImageDtype
     v2.ToDtype
     v2.ConvertBoundingBoxFormat
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 5d8d22dce..951af514b 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -29,7 +29,7 @@ def show(sample):
     image, target = sample
     if isinstance(image, PIL.Image.Image):
         image = F.to_image_tensor(image)
-    image = F.convert_dtype(image, torch.uint8)
+    image = F.to_dtype(image, torch.uint8, scale=True)
     annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
 
     fig, ax = plt.subplots()
diff --git a/test/common_utils.py b/test/common_utils.py
index 72ecf1043..b8b028286 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -27,7 +27,7 @@ from PIL import Image
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_pil, to_image_tensor
+from torchvision.transforms.v2.functional import to_dtype_image_tensor, to_image_pil, to_image_tensor
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -601,7 +601,7 @@ def make_image_loader_for_interpolation(
             image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
         else:
             image_tensor = image_tensor.to(device=device)
-        image_tensor = convert_dtype_image_tensor(image_tensor, dtype=dtype)
+        image_tensor = to_dtype_image_tensor(image_tensor, dtype=dtype, scale=True)
 
         return datapoints.Image(image_tensor)
 
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 374358179..6e0936079 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1,7 +1,6 @@
 import itertools
 import pathlib
 import random
-import re
 import textwrap
 import warnings
 from collections import defaultdict
@@ -105,7 +104,7 @@ def normalize_adapter(transform, input, device):
             continue
         elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
             # normalize doesn't support integer images
-            value = F.convert_dtype(value, torch.float32)
+            value = F.to_dtype(value, torch.float32, scale=True)
         adapted_input[key] = value
     return adapted_input
 
@@ -146,7 +145,7 @@ class TestSmoke:
             (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
             (transforms.ClampBoundingBox(), None),
             (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
-            (transforms.ConvertDtype(), None),
+            (transforms.ConvertImageDtype(), None),
             (transforms.GaussianBlur(kernel_size=3), None),
             (
                 transforms.LinearTransformation(
@@ -1326,61 +1325,6 @@ class TestRandomResize:
         )
 
 
-class TestToDtype:
-    @pytest.mark.parametrize(
-        ("dtype", "expected_dtypes"),
-        [
-            (
-                torch.float64,
-                {
-                    datapoints.Video: torch.float64,
-                    datapoints.Image: torch.float64,
-                    datapoints.BoundingBox: torch.float64,
-                },
-            ),
-            (
-                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-            ),
-        ],
-    )
-    def test_call(self, dtype, expected_dtypes):
-        sample = dict(
-            video=make_video(dtype=torch.int64),
-            image=make_image(dtype=torch.uint8),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
-            str="str",
-            int=0,
-        )
-
-        transform = transforms.ToDtype(dtype)
-        transformed_sample = transform(sample)
-
-        for key, value in sample.items():
-            value_type = type(value)
-            transformed_value = transformed_sample[key]
-
-            # make sure the transformation retains the type
-            assert isinstance(transformed_value, value_type)
-
-            if isinstance(value, torch.Tensor):
-                assert transformed_value.dtype is expected_dtypes[value_type]
-            else:
-                assert transformed_value is value
-
-    @pytest.mark.filterwarnings("error")
-    def test_plain_tensor_call(self):
-        tensor = torch.empty((), dtype=torch.float32)
-        transform = transforms.ToDtype({torch.Tensor: torch.float64})
-
-        assert transform(tensor).dtype is torch.float64
-
-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
-    def test_plain_tensor_warning(self, other_type):
-        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.ToDtype(dtype={torch.Tensor: torch.float32, other_type: torch.float64})
-
-
 class TestUniformTemporalSubsample:
     @pytest.mark.parametrize(
         "inpt",
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 102afdb37..9b7886f47 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -191,7 +191,7 @@ CONSISTENCY_CONFIGS = [
         closeness_kwargs=dict(rtol=None, atol=None),
     ),
     ConsistencyConfig(
-        v2_transforms.ConvertDtype,
+        v2_transforms.ConvertImageDtype,
         legacy_transforms.ConvertImageDtype,
         [
             ArgsKwargs(torch.float16),
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 465cc2271..47ea00694 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -283,12 +283,12 @@ class TestKernels:
         adapted_other_args, adapted_kwargs = info.float32_vs_uint8(other_args, kwargs)
 
         actual = info.kernel(
-            F.convert_dtype_image_tensor(input, dtype=torch.float32),
+            F.to_dtype_image_tensor(input, dtype=torch.float32, scale=True),
             *adapted_other_args,
             **adapted_kwargs,
         )
 
-        expected = F.convert_dtype_image_tensor(info.kernel(input, *other_args, **kwargs), dtype=torch.float32)
+        expected = F.to_dtype_image_tensor(info.kernel(input, *other_args, **kwargs), dtype=torch.float32, scale=True)
 
         assert_close(
             actual,
@@ -538,7 +538,6 @@ class TestDispatchers:
             (F.get_image_num_channels, F.get_num_channels),
             (F.to_pil_image, F.to_image_pil),
             (F.elastic_transform, F.elastic),
-            (F.convert_image_dtype, F.convert_dtype_image_tensor),
             (F.to_grayscale, F.rgb_to_grayscale),
         ]
     ],
@@ -547,24 +546,6 @@ def test_alias(alias, target):
     assert alias is target
 
 
-@pytest.mark.parametrize(
-    ("info", "args_kwargs"),
-    make_info_args_kwargs_params(
-        KERNEL_INFOS_MAP[F.convert_dtype_image_tensor],
-        args_kwargs_fn=lambda info: info.sample_inputs_fn(),
-    ),
-)
-@pytest.mark.parametrize("device", cpu_and_cuda())
-def test_convert_dtype_image_tensor_dtype_and_device(info, args_kwargs, device):
-    (input, *other_args), kwargs = args_kwargs.load(device)
-    dtype = other_args[0] if other_args else kwargs.get("dtype", torch.float32)
-
-    output = info.kernel(input, dtype)
-
-    assert output.dtype == dtype
-    assert output.device == input.device
-
-
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_channels", [1, 3])
 def test_normalize_image_tensor_stats(device, num_channels):
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 64a79262f..4eb1c7a33 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -1,4 +1,5 @@
 import contextlib
+import decimal
 import inspect
 import math
 import re
@@ -29,6 +30,7 @@ from common_utils import (
 
 from torch import nn
 from torch.testing import assert_close
+from torch.utils._pytree import tree_map
 from torchvision import datapoints
 
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
@@ -66,11 +68,12 @@ def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs):
 
 
 @cache
-def _script(fn):
+def _script(obj):
     try:
-        return torch.jit.script(fn)
+        return torch.jit.script(obj)
     except Exception as error:
-        raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
+        name = getattr(obj, "__name__", obj.__class__.__name__)
+        raise AssertionError(f"Trying to `torch.jit.script` '{name}' raised the error above.") from error
 
 
 def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs):
@@ -127,6 +130,7 @@ def check_kernel(
     check_cuda_vs_cpu=True,
     check_scripted_vs_eager=True,
     check_batched_vs_unbatched=True,
+    expect_same_dtype=True,
     **kwargs,
 ):
     initial_input_version = input._version
@@ -139,7 +143,8 @@ def check_kernel(
     # check that no inplace operation happened
     assert input._version == initial_input_version
 
-    assert output.dtype == input.dtype
+    if expect_same_dtype:
+        assert output.dtype == input.dtype
     assert output.device == input.device
 
     if check_cuda_vs_cpu:
@@ -276,7 +281,7 @@ def check_dispatcher_signatures_match(dispatcher, *, kernel, input_type):
 def _check_transform_v1_compatibility(transform, input):
     """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
     ``get_params`` method, is scriptable, and the scripted version can be called without error."""
-    if not hasattr(transform, "_v1_transform_cls"):
+    if transform._v1_transform_cls is None:
         return
 
     if type(input) is not torch.Tensor:
@@ -1697,3 +1702,193 @@ class TestCompose:
             assert isinstance(output, tuple) and len(output) == 2
             assert output[0] is image
             assert output[1] is label
+
+
+class TestToDtype:
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.to_dtype_image_tensor, make_image_tensor),
+            (F.to_dtype_image_tensor, make_image),
+            (F.to_dtype_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale):
+        check_kernel(
+            kernel,
+            make_input(dtype=input_dtype, device=device),
+            expect_same_dtype=input_dtype is output_dtype,
+            dtype=output_dtype,
+            scale=scale,
+        )
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.to_dtype_image_tensor, make_image_tensor),
+            (F.to_dtype_image_tensor, make_image),
+            (F.to_dtype_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_dispatcher(self, kernel, make_input, input_dtype, output_dtype, device, scale):
+        check_dispatcher(
+            F.to_dtype,
+            kernel,
+            make_input(dtype=input_dtype, device=device),
+            # TODO: we could leave check_dispatch to True but it currently fails
+            # in _check_dispatcher_dispatch because there is no to_dtype() method on the datapoints.
+            # We should be able to put this back if we change the dispatch
+            # mechanism e.g. via https://github.com/pytorch/vision/pull/7733
+            check_dispatch=False,
+            dtype=output_dtype,
+            scale=scale,
+        )
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_bounding_box, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    @pytest.mark.parametrize("as_dict", (True, False))
+    def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict):
+        input = make_input(dtype=input_dtype, device=device)
+        if as_dict:
+            output_dtype = {type(input): output_dtype}
+        check_transform(transforms.ToDtype, input, dtype=output_dtype, scale=scale)
+
+    def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False):
+        input_dtype = image.dtype
+        output_dtype = dtype
+
+        if not scale:
+            return image.to(dtype)
+
+        if output_dtype == input_dtype:
+            return image
+
+        def fn(value):
+            if input_dtype.is_floating_point:
+                if output_dtype.is_floating_point:
+                    return value
+                else:
+                    return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max)
+            else:
+                input_max_value = torch.iinfo(input_dtype).max
+
+                if output_dtype.is_floating_point:
+                    return float(decimal.Decimal(value) / input_max_value)
+                else:
+                    output_max_value = torch.iinfo(output_dtype).max
+
+                    if input_max_value > output_max_value:
+                        factor = (input_max_value + 1) // (output_max_value + 1)
+                        return value / factor
+                    else:
+                        factor = (output_max_value + 1) // (input_max_value + 1)
+                        return value * factor
+
+        return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype, device=image.device)
+
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_image_correctness(self, input_dtype, output_dtype, device, scale):
+        if input_dtype.is_floating_point and output_dtype == torch.int64:
+            pytest.xfail("float to int64 conversion is not supported")
+
+        input = make_image(dtype=input_dtype, device=device)
+
+        out = F.to_dtype(input, dtype=output_dtype, scale=scale)
+        expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale)
+
+        if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale:
+            torch.testing.assert_close(out, expected, atol=1, rtol=0)
+        else:
+            torch.testing.assert_close(out, expected)
+
+    def was_scaled(self, inpt):
+        # this assumes the target dtype is float
+        return inpt.max() <= 1
+
+    def make_inpt_with_bbox_and_mask(self, make_input):
+        H, W = 10, 10
+        inpt_dtype = torch.uint8
+        bbox_dtype = torch.float32
+        mask_dtype = torch.bool
+        sample = {
+            "inpt": make_input(size=(H, W), dtype=inpt_dtype),
+            "bbox": make_bounding_box(size=(H, W), dtype=bbox_dtype),
+            "mask": make_detection_mask(size=(H, W), dtype=mask_dtype),
+        }
+
+        return sample, inpt_dtype, bbox_dtype, mask_dtype
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_dtype_not_a_dict(self, make_input, scale):
+        # assert only inpt gets transformed when dtype isn't a dict
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample)
+
+        assert out["inpt"].dtype != inpt_dtype
+        assert out["inpt"].dtype == torch.float32
+        if scale:
+            assert self.was_scaled(out["inpt"])
+        else:
+            assert not self.was_scaled(out["inpt"])
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype == mask_dtype
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_others_catch_all_and_none(self, make_input):
+        # make sure "others" works as a catch-all and that None means no conversion
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(dtype={datapoints.Mask: torch.int64, "others": None})(sample)
+        assert out["inpt"].dtype == inpt_dtype
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype != mask_dtype
+        assert out["mask"].dtype == torch.int64
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_typical_use_case(self, make_input):
+        # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks.
+        # This just makes sure we now have a decent API for this
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(
+            dtype={type(sample["inpt"]): torch.float32, datapoints.Mask: torch.int64, "others": None}, scale=True
+        )(sample)
+        assert out["inpt"].dtype != inpt_dtype
+        assert out["inpt"].dtype == torch.float32
+        assert self.was_scaled(out["inpt"])
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype != mask_dtype
+        assert out["mask"].dtype == torch.int64
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_errors_warnings(self, make_input):
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+
+        with pytest.raises(ValueError, match="No dtype was specified for"):
+            out = transforms.ToDtype(dtype={datapoints.Mask: torch.float32})(sample)
+        with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")):
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, datapoints.Image: torch.float32})
+        with pytest.warns(UserWarning, match="no scaling will be done"):
+            out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample)
+        assert out["inpt"].dtype == inpt_dtype
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype == mask_dtype
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 6f61526f3..57b905035 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -364,16 +364,6 @@ DISPATCHER_INFOS = [
             xfail_jit_python_scalar_arg("std"),
         ],
     ),
-    DispatcherInfo(
-        F.convert_dtype,
-        kernels={
-            datapoints.Image: F.convert_dtype_image_tensor,
-            datapoints.Video: F.convert_dtype_video,
-        },
-        test_marks=[
-            skip_dispatch_datapoint,
-        ],
-    ),
     DispatcherInfo(
         F.uniform_temporal_subsample,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index dc04fbfc7..036b3e4d3 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -1,4 +1,3 @@
-import decimal
 import functools
 import itertools
 
@@ -27,7 +26,6 @@ from common_utils import (
     mark_framework_limitation,
     TestMark,
 )
-from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
@@ -1566,7 +1564,7 @@ def multi_crop_pil_reference_wrapper(pil_kernel):
     def wrapper(input_tensor, *other_args, **kwargs):
         output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs)
         return type(output)(
-            F.convert_dtype_image_tensor(F.to_image_tensor(output_pil), dtype=input_tensor.dtype)
+            F.to_dtype_image_tensor(F.to_image_tensor(output_pil), dtype=input_tensor.dtype, scale=True)
             for output_pil in output
         )
 
@@ -1667,125 +1665,6 @@ KERNEL_INFOS.extend(
 )
 
 
-def sample_inputs_convert_dtype_image_tensor():
-    for input_dtype, output_dtype in itertools.product(
-        [torch.uint8, torch.int64, torch.float32, torch.float64], repeat=2
-    ):
-        if input_dtype.is_floating_point and output_dtype == torch.int64:
-            # conversion cannot be performed safely
-            continue
-
-        for image_loader in make_image_loaders(
-            sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[input_dtype]
-        ):
-            yield ArgsKwargs(image_loader, dtype=output_dtype)
-
-
-def reference_convert_dtype_image_tensor(image, dtype=torch.float):
-    input_dtype = image.dtype
-    output_dtype = dtype
-
-    if output_dtype == input_dtype:
-        return image
-
-    def fn(value):
-        if input_dtype.is_floating_point:
-            if output_dtype.is_floating_point:
-                return value
-            else:
-                return int(decimal.Decimal(value) * torch.iinfo(output_dtype).max)
-        else:
-            input_max_value = torch.iinfo(input_dtype).max
-
-            if output_dtype.is_floating_point:
-                return float(decimal.Decimal(value) / input_max_value)
-            else:
-                output_max_value = torch.iinfo(output_dtype).max
-
-                if input_max_value > output_max_value:
-                    factor = (input_max_value + 1) // (output_max_value + 1)
-                    return value // factor
-                else:
-                    factor = (output_max_value + 1) // (input_max_value + 1)
-                    return value * factor
-
-    return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype)
-
-
-def reference_inputs_convert_dtype_image_tensor():
-    for input_dtype, output_dtype in itertools.product(
-        [
-            torch.uint8,
-            torch.int16,
-            torch.int32,
-            torch.int64,
-            torch.float16,
-            torch.float32,
-            torch.float64,
-            torch.bfloat16,
-        ],
-        repeat=2,
-    ):
-        if (input_dtype == torch.float32 and output_dtype in {torch.int32, torch.int64}) or (
-            input_dtype == torch.float64 and output_dtype == torch.int64
-        ):
-            continue
-
-        if input_dtype.is_floating_point:
-            data = [0.0, 0.5, 1.0]
-        else:
-            max_value = torch.iinfo(input_dtype).max
-            data = [0, max_value // 2, max_value]
-        image = torch.tensor(data, dtype=input_dtype)
-
-        yield ArgsKwargs(image, dtype=output_dtype)
-
-
-def sample_inputs_convert_dtype_video():
-    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
-        yield ArgsKwargs(video_loader)
-
-
-skip_dtype_consistency = TestMark(
-    ("TestKernels", "test_dtype_and_device_consistency"),
-    pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
-    condition=lambda args_kwargs: args_kwargs.args[0].dtype != args_kwargs.kwargs.get("dtype", torch.float32),
-)
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.convert_dtype_image_tensor,
-            sample_inputs_fn=sample_inputs_convert_dtype_image_tensor,
-            reference_fn=reference_convert_dtype_image_tensor,
-            reference_inputs_fn=reference_inputs_convert_dtype_image_tensor,
-            test_marks=[
-                skip_dtype_consistency,
-                TestMark(
-                    ("TestKernels", "test_against_reference"),
-                    pytest.mark.xfail(reason="Conversion overflows"),
-                    condition=lambda args_kwargs: (
-                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
-                        and not args_kwargs.kwargs["dtype"].is_floating_point
-                    )
-                    or (
-                        args_kwargs.args[0].dtype in {torch.int32, torch.int64}
-                        and args_kwargs.kwargs["dtype"] == torch.float16
-                    ),
-                ),
-            ],
-        ),
-        KernelInfo(
-            F.convert_dtype_video,
-            sample_inputs_fn=sample_inputs_convert_dtype_video,
-            test_marks=[
-                skip_dtype_consistency,
-            ],
-        ),
-    ]
-)
-
-
 def sample_inputs_uniform_temporal_subsample_video():
     for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[4]):
         yield ArgsKwargs(video_loader, num_samples=2)
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 6573446a3..69f9bc114 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -39,7 +39,7 @@ from ._geometry import (
     ScaleJitter,
     TenCrop,
 )
-from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
+from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertImageDtype
 from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index b7e2a4225..5299e318f 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -31,10 +31,13 @@ class ConvertBoundingBoxFormat(Transform):
         return F.convert_format_bounding_box(inpt, new_format=self.format)  # type: ignore[return-value]
 
 
-class ConvertDtype(Transform):
-    """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly.
+class ConvertImageDtype(Transform):
+    """[BETA] Convert input image to the given ``dtype`` and scale the values accordingly.
 
-    .. v2betastatus:: ConvertDtype transform
+    .. v2betastatus:: ConvertImageDtype transform
+
+    .. warning::
+        Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
 
     This function does not support PIL Image.
 
@@ -55,21 +58,14 @@ class ConvertDtype(Transform):
 
     _v1_transform_cls = _transforms.ConvertImageDtype
 
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_simple_tensor, datapoints.Image)
 
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
         super().__init__()
         self.dtype = dtype
 
-    def _transform(
-        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
-    ) -> Union[datapoints._TensorImageType, datapoints._TensorVideoType]:
-        return F.convert_dtype(inpt, self.dtype)
-
-
-# We changed the name to align it with the new naming scheme. Still, `ConvertImageDtype` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-ConvertImageDtype = ConvertDtype
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.to_dtype(inpt, dtype=self.dtype, scale=True)
 
 
 class ClampBoundingBox(Transform):
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 90741c4ec..7dfe46a66 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -11,7 +11,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
-from ._utils import _get_defaultdict, _setup_float_or_seq, _setup_size
+from ._utils import _setup_float_or_seq, _setup_size
 from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
@@ -225,36 +225,76 @@ class GaussianBlur(Transform):
 
 
 class ToDtype(Transform):
-    """[BETA] Converts the input to a specific dtype - this does not scale values.
+    """[BETA] Converts the input to a specific dtype, optionally scaling the values for images or videos.
 
     .. v2betastatus:: ToDtype transform
 
+    .. note::
+        ``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``.
+
     Args:
         dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
+            If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted
+            to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`.
             A dict can be passed to specify per-datapoint conversions, e.g.
-            ``dtype={datapoints.Image: torch.float32, datapoints.Video:
-            torch.float64}``.
+            ``dtype={datapoints.Image: torch.float32, datapoints.Mask: torch.int64, "others":None}``. The "others"
+            key can be used as a catch-all for any other datapoint type, and ``None`` means no conversion.
+        scale (bool, optional): Whether to scale the values for images or videos. Default: ``False``.
     """
 
     _transformed_types = (torch.Tensor,)
 
-    def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
+    def __init__(
+        self, dtype: Union[torch.dtype, Dict[Union[Type, str], Optional[torch.dtype]]], scale: bool = False
+    ) -> None:
         super().__init__()
-        if not isinstance(dtype, dict):
-            dtype = _get_defaultdict(dtype)
-        if torch.Tensor in dtype and any(cls in dtype for cls in [datapoints.Image, datapoints.Video]):
+
+        if not isinstance(dtype, (dict, torch.dtype)):
+            raise ValueError(f"dtype must be a dict or a torch.dtype, got {type(dtype)} instead")
+
+        if (
+            isinstance(dtype, dict)
+            and torch.Tensor in dtype
+            and any(cls in dtype for cls in [datapoints.Image, datapoints.Video])
+        ):
             warnings.warn(
                 "Got `dtype` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
                 "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
                 "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
             )
         self.dtype = dtype
+        self.scale = scale
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        dtype = self.dtype[type(inpt)]
+        if isinstance(self.dtype, torch.dtype):
+            # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
+            # is a simple torch.dtype
+            if not is_simple_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)):
+                return inpt
+
+            dtype: Optional[torch.dtype] = self.dtype
+        elif type(inpt) in self.dtype:
+            dtype = self.dtype[type(inpt)]
+        elif "others" in self.dtype:
+            dtype = self.dtype["others"]
+        else:
+            raise ValueError(
+                f"No dtype was specified for type {type(inpt)}. "
+                "If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. "
+                "If you're passing a dict as dtype, "
+                'you can use "others" as a catch-all key '
+                'e.g. dtype={datapoints.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
+            )
+
+        supports_scaling = is_simple_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video))
         if dtype is None:
+            if self.scale and supports_scaling:
+                warnings.warn(
+                    "scale was set to True but no dtype was specified for images or videos: no scaling will be done."
+                )
             return inpt
-        return inpt.to(dtype=dtype)
+
+        return F.to_dtype(inpt, dtype=dtype, scale=self.scale)
 
 
 class SanitizeBoundingBox(Transform):
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index b4803f4f1..4617d1af6 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -5,10 +5,10 @@ from ._utils import is_simple_tensor  # usort: skip
 from ._meta import (
     clamp_bounding_box,
     convert_format_bounding_box,
-    convert_dtype_image_tensor,
-    convert_dtype,
-    convert_dtype_video,
     convert_image_dtype,
+    to_dtype,
+    to_dtype_image_tensor,
+    to_dtype_video,
     get_dimensions_image_tensor,
     get_dimensions_image_pil,
     get_dimensions,
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 13417e4a9..c2ee56112 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -9,7 +9,7 @@ from torchvision.transforms._functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import _num_value_bits, convert_dtype_image_tensor
+from ._meta import _num_value_bits, to_dtype_image_tensor
 from ._utils import is_simple_tensor
 
 
@@ -351,7 +351,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
         return image
 
     orig_dtype = image.dtype
-    image = convert_dtype_image_tensor(image, torch.float32)
+    image = to_dtype_image_tensor(image, torch.float32, scale=True)
 
     image = _rgb_to_hsv(image)
     h, s, v = image.unbind(dim=-3)
@@ -359,7 +359,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
     image = torch.stack((h, s, v), dim=-3)
     image_hue_adj = _hsv_to_rgb(image)
 
-    return convert_dtype_image_tensor(image_hue_adj, orig_dtype)
+    return to_dtype_image_tensor(image_hue_adj, orig_dtype, scale=True)
 
 
 adjust_hue_image_pil = _FP.adjust_hue
@@ -393,7 +393,7 @@ def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1
     # The input image is either assumed to be at [0, 1] scale (if float) or is converted to that scale (if integer).
     # Since the gamma is non-negative, the output remains at [0, 1] scale.
     if not torch.is_floating_point(image):
-        output = convert_dtype_image_tensor(image, torch.float32).pow_(gamma)
+        output = to_dtype_image_tensor(image, torch.float32, scale=True).pow_(gamma)
     else:
         output = image.pow(gamma)
 
@@ -402,7 +402,7 @@ def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1
         # of the output can go beyond [0, 1].
         output = output.mul_(gain).clamp_(0.0, 1.0)
 
-    return convert_dtype_image_tensor(output, image.dtype)
+    return to_dtype_image_tensor(output, image.dtype, scale=True)
 
 
 adjust_gamma_image_pil = _FP.adjust_gamma
@@ -565,7 +565,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
     # by far the most common, we choose it as base.
     output_dtype = image.dtype
-    image = convert_dtype_image_tensor(image, torch.uint8)
+    image = to_dtype_image_tensor(image, torch.uint8, scale=True)
 
     # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
     # corresponds to adding 1 to index 127 in the histogram.
@@ -616,7 +616,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
 
     output = torch.where(valid_equalization, equalized_image, image)
-    return convert_dtype_image_tensor(output, output_dtype)
+    return to_dtype_image_tensor(output, output_dtype, scale=True)
 
 
 equalize_image_pil = _FP.equalize
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 8ffa39661..5d0c072d2 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -296,9 +296,12 @@ def _num_value_bits(dtype: torch.dtype) -> int:
         raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
 
 
-def convert_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+
     if image.dtype == dtype:
         return image
+    elif not scale:
+        return image.to(dtype)
 
     float_input = image.is_floating_point()
     if torch.jit.is_scripting():
@@ -345,30 +348,28 @@ def convert_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.f
             return image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)
 
 
-# We changed the name to align it with the new naming scheme. Still, `convert_image_dtype` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-convert_image_dtype = convert_dtype_image_tensor
+# We encourage users to use to_dtype() instead but we keep this for BC
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    return to_dtype_image_tensor(image, dtype=dtype, scale=True)
 
 
-def convert_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
-    return convert_dtype_image_tensor(video, dtype)
+def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    return to_dtype_image_tensor(video, dtype, scale=scale)
 
 
-def convert_dtype(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], dtype: torch.dtype = torch.float
-) -> torch.Tensor:
+def to_dtype(inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(convert_dtype)
+        _log_api_usage_once(to_dtype)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return convert_dtype_image_tensor(inpt, dtype)
+        return to_dtype_image_tensor(inpt, dtype, scale=scale)
     elif isinstance(inpt, datapoints.Image):
-        output = convert_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype)
+        output = to_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
         return datapoints.Image.wrap_like(inpt, output)
     elif isinstance(inpt, datapoints.Video):
-        output = convert_dtype_video(inpt.as_subclass(torch.Tensor), dtype)
+        output = to_dtype_video(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
         return datapoints.Video.wrap_like(inpt, output)
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
+        return inpt.to(dtype)
     else:
-        raise TypeError(
-            f"Input can either be a plain tensor or an `Image` or `Video` datapoint, " f"but got {type(inpt)} instead."
-        )
+        raise TypeError(f"Input can either be a plain tensor or a datapoint, but got {type(inpt)} instead.")
-- 
GitLab


From 8233c9cdf3351e1996249fdb4f3a998f8c9e693d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 27 Jul 2023 14:22:09 +0100
Subject: [PATCH 518/624] Properly handle maskrcnn and keypoints w.r.t. V2 in
 detection references (#7742)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 references/detection/coco_utils.py | 34 ++++++------------------------
 references/detection/train.py      | 29 +++++++++++++++++++------
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 313faacdb..7cf19d39d 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -1,4 +1,3 @@
-import copy
 import os
 
 import torch
@@ -10,24 +9,6 @@ from pycocotools.coco import COCO
 from torchvision.datasets import wrap_dataset_for_transforms_v2
 
 
-class FilterAndRemapCocoCategories:
-    def __init__(self, categories, remap=True):
-        self.categories = categories
-        self.remap = remap
-
-    def __call__(self, image, target):
-        anno = target["annotations"]
-        anno = [obj for obj in anno if obj["category_id"] in self.categories]
-        if not self.remap:
-            target["annotations"] = anno
-            return image, target
-        anno = copy.deepcopy(anno)
-        for obj in anno:
-            obj["category_id"] = self.categories.index(obj["category_id"])
-        target["annotations"] = anno
-        return image, target
-
-
 def convert_coco_poly_to_mask(segmentations, height, width):
     masks = []
     for polygons in segmentations:
@@ -219,7 +200,7 @@ class CocoDetection(torchvision.datasets.CocoDetection):
         return img, target
 
 
-def get_coco(root, image_set, transforms, mode="instances", use_v2=False):
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
     anno_file_template = "{}_{}2017.json"
     PATHS = {
         "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
@@ -233,9 +214,12 @@ def get_coco(root, image_set, transforms, mode="instances", use_v2=False):
 
     if use_v2:
         dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
-        # TODO: need to update target_keys to handle masks for segmentation!
-        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"boxes", "labels", "image_id"})
+        target_keys = ["boxes", "labels", "image_id"]
+        if with_masks:
+            target_keys += ["masks"]
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
     else:
+        # TODO: handle with_masks for V1?
         t = [ConvertCocoPolysToMask()]
         if transforms is not None:
             t.append(transforms)
@@ -249,9 +233,3 @@ def get_coco(root, image_set, transforms, mode="instances", use_v2=False):
     # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
 
     return dataset
-
-
-def get_coco_kp(root, image_set, transforms, use_v2=False):
-    if use_v2:
-        raise ValueError("KeyPoints aren't supported by transforms V2 yet.")
-    return get_coco(root, image_set, transforms, mode="person_keypoints")
diff --git a/references/detection/train.py b/references/detection/train.py
index db86f33aa..892ffbbbc 100644
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -28,7 +28,7 @@ import torchvision
 import torchvision.models.detection
 import torchvision.models.detection.mask_rcnn
 import utils
-from coco_utils import get_coco, get_coco_kp
+from coco_utils import get_coco
 from engine import evaluate, train_one_epoch
 from group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler
 from torchvision.transforms import InterpolationMode
@@ -42,10 +42,16 @@ def copypaste_collate_fn(batch):
 
 def get_dataset(is_train, args):
     image_set = "train" if is_train else "val"
-    paths = {"coco": (args.data_path, get_coco, 91), "coco_kp": (args.data_path, get_coco_kp, 2)}
-    p, ds_fn, num_classes = paths[args.dataset]
-
-    ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
+    num_classes, mode = {"coco": (91, "instances"), "coco_kp": (2, "person_keypoints")}[args.dataset]
+    with_masks = "mask" in args.model
+    ds = get_coco(
+        root=args.data_path,
+        image_set=image_set,
+        transforms=get_transform(is_train, args),
+        mode=mode,
+        use_v2=args.use_v2,
+        with_masks=with_masks,
+    )
     return ds, num_classes
 
 
@@ -68,7 +74,12 @@ def get_args_parser(add_help=True):
     parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help)
 
     parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
-    parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
+    parser.add_argument(
+        "--dataset",
+        default="coco",
+        type=str,
+        help="dataset name. Use coco for object detection and instance segmentation and coco_kp for Keypoint detection",
+    )
     parser.add_argument("--model", default="maskrcnn_resnet50_fpn", type=str, help="model name")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument(
@@ -171,6 +182,12 @@ def get_args_parser(add_help=True):
 def main(args):
     if args.backend.lower() == "datapoint" and not args.use_v2:
         raise ValueError("Use --use-v2 if you want to use the datapoint backend.")
+    if args.dataset not in ("coco", "coco_kp"):
+        raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}")
+    if "keypoint" in args.model and args.dataset != "coco_kp":
+        raise ValueError("Oops, if you want Keypoint detection, set --dataset coco_kp")
+    if args.dataset == "coco_kp" and args.use_v2:
+        raise ValueError("KeyPoint detection doesn't support V2 transforms yet")
 
     if args.output_dir:
         utils.mkdir(args.output_dir)
-- 
GitLab


From b9b7cfc602d68e71b4e4039d15dddfe578df9db2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 27 Jul 2023 16:28:17 +0100
Subject: [PATCH 519/624] Add --backend and --use-v2 support for segmentation
 references (#7743)

---
 references/detection/coco_utils.py    |   3 +-
 references/segmentation/coco_utils.py |  24 ++++--
 references/segmentation/presets.py    | 113 ++++++++++++++++++++------
 references/segmentation/train.py      |  38 ++++++---
 references/segmentation/transforms.py |   2 +-
 references/segmentation/utils.py      |   6 +-
 references/segmentation/v2_extras.py  |  83 +++++++++++++++++++
 7 files changed, 220 insertions(+), 49 deletions(-)
 create mode 100644 references/segmentation/v2_extras.py

diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 7cf19d39d..07c98a67c 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -6,7 +6,6 @@ import torchvision
 import transforms as T
 from pycocotools import mask as coco_mask
 from pycocotools.coco import COCO
-from torchvision.datasets import wrap_dataset_for_transforms_v2
 
 
 def convert_coco_poly_to_mask(segmentations, height, width):
@@ -213,6 +212,8 @@ def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_m
     ann_file = os.path.join(root, ann_file)
 
     if use_v2:
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
         dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
         target_keys = ["boxes", "labels", "image_id"]
         if with_masks:
diff --git a/references/segmentation/coco_utils.py b/references/segmentation/coco_utils.py
index e02434012..6a15dbefb 100644
--- a/references/segmentation/coco_utils.py
+++ b/references/segmentation/coco_utils.py
@@ -68,11 +68,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
         # if more than 1k pixels occupied in the image
         return sum(obj["area"] for obj in anno) > 1000
 
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
-
     ids = []
     for ds_idx, img_id in enumerate(dataset.ids):
         ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -86,7 +81,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
     return dataset
 
 
-def get_coco(root, image_set, transforms):
+def get_coco(root, image_set, transforms, use_v2=False):
     PATHS = {
         "train": ("train2017", os.path.join("annotations", "instances_train2017.json")),
         "val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
@@ -94,13 +89,24 @@ def get_coco(root, image_set, transforms):
     }
     CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]
 
-    transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
-
     img_folder, ann_file = PATHS[image_set]
     img_folder = os.path.join(root, img_folder)
     ann_file = os.path.join(root, ann_file)
 
-    dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+    # The 2 "Compose" below achieve the same thing: converting coco detection
+    # samples into segmentation-compatible samples. They just do it with
+    # slightly different implementations. We could refactor and unify, but
+    # keeping them separate helps keeping the v2 version clean
+    if use_v2:
+        import v2_extras
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"})
+    else:
+        transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
 
     if image_set == "train":
         dataset = _coco_remove_images_without_annotations(dataset, CAT_LIST)
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index ed02ae660..abb70d8d0 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -1,39 +1,106 @@
+from collections import defaultdict
+
 import torch
-import transforms as T
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.datapoints
+        import torchvision.transforms.v2
+        import v2_extras
+
+        return torchvision.transforms.v2, torchvision.datapoints, v2_extras
+    else:
+        import transforms
+
+        return transforms, None, None
 
 
 class SegmentationPresetTrain:
-    def __init__(self, *, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        min_size = int(0.5 * base_size)
-        max_size = int(2.0 * base_size)
+    def __init__(
+        self,
+        *,
+        base_size,
+        crop_size,
+        hflip_prob=0.5,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+        backend="pil",
+        use_v2=False,
+    ):
+        T, datapoints, v2_extras = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "datapoint":
+            transforms.append(T.ToImageTensor())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))]
 
-        trans = [T.RandomResize(min_size, max_size)]
         if hflip_prob > 0:
-            trans.append(T.RandomHorizontalFlip(hflip_prob))
-        trans.extend(
-            [
-                T.RandomCrop(crop_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
+            transforms += [T.RandomHorizontalFlip(hflip_prob)]
+
+        if use_v2:
+            # We need a custom pad transform here, since the padding we want to perform here is fundamentally
+            # different from the padding in `RandomCrop` if `pad_if_needed=True`.
+            transforms += [v2_extras.PadIfSmaller(crop_size, fill=defaultdict(lambda: 0, {datapoints.Mask: 255}))]
+
+        transforms += [T.RandomCrop(crop_size)]
+
+        if backend == "pil":
+            transforms += [T.PILToTensor()]
+
+        if use_v2:
+            img_type = datapoints.Image if backend == "datapoint" else torch.Tensor
+            transforms += [
+                T.ToDtype(dtype={img_type: torch.float32, datapoints.Mask: torch.int64, "others": None}, scale=True)
             ]
-        )
-        self.transforms = T.Compose(trans)
+        else:
+            # No need to explicitly convert masks as they're magically int64 already
+            transforms += [T.ConvertImageDtype(torch.float)]
+
+        transforms += [T.Normalize(mean=mean, std=std)]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
 
 
 class SegmentationPresetEval:
-    def __init__(self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        self.transforms = T.Compose(
-            [
-                T.RandomResize(base_size, base_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
-            ]
-        )
+    def __init__(
+        self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False
+    ):
+        T, _, _ = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "datapoint":
+            transforms += [T.ToImageTensor()]
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+
+        if use_v2:
+            transforms += [T.Resize(size=(base_size, base_size))]
+        else:
+            transforms += [T.RandomResize(min_size=base_size, max_size=base_size)]
+
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImageTensor() if use_v2 else T.PILToTensor()]
+
+        transforms += [
+            T.ConvertImageDtype(torch.float),
+            T.Normalize(mean=mean, std=std),
+        ]
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index 1aa72a9fe..7ca4bd1c5 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -14,24 +14,30 @@ from torch.optim.lr_scheduler import PolynomialLR
 from torchvision.transforms import functional as F, InterpolationMode
 
 
-def get_dataset(dir_path, name, image_set, transform):
+def get_dataset(args, is_train):
     def sbd(*args, **kwargs):
+        kwargs.pop("use_v2")
         return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)
 
+    def voc(*args, **kwargs):
+        kwargs.pop("use_v2")
+        return torchvision.datasets.VOCSegmentation(*args, **kwargs)
+
     paths = {
-        "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
-        "voc_aug": (dir_path, sbd, 21),
-        "coco": (dir_path, get_coco, 21),
+        "voc": (args.data_path, voc, 21),
+        "voc_aug": (args.data_path, sbd, 21),
+        "coco": (args.data_path, get_coco, 21),
     }
-    p, ds_fn, num_classes = paths[name]
+    p, ds_fn, num_classes = paths[args.dataset]
 
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    image_set = "train" if is_train else "val"
+    ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
     return ds, num_classes
 
 
-def get_transform(train, args):
-    if train:
-        return presets.SegmentationPresetTrain(base_size=520, crop_size=480)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2)
     elif args.weights and args.test_only:
         weights = torchvision.models.get_weight(args.weights)
         trans = weights.transforms()
@@ -44,7 +50,7 @@ def get_transform(train, args):
 
         return preprocessing
     else:
-        return presets.SegmentationPresetEval(base_size=520)
+        return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2)
 
 
 def criterion(inputs, target):
@@ -120,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi
 
 
 def main(args):
+    if args.backend.lower() != "pil" and not args.use_v2:
+        # TODO: Support tensor backend in V1?
+        raise ValueError("Use --use-v2 if you want to use the datapoint or tensor backend.")
+    if args.use_v2 and args.dataset != "coco":
+        raise ValueError("v2 is only support supported for coco dataset for now.")
+
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -134,8 +146,8 @@ def main(args):
     else:
         torch.backends.cudnn.benchmark = True
 
-    dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args))
-    dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args))
+    dataset, num_classes = get_dataset(args, is_train=True)
+    dataset_test, _ = get_dataset(args, is_train=False)
 
     if args.distributed:
         train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
@@ -307,6 +319,8 @@ def get_args_parser(add_help=True):
     # Mixed precision training parameters
     parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
     return parser
 
 
diff --git a/references/segmentation/transforms.py b/references/segmentation/transforms.py
index 518048db2..2b3e79b14 100644
--- a/references/segmentation/transforms.py
+++ b/references/segmentation/transforms.py
@@ -35,7 +35,7 @@ class RandomResize:
 
     def __call__(self, image, target):
         size = random.randint(self.min_size, self.max_size)
-        image = F.resize(image, size)
+        image = F.resize(image, size, antialias=True)
         target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST)
         return image, target
 
diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py
index 4ea24db83..cb200f23d 100644
--- a/references/segmentation/utils.py
+++ b/references/segmentation/utils.py
@@ -267,9 +267,9 @@ def init_distributed_mode(args):
         args.rank = int(os.environ["RANK"])
         args.world_size = int(os.environ["WORLD_SIZE"])
         args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
+    # elif "SLURM_PROCID" in os.environ:
+    #     args.rank = int(os.environ["SLURM_PROCID"])
+    #     args.gpu = args.rank % torch.cuda.device_count()
     elif hasattr(args, "rank"):
         pass
     else:
diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
new file mode 100644
index 000000000..c69827c22
--- /dev/null
+++ b/references/segmentation/v2_extras.py
@@ -0,0 +1,83 @@
+"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1."""
+import torch
+from torchvision import datapoints
+from torchvision.transforms import v2
+
+
+class PadIfSmaller(v2.Transform):
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = size
+        self.fill = v2._geometry._setup_fill_arg(fill)
+
+    def _get_params(self, sample):
+        _, height, width = v2.utils.query_chw(sample)
+        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
+        needs_padding = any(padding)
+        return dict(padding=padding, needs_padding=needs_padding)
+
+    def _transform(self, inpt, params):
+        if not params["needs_padding"]:
+            return inpt
+
+        fill = self.fill[type(inpt)]
+        fill = v2._utils._convert_fill_arg(fill)
+
+        return v2.functional.pad(inpt, padding=params["padding"], fill=fill)
+
+
+class CocoDetectionToVOCSegmentation(v2.Transform):
+    """Turn samples from datasets.CocoDetection into the same format as VOCSegmentation.
+
+    This is achieved in two steps:
+
+    1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately,
+       the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not
+       present in VOC are dropped and replaced by background.
+    2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual
+       mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where
+       the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation
+       mask while pixels that belong to multiple detection masks are marked as invalid.
+    """
+
+    COCO_TO_VOC_LABEL_MAP = dict(
+        zip(
+            [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72],
+            range(21),
+        )
+    )
+    INVALID_VALUE = 255
+
+    def _coco_detection_masks_to_voc_segmentation_mask(self, target):
+        if "masks" not in target:
+            return None
+
+        instance_masks, instance_labels_coco = target["masks"], target["labels"]
+
+        valid_labels_voc = [
+            (idx, label_voc)
+            for idx, label_coco in enumerate(instance_labels_coco.tolist())
+            if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None
+        ]
+
+        if not valid_labels_voc:
+            return None
+
+        valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc)
+
+        instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8)
+        instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8)
+
+        # Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as
+        # there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step.
+        segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0)
+        segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE
+
+        return segmentation_mask
+
+    def forward(self, image, target):
+        segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target)
+        if segmentation_mask is None:
+            segmentation_mask = torch.zeros(v2.functional.get_spatial_size(image), dtype=torch.uint8)
+
+        return image, datapoints.Mask(segmentation_mask)
-- 
GitLab


From 8071c177065691b80175b429314986dfbcce454a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 27 Jul 2023 17:39:23 +0200
Subject: [PATCH 520/624] fix make_image (#7768)

---
 test/common_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index b8b028286..af8f57832 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -503,12 +503,13 @@ def make_image(
     device="cpu",
     memory_format=torch.contiguous_format,
 ):
+    dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
     data = torch.testing.make_tensor(
         (*batch_dims, get_num_channels(color_space), *size),
         low=0,
         high=max_value,
-        dtype=dtype or torch.uint8,
+        dtype=dtype,
         device=device,
         memory_format=memory_format,
     )
-- 
GitLab


From 359137104f445ce7a5d5385432b437b25519898f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 28 Jul 2023 09:41:06 +0200
Subject: [PATCH 521/624] promote Mixup and Cutmix from prototype to transforms
 v2 (#7731)

Co-authored-by: Nicolas Hug <nicolashug@meta.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/transforms.rst              |  16 +++
 gallery/plot_cutmix_mixup.py            |   8 ++
 references/classification/train.py      |  19 ++-
 references/classification/transforms.py |  23 ++++
 test/test_transforms_v2.py              |  26 +---
 test/test_transforms_v2_refactored.py   | 148 +++++++++++++++++++-
 torchvision/transforms/v2/__init__.py   |   2 +-
 torchvision/transforms/v2/_augment.py   | 175 +++++++++++++++++++++++-
 torchvision/transforms/v2/_misc.py      |  75 ++--------
 torchvision/transforms/v2/_utils.py     |  64 ++++++++-
 10 files changed, 451 insertions(+), 105 deletions(-)
 create mode 100644 gallery/plot_cutmix_mixup.py

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 665f50ba3..b29c22ee1 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -261,6 +261,22 @@ The new transform can be used standalone or mixed-and-matched with existing tran
     AugMix
     v2.AugMix
 
+Cutmix - Mixup
+--------------
+
+Cutmix and Mixup are special transforms that
+are meant to be used on batches rather than on individual images, because they
+are combining pairs of images together. These can be used after the dataloader,
+or part of a collation function. See
+:ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.Cutmix
+    v2.Mixup
+
 .. _functional_transforms:
 
 Functional Transforms
diff --git a/gallery/plot_cutmix_mixup.py b/gallery/plot_cutmix_mixup.py
new file mode 100644
index 000000000..19838fe90
--- /dev/null
+++ b/gallery/plot_cutmix_mixup.py
@@ -0,0 +1,8 @@
+
+"""
+===========================
+How to use Cutmix and Mixup
+===========================
+
+TODO
+"""
diff --git a/references/classification/train.py b/references/classification/train.py
index e53476319..1bb0d86e9 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -8,12 +8,12 @@ import torch
 import torch.utils.data
 import torchvision
 import torchvision.transforms
-import transforms
 import utils
 from sampler import RASampler
 from torch import nn
 from torch.utils.data.dataloader import default_collate
 from torchvision.transforms.functional import InterpolationMode
+from transforms import get_mixup_cutmix
 
 
 def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None):
@@ -218,18 +218,17 @@ def main(args):
     val_dir = os.path.join(args.data_path, "val")
     dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
 
-    collate_fn = None
     num_classes = len(dataset.classes)
-    mixup_transforms = []
-    if args.mixup_alpha > 0.0:
-        mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha))
-    if args.cutmix_alpha > 0.0:
-        mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha))
-    if mixup_transforms:
-        mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms)
+    mixup_cutmix = get_mixup_cutmix(
+        mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_categories=num_classes, use_v2=args.use_v2
+    )
+    if mixup_cutmix is not None:
 
         def collate_fn(batch):
-            return mixupcutmix(*default_collate(batch))
+            return mixup_cutmix(*default_collate(batch))
+
+    else:
+        collate_fn = default_collate
 
     data_loader = torch.utils.data.DataLoader(
         dataset,
diff --git a/references/classification/transforms.py b/references/classification/transforms.py
index 9a8ef7877..330579cb5 100644
--- a/references/classification/transforms.py
+++ b/references/classification/transforms.py
@@ -2,10 +2,33 @@ import math
 from typing import Tuple
 
 import torch
+from presets import get_module
 from torch import Tensor
 from torchvision.transforms import functional as F
 
 
+def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_categories, use_v2):
+    transforms_module = get_module(use_v2)
+
+    mixup_cutmix = []
+    if mixup_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.Mixup(alpha=mixup_alpha, num_categories=num_categories)
+            if use_v2
+            else RandomMixup(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+        )
+    if cutmix_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.Cutmix(alpha=mixup_alpha, num_categories=num_categories)
+            if use_v2
+            else RandomCutmix(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+        )
+    if not mixup_cutmix:
+        return None
+
+    return transforms_module.RandomChoice(mixup_cutmix)
+
+
 class RandomMixup(torch.nn.Module):
     """Randomly apply Mixup to the provided batch and targets.
     The class implements the data augmentations as described in the paper
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 6e0936079..0e311fd65 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1558,9 +1558,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 
 
 @pytest.mark.parametrize("min_size", (1, 10))
-@pytest.mark.parametrize(
-    "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
-)
+@pytest.mark.parametrize("labels_getter", ("default", lambda inputs: inputs["labels"], None, lambda inputs: None))
 @pytest.mark.parametrize("sample_type", (tuple, dict))
 def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
 
@@ -1648,22 +1646,6 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
         assert out_labels.tolist() == valid_indices
 
 
-@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
-@pytest.mark.parametrize("sample_type", (tuple, dict))
-def test_sanitize_bounding_boxes_default_heuristic(key, sample_type):
-    labels = torch.arange(10)
-    sample = {key: labels, "another_key": "whatever"}
-    if sample_type is tuple:
-        sample = (None, sample, "whatever_again")
-    assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(sample) is labels
-
-    if key.lower() != "labels":
-        # If "labels" is in the dict (case-insensitive),
-        # it takes precedence over other keys which would otherwise be a match
-        d = {key: "something_else", "labels": labels}
-        assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(d) is labels
-
-
 def test_sanitize_bounding_boxes_errors():
 
     good_bbox = datapoints.BoundingBox(
@@ -1674,17 +1656,13 @@ def test_sanitize_bounding_boxes_errors():
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
         transforms.SanitizeBoundingBox(min_size=0)
-    with pytest.raises(ValueError, match="labels_getter should either be a str"):
+    with pytest.raises(ValueError, match="labels_getter should either be 'default'"):
         transforms.SanitizeBoundingBox(labels_getter=12)
 
     with pytest.raises(ValueError, match="Could not infer where the labels are"):
         bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
         transforms.SanitizeBoundingBox()(bad_labels_key)
 
-    with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
-        not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
-        transforms.SanitizeBoundingBox()(not_a_dict)
-
     with pytest.raises(ValueError, match="must be a tensor"):
         not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
         transforms.SanitizeBoundingBox()(not_a_tensor)
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 4eb1c7a33..0ec3c5f01 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -17,6 +17,7 @@ from common_utils import (
     assert_no_warnings,
     cache,
     cpu_and_cuda,
+    freeze_rng_state,
     ignore_jit_no_profile_information_warning,
     make_bounding_box,
     make_detection_mask,
@@ -25,12 +26,14 @@ from common_utils import (
     make_image_tensor,
     make_segmentation_mask,
     make_video,
+    needs_cuda,
     set_rng_seed,
 )
 
 from torch import nn
 from torch.testing import assert_close
 from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader, default_collate
 from torchvision import datapoints
 
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
@@ -61,8 +64,10 @@ def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs):
     input_cuda = input.as_subclass(torch.Tensor)
     input_cpu = input_cuda.to("cpu")
 
-    actual = kernel(input_cuda, *args, **kwargs)
-    expected = kernel(input_cpu, *args, **kwargs)
+    with freeze_rng_state():
+        actual = kernel(input_cuda, *args, **kwargs)
+    with freeze_rng_state():
+        expected = kernel(input_cpu, *args, **kwargs)
 
     assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol)
 
@@ -1892,3 +1897,142 @@ class TestToDtype:
         assert out["inpt"].dtype == inpt_dtype
         assert out["bbox"].dtype == bbox_dtype
         assert out["mask"].dtype == mask_dtype
+
+
+class TestCutMixMixUp:
+    class DummyDataset:
+        def __init__(self, size, num_classes):
+            self.size = size
+            self.num_classes = num_classes
+            assert size < num_classes
+
+        def __getitem__(self, idx):
+            img = torch.rand(3, 100, 100)
+            label = idx  # This ensures all labels in a batch are unique and makes testing easier
+            return img, label
+
+        def __len__(self):
+            return self.size
+
+    @pytest.mark.parametrize("T", [transforms.Cutmix, transforms.Mixup])
+    def test_supported_input_structure(self, T):
+
+        batch_size = 32
+        num_classes = 100
+
+        dataset = self.DummyDataset(size=batch_size, num_classes=num_classes)
+
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        dl = DataLoader(dataset, batch_size=batch_size)
+
+        # Input sanity checks
+        img, target = next(iter(dl))
+        input_img_size = img.shape[-3:]
+        assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor)
+        assert target.shape == (batch_size,)
+
+        def check_output(img, target):
+            assert img.shape == (batch_size, *input_img_size)
+            assert target.shape == (batch_size, num_classes)
+            torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size))
+            num_non_zero_labels = (target != 0).sum(axis=-1)
+            assert (num_non_zero_labels == 2).all()
+
+        # After Dataloader, as unpacked input
+        img, target = next(iter(dl))
+        assert target.shape == (batch_size,)
+        img, target = cutmix_mixup(img, target)
+        check_output(img, target)
+
+        # After Dataloader, as packed input
+        packed_from_dl = next(iter(dl))
+        assert isinstance(packed_from_dl, list)
+        img, target = cutmix_mixup(packed_from_dl)
+        check_output(img, target)
+
+        # As collation function. We expect default_collate to be used by users.
+        def collate_fn_1(batch):
+            return cutmix_mixup(default_collate(batch))
+
+        def collate_fn_2(batch):
+            return cutmix_mixup(*default_collate(batch))
+
+        for collate_fn in (collate_fn_1, collate_fn_2):
+            dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
+            img, target = next(iter(dl))
+            check_output(img, target)
+
+    @needs_cuda
+    @pytest.mark.parametrize("T", [transforms.Cutmix, transforms.Mixup])
+    def test_cpu_vs_gpu(self, T):
+        num_classes = 10
+        batch_size = 3
+        H, W = 12, 12
+
+        imgs = torch.rand(batch_size, 3, H, W)
+        labels = torch.randint(0, num_classes, (batch_size,))
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None)
+
+    @pytest.mark.parametrize("T", [transforms.Cutmix, transforms.Mixup])
+    def test_error(self, T):
+
+        num_classes = 10
+        batch_size = 9
+
+        imgs = torch.rand(batch_size, 3, 12, 12)
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        for input_with_bad_type in (
+            F.to_pil_image(imgs[0]),
+            datapoints.Mask(torch.rand(12, 12)),
+            datapoints.BoundingBox(torch.rand(2, 4), format="XYXY", spatial_size=12),
+        ):
+            with pytest.raises(ValueError, match="does not support PIL images, "):
+                cutmix_mixup(input_with_bad_type)
+
+        with pytest.raises(ValueError, match="Could not infer where the labels are"):
+            cutmix_mixup({"img": imgs, "Nothing_else": 3})
+
+        with pytest.raises(ValueError, match="labels tensor should be of shape"):
+            # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label
+            # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently
+            cutmix_mixup(imgs)
+
+        with pytest.raises(ValueError, match="When using the default labels_getter"):
+            cutmix_mixup(imgs, "not_a_tensor")
+
+        with pytest.raises(ValueError, match="labels tensor should be of shape"):
+            cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3)))
+
+        with pytest.raises(ValueError, match="Expected a batched input with 4 dims"):
+            cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,)))
+
+        with pytest.raises(ValueError, match="does not match the batch size of the labels"):
+            cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,)))
+
+        with pytest.raises(ValueError, match="labels tensor should be of shape"):
+            # The purpose of this check is more about documenting the current
+            # behaviour of what happens on a Compose(), rather than actually
+            # asserting the expected behaviour. We may support Compose() in the
+            # future, e.g. for 2 consecutive CutMix?
+            labels = torch.randint(0, num_classes, size=(batch_size,))
+            transforms.Compose([cutmix_mixup, cutmix_mixup])(imgs, labels)
+
+
+@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
+@pytest.mark.parametrize("sample_type", (tuple, list, dict))
+def test_labels_getter_default_heuristic(key, sample_type):
+    labels = torch.arange(10)
+    sample = {key: labels, "another_key": "whatever"}
+    if sample_type is not dict:
+        sample = sample_type((None, sample, "whatever_again"))
+    assert transforms._utils._find_labels_default_heuristic(sample) is labels
+
+    if key.lower() != "labels":
+        # If "labels" is in the dict (case-insensitive),
+        # it takes precedence over other keys which would otherwise be a match
+        d = {key: "something_else", "labels": labels}
+        assert transforms._utils._find_labels_default_heuristic(d) is labels
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 69f9bc114..7e1080e69 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -4,7 +4,7 @@ from . import functional, utils  # usort: skip
 
 from ._transform import Transform  # usort: skip
 
-from ._augment import RandomErasing
+from ._augment import Cutmix, Mixup, RandomErasing
 from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
 from ._color import (
     ColorJitter,
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 937e3508a..fb4e23f5f 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -5,11 +5,14 @@ from typing import Any, Dict, List, Tuple, Union
 
 import PIL.Image
 import torch
+from torch.nn.functional import one_hot
+from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F
 
-from ._transform import _RandomApplyTransform
-from .utils import is_simple_tensor, query_chw
+from ._transform import _RandomApplyTransform, Transform
+from ._utils import _parse_labels_getter
+from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
 
 
 class RandomErasing(_RandomApplyTransform):
@@ -135,3 +138,171 @@ class RandomErasing(_RandomApplyTransform):
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
         return inpt
+
+
+class _BaseMixupCutmix(Transform):
+    def __init__(self, *, alpha: float = 1, num_classes: int, labels_getter="default") -> None:
+        super().__init__()
+        self.alpha = alpha
+        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
+
+        self.num_classes = num_classes
+
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask):
+            raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.")
+
+        labels = self._labels_getter(inputs)
+        if not isinstance(labels, torch.Tensor):
+            raise ValueError(f"The labels must be a tensor, but got {type(labels)} instead.")
+        elif labels.ndim != 1:
+            raise ValueError(
+                f"labels tensor should be of shape (batch_size,) " f"but got shape {labels.shape} instead."
+            )
+
+        params = {
+            "labels": labels,
+            "batch_size": labels.shape[0],
+            **self._get_params(
+                [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+            ),
+        }
+
+        # By default, the labels will be False inside needs_transform_list, since they are a torch.Tensor coming
+        # after an image or video. However, we need to handle them in _transform, so we make sure to set them to True
+        needs_transform_list[next(idx for idx, inpt in enumerate(flat_inputs) if inpt is labels)] = True
+        flat_outputs = [
+            self._transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int):
+        expected_num_dims = 5 if isinstance(inpt, datapoints.Video) else 4
+        if inpt.ndim != expected_num_dims:
+            raise ValueError(
+                f"Expected a batched input with {expected_num_dims} dims, but got {inpt.ndim} dimensions instead."
+            )
+        if inpt.shape[0] != batch_size:
+            raise ValueError(
+                f"The batch size of the image or video does not match the batch size of the labels: "
+                f"{inpt.shape[0]} != {batch_size}."
+            )
+
+    def _mixup_label(self, label: torch.Tensor, *, lam: float) -> torch.Tensor:
+        label = one_hot(label, num_classes=self.num_classes)
+        if not label.dtype.is_floating_point:
+            label = label.float()
+        return label.roll(1, 0).mul_(1.0 - lam).add_(label.mul(lam))
+
+
+class Mixup(_BaseMixupCutmix):
+    """[BETA] Apply Mixup to the provided batch of images and labels.
+
+    .. v2betastatus:: Mixup transform
+
+    Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
+
+    See :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int): number of classes in the batch. Used for one-hot-encoding.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter a the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``Mixup()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        lam = params["lam"]
+
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=lam)
+        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
+
+            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+                output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
+
+            return output
+        else:
+            return inpt
+
+
+class Cutmix(_BaseMixupCutmix):
+    """[BETA] Apply Cutmix to the provided batch of images and labels.
+
+    .. v2betastatus:: Cutmix transform
+
+    Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
+    <https://arxiv.org/abs/1905.04899>`_.
+
+    See :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int): number of classes in the batch. Used for one-hot-encoding.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter a the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``Cutmix()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        lam = float(self._dist.sample(()))  # type: ignore[arg-type]
+
+        H, W = query_spatial_size(flat_inputs)
+
+        r_x = torch.randint(W, size=(1,))
+        r_y = torch.randint(H, size=(1,))
+
+        r = 0.5 * math.sqrt(1.0 - lam)
+        r_w_half = int(r * W)
+        r_h_half = int(r * H)
+
+        x1 = int(torch.clamp(r_x - r_w_half, min=0))
+        y1 = int(torch.clamp(r_y - r_h_half, min=0))
+        x2 = int(torch.clamp(r_x + r_w_half, max=W))
+        y2 = int(torch.clamp(r_y + r_h_half, max=H))
+        box = (x1, y1, x2, y2)
+
+        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
+
+        return dict(box=box, lam_adjusted=lam_adjusted)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=params["lam_adjusted"])
+        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            x1, y1, x2, y2 = params["box"]
+            rolled = inpt.roll(1, 0)
+            output = inpt.clone()
+            output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
+
+            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
+                output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
+
+            return output
+        else:
+            return inpt
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 7dfe46a66..b22c61727 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -1,7 +1,5 @@
-import collections
 import warnings
-from contextlib import suppress
-from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Sequence, Type, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union
 
 import PIL.Image
 
@@ -11,7 +9,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
-from ._utils import _setup_float_or_seq, _setup_size
+from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
 from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
@@ -318,12 +316,11 @@ class SanitizeBoundingBox(Transform):
     Args:
         min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
         labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
-            It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies
-            the key whose value corresponds to the labels. It can also be a callable that takes the same input
-            as the transform, and returns the labels.
-            By default, this will try to find a "labels" key in the input, if
+            By default, this will try to find a "labels" key in the input (case-insensitive), if
             the input is a dict or it is a tuple whose second element is a dict.
             This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+            It can also be a callable that takes the same input
+            as the transform, and returns the labels.
     """
 
     def __init__(
@@ -338,66 +335,16 @@ class SanitizeBoundingBox(Transform):
         self.min_size = min_size
 
         self.labels_getter = labels_getter
-        self._labels_getter: Optional[Callable[[Any], Optional[torch.Tensor]]]
-        if labels_getter == "default":
-            self._labels_getter = self._find_labels_default_heuristic
-        elif callable(labels_getter):
-            self._labels_getter = labels_getter
-        elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)[
-                labels_getter  # type: ignore[index]
-            ]
-        elif labels_getter is None:
-            self._labels_getter = None
-        else:
-            raise ValueError(
-                "labels_getter should either be a str, callable, or 'default'. "
-                f"Got {labels_getter} of type {type(labels_getter)}."
-            )
-
-    @staticmethod
-    def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]:
-        # datasets outputs may be plain dicts like {"img": ..., "labels": ..., "bbox": ...}
-        # or tuples like (img, {"labels":..., "bbox": ...})
-        # This hacky helper accounts for both structures.
-        if isinstance(inputs, tuple):
-            inputs = inputs[1]
-
-        if not isinstance(inputs, collections.abc.Mapping):
-            raise ValueError(
-                f"If labels_getter is a str or 'default', "
-                f"then the input to forward() must be a dict or a tuple whose second element is a dict."
-                f" Got {type(inputs)} instead."
-            )
-        return inputs
-
-    @staticmethod
-    def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
-        # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive
-        # Returns None if nothing is found
-        inputs = SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)
-        candidate_key = None
-        with suppress(StopIteration):
-            candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
-        if candidate_key is None:
-            with suppress(StopIteration):
-                candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
-        if candidate_key is None:
-            raise ValueError(
-                "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
-                "If there are no samples and it is by design, pass labels_getter=None."
-            )
-        return inputs[candidate_key]
+        self._labels_getter = _parse_labels_getter(labels_getter)
 
     def forward(self, *inputs: Any) -> Any:
         inputs = inputs if len(inputs) > 1 else inputs[0]
 
-        if self._labels_getter is None:
-            labels = None
-        else:
-            labels = self._labels_getter(inputs)
-            if labels is not None and not isinstance(labels, torch.Tensor):
-                raise ValueError(f"The labels in the input to forward() must be a tensor, got {type(labels)} instead.")
+        labels = self._labels_getter(inputs)
+        if labels is not None and not isinstance(labels, torch.Tensor):
+            raise ValueError(
+                f"The labels in the input to forward() must be a tensor or None, got {type(labels)} instead."
+            )
 
         flat_inputs, spec = tree_flatten(inputs)
         # TODO: this enforces one single BoundingBox entry.
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index 9942602eb..ed3339dc7 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -1,11 +1,14 @@
+import collections.abc
 import functools
 import numbers
 from collections import defaultdict
-from typing import Any, Dict, Literal, Sequence, Type, TypeVar, Union
+from contextlib import suppress
+from typing import Any, Callable, Dict, Literal, Optional, Sequence, Type, TypeVar, Union
+
+import torch
 
 from torchvision import datapoints
 from torchvision.datapoints._datapoint import _FillType, _FillTypeJIT
-
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
 
 
@@ -93,3 +96,60 @@ def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
 def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
     if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
         raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+
+def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
+    """
+    This heuristic covers three cases:
+
+    1. The input is tuple or list whose second item is a labels tensor. This happens for already batched
+       classification inputs for Mixup and Cutmix (typically after the Dataloder).
+    2. The input is a tuple or list whose second item is a dictionary that contains the labels tensor
+       under a label-like (see below) key. This happens for the inputs of detection models.
+    3. The input is a dictionary that is structured as the one from 2.
+
+    What is "label-like" key? We first search for an case-insensitive match of 'labels' inside the keys of the
+    dictionary. This is the name our detection models expect. If we can't find that, we look for a case-insensitive
+    match of the term 'label' anywhere inside the key, i.e. 'FooLaBeLBar'. If we can't find that either, the dictionary
+    contains no "label-like" key.
+    """
+
+    if isinstance(inputs, (tuple, list)):
+        inputs = inputs[1]
+
+    # Mixup, Cutmix
+    if isinstance(inputs, torch.Tensor):
+        return inputs
+
+    if not isinstance(inputs, collections.abc.Mapping):
+        raise ValueError(
+            f"When using the default labels_getter, the input passed to forward must be a dictionary or a two-tuple "
+            f"whose second item is a dictionary or a tensor, but got {inputs} instead."
+        )
+
+    candidate_key = None
+    with suppress(StopIteration):
+        candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
+    if candidate_key is None:
+        with suppress(StopIteration):
+            candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
+    if candidate_key is None:
+        raise ValueError(
+            "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
+            "If there are no labels in the sample by design, pass labels_getter=None."
+        )
+
+    return inputs[candidate_key]
+
+
+def _parse_labels_getter(
+    labels_getter: Union[str, Callable[[Any], Optional[torch.Tensor]], None]
+) -> Callable[[Any], Optional[torch.Tensor]]:
+    if labels_getter == "default":
+        return _find_labels_default_heuristic
+    elif callable(labels_getter):
+        return labels_getter
+    elif labels_getter is None:
+        return lambda _: None
+    else:
+        raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.")
-- 
GitLab


From 72dcc1705d290a2b6084d0fed97e6f52c670ae79 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 28 Jul 2023 10:18:31 +0100
Subject: [PATCH 522/624] Put all dtype conversion stuff into the _misc
 namespaces (#7770)

---
 torchvision/transforms/v2/__init__.py         | 13 ++-
 torchvision/transforms/v2/_meta.py            | 43 +--------
 torchvision/transforms/v2/_misc.py            | 37 ++++++++
 .../transforms/v2/functional/__init__.py      |  8 +-
 .../transforms/v2/functional/_color.py        |  2 +-
 torchvision/transforms/v2/functional/_meta.py | 95 -------------------
 torchvision/transforms/v2/functional/_misc.py | 95 +++++++++++++++++++
 7 files changed, 149 insertions(+), 144 deletions(-)

diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 7e1080e69..8f1c161c2 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -39,8 +39,17 @@ from ._geometry import (
     ScaleJitter,
     TenCrop,
 )
-from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype
+from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat
+from ._misc import (
+    ConvertImageDtype,
+    GaussianBlur,
+    Identity,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    SanitizeBoundingBox,
+    ToDtype,
+)
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 5299e318f..9abe91d88 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -1,12 +1,8 @@
 from typing import Any, Dict, Union
 
-import torch
-
-from torchvision import datapoints, transforms as _transforms
+from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F, Transform
 
-from .utils import is_simple_tensor
-
 
 class ConvertBoundingBoxFormat(Transform):
     """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
@@ -31,43 +27,6 @@ class ConvertBoundingBoxFormat(Transform):
         return F.convert_format_bounding_box(inpt, new_format=self.format)  # type: ignore[return-value]
 
 
-class ConvertImageDtype(Transform):
-    """[BETA] Convert input image to the given ``dtype`` and scale the values accordingly.
-
-    .. v2betastatus:: ConvertImageDtype transform
-
-    .. warning::
-        Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
-
-    This function does not support PIL Image.
-
-    Args:
-        dtype (torch.dtype): Desired data type of the output
-
-    .. note::
-
-        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
-        If converted back and forth, this mismatch has no effect.
-
-    Raises:
-        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
-            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
-            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
-            of the integer ``dtype``.
-    """
-
-    _v1_transform_cls = _transforms.ConvertImageDtype
-
-    _transformed_types = (is_simple_tensor, datapoints.Image)
-
-    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
-        super().__init__()
-        self.dtype = dtype
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.to_dtype(inpt, dtype=self.dtype, scale=True)
-
-
 class ClampBoundingBox(Transform):
     """[BETA] Clamp bounding boxes to their corresponding image dimensions.
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index b22c61727..4bd6bc131 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -295,6 +295,43 @@ class ToDtype(Transform):
         return F.to_dtype(inpt, dtype=dtype, scale=self.scale)
 
 
+class ConvertImageDtype(Transform):
+    """[BETA] Convert input image to the given ``dtype`` and scale the values accordingly.
+
+    .. v2betastatus:: ConvertImageDtype transform
+
+    .. warning::
+        Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
+    _v1_transform_cls = _transforms.ConvertImageDtype
+
+    _transformed_types = (is_simple_tensor, datapoints.Image)
+
+    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.to_dtype(inpt, dtype=self.dtype, scale=True)
+
+
 class SanitizeBoundingBox(Transform):
     """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
 
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 4617d1af6..53aa47a2b 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -5,10 +5,6 @@ from ._utils import is_simple_tensor  # usort: skip
 from ._meta import (
     clamp_bounding_box,
     convert_format_bounding_box,
-    convert_image_dtype,
-    to_dtype,
-    to_dtype_image_tensor,
-    to_dtype_video,
     get_dimensions_image_tensor,
     get_dimensions_image_pil,
     get_dimensions,
@@ -158,6 +154,7 @@ from ._geometry import (
     vflip,
 )
 from ._misc import (
+    convert_image_dtype,
     gaussian_blur,
     gaussian_blur_image_pil,
     gaussian_blur_image_tensor,
@@ -165,6 +162,9 @@ from ._misc import (
     normalize,
     normalize_image_tensor,
     normalize_video,
+    to_dtype,
+    to_dtype_image_tensor,
+    to_dtype_video,
 )
 from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
 from ._type_conversion import pil_to_tensor, to_image_pil, to_image_tensor, to_pil_image
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index c2ee56112..32568f728 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -9,7 +9,7 @@ from torchvision.transforms._functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import _num_value_bits, to_dtype_image_tensor
+from ._misc import _num_value_bits, to_dtype_image_tensor
 from ._utils import is_simple_tensor
 
 
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 5d0c072d2..809a01fcc 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -5,7 +5,6 @@ import torch
 from torchvision import datapoints
 from torchvision.datapoints import BoundingBoxFormat
 from torchvision.transforms import _functional_pil as _FP
-from torchvision.transforms._functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
@@ -279,97 +278,3 @@ def clamp_bounding_box(
         raise TypeError(
             f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
         )
-
-
-def _num_value_bits(dtype: torch.dtype) -> int:
-    if dtype == torch.uint8:
-        return 8
-    elif dtype == torch.int8:
-        return 7
-    elif dtype == torch.int16:
-        return 15
-    elif dtype == torch.int32:
-        return 31
-    elif dtype == torch.int64:
-        return 63
-    else:
-        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
-
-
-def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
-
-    if image.dtype == dtype:
-        return image
-    elif not scale:
-        return image.to(dtype)
-
-    float_input = image.is_floating_point()
-    if torch.jit.is_scripting():
-        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
-        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
-    else:
-        float_output = dtype.is_floating_point
-
-    if float_input:
-        # float to float
-        if float_output:
-            return image.to(dtype)
-
-        # float to int
-        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
-            image.dtype == torch.float64 and dtype == torch.int64
-        ):
-            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
-
-        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
-        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
-        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
-        # for a detailed analysis.
-        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
-        # Instead, we can also multiply by the maximum value plus something close to `1`. See
-        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
-        eps = 1e-3
-        max_value = float(_max_value(dtype))
-        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
-        # discrete set `{0, 1}`.
-        return image.mul(max_value + 1.0 - eps).to(dtype)
-    else:
-        # int to float
-        if float_output:
-            return image.to(dtype).mul_(1.0 / _max_value(image.dtype))
-
-        # int to int
-        num_value_bits_input = _num_value_bits(image.dtype)
-        num_value_bits_output = _num_value_bits(dtype)
-
-        if num_value_bits_input > num_value_bits_output:
-            return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
-        else:
-            return image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)
-
-
-# We encourage users to use to_dtype() instead but we keep this for BC
-def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
-    return to_dtype_image_tensor(image, dtype=dtype, scale=True)
-
-
-def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
-    return to_dtype_image_tensor(video, dtype, scale=scale)
-
-
-def to_dtype(inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(to_dtype)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return to_dtype_image_tensor(inpt, dtype, scale=scale)
-    elif isinstance(inpt, datapoints.Image):
-        output = to_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
-        return datapoints.Image.wrap_like(inpt, output)
-    elif isinstance(inpt, datapoints.Video):
-        output = to_dtype_video(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
-        return datapoints.Video.wrap_like(inpt, output)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.to(dtype)
-    else:
-        raise TypeError(f"Input can either be a plain tensor or a datapoint, but got {type(inpt)} instead.")
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index 9abb3ac22..cda85ba90 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -6,6 +6,7 @@ import torch
 from torch.nn.functional import conv2d, pad as torch_pad
 
 from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 from torchvision.utils import _log_api_usage_once
@@ -182,3 +183,97 @@ def gaussian_blur(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
+
+
+def _num_value_bits(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 8
+    elif dtype == torch.int8:
+        return 7
+    elif dtype == torch.int16:
+        return 15
+    elif dtype == torch.int32:
+        return 31
+    elif dtype == torch.int64:
+        return 63
+    else:
+        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
+
+
+def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+
+    if image.dtype == dtype:
+        return image
+    elif not scale:
+        return image.to(dtype)
+
+    float_input = image.is_floating_point()
+    if torch.jit.is_scripting():
+        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
+        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
+    else:
+        float_output = dtype.is_floating_point
+
+    if float_input:
+        # float to float
+        if float_output:
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
+
+        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
+        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
+        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # for a detailed analysis.
+        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
+        # Instead, we can also multiply by the maximum value plus something close to `1`. See
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
+        eps = 1e-3
+        max_value = float(_max_value(dtype))
+        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
+        # discrete set `{0, 1}`.
+        return image.mul(max_value + 1.0 - eps).to(dtype)
+    else:
+        # int to float
+        if float_output:
+            return image.to(dtype).mul_(1.0 / _max_value(image.dtype))
+
+        # int to int
+        num_value_bits_input = _num_value_bits(image.dtype)
+        num_value_bits_output = _num_value_bits(dtype)
+
+        if num_value_bits_input > num_value_bits_output:
+            return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
+        else:
+            return image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)
+
+
+# We encourage users to use to_dtype() instead but we keep this for BC
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    return to_dtype_image_tensor(image, dtype=dtype, scale=True)
+
+
+def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    return to_dtype_image_tensor(video, dtype, scale=scale)
+
+
+def to_dtype(inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(to_dtype)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return to_dtype_image_tensor(inpt, dtype, scale=scale)
+    elif isinstance(inpt, datapoints.Image):
+        output = to_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
+        return datapoints.Image.wrap_like(inpt, output)
+    elif isinstance(inpt, datapoints.Video):
+        output = to_dtype_video(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
+        return datapoints.Video.wrap_like(inpt, output)
+    elif isinstance(inpt, datapoints._datapoint.Datapoint):
+        return inpt.to(dtype)
+    else:
+        raise TypeError(f"Input can either be a plain tensor or a datapoint, but got {type(inpt)} instead.")
-- 
GitLab


From 3966f9558bfc8443fc4fe16538b33805dd42812d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 28 Jul 2023 12:14:56 +0100
Subject: [PATCH 523/624] Update deprecation warning to not mention
 ConvertImageDtype (#7771)

---
 torchvision/transforms/v2/_deprecated.py            | 4 ++--
 torchvision/transforms/v2/functional/_deprecated.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index e900e853d..1cb135a30 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -16,7 +16,7 @@ class ToTensor(Transform):
 
     .. warning::
         :class:`v2.ToTensor` is deprecated and will be removed in a future release.
-        Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
+        Please use instead ``v2.Compose([transforms.ToImageTensor(), v2.ToDtype(torch.float32, scale=True)])``.
 
     This transform does not support torchscript.
 
@@ -40,7 +40,7 @@ class ToTensor(Transform):
     def __init__(self) -> None:
         warnings.warn(
             "The transform `ToTensor()` is deprecated and will be removed in a future release. "
-            "Instead, please use `transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])`."
+            "Instead, please use `v2.Compose([transforms.ToImageTensor(), v2.ToDtype(torch.float32, scale=True)])`."
         )
         super().__init__()
 
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index c9a0f647e..99097aecc 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -11,7 +11,7 @@ from torchvision.transforms import functional as _F
 def to_tensor(inpt: Any) -> torch.Tensor:
     warnings.warn(
         "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
-        "Instead, please use `to_image_tensor(...)` followed by `convert_image_dtype(...)`."
+        "Instead, please use `to_image_tensor(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
     )
     return _F.to_tensor(inpt)
 
-- 
GitLab


From d4e5aa21afadb3608f6244d15e7bcd2a9e7d6081 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dennis=20M=2E=20P=C3=B6pperl?=
 <free-software@dm-poepperl.de>
Date: Mon, 31 Jul 2023 12:14:42 +0200
Subject: [PATCH 524/624] Fix RandomZoomOut doc string (#7777)

---
 torchvision/transforms/v2/_geometry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 731d768c2..c1b05ce03 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -537,7 +537,7 @@ class RandomZoomOut(_RandomApplyTransform):
             ``Mask`` will be filled with 0.
         side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
             scale the input size.
-        p (float, optional): probability of the input being flipped. Default value is 0.5
+        p (float, optional): probability that the zoom operation will be performed.
     """
 
     def __init__(
-- 
GitLab


From 332bff937c6711666191880fab57fa2f23ae772e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 31 Jul 2023 16:39:29 +0100
Subject: [PATCH 525/624] Renaming: `BoundingBox` -> `BoundingBoxes` (#7778)

---
 docs/source/datapoints.rst                    |   2 +-
 docs/source/transforms.rst                    |   4 +-
 gallery/plot_datapoints.py                    |  12 +-
 gallery/plot_transforms_v2.py                 |   2 +-
 gallery/plot_transforms_v2_e2e.py             |   4 +-
 references/detection/presets.py               |   2 +-
 test/common_utils.py                          |  18 +-
 test/test_datapoints.py                       |   4 +-
 test/test_prototype_transforms.py             |  28 +--
 test/test_transforms_v2.py                    |  78 +++---
 test/test_transforms_v2_consistency.py        |   2 +-
 test/test_transforms_v2_functional.py         |  86 +++----
 test/test_transforms_v2_refactored.py         | 226 +++++++++---------
 test/test_transforms_v2_utils.py              |  36 +--
 test/transforms_v2_dispatcher_infos.py        |  20 +-
 test/transforms_v2_kernel_infos.py            | 170 ++++++-------
 torchvision/datapoints/__init__.py            |   2 +-
 torchvision/datapoints/_bounding_box.py       |  90 +++----
 torchvision/datapoints/_datapoint.py          |   4 +-
 torchvision/datapoints/_dataset_wrapper.py    |  20 +-
 .../prototype/datasets/_builtin/caltech.py    |   4 +-
 .../prototype/datasets/_builtin/celeba.py     |   8 +-
 .../prototype/datasets/_builtin/coco.py       |   4 +-
 .../prototype/datasets/_builtin/cub200.py     |  10 +-
 .../prototype/datasets/_builtin/gtsrb.py      |   6 +-
 .../datasets/_builtin/stanford_cars.py        |   4 +-
 .../prototype/datasets/_builtin/voc.py        |   4 +-
 torchvision/prototype/transforms/_augment.py  |  16 +-
 torchvision/prototype/transforms/_geometry.py |  22 +-
 torchvision/transforms/v2/__init__.py         |   4 +-
 torchvision/transforms/v2/_augment.py         |   2 +-
 torchvision/transforms/v2/_auto_augment.py    |   2 +-
 torchvision/transforms/v2/_geometry.py        |  56 ++---
 torchvision/transforms/v2/_meta.py            |  16 +-
 torchvision/transforms/v2/_misc.py            |  22 +-
 .../transforms/v2/functional/__init__.py      |  28 +--
 .../transforms/v2/functional/_geometry.py     | 180 +++++++-------
 torchvision/transforms/v2/functional/_meta.py |  66 ++---
 torchvision/transforms/v2/utils.py            |   6 +-
 39 files changed, 638 insertions(+), 632 deletions(-)

diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 1cc62413e..55d3cda4a 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -15,5 +15,5 @@ see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
     Image
     Video
     BoundingBoxFormat
-    BoundingBox
+    BoundingBoxes
     Mask
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index b29c22ee1..9f3efe303 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -206,8 +206,8 @@ Miscellaneous
     v2.RandomErasing
     Lambda
     v2.Lambda
-    v2.SanitizeBoundingBox
-    v2.ClampBoundingBox
+    v2.SanitizeBoundingBoxes
+    v2.ClampBoundingBoxes
     v2.UniformTemporalSubsample
 
 .. _conversion_transforms:
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index 5094de13a..c5a6efa98 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -47,7 +47,7 @@ assert image.data_ptr() == tensor.data_ptr()
 #
 # * :class:`~torchvision.datapoints.Image`
 # * :class:`~torchvision.datapoints.Video`
-# * :class:`~torchvision.datapoints.BoundingBox`
+# * :class:`~torchvision.datapoints.BoundingBoxes`
 # * :class:`~torchvision.datapoints.Mask`
 #
 # How do I construct a datapoint?
@@ -76,10 +76,10 @@ print(image.shape, image.dtype)
 
 ########################################################################################################################
 # In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
-# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the
+# :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
 # corresponding image alongside the actual values:
 
-bounding_box = datapoints.BoundingBox(
+bounding_box = datapoints.BoundingBoxes(
     [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
 )
 print(bounding_box)
@@ -105,7 +105,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
     def __getitem__(self, item):
         ...
 
-        target["boxes"] = datapoints.BoundingBox(
+        target["boxes"] = datapoints.BoundingBoxes(
             boxes,
             format=datapoints.BoundingBoxFormat.XYXY,
             spatial_size=F.get_spatial_size(img),
@@ -126,7 +126,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
 
 class WrapPennFudanDataset:
     def __call__(self, img, target):
-        target["boxes"] = datapoints.BoundingBox(
+        target["boxes"] = datapoints.BoundingBoxes(
             target["boxes"],
             format=datapoints.BoundingBoxFormat.XYXY,
             spatial_size=F.get_spatial_size(img),
@@ -147,7 +147,7 @@ def get_transform(train):
 ########################################################################################################################
 # .. note::
 #
-#    If both :class:`~torchvision.datapoints.BoundingBox`'es and :class:`~torchvision.datapoints.Mask`'s are included in
+#    If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
 #    the sample, ``torchvision.transforms.v2`` will transform them both. Meaning, if you don't need both, dropping or
 #    at least not wrapping the obsolete parts, can lead to a significant performance boost.
 #
diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
index d1096bec1..c7bae8780 100644
--- a/gallery/plot_transforms_v2.py
+++ b/gallery/plot_transforms_v2.py
@@ -29,7 +29,7 @@ def load_data():
 
     masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
 
-    bounding_boxes = datapoints.BoundingBox(
+    bounding_boxes = datapoints.BoundingBoxes(
         masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
     )
 
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 951af514b..53c461959 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -106,13 +106,13 @@ transform = transforms.Compose(
         transforms.RandomHorizontalFlip(),
         transforms.ToImageTensor(),
         transforms.ConvertImageDtype(torch.float32),
-        transforms.SanitizeBoundingBox(),
+        transforms.SanitizeBoundingBoxes(),
     ]
 )
 
 ########################################################################################################################
 # .. note::
-#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBox` transform is a no-op in this example, but it
+#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
 #    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
 #    the corresponding labels and optionally masks. It is particularly critical to add it if
 #    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 120f079af..098ec85e6 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -78,7 +78,7 @@ class DetectionPresetTrain:
         if use_v2:
             transforms += [
                 T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY),
-                T.SanitizeBoundingBox(),
+                T.SanitizeBoundingBoxes(),
             ]
 
         self.transforms = T.Compose(transforms)
diff --git a/test/common_utils.py b/test/common_utils.py
index af8f57832..c9cff035c 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -620,7 +620,7 @@ def make_image_loaders_for_interpolation(
 
 
 @dataclasses.dataclass
-class BoundingBoxLoader(TensorLoader):
+class BoundingBoxesLoader(TensorLoader):
     format: datapoints.BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
@@ -639,7 +639,7 @@ def make_bounding_box(
         - (box[3] - box[1], box[2] - box[0]) for XYXY
         - (H, W) for XYWH and CXCYWH
     spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on
-        returned datapoints.BoundingBox
+        returned datapoints.BoundingBoxes
 
     To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker
     functions, e.g.
@@ -647,8 +647,8 @@ def make_bounding_box(
     .. code::
 
         image = make_image=(size=size)
-        bounding_box = make_bounding_box(spatial_size=size)
-        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
+        bounding_boxes = make_bounding_box(spatial_size=size)
+        assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image)
 
     For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all
     other maker functions, e.g.
@@ -656,8 +656,8 @@ def make_bounding_box(
     .. code::
 
         image = make_image=()
-        bounding_box = make_bounding_box()
-        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
+        bounding_boxes = make_bounding_box()
+        assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image)
     """
 
     def sample_position(values, max_value):
@@ -679,7 +679,7 @@ def make_bounding_box(
     dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
-        return datapoints.BoundingBox(
+        return datapoints.BoundingBoxes(
             torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
         )
 
@@ -705,7 +705,7 @@ def make_bounding_box(
     else:
         raise ValueError(f"Format {format} is not supported")
 
-    return datapoints.BoundingBox(
+    return datapoints.BoundingBoxes(
         torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
     )
 
@@ -725,7 +725,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORT
             format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
+    return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
 def make_bounding_box_loaders(
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 1334fd728..a5f090435 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -27,7 +27,7 @@ def test_mask_instance(data):
     "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
 )
 def test_bbox_instance(data, format):
-    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
+    bboxes = datapoints.BoundingBoxes(data, format=format, spatial_size=(32, 32))
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
@@ -164,7 +164,7 @@ def test_wrap_like():
     [
         datapoints.Image(torch.rand(3, 16, 16)),
         datapoints.Video(torch.rand(2, 3, 16, 16)),
-        datapoints.BoundingBox([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)),
+        datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)),
         datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)),
     ],
 )
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index c574979e2..d1f244107 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -20,7 +20,7 @@ from common_utils import (
 
 from prototype_common_utils import make_label, make_one_hot_labels
 
-from torchvision.datapoints import BoundingBox, BoundingBoxFormat, Image, Mask, Video
+from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
 from torchvision.transforms.v2._utils import _convert_fill_arg
 from torchvision.transforms.v2.functional import InterpolationMode, pil_to_tensor, to_image_pil
@@ -101,10 +101,10 @@ class TestSimpleCopyPaste:
             self.create_fake_image(mocker, Image),
             # labels, bboxes, masks
             mocker.MagicMock(spec=datapoints.Label),
-            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=BoundingBoxes),
             mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=BoundingBoxes),
             mocker.MagicMock(spec=Mask),
         ]
 
@@ -122,11 +122,11 @@ class TestSimpleCopyPaste:
             self.create_fake_image(mocker, image_type),
             # labels, bboxes, masks
             mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=BoundingBoxes),
             mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
             mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=BoundingBoxes),
             mocker.MagicMock(spec=Mask),
         ]
 
@@ -142,7 +142,7 @@ class TestSimpleCopyPaste:
 
         for target in targets:
             for key, type_ in [
-                ("boxes", BoundingBox),
+                ("boxes", BoundingBoxes),
                 ("masks", Mask),
                 ("labels", label_type),
             ]:
@@ -163,7 +163,7 @@ class TestSimpleCopyPaste:
         if label_type == datapoints.OneHotLabel:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
-            "boxes": BoundingBox(
+            "boxes": BoundingBoxes(
                 torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
             ),
             "masks": Mask(masks),
@@ -178,7 +178,7 @@ class TestSimpleCopyPaste:
         if label_type == datapoints.OneHotLabel:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
-            "boxes": BoundingBox(
+            "boxes": BoundingBoxes(
                 torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
             ),
             "masks": Mask(paste_masks),
@@ -332,7 +332,7 @@ class TestFixedSizeCrop:
         assert_equal(output["masks"], masks[is_valid])
         assert_equal(output["labels"], labels[is_valid])
 
-    def test__transform_bounding_box_clamping(self, mocker):
+    def test__transform_bounding_boxes_clamping(self, mocker):
         batch_size = 3
         spatial_size = (10, 10)
 
@@ -349,15 +349,15 @@ class TestFixedSizeCrop:
             ),
         )
 
-        bounding_box = make_bounding_box(
+        bounding_boxes = make_bounding_box(
             format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
+        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes")
 
         transform = transforms.FixedSizeCrop((-1, -1))
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
 
-        transform(bounding_box)
+        transform(bounding_boxes)
 
         mock.assert_called_once()
 
@@ -390,7 +390,7 @@ class TestPermuteDimensions:
     def test_call(self, dims, inverse_dims):
         sample = dict(
             image=make_image(),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -434,7 +434,7 @@ class TestTransposeDimensions:
     def test_call(self, dims):
         sample = dict(
             image=make_image(),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 0e311fd65..e5624d78f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -46,8 +46,8 @@ def make_pil_images(*args, **kwargs):
 
 
 def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_box in make_bounding_boxes(*args, **kwargs):
-        yield bounding_box.data
+    for bounding_boxes in make_bounding_boxes(*args, **kwargs):
+        yield bounding_boxes.data
 
 
 def parametrize(transforms_with_inputs):
@@ -69,7 +69,7 @@ def auto_augment_adapter(transform, input, device):
     adapted_input = {}
     image_or_video_found = False
     for key, value in input.items():
-        if isinstance(value, (datapoints.BoundingBox, datapoints.Mask)):
+        if isinstance(value, (datapoints.BoundingBoxes, datapoints.Mask)):
             # AA transforms don't support bounding boxes or masks
             continue
         elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
@@ -143,7 +143,7 @@ class TestSmoke:
             (transforms.RandomZoomOut(p=1.0), None),
             (transforms.Resize([16, 16], antialias=True), None),
             (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
-            (transforms.ClampBoundingBox(), None),
+            (transforms.ClampBoundingBoxes(), None),
             (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertImageDtype(), None),
             (transforms.GaussianBlur(kernel_size=3), None),
@@ -180,16 +180,16 @@ class TestSmoke:
             image_datapoint=make_image(size=spatial_size),
             video_datapoint=make_video(size=spatial_size),
             image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
-            bounding_box_xyxy=make_bounding_box(
+            bounding_boxes_xyxy=make_bounding_box(
                 format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
             ),
-            bounding_box_xywh=make_bounding_box(
+            bounding_boxes_xywh=make_bounding_box(
                 format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,)
             ),
-            bounding_box_cxcywh=make_bounding_box(
+            bounding_boxes_cxcywh=make_bounding_box(
                 format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,)
             ),
-            bounding_box_degenerate_xyxy=datapoints.BoundingBox(
+            bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes(
                 [
                     [0, 0, 0, 0],  # no height or width
                     [0, 0, 0, 1],  # no height
@@ -201,7 +201,7 @@ class TestSmoke:
                 format=datapoints.BoundingBoxFormat.XYXY,
                 spatial_size=spatial_size,
             ),
-            bounding_box_degenerate_xywh=datapoints.BoundingBox(
+            bounding_boxes_degenerate_xywh=datapoints.BoundingBoxes(
                 [
                     [0, 0, 0, 0],  # no height or width
                     [0, 0, 0, 1],  # no height
@@ -213,7 +213,7 @@ class TestSmoke:
                 format=datapoints.BoundingBoxFormat.XYWH,
                 spatial_size=spatial_size,
             ),
-            bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
+            bounding_boxes_degenerate_cxcywh=datapoints.BoundingBoxes(
                 [
                     [0, 0, 0, 0],  # no height or width
                     [0, 0, 0, 1],  # no height
@@ -261,7 +261,7 @@ class TestSmoke:
             else:
                 assert output_item is input_item
 
-            if isinstance(input_item, datapoints.BoundingBox) and not isinstance(
+            if isinstance(input_item, datapoints.BoundingBoxes) and not isinstance(
                 transform, transforms.ConvertBoundingBoxFormat
             ):
                 assert output_item.format == input_item.format
@@ -271,10 +271,10 @@ class TestSmoke:
         # TODO: we should test that against all degenerate boxes above
         for format in list(datapoints.BoundingBoxFormat):
             sample = dict(
-                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
+                boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
-            assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4)
+            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
 
     @parametrize(
         [
@@ -942,7 +942,7 @@ class TestRandomErasing:
 class TestTransform:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test_check_transformed_types(self, inpt_type, mocker):
         # This test ensures that we correctly handle which types to transform and which to bypass
@@ -960,7 +960,7 @@ class TestTransform:
 class TestToImageTensor:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch(
@@ -971,7 +971,7 @@ class TestToImageTensor:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToImageTensor()
         transform(inpt)
-        if inpt_type in (datapoints.BoundingBox, datapoints.Image, str, int):
+        if inpt_type in (datapoints.BoundingBoxes, datapoints.Image, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt)
@@ -980,7 +980,7 @@ class TestToImageTensor:
 class TestToImagePIL:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
@@ -988,7 +988,7 @@ class TestToImagePIL:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToImagePIL()
         transform(inpt)
-        if inpt_type in (datapoints.BoundingBox, PIL.Image.Image, str, int):
+        if inpt_type in (datapoints.BoundingBoxes, PIL.Image.Image, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt, mode=transform.mode)
@@ -997,7 +997,7 @@ class TestToImagePIL:
 class TestToPILImage:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
@@ -1005,7 +1005,7 @@ class TestToPILImage:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToPILImage()
         transform(inpt)
-        if inpt_type in (PIL.Image.Image, datapoints.BoundingBox, str, int):
+        if inpt_type in (PIL.Image.Image, datapoints.BoundingBoxes, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt, mode=transform.mode)
@@ -1014,7 +1014,7 @@ class TestToPILImage:
 class TestToTensor:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.transforms.functional.to_tensor")
@@ -1023,7 +1023,7 @@ class TestToTensor:
         with pytest.warns(UserWarning, match="deprecated and will be removed"):
             transform = transforms.ToTensor()
         transform(inpt)
-        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBox, str, int):
+        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBoxes, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt)
@@ -1065,7 +1065,7 @@ class TestRandomIoUCrop:
         image = mocker.MagicMock(spec=datapoints.Image)
         image.num_channels = 3
         image.spatial_size = (24, 32)
-        bboxes = datapoints.BoundingBox(
+        bboxes = datapoints.BoundingBoxes(
             torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
             format="XYXY",
             spatial_size=image.spatial_size,
@@ -1103,7 +1103,7 @@ class TestRandomIoUCrop:
     def test__transform_empty_params(self, mocker):
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
         image = datapoints.Image(torch.rand(1, 3, 4, 4))
-        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
+        bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
         label = torch.tensor([1])
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
@@ -1147,7 +1147,7 @@ class TestRandomIoUCrop:
 
         # check number of bboxes vs number of labels:
         output_bboxes = output[1]
-        assert isinstance(output_bboxes, datapoints.BoundingBox)
+        assert isinstance(output_bboxes, datapoints.BoundingBoxes)
         assert (output_bboxes[~is_within_crop_area] == 0).all()
 
         output_masks = output[2]
@@ -1505,7 +1505,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             transforms.ConvertImageDtype(torch.float),
         ]
     if sanitize:
-        t += [transforms.SanitizeBoundingBox()]
+        t += [transforms.SanitizeBoundingBoxes()]
     t = transforms.Compose(t)
 
     num_boxes = 5
@@ -1523,7 +1523,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
     boxes[:, 2:] += boxes[:, :2]
     boxes = boxes.clamp(min=0, max=min(H, W))
-    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
+    boxes = datapoints.BoundingBoxes(boxes, format="XYXY", spatial_size=(H, W))
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
 
@@ -1546,7 +1546,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
         # doesn't remove them strictly speaking, it just marks some boxes as
         # degenerate and those boxes will be later removed by
-        # SanitizeBoundingBox(), which we add to the pipelines if the sanitize
+        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
         # param is True.
         # Note that the values below are probably specific to the random seed
         # set above (which is fine).
@@ -1594,7 +1594,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     boxes = torch.tensor(boxes)
     labels = torch.arange(boxes.shape[0])
 
-    boxes = datapoints.BoundingBox(
+    boxes = datapoints.BoundingBoxes(
         boxes,
         format=datapoints.BoundingBoxFormat.XYXY,
         spatial_size=(H, W),
@@ -1616,7 +1616,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
         img = sample.pop("image")
         sample = (img, sample)
 
-    out = transforms.SanitizeBoundingBox(min_size=min_size, labels_getter=labels_getter)(sample)
+    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
 
     if sample_type is tuple:
         out_image = out[0]
@@ -1634,7 +1634,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     assert out_image is input_img
     assert out_whatever is whatever
 
-    assert isinstance(out_boxes, datapoints.BoundingBox)
+    assert isinstance(out_boxes, datapoints.BoundingBoxes)
     assert isinstance(out_masks, datapoints.Mask)
 
     if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
@@ -1648,31 +1648,31 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
 
 def test_sanitize_bounding_boxes_errors():
 
-    good_bbox = datapoints.BoundingBox(
+    good_bbox = datapoints.BoundingBoxes(
         [[0, 0, 10, 10]],
         format=datapoints.BoundingBoxFormat.XYXY,
         spatial_size=(20, 20),
     )
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
-        transforms.SanitizeBoundingBox(min_size=0)
+        transforms.SanitizeBoundingBoxes(min_size=0)
     with pytest.raises(ValueError, match="labels_getter should either be 'default'"):
-        transforms.SanitizeBoundingBox(labels_getter=12)
+        transforms.SanitizeBoundingBoxes(labels_getter=12)
 
     with pytest.raises(ValueError, match="Could not infer where the labels are"):
         bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
-        transforms.SanitizeBoundingBox()(bad_labels_key)
+        transforms.SanitizeBoundingBoxes()(bad_labels_key)
 
     with pytest.raises(ValueError, match="must be a tensor"):
         not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
-        transforms.SanitizeBoundingBox()(not_a_tensor)
+        transforms.SanitizeBoundingBoxes()(not_a_tensor)
 
     with pytest.raises(ValueError, match="Number of boxes"):
         different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
-        transforms.SanitizeBoundingBox()(different_sizes)
+        transforms.SanitizeBoundingBoxes()(different_sizes)
 
     with pytest.raises(ValueError, match="boxes must be of shape"):
-        bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
+        bad_bbox = datapoints.BoundingBoxes(  # batch with 2 elements
             [
                 [[0, 0, 10, 10]],
                 [[0, 0, 10, 10]],
@@ -1681,7 +1681,7 @@ def test_sanitize_bounding_boxes_errors():
             spatial_size=(20, 20),
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBox()(different_sizes)
+        transforms.SanitizeBoundingBoxes()(different_sizes)
 
 
 @pytest.mark.parametrize(
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 9b7886f47..9adec66b3 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1127,7 +1127,7 @@ class TestRefDetTransforms:
                 v2_transforms.Compose(
                     [
                         v2_transforms.RandomIoUCrop(),
-                        v2_transforms.SanitizeBoundingBox(labels_getter=lambda sample: sample[1]["labels"]),
+                        v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]),
                     ]
                 ),
                 {"with_mask": False},
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 47ea00694..5d692b581 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -26,7 +26,7 @@ from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
-from torchvision.transforms.v2.functional._meta import clamp_bounding_box, convert_format_bounding_box
+from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes
 from torchvision.transforms.v2.utils import is_simple_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
@@ -176,7 +176,7 @@ class TestKernels:
         # Everything to the left is considered a batch dimension.
         data_dims = {
             datapoints.Image: 3,
-            datapoints.BoundingBox: 1,
+            datapoints.BoundingBoxes: 1,
             # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks
             # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one
             # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as
@@ -515,15 +515,15 @@ class TestDispatchers:
         [
             info
             for info in DISPATCHER_INFOS
-            if datapoints.BoundingBox in info.kernels and info.dispatcher is not F.convert_format_bounding_box
+            if datapoints.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_format_bounding_boxes
         ],
-        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBox),
+        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBoxes),
     )
-    def test_bounding_box_format_consistency(self, info, args_kwargs):
-        (bounding_box, *other_args), kwargs = args_kwargs.load()
-        format = bounding_box.format
+    def test_bounding_boxes_format_consistency(self, info, args_kwargs):
+        (bounding_boxes, *other_args), kwargs = args_kwargs.load()
+        format = bounding_boxes.format
 
-        output = info.dispatcher(bounding_box, *other_args, **kwargs)
+        output = info.dispatcher(bounding_boxes, *other_args, **kwargs)
 
         assert output.format == format
 
@@ -562,7 +562,7 @@ def test_normalize_image_tensor_stats(device, num_channels):
     assert_samples_from_standard_normal(F.normalize_image_tensor(image, mean, std))
 
 
-class TestClampBoundingBox:
+class TestClampBoundingBoxes:
     @pytest.mark.parametrize(
         "metadata",
         [
@@ -575,7 +575,7 @@ class TestClampBoundingBox:
         simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` has to be passed")):
-            F.clamp_bounding_box(simple_tensor, **metadata)
+            F.clamp_bounding_boxes(simple_tensor, **metadata)
 
     @pytest.mark.parametrize(
         "metadata",
@@ -589,10 +589,10 @@ class TestClampBoundingBox:
         datapoint = next(make_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` must not be passed")):
-            F.clamp_bounding_box(datapoint, **metadata)
+            F.clamp_bounding_boxes(datapoint, **metadata)
 
 
-class TestConvertFormatBoundingBox:
+class TestConvertFormatBoundingBoxes:
     @pytest.mark.parametrize(
         ("inpt", "old_format"),
         [
@@ -602,19 +602,19 @@ class TestConvertFormatBoundingBox:
     )
     def test_missing_new_format(self, inpt, old_format):
         with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")):
-            F.convert_format_bounding_box(inpt, old_format)
+            F.convert_format_bounding_boxes(inpt, old_format)
 
     def test_simple_tensor_insufficient_metadata(self):
         simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
-            F.convert_format_bounding_box(simple_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
+            F.convert_format_bounding_boxes(simple_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
 
     def test_datapoint_explicit_metadata(self):
         datapoint = next(make_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
-            F.convert_format_bounding_box(
+            F.convert_format_bounding_boxes(
                 datapoint, old_format=datapoint.format, new_format=datapoints.BoundingBoxFormat.CXCYWH
             )
 
@@ -658,7 +658,7 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
         [-8, 12, 70, 40, [(-2.0, 23.0, 13.0, 43.0), (38.0, 13.0, 58.0, 30.0), (33.0, 54.0, 44.0, 70.0)]],
     ],
 )
-def test_correctness_crop_bounding_box(device, format, top, left, height, width, expected_bboxes):
+def test_correctness_crop_bounding_boxes(device, format, top, left, height, width, expected_bboxes):
 
     # Expected bboxes computed using Albumentations:
     # import numpy as np
@@ -681,13 +681,13 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     ]
     in_boxes = torch.tensor(in_boxes, device=device)
     if format != datapoints.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+        in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
-    expected_bboxes = clamp_bounding_box(
-        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+    expected_bboxes = clamp_bounding_boxes(
+        datapoints.BoundingBoxes(expected_bboxes, format="XYXY", spatial_size=spatial_size)
     ).tolist()
 
-    output_boxes, output_spatial_size = F.crop_bounding_box(
+    output_boxes, output_spatial_size = F.crop_bounding_boxes(
         in_boxes,
         format,
         top,
@@ -697,7 +697,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     )
 
     if format != datapoints.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
+        output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
     torch.testing.assert_close(output_spatial_size, spatial_size)
@@ -727,7 +727,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
         [-5, 5, 35, 45, (32, 34)],
     ],
 )
-def test_correctness_resized_crop_bounding_box(device, format, top, left, height, width, size):
+def test_correctness_resized_crop_bounding_boxes(device, format, top, left, height, width, size):
     def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
         # bbox should be xyxy
         bbox[0] = (bbox[0] - left_) * size_[1] / width_
@@ -747,16 +747,16 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
         expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
-    in_boxes = datapoints.BoundingBox(
+    in_boxes = datapoints.BoundingBoxes(
         in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
     )
     if format != datapoints.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+        in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_spatial_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size)
 
     if format != datapoints.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
+        output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
     torch.testing.assert_close(output_spatial_size, size)
@@ -776,7 +776,7 @@ def _parse_padding(padding):
 
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]])
-def test_correctness_pad_bounding_box(device, padding):
+def test_correctness_pad_bounding_boxes(device, padding):
     def _compute_expected_bbox(bbox, padding_):
         pad_left, pad_up, _, _ = _parse_padding(padding_)
 
@@ -785,13 +785,13 @@ def test_correctness_pad_bounding_box(device, padding):
         bbox = (
             bbox.clone()
             if format == datapoints.BoundingBoxFormat.XYXY
-            else convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
+            else convert_format_bounding_boxes(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
         )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_format_bounding_box(bbox, new_format=format)
+        bbox = convert_format_bounding_boxes(bbox, new_format=format)
         if bbox.dtype != dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
@@ -808,7 +808,7 @@ def test_correctness_pad_bounding_box(device, padding):
         bboxes_format = bboxes.format
         bboxes_spatial_size = bboxes.spatial_size
 
-        output_boxes, output_spatial_size = F.pad_bounding_box(
+        output_boxes, output_spatial_size = F.pad_bounding_boxes(
             bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding
         )
 
@@ -819,7 +819,7 @@ def test_correctness_pad_bounding_box(device, padding):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, padding))
 
         if len(expected_bboxes) > 1:
@@ -849,7 +849,7 @@ def test_correctness_pad_segmentation_mask_on_fixed_input(device):
         [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
     ],
 )
-def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
+def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
     def _compute_expected_bbox(bbox, pcoeffs_):
         m1 = np.array(
             [
@@ -864,7 +864,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
             ]
         )
 
-        bbox_xyxy = convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
+        bbox_xyxy = convert_format_bounding_boxes(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -884,14 +884,14 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
                 np.max(transformed_points[:, 1]),
             ]
         )
-        out_bbox = datapoints.BoundingBox(
+        out_bbox = datapoints.BoundingBoxes(
             out_bbox,
             format=datapoints.BoundingBoxFormat.XYXY,
             spatial_size=bbox.spatial_size,
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
+        return clamp_bounding_boxes(convert_format_bounding_boxes(out_bbox, new_format=bbox.format))
 
     spatial_size = (32, 38)
 
@@ -901,7 +901,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
     for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
-        output_bboxes = F.perspective_bounding_box(
+        output_bboxes = F.perspective_bounding_boxes(
             bboxes.as_subclass(torch.Tensor),
             format=bboxes.format,
             spatial_size=bboxes.spatial_size,
@@ -915,7 +915,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size)
+            bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
@@ -929,12 +929,12 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
     "output_size",
     [(18, 18), [18, 15], (16, 19), [12], [46, 48]],
 )
-def test_correctness_center_crop_bounding_box(device, output_size):
+def test_correctness_center_crop_bounding_boxes(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
         spatial_size_ = bbox.spatial_size
         dtype = bbox.dtype
-        bbox = convert_format_bounding_box(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
+        bbox = convert_format_bounding_boxes(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
@@ -948,8 +948,8 @@ def test_correctness_center_crop_bounding_box(device, output_size):
             bbox[3].item(),
         ]
         out_bbox = torch.tensor(out_bbox)
-        out_bbox = convert_format_bounding_box(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
-        out_bbox = clamp_bounding_box(out_bbox, format=format_, spatial_size=output_size)
+        out_bbox = convert_format_bounding_boxes(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
+        out_bbox = clamp_bounding_boxes(out_bbox, format=format_, spatial_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
@@ -957,7 +957,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
         bboxes_format = bboxes.format
         bboxes_spatial_size = bboxes.spatial_size
 
-        output_boxes, output_spatial_size = F.center_crop_bounding_box(
+        output_boxes, output_spatial_size = F.center_crop_bounding_boxes(
             bboxes, bboxes_format, bboxes_spatial_size, output_size
         )
 
@@ -966,7 +966,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
 
         if len(expected_bboxes) > 1:
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 0ec3c5f01..3b808d6b7 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -196,7 +196,7 @@ def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs):
 
     assert isinstance(output, type(input))
 
-    if isinstance(input, datapoints.BoundingBox):
+    if isinstance(input, datapoints.BoundingBoxes):
         assert output.format == input.format
 
 
@@ -306,7 +306,7 @@ def check_transform(transform_cls, input, *args, **kwargs):
     output = transform(input)
     assert isinstance(output, type(input))
 
-    if isinstance(input, datapoints.BoundingBox):
+    if isinstance(input, datapoints.BoundingBoxes):
         assert output.format == input.format
 
     _check_transform_v1_compatibility(transform, input)
@@ -392,13 +392,13 @@ def assert_warns_antialias_default_value():
         yield
 
 
-def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix):
     def transform(bbox):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
             bbox = bbox.float()
-        bbox_xyxy = F.convert_format_bounding_box(
+        bbox_xyxy = F.convert_format_bounding_boxes(
             bbox.as_subclass(torch.Tensor),
             old_format=format,
             new_format=datapoints.BoundingBoxFormat.XYXY,
@@ -422,15 +422,15 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size,
             ],
             dtype=bbox_xyxy.dtype,
         )
-        out_bbox = F.convert_format_bounding_box(
+        out_bbox = F.convert_format_bounding_boxes(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_box(out_bbox, format=format, spatial_size=spatial_size)
+        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, spatial_size=spatial_size)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
-    return torch.stack([transform(b) for b in bounding_box.reshape(-1, 4).unbind()]).reshape(bounding_box.shape)
+    return torch.stack([transform(b) for b in bounding_boxes.reshape(-1, 4).unbind()]).reshape(bounding_boxes.shape)
 
 
 class TestResize:
@@ -508,20 +508,20 @@ class TestResize:
     @pytest.mark.parametrize("use_max_size", [True, False])
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_bounding_box(self, format, size, use_max_size, dtype, device):
+    def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_bounding_box(
+        bounding_boxes = make_bounding_box(
             format=format,
             spatial_size=self.INPUT_SIZE,
             dtype=dtype,
             device=device,
         )
         check_kernel(
-            F.resize_bounding_box,
-            bounding_box,
-            spatial_size=bounding_box.spatial_size,
+            F.resize_bounding_boxes,
+            bounding_boxes,
+            spatial_size=bounding_boxes.spatial_size,
             size=size,
             **max_size_kwarg,
             check_scripted_vs_eager=not isinstance(size, int),
@@ -541,7 +541,7 @@ class TestResize:
             (F.resize_image_tensor, make_image_tensor),
             (F.resize_image_pil, make_image_pil),
             (F.resize_image_tensor, make_image),
-            (F.resize_bounding_box, make_bounding_box),
+            (F.resize_bounding_boxes, make_bounding_box),
             (F.resize_mask, make_segmentation_mask),
             (F.resize_video, make_video),
         ],
@@ -562,7 +562,7 @@ class TestResize:
             (F.resize_image_tensor, torch.Tensor),
             (F.resize_image_pil, PIL.Image.Image),
             (F.resize_image_tensor, datapoints.Image),
-            (F.resize_bounding_box, datapoints.BoundingBox),
+            (F.resize_bounding_boxes, datapoints.BoundingBoxes),
             (F.resize_mask, datapoints.Mask),
             (F.resize_video, datapoints.Video),
         ],
@@ -612,45 +612,45 @@ class TestResize:
         self._check_output_size(image, actual, size=size, **max_size_kwarg)
         torch.testing.assert_close(actual, expected, atol=1, rtol=0)
 
-    def _reference_resize_bounding_box(self, bounding_box, *, size, max_size=None):
-        old_height, old_width = bounding_box.spatial_size
+    def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None):
+        old_height, old_width = bounding_boxes.spatial_size
         new_height, new_width = self._compute_output_size(
-            input_size=bounding_box.spatial_size, size=size, max_size=max_size
+            input_size=bounding_boxes.spatial_size, size=size, max_size=max_size
         )
 
         if (old_height, old_width) == (new_height, new_width):
-            return bounding_box
+            return bounding_boxes
 
         affine_matrix = np.array(
             [
                 [new_width / old_width, 0, 0],
                 [0, new_height / old_height, 0],
             ],
-            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
 
-        expected_bboxes = reference_affine_bounding_box_helper(
-            bounding_box,
-            format=bounding_box.format,
+        expected_bboxes = reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            format=bounding_boxes.format,
             spatial_size=(new_height, new_width),
             affine_matrix=affine_matrix,
         )
-        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes, spatial_size=(new_height, new_width))
+        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, spatial_size=(new_height, new_width))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("use_max_size", [True, False])
     @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
-    def test_bounding_box_correctness(self, format, size, use_max_size, fn):
+    def test_bounding_boxes_correctness(self, format, size, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE)
+        bounding_boxes = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE)
 
-        actual = fn(bounding_box, size=size, **max_size_kwarg)
-        expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
+        actual = fn(bounding_boxes, size=size, **max_size_kwarg)
+        expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
 
-        self._check_output_size(bounding_box, actual, size=size, **max_size_kwarg)
+        self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg)
         torch.testing.assert_close(actual, expected)
 
     @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES))
@@ -808,13 +808,13 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
         check_kernel(
-            F.horizontal_flip_bounding_box,
-            bounding_box,
+            F.horizontal_flip_bounding_boxes,
+            bounding_boxes,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            spatial_size=bounding_boxes.spatial_size,
         )
 
     @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
@@ -830,7 +830,7 @@ class TestHorizontalFlip:
             (F.horizontal_flip_image_tensor, make_image_tensor),
             (F.horizontal_flip_image_pil, make_image_pil),
             (F.horizontal_flip_image_tensor, make_image),
-            (F.horizontal_flip_bounding_box, make_bounding_box),
+            (F.horizontal_flip_bounding_boxes, make_bounding_box),
             (F.horizontal_flip_mask, make_segmentation_mask),
             (F.horizontal_flip_video, make_video),
         ],
@@ -844,7 +844,7 @@ class TestHorizontalFlip:
             (F.horizontal_flip_image_tensor, torch.Tensor),
             (F.horizontal_flip_image_pil, PIL.Image.Image),
             (F.horizontal_flip_image_tensor, datapoints.Image),
-            (F.horizontal_flip_bounding_box, datapoints.BoundingBox),
+            (F.horizontal_flip_bounding_boxes, datapoints.BoundingBoxes),
             (F.horizontal_flip_mask, datapoints.Mask),
             (F.horizontal_flip_video, datapoints.Video),
         ],
@@ -871,33 +871,33 @@ class TestHorizontalFlip:
 
         torch.testing.assert_close(actual, expected)
 
-    def _reference_horizontal_flip_bounding_box(self, bounding_box):
+    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes):
         affine_matrix = np.array(
             [
-                [-1, 0, bounding_box.spatial_size[1]],
+                [-1, 0, bounding_boxes.spatial_size[1]],
                 [0, 1, 0],
             ],
-            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
 
-        expected_bboxes = reference_affine_bounding_box_helper(
-            bounding_box,
-            format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+        expected_bboxes = reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            format=bounding_boxes.format,
+            spatial_size=bounding_boxes.spatial_size,
             affine_matrix=affine_matrix,
         )
 
-        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes)
+        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
-    def test_bounding_box_correctness(self, format, fn):
-        bounding_box = make_bounding_box(format=format)
+    def test_bounding_boxes_correctness(self, format, fn):
+        bounding_boxes = make_bounding_box(format=format)
 
-        actual = fn(bounding_box)
-        expected = self._reference_horizontal_flip_bounding_box(bounding_box)
+        actual = fn(bounding_boxes)
+        expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
 
         torch.testing.assert_close(actual, expected)
 
@@ -989,13 +989,13 @@ class TestAffine:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_bounding_box(self, param, value, format, dtype, device):
-        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
+    def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
+        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
         self._check_kernel(
-            F.affine_bounding_box,
-            bounding_box,
+            F.affine_bounding_boxes,
+            bounding_boxes,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            spatial_size=bounding_boxes.spatial_size,
             **{param: value},
             check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
         )
@@ -1013,7 +1013,7 @@ class TestAffine:
             (F.affine_image_tensor, make_image_tensor),
             (F.affine_image_pil, make_image_pil),
             (F.affine_image_tensor, make_image),
-            (F.affine_bounding_box, make_bounding_box),
+            (F.affine_bounding_boxes, make_bounding_box),
             (F.affine_mask, make_segmentation_mask),
             (F.affine_video, make_video),
         ],
@@ -1027,7 +1027,7 @@ class TestAffine:
             (F.affine_image_tensor, torch.Tensor),
             (F.affine_image_pil, PIL.Image.Image),
             (F.affine_image_tensor, datapoints.Image),
-            (F.affine_bounding_box, datapoints.BoundingBox),
+            (F.affine_bounding_boxes, datapoints.BoundingBoxes),
             (F.affine_mask, datapoints.Mask),
             (F.affine_video, datapoints.Video),
         ],
@@ -1131,19 +1131,19 @@ class TestAffine:
         true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
         return true_matrix
 
-    def _reference_affine_bounding_box(self, bounding_box, *, angle, translate, scale, shear, center):
+    def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center):
         if center is None:
-            center = [s * 0.5 for s in bounding_box.spatial_size[::-1]]
+            center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]]
 
         affine_matrix = self._compute_affine_matrix(
             angle=angle, translate=translate, scale=scale, shear=shear, center=center
         )
         affine_matrix = affine_matrix[:2, :]
 
-        expected_bboxes = reference_affine_bounding_box_helper(
-            bounding_box,
-            format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+        expected_bboxes = reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            format=bounding_boxes.format,
+            spatial_size=bounding_boxes.spatial_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1155,19 +1155,19 @@ class TestAffine:
     @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
-    def test_functional_bounding_box_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_box = make_bounding_box(format=format)
+    def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center):
+        bounding_boxes = make_bounding_box(format=format)
 
         actual = F.affine(
-            bounding_box,
+            bounding_boxes,
             angle=angle,
             translate=translate,
             scale=scale,
             shear=shear,
             center=center,
         )
-        expected = self._reference_affine_bounding_box(
-            bounding_box,
+        expected = self._reference_affine_bounding_boxes(
+            bounding_boxes,
             angle=angle,
             translate=translate,
             scale=scale,
@@ -1180,18 +1180,18 @@ class TestAffine:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
-    def test_transform_bounding_box_correctness(self, format, center, seed):
-        bounding_box = make_bounding_box(format=format)
+    def test_transform_bounding_boxes_correctness(self, format, center, seed):
+        bounding_boxes = make_bounding_box(format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
         torch.manual_seed(seed)
-        params = transform._get_params([bounding_box])
+        params = transform._get_params([bounding_boxes])
 
         torch.manual_seed(seed)
-        actual = transform(bounding_box)
+        actual = transform(bounding_boxes)
 
-        expected = self._reference_affine_bounding_box(bounding_box, **params, center=center)
+        expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center)
 
         torch.testing.assert_close(actual, expected)
 
@@ -1287,13 +1287,13 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
         check_kernel(
-            F.vertical_flip_bounding_box,
-            bounding_box,
+            F.vertical_flip_bounding_boxes,
+            bounding_boxes,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            spatial_size=bounding_boxes.spatial_size,
         )
 
     @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
@@ -1309,7 +1309,7 @@ class TestVerticalFlip:
             (F.vertical_flip_image_tensor, make_image_tensor),
             (F.vertical_flip_image_pil, make_image_pil),
             (F.vertical_flip_image_tensor, make_image),
-            (F.vertical_flip_bounding_box, make_bounding_box),
+            (F.vertical_flip_bounding_boxes, make_bounding_box),
             (F.vertical_flip_mask, make_segmentation_mask),
             (F.vertical_flip_video, make_video),
         ],
@@ -1323,7 +1323,7 @@ class TestVerticalFlip:
             (F.vertical_flip_image_tensor, torch.Tensor),
             (F.vertical_flip_image_pil, PIL.Image.Image),
             (F.vertical_flip_image_tensor, datapoints.Image),
-            (F.vertical_flip_bounding_box, datapoints.BoundingBox),
+            (F.vertical_flip_bounding_boxes, datapoints.BoundingBoxes),
             (F.vertical_flip_mask, datapoints.Mask),
             (F.vertical_flip_video, datapoints.Video),
         ],
@@ -1348,31 +1348,31 @@ class TestVerticalFlip:
 
         torch.testing.assert_close(actual, expected)
 
-    def _reference_vertical_flip_bounding_box(self, bounding_box):
+    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes):
         affine_matrix = np.array(
             [
                 [1, 0, 0],
-                [0, -1, bounding_box.spatial_size[0]],
+                [0, -1, bounding_boxes.spatial_size[0]],
             ],
-            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
 
-        expected_bboxes = reference_affine_bounding_box_helper(
-            bounding_box,
-            format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+        expected_bboxes = reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            format=bounding_boxes.format,
+            spatial_size=bounding_boxes.spatial_size,
             affine_matrix=affine_matrix,
         )
 
-        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes)
+        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
-    def test_bounding_box_correctness(self, format, fn):
-        bounding_box = make_bounding_box(format=format)
+    def test_bounding_boxes_correctness(self, format, fn):
+        bounding_boxes = make_bounding_box(format=format)
 
-        actual = fn(bounding_box)
-        expected = self._reference_vertical_flip_bounding_box(bounding_box)
+        actual = fn(bounding_boxes)
+        expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
 
         torch.testing.assert_close(actual, expected)
 
@@ -1438,18 +1438,18 @@ class TestRotate:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_bounding_box(self, param, value, format, dtype, device):
+    def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
         kwargs = {param: value}
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
 
-        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
 
         check_kernel(
-            F.rotate_bounding_box,
-            bounding_box,
+            F.rotate_bounding_boxes,
+            bounding_boxes,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            spatial_size=bounding_boxes.spatial_size,
             **kwargs,
         )
 
@@ -1466,7 +1466,7 @@ class TestRotate:
             (F.rotate_image_tensor, make_image_tensor),
             (F.rotate_image_pil, make_image_pil),
             (F.rotate_image_tensor, make_image),
-            (F.rotate_bounding_box, make_bounding_box),
+            (F.rotate_bounding_boxes, make_bounding_box),
             (F.rotate_mask, make_segmentation_mask),
             (F.rotate_video, make_video),
         ],
@@ -1480,7 +1480,7 @@ class TestRotate:
             (F.rotate_image_tensor, torch.Tensor),
             (F.rotate_image_pil, PIL.Image.Image),
             (F.rotate_image_tensor, datapoints.Image),
-            (F.rotate_bounding_box, datapoints.BoundingBox),
+            (F.rotate_bounding_boxes, datapoints.BoundingBoxes),
             (F.rotate_mask, datapoints.Mask),
             (F.rotate_video, datapoints.Video),
         ],
@@ -1549,13 +1549,13 @@ class TestRotate:
         mae = (actual.float() - expected.float()).abs().mean()
         assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
 
-    def _reference_rotate_bounding_box(self, bounding_box, *, angle, expand, center):
+    def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center):
         # FIXME
         if expand:
             raise ValueError("This reference currently does not support expand=True")
 
         if center is None:
-            center = [s * 0.5 for s in bounding_box.spatial_size[::-1]]
+            center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]]
 
         a = np.cos(angle * np.pi / 180.0)
         b = np.sin(angle * np.pi / 180.0)
@@ -1566,13 +1566,13 @@ class TestRotate:
                 [a, b, cx - cx * a - b * cy],
                 [-b, a, cy + cx * b - a * cy],
             ],
-            dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
 
-        expected_bboxes = reference_affine_bounding_box_helper(
-            bounding_box,
-            format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+        expected_bboxes = reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            format=bounding_boxes.format,
+            spatial_size=bounding_boxes.spatial_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1583,11 +1583,11 @@ class TestRotate:
     # TODO: add support for expand=True in the reference
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
-    def test_functional_bounding_box_correctness(self, format, angle, expand, center):
-        bounding_box = make_bounding_box(format=format)
+    def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
+        bounding_boxes = make_bounding_box(format=format)
 
-        actual = F.rotate(bounding_box, angle=angle, expand=expand, center=center)
-        expected = self._reference_rotate_bounding_box(bounding_box, angle=angle, expand=expand, center=center)
+        actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center)
+        expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
 
         torch.testing.assert_close(actual, expected)
 
@@ -1596,18 +1596,18 @@ class TestRotate:
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
-    def test_transform_bounding_box_correctness(self, format, expand, center, seed):
-        bounding_box = make_bounding_box(format=format)
+    def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
+        bounding_boxes = make_bounding_box(format=format)
 
         transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
 
         torch.manual_seed(seed)
-        params = transform._get_params([bounding_box])
+        params = transform._get_params([bounding_boxes])
 
         torch.manual_seed(seed)
-        actual = transform(bounding_box)
+        actual = transform(bounding_boxes)
 
-        expected = self._reference_rotate_bounding_box(bounding_box, **params, expand=expand, center=center)
+        expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center)
 
         torch.testing.assert_close(actual, expected)
 
@@ -1988,7 +1988,7 @@ class TestCutMixMixUp:
         for input_with_bad_type in (
             F.to_pil_image(imgs[0]),
             datapoints.Mask(torch.rand(12, 12)),
-            datapoints.BoundingBox(torch.rand(2, 4), format="XYXY", spatial_size=12),
+            datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", spatial_size=12),
         ):
             with pytest.raises(ValueError, match="does not support PIL images, "):
                 cutmix_mixup(input_with_bad_type)
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 198ab39a4..58c8bfd58 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -20,20 +20,20 @@ MASK = make_detection_mask(size=IMAGE.spatial_size)
     ("sample", "types", "expected"),
     [
         ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True),
         ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True),
         ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox, datapoints.Mask), True),
-        ((MASK,), (datapoints.Image, datapoints.BoundingBox), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True),
+        ((MASK,), (datapoints.Image, datapoints.BoundingBoxes), False),
         ((BOUNDING_BOX,), (datapoints.Image, datapoints.Mask), False),
-        ((IMAGE,), (datapoints.BoundingBox, datapoints.Mask), False),
+        ((IMAGE,), (datapoints.BoundingBoxes, datapoints.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (datapoints.Image, datapoints.BoundingBox, datapoints.Mask),
+            (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask),
             True,
         ),
-        ((), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
+        ((), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
@@ -58,30 +58,30 @@ def test_has_any(sample, types, expected):
     ("sample", "types", "expected"),
     [
         ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True),
         ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True),
         ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBox, datapoints.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (datapoints.Image, datapoints.BoundingBox, datapoints.Mask),
+            (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask),
             True,
         ),
-        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox), False),
+        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), False),
         ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), False),
-        ((IMAGE, MASK), (datapoints.BoundingBox, datapoints.Mask), False),
+        ((IMAGE, MASK), (datapoints.BoundingBoxes, datapoints.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (datapoints.Image, datapoints.BoundingBox, datapoints.Mask),
+            (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask),
             True,
         ),
-        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
-        ((IMAGE, MASK), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
-        ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BoundingBox, datapoints.Mask), False),
+        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
+        ((IMAGE, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
+        ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BoundingBox, datapoints.Mask)),),
+            (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)),),
             True,
         ),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 57b905035..239954dda 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -143,7 +143,7 @@ DISPATCHER_INFOS = [
         kernels={
             datapoints.Image: F.crop_image_tensor,
             datapoints.Video: F.crop_video,
-            datapoints.BoundingBox: F.crop_bounding_box,
+            datapoints.BoundingBoxes: F.crop_bounding_boxes,
             datapoints.Mask: F.crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"),
@@ -153,7 +153,7 @@ DISPATCHER_INFOS = [
         kernels={
             datapoints.Image: F.resized_crop_image_tensor,
             datapoints.Video: F.resized_crop_video,
-            datapoints.BoundingBox: F.resized_crop_bounding_box,
+            datapoints.BoundingBoxes: F.resized_crop_bounding_boxes,
             datapoints.Mask: F.resized_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.resized_crop_image_pil),
@@ -163,7 +163,7 @@ DISPATCHER_INFOS = [
         kernels={
             datapoints.Image: F.pad_image_tensor,
             datapoints.Video: F.pad_video,
-            datapoints.BoundingBox: F.pad_bounding_box,
+            datapoints.BoundingBoxes: F.pad_bounding_boxes,
             datapoints.Mask: F.pad_mask,
         },
         pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"),
@@ -185,7 +185,7 @@ DISPATCHER_INFOS = [
         kernels={
             datapoints.Image: F.perspective_image_tensor,
             datapoints.Video: F.perspective_video,
-            datapoints.BoundingBox: F.perspective_bounding_box,
+            datapoints.BoundingBoxes: F.perspective_bounding_boxes,
             datapoints.Mask: F.perspective_mask,
         },
         pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
@@ -199,7 +199,7 @@ DISPATCHER_INFOS = [
         kernels={
             datapoints.Image: F.elastic_image_tensor,
             datapoints.Video: F.elastic_video,
-            datapoints.BoundingBox: F.elastic_bounding_box,
+            datapoints.BoundingBoxes: F.elastic_bounding_boxes,
             datapoints.Mask: F.elastic_mask,
         },
         pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
@@ -210,7 +210,7 @@ DISPATCHER_INFOS = [
         kernels={
             datapoints.Image: F.center_crop_image_tensor,
             datapoints.Video: F.center_crop_video,
-            datapoints.BoundingBox: F.center_crop_bounding_box,
+            datapoints.BoundingBoxes: F.center_crop_bounding_boxes,
             datapoints.Mask: F.center_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
@@ -374,15 +374,15 @@ DISPATCHER_INFOS = [
         ],
     ),
     DispatcherInfo(
-        F.clamp_bounding_box,
-        kernels={datapoints.BoundingBox: F.clamp_bounding_box},
+        F.clamp_bounding_boxes,
+        kernels={datapoints.BoundingBoxes: F.clamp_bounding_boxes},
         test_marks=[
             skip_dispatch_datapoint,
         ],
     ),
     DispatcherInfo(
-        F.convert_format_bounding_box,
-        kernels={datapoints.BoundingBox: F.convert_format_bounding_box},
+        F.convert_format_bounding_boxes,
+        kernels={datapoints.BoundingBoxes: F.convert_format_bounding_boxes},
         test_marks=[
             skip_dispatch_datapoint,
         ],
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 036b3e4d3..6f1c91ac6 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -184,13 +184,13 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
     return other_args, dict(kwargs, fill=fill)
 
 
-def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix):
     def transform(bbox, affine_matrix_, format_, spatial_size_):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
             bbox = bbox.float()
-        bbox_xyxy = F.convert_format_bounding_box(
+        bbox_xyxy = F.convert_format_bounding_boxes(
             bbox.as_subclass(torch.Tensor),
             old_format=format_,
             new_format=datapoints.BoundingBoxFormat.XYXY,
@@ -214,18 +214,18 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size,
             ],
             dtype=bbox_xyxy.dtype,
         )
-        out_bbox = F.convert_format_bounding_box(
+        out_bbox = F.convert_format_bounding_boxes(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, spatial_size=spatial_size_)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
-    if bounding_box.ndim < 2:
-        bounding_box = [bounding_box]
+    if bounding_boxes.ndim < 2:
+        bounding_boxes = [bounding_boxes]
 
-    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
+    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_boxes]
     if len(expected_bboxes) > 1:
         expected_bboxes = torch.stack(expected_bboxes)
     else:
@@ -234,30 +234,30 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size,
     return expected_bboxes
 
 
-def sample_inputs_convert_format_bounding_box():
+def sample_inputs_convert_format_bounding_boxes():
     formats = list(datapoints.BoundingBoxFormat)
-    for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
-        yield ArgsKwargs(bounding_box_loader, old_format=bounding_box_loader.format, new_format=new_format)
+    for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
+        yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format)
 
 
-def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
+def reference_convert_format_bounding_boxes(bounding_boxes, old_format, new_format):
     return torchvision.ops.box_convert(
-        bounding_box, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()
-    ).to(bounding_box.dtype)
+        bounding_boxes, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()
+    ).to(bounding_boxes.dtype)
 
 
-def reference_inputs_convert_format_bounding_box():
-    for args_kwargs in sample_inputs_convert_format_bounding_box():
+def reference_inputs_convert_format_bounding_boxes():
+    for args_kwargs in sample_inputs_convert_format_bounding_boxes():
         if len(args_kwargs.args[0].shape) == 2:
             yield args_kwargs
 
 
 KERNEL_INFOS.append(
     KernelInfo(
-        F.convert_format_bounding_box,
-        sample_inputs_fn=sample_inputs_convert_format_bounding_box,
-        reference_fn=reference_convert_format_bounding_box,
-        reference_inputs_fn=reference_inputs_convert_format_bounding_box,
+        F.convert_format_bounding_boxes,
+        sample_inputs_fn=sample_inputs_convert_format_bounding_boxes,
+        reference_fn=reference_convert_format_bounding_boxes,
+        reference_inputs_fn=reference_inputs_convert_format_bounding_boxes,
         logs_usage=True,
         closeness_kwargs={
             (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
@@ -290,11 +290,11 @@ def reference_inputs_crop_image_tensor():
         yield ArgsKwargs(image_loader, **params)
 
 
-def sample_inputs_crop_bounding_box():
-    for bounding_box_loader, params in itertools.product(
+def sample_inputs_crop_bounding_boxes():
+    for bounding_boxes_loader, params in itertools.product(
         make_bounding_box_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
     ):
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
+        yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **params)
 
 
 def sample_inputs_crop_mask():
@@ -312,27 +312,27 @@ def sample_inputs_crop_video():
         yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
 
 
-def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
+def reference_crop_bounding_boxes(bounding_boxes, *, format, top, left, height, width):
     affine_matrix = np.array(
         [
             [1, 0, -left],
             [0, 1, -top],
         ],
-        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+        dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
     )
 
     spatial_size = (height, width)
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    expected_bboxes = reference_affine_bounding_boxes_helper(
+        bounding_boxes, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
     )
     return expected_bboxes, spatial_size
 
 
-def reference_inputs_crop_bounding_box():
-    for bounding_box_loader, params in itertools.product(
+def reference_inputs_crop_bounding_boxes():
+    for bounding_boxes_loader, params in itertools.product(
         make_bounding_box_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
     ):
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
+        yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **params)
 
 
 KERNEL_INFOS.extend(
@@ -346,10 +346,10 @@ KERNEL_INFOS.extend(
             float32_vs_uint8=True,
         ),
         KernelInfo(
-            F.crop_bounding_box,
-            sample_inputs_fn=sample_inputs_crop_bounding_box,
-            reference_fn=reference_crop_bounding_box,
-            reference_inputs_fn=reference_inputs_crop_bounding_box,
+            F.crop_bounding_boxes,
+            sample_inputs_fn=sample_inputs_crop_bounding_boxes,
+            reference_fn=reference_crop_bounding_boxes,
+            reference_inputs_fn=reference_inputs_crop_bounding_boxes,
         ),
         KernelInfo(
             F.crop_mask,
@@ -406,9 +406,9 @@ def reference_inputs_resized_crop_image_tensor():
         )
 
 
-def sample_inputs_resized_crop_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **_RESIZED_CROP_PARAMS[0])
+def sample_inputs_resized_crop_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
+        yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **_RESIZED_CROP_PARAMS[0])
 
 
 def sample_inputs_resized_crop_mask():
@@ -436,8 +436,8 @@ KERNEL_INFOS.extend(
             },
         ),
         KernelInfo(
-            F.resized_crop_bounding_box,
-            sample_inputs_fn=sample_inputs_resized_crop_bounding_box,
+            F.resized_crop_bounding_boxes,
+            sample_inputs_fn=sample_inputs_resized_crop_bounding_boxes,
         ),
         KernelInfo(
             F.resized_crop_mask,
@@ -500,14 +500,14 @@ def reference_inputs_pad_image_tensor():
             yield ArgsKwargs(image_loader, fill=fill, **params)
 
 
-def sample_inputs_pad_bounding_box():
-    for bounding_box_loader, padding in itertools.product(
+def sample_inputs_pad_bounding_boxes():
+    for bounding_boxes_loader, padding in itertools.product(
         make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
     ):
         yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            spatial_size=bounding_boxes_loader.spatial_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -530,7 +530,7 @@ def sample_inputs_pad_video():
         yield ArgsKwargs(video_loader, padding=[1])
 
 
-def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, padding_mode):
+def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, padding, padding_mode):
 
     left, right, top, bottom = _parse_pad_padding(padding)
 
@@ -539,26 +539,26 @@ def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, p
             [1, 0, left],
             [0, 1, top],
         ],
-        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+        dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
     )
 
     height = spatial_size[0] + top + bottom
     width = spatial_size[1] + left + right
 
-    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
+    expected_bboxes = reference_affine_bounding_boxes_helper(
+        bounding_boxes, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
     )
     return expected_bboxes, (height, width)
 
 
-def reference_inputs_pad_bounding_box():
-    for bounding_box_loader, padding in itertools.product(
+def reference_inputs_pad_bounding_boxes():
+    for bounding_boxes_loader, padding in itertools.product(
         make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
     ):
         yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            spatial_size=bounding_boxes_loader.spatial_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -591,10 +591,10 @@ KERNEL_INFOS.extend(
             ],
         ),
         KernelInfo(
-            F.pad_bounding_box,
-            sample_inputs_fn=sample_inputs_pad_bounding_box,
-            reference_fn=reference_pad_bounding_box,
-            reference_inputs_fn=reference_inputs_pad_bounding_box,
+            F.pad_bounding_boxes,
+            sample_inputs_fn=sample_inputs_pad_bounding_boxes,
+            reference_fn=reference_pad_bounding_boxes,
+            reference_inputs_fn=reference_inputs_pad_bounding_boxes,
             test_marks=[
                 xfail_jit_python_scalar_arg("padding"),
             ],
@@ -655,12 +655,12 @@ def reference_inputs_perspective_image_tensor():
             )
 
 
-def sample_inputs_perspective_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
+def sample_inputs_perspective_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            spatial_size=bounding_boxes_loader.spatial_size,
             startpoints=None,
             endpoints=None,
             coefficients=_PERSPECTIVE_COEFFS[0],
@@ -712,8 +712,8 @@ KERNEL_INFOS.extend(
             test_marks=[xfail_jit_python_scalar_arg("fill")],
         ),
         KernelInfo(
-            F.perspective_bounding_box,
-            sample_inputs_fn=sample_inputs_perspective_bounding_box,
+            F.perspective_bounding_boxes,
+            sample_inputs_fn=sample_inputs_perspective_bounding_boxes,
             closeness_kwargs={
                 **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
                 **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
@@ -767,13 +767,13 @@ def reference_inputs_elastic_image_tensor():
             yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
 
 
-def sample_inputs_elastic_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        displacement = _get_elastic_displacement(bounding_box_loader.spatial_size)
+def sample_inputs_elastic_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
+        displacement = _get_elastic_displacement(bounding_boxes_loader.spatial_size)
         yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            spatial_size=bounding_boxes_loader.spatial_size,
             displacement=displacement,
         )
 
@@ -804,8 +804,8 @@ KERNEL_INFOS.extend(
             test_marks=[xfail_jit_python_scalar_arg("fill")],
         ),
         KernelInfo(
-            F.elastic_bounding_box,
-            sample_inputs_fn=sample_inputs_elastic_bounding_box,
+            F.elastic_bounding_boxes,
+            sample_inputs_fn=sample_inputs_elastic_bounding_boxes,
         ),
         KernelInfo(
             F.elastic_mask,
@@ -845,12 +845,12 @@ def reference_inputs_center_crop_image_tensor():
         yield ArgsKwargs(image_loader, output_size=output_size)
 
 
-def sample_inputs_center_crop_bounding_box():
-    for bounding_box_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES):
+def sample_inputs_center_crop_bounding_boxes():
+    for bounding_boxes_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES):
         yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            spatial_size=bounding_boxes_loader.spatial_size,
             output_size=output_size,
         )
 
@@ -887,8 +887,8 @@ KERNEL_INFOS.extend(
             ],
         ),
         KernelInfo(
-            F.center_crop_bounding_box,
-            sample_inputs_fn=sample_inputs_center_crop_bounding_box,
+            F.center_crop_bounding_boxes,
+            sample_inputs_fn=sample_inputs_center_crop_bounding_boxes,
             test_marks=[
                 xfail_jit_python_scalar_arg("output_size"),
             ],
@@ -1482,19 +1482,19 @@ KERNEL_INFOS.extend(
 )
 
 
-def sample_inputs_clamp_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
+def sample_inputs_clamp_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            spatial_size=bounding_boxes_loader.spatial_size,
         )
 
 
 KERNEL_INFOS.append(
     KernelInfo(
-        F.clamp_bounding_box,
-        sample_inputs_fn=sample_inputs_clamp_bounding_box,
+        F.clamp_bounding_boxes,
+        sample_inputs_fn=sample_inputs_clamp_bounding_boxes,
         logs_usage=True,
     )
 )
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index c9343048a..fb51f0497 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,6 +1,6 @@
 from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
 
-from ._bounding_box import BoundingBox, BoundingBoxFormat
+from ._bounding_box import BoundingBoxes, BoundingBoxFormat
 from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT
 from ._image import _ImageType, _ImageTypeJIT, _TensorImageType, _TensorImageTypeJIT, Image
 from ._mask import Mask
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 11d42f171..b3dc46348 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -24,7 +24,7 @@ class BoundingBoxFormat(Enum):
     CXCYWH = "CXCYWH"
 
 
-class BoundingBox(Datapoint):
+class BoundingBoxes(Datapoint):
     """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
 
     Args:
@@ -43,11 +43,11 @@ class BoundingBox(Datapoint):
     spatial_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBox:
-        bounding_box = tensor.as_subclass(cls)
-        bounding_box.format = format
-        bounding_box.spatial_size = spatial_size
-        return bounding_box
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBoxes:
+        bounding_boxes = tensor.as_subclass(cls)
+        bounding_boxes.format = format
+        bounding_boxes.spatial_size = spatial_size
+        return bounding_boxes
 
     def __new__(
         cls,
@@ -58,7 +58,7 @@ class BoundingBox(Datapoint):
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: Optional[bool] = None,
-    ) -> BoundingBox:
+    ) -> BoundingBoxes:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
         if isinstance(format, str):
@@ -69,17 +69,17 @@ class BoundingBox(Datapoint):
     @classmethod
     def wrap_like(
         cls,
-        other: BoundingBox,
+        other: BoundingBoxes,
         tensor: torch.Tensor,
         *,
         format: Optional[BoundingBoxFormat] = None,
         spatial_size: Optional[Tuple[int, int]] = None,
-    ) -> BoundingBox:
-        """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference.
+    ) -> BoundingBoxes:
+        """Wrap a :class:`torch.Tensor` as :class:`BoundingBoxes` from a reference.
 
         Args:
-            other (BoundingBox): Reference bounding box.
-            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox`
+            other (BoundingBoxes): Reference bounding box.
+            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBoxes`
             format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
                 reference.
             spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
@@ -98,17 +98,17 @@ class BoundingBox(Datapoint):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr(format=self.format, spatial_size=self.spatial_size)
 
-    def horizontal_flip(self) -> BoundingBox:
-        output = self._F.horizontal_flip_bounding_box(
+    def horizontal_flip(self) -> BoundingBoxes:
+        output = self._F.horizontal_flip_bounding_boxes(
             self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
         )
-        return BoundingBox.wrap_like(self, output)
+        return BoundingBoxes.wrap_like(self, output)
 
-    def vertical_flip(self) -> BoundingBox:
-        output = self._F.vertical_flip_bounding_box(
+    def vertical_flip(self) -> BoundingBoxes:
+        output = self._F.vertical_flip_bounding_boxes(
             self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
         )
-        return BoundingBox.wrap_like(self, output)
+        return BoundingBoxes.wrap_like(self, output)
 
     def resize(  # type: ignore[override]
         self,
@@ -116,26 +116,26 @@ class BoundingBox(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
-    ) -> BoundingBox:
-        output, spatial_size = self._F.resize_bounding_box(
+    ) -> BoundingBoxes:
+        output, spatial_size = self._F.resize_bounding_boxes(
             self.as_subclass(torch.Tensor),
             spatial_size=self.spatial_size,
             size=size,
             max_size=max_size,
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
 
-    def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        output, spatial_size = self._F.crop_bounding_box(
+    def crop(self, top: int, left: int, height: int, width: int) -> BoundingBoxes:
+        output, spatial_size = self._F.crop_bounding_boxes(
             self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
 
-    def center_crop(self, output_size: List[int]) -> BoundingBox:
-        output, spatial_size = self._F.center_crop_bounding_box(
+    def center_crop(self, output_size: List[int]) -> BoundingBoxes:
+        output, spatial_size = self._F.center_crop_bounding_boxes(
             self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, output_size=output_size
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
 
     def resized_crop(
         self,
@@ -146,26 +146,26 @@ class BoundingBox(Datapoint):
         size: List[int],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
-    ) -> BoundingBox:
-        output, spatial_size = self._F.resized_crop_bounding_box(
+    ) -> BoundingBoxes:
+        output, spatial_size = self._F.resized_crop_bounding_boxes(
             self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
 
     def pad(
         self,
         padding: Union[int, Sequence[int]],
         fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
-    ) -> BoundingBox:
-        output, spatial_size = self._F.pad_bounding_box(
+    ) -> BoundingBoxes:
+        output, spatial_size = self._F.pad_bounding_boxes(
             self.as_subclass(torch.Tensor),
             format=self.format,
             spatial_size=self.spatial_size,
             padding=padding,
             padding_mode=padding_mode,
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
 
     def rotate(
         self,
@@ -174,8 +174,8 @@ class BoundingBox(Datapoint):
         expand: bool = False,
         center: Optional[List[float]] = None,
         fill: _FillTypeJIT = None,
-    ) -> BoundingBox:
-        output, spatial_size = self._F.rotate_bounding_box(
+    ) -> BoundingBoxes:
+        output, spatial_size = self._F.rotate_bounding_boxes(
             self.as_subclass(torch.Tensor),
             format=self.format,
             spatial_size=self.spatial_size,
@@ -183,7 +183,7 @@ class BoundingBox(Datapoint):
             expand=expand,
             center=center,
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
 
     def affine(
         self,
@@ -194,8 +194,8 @@ class BoundingBox(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         fill: _FillTypeJIT = None,
         center: Optional[List[float]] = None,
-    ) -> BoundingBox:
-        output = self._F.affine_bounding_box(
+    ) -> BoundingBoxes:
+        output = self._F.affine_bounding_boxes(
             self.as_subclass(torch.Tensor),
             self.format,
             self.spatial_size,
@@ -205,7 +205,7 @@ class BoundingBox(Datapoint):
             shear=shear,
             center=center,
         )
-        return BoundingBox.wrap_like(self, output)
+        return BoundingBoxes.wrap_like(self, output)
 
     def perspective(
         self,
@@ -214,8 +214,8 @@ class BoundingBox(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: _FillTypeJIT = None,
         coefficients: Optional[List[float]] = None,
-    ) -> BoundingBox:
-        output = self._F.perspective_bounding_box(
+    ) -> BoundingBoxes:
+        output = self._F.perspective_bounding_boxes(
             self.as_subclass(torch.Tensor),
             format=self.format,
             spatial_size=self.spatial_size,
@@ -223,15 +223,15 @@ class BoundingBox(Datapoint):
             endpoints=endpoints,
             coefficients=coefficients,
         )
-        return BoundingBox.wrap_like(self, output)
+        return BoundingBoxes.wrap_like(self, output)
 
     def elastic(
         self,
         displacement: torch.Tensor,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         fill: _FillTypeJIT = None,
-    ) -> BoundingBox:
-        output = self._F.elastic_bounding_box(
+    ) -> BoundingBoxes:
+        output = self._F.elastic_bounding_boxes(
             self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement
         )
-        return BoundingBox.wrap_like(self, output)
+        return BoundingBoxes.wrap_like(self, output)
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 0dabec58f..35072159d 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -138,8 +138,8 @@ class Datapoint(torch.Tensor):
         # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
         # attribute is cleared, so we need to refill it before we return.
         # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
-        # `BoundingBox.format` and `BoundingBox.spatial_size`, which are immutable and thus implicitly deep-copied by
-        # `BoundingBox.clone()`.
+        # `BoundingBoxes.format` and `BoundingBoxes.spatial_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBoxes.clone()`.
         return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
 
     def horizontal_flip(self) -> Datapoint:
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index d88bc81e6..3b162b69c 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -44,7 +44,7 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
           the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
           preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
-          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBox` datapoint.
+          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint.
         * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
           dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
           in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is
@@ -56,7 +56,7 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
           a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.datapoints.Mask` datapoint) and
           ``"labels"``.
         * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY``
-          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBox` datapoint.
+          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint.
 
     Image classification datasets
 
@@ -360,8 +360,8 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
             target["image_id"] = image_id
 
         if "boxes" in target_keys:
-            target["boxes"] = F.convert_format_bounding_box(
-                datapoints.BoundingBox(
+            target["boxes"] = F.convert_format_bounding_boxes(
+                datapoints.BoundingBoxes(
                     batched_target["bbox"],
                     format=datapoints.BoundingBoxFormat.XYWH,
                     spatial_size=spatial_size,
@@ -442,7 +442,7 @@ def voc_detection_wrapper_factory(dataset, target_keys):
             target = {}
 
         if "boxes" in target_keys:
-            target["boxes"] = datapoints.BoundingBox(
+            target["boxes"] = datapoints.BoundingBoxes(
                 [
                     [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
                     for bndbox in batched_instances["bndbox"]
@@ -481,8 +481,8 @@ def celeba_wrapper_factory(dataset, target_keys):
             target,
             target_types=dataset.target_type,
             type_wrappers={
-                "bbox": lambda item: F.convert_format_bounding_box(
-                    datapoints.BoundingBox(
+                "bbox": lambda item: F.convert_format_bounding_boxes(
+                    datapoints.BoundingBoxes(
                         item,
                         format=datapoints.BoundingBoxFormat.XYWH,
                         spatial_size=(image.height, image.width),
@@ -532,7 +532,7 @@ def kitti_wrapper_factory(dataset, target_keys):
         target = {}
 
         if "boxes" in target_keys:
-            target["boxes"] = datapoints.BoundingBox(
+            target["boxes"] = datapoints.BoundingBoxes(
                 batched_target["bbox"],
                 format=datapoints.BoundingBoxFormat.XYXY,
                 spatial_size=(image.height, image.width),
@@ -628,8 +628,8 @@ def widerface_wrapper(dataset, target_keys):
         target = {key: target[key] for key in target_keys}
 
         if "bbox" in target_keys:
-            target["bbox"] = F.convert_format_bounding_box(
-                datapoints.BoundingBox(
+            target["bbox"] = F.convert_format_bounding_boxes(
+                datapoints.BoundingBoxes(
                     target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
                 ),
                 new_format=datapoints.BoundingBoxFormat.XYXY,
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index f38823616..631de46b2 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -6,7 +6,7 @@ import numpy as np
 
 import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.datapoints import BoundingBox
+from torchvision.datapoints import BoundingBoxes
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
@@ -112,7 +112,7 @@ class Caltech101(Dataset):
             image_path=image_path,
             image=image,
             ann_path=ann_path,
-            bounding_box=BoundingBox(
+            bounding_boxes=BoundingBoxes(
                 ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]],
                 format="xyxy",
                 spatial_size=image.spatial_size,
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 2c8194687..9112a8035 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -4,7 +4,7 @@ from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tupl
 
 import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.datapoints import BoundingBox
+from torchvision.datapoints import BoundingBoxes
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
@@ -137,15 +137,15 @@ class CelebA(Dataset):
         path, buffer = image_data
 
         image = EncodedImage.from_file(buffer)
-        (_, identity), (_, attributes), (_, bounding_box), (_, landmarks) = ann_data
+        (_, identity), (_, attributes), (_, bounding_boxes), (_, landmarks) = ann_data
 
         return dict(
             path=path,
             image=image,
             identity=Label(int(identity["identity"])),
             attributes={attr: value == "1" for attr, value in attributes.items()},
-            bounding_box=BoundingBox(
-                [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")],
+            bounding_boxes=BoundingBoxes(
+                [int(bounding_boxes[key]) for key in ("x_1", "y_1", "width", "height")],
                 format="xywh",
                 spatial_size=image.spatial_size,
             ),
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index 6616b4e34..abf19acec 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -14,7 +14,7 @@ from torchdata.datapipes.iter import (
     Mapper,
     UnBatcher,
 )
-from torchvision.datapoints import BoundingBox, Mask
+from torchvision.datapoints import BoundingBoxes, Mask
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
@@ -126,7 +126,7 @@ class Coco(Dataset):
             ),
             areas=torch.as_tensor([ann["area"] for ann in anns]),
             crowds=torch.as_tensor([ann["iscrowd"] for ann in anns], dtype=torch.bool),
-            bounding_boxes=BoundingBox(
+            bounding_boxes=BoundingBoxes(
                 [ann["bbox"] for ann in anns],
                 format="xywh",
                 spatial_size=spatial_size,
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index bc41ba028..b301c6ba0 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -15,7 +15,7 @@ from torchdata.datapipes.iter import (
     Mapper,
 )
 from torchdata.datapipes.map import IterToMapConverter
-from torchvision.datapoints import BoundingBox
+from torchvision.datapoints import BoundingBoxes
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
@@ -134,11 +134,11 @@ class CUB200(Dataset):
     def _2011_prepare_ann(
         self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int]
     ) -> Dict[str, Any]:
-        _, (bounding_box_data, segmentation_data) = data
+        _, (bounding_boxes_data, segmentation_data) = data
         segmentation_path, segmentation_buffer = segmentation_data
         return dict(
-            bounding_box=BoundingBox(
-                [float(part) for part in bounding_box_data[1:]], format="xywh", spatial_size=spatial_size
+            bounding_boxes=BoundingBoxes(
+                [float(part) for part in bounding_boxes_data[1:]], format="xywh", spatial_size=spatial_size
             ),
             segmentation_path=segmentation_path,
             segmentation=EncodedImage.from_file(segmentation_buffer),
@@ -158,7 +158,7 @@ class CUB200(Dataset):
         content = read_mat(buffer)
         return dict(
             ann_path=path,
-            bounding_box=BoundingBox(
+            bounding_boxes=BoundingBoxes(
                 [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
                 format="xyxy",
                 spatial_size=spatial_size,
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 85116ca38..34651fcfc 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -2,7 +2,7 @@ import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.datapoints import BoundingBox
+from torchvision.datapoints import BoundingBoxes
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
@@ -76,7 +76,7 @@ class GTSRB(Dataset):
         (path, buffer), csv_info = data
         label = int(csv_info["ClassId"])
 
-        bounding_box = BoundingBox(
+        bounding_boxes = BoundingBoxes(
             [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
             format="xyxy",
             spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])),
@@ -86,7 +86,7 @@ class GTSRB(Dataset):
             "path": path,
             "image": EncodedImage.from_file(buffer),
             "label": Label(label, categories=self._categories),
-            "bounding_box": bounding_box,
+            "bounding_boxes": bounding_boxes,
         }
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index a76b2dba2..aefbbede2 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -2,7 +2,7 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.datapoints import BoundingBox
+from torchvision.datapoints import BoundingBoxes
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
@@ -90,7 +90,7 @@ class StanfordCars(Dataset):
             path=path,
             image=image,
             label=Label(target[4] - 1, categories=self._categories),
-            bounding_box=BoundingBox(target[:4], format="xyxy", spatial_size=image.spatial_size),
+            bounding_boxes=BoundingBoxes(target[:4], format="xyxy", spatial_size=image.spatial_size),
         )
 
     def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index a13cfb764..53dfbd185 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -5,7 +5,7 @@ from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
 from xml.etree import ElementTree
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.datapoints import BoundingBox
+from torchvision.datapoints import BoundingBoxes
 from torchvision.datasets import VOCDetection
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
@@ -103,7 +103,7 @@ class VOC(Dataset):
         anns = self._parse_detection_ann(buffer)
         instances = anns["object"]
         return dict(
-            bounding_boxes=BoundingBox(
+            bounding_boxes=BoundingBoxes(
                 [
                     [int(instance["bndbox"][part]) for part in ("xmin", "ymin", "xmax", "ymax")]
                     for instance in instances
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index d04baf739..0e50fb755 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -26,7 +26,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
             and has_any(flat_inputs, proto_datapoints.OneHotLabel)
         ):
             raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
-        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask, proto_datapoints.Label):
+        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask, proto_datapoints.Label):
             raise TypeError(
                 f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
             )
@@ -175,7 +175,7 @@ class SimpleCopyPaste(Transform):
         # There is a similar +1 in other reference implementations:
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
-        boxes = F.convert_format_bounding_box(
+        boxes = F.convert_format_bounding_boxes(
             xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
@@ -184,7 +184,7 @@ class SimpleCopyPaste(Transform):
         out_target["labels"] = torch.cat([labels, paste_labels])
 
         # Check for degenerated boxes and remove them
-        boxes = F.convert_format_bounding_box(
+        boxes = F.convert_format_bounding_boxes(
             out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY
         )
         degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
@@ -201,14 +201,14 @@ class SimpleCopyPaste(Transform):
         self, flat_sample: List[Any]
     ) -> Tuple[List[datapoints._TensorImageType], List[Dict[str, Any]]]:
         # fetch all images, bboxes, masks and labels from unstructured input
-        # with List[image], List[BoundingBox], List[Mask], List[Label]
+        # with List[image], List[BoundingBoxes], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
         for obj in flat_sample:
             if isinstance(obj, datapoints.Image) or is_simple_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
                 images.append(F.to_image_tensor(obj))
-            elif isinstance(obj, datapoints.BoundingBox):
+            elif isinstance(obj, datapoints.BoundingBoxes):
                 bboxes.append(obj)
             elif isinstance(obj, datapoints.Mask):
                 masks.append(obj)
@@ -218,7 +218,7 @@ class SimpleCopyPaste(Transform):
         if not (len(images) == len(bboxes) == len(masks) == len(labels)):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain equal sized list of Images, "
-                "BoundingBoxes, Masks and Labels or OneHotLabels."
+                "BoundingBoxeses, Masks and Labels or OneHotLabels."
             )
 
         targets = []
@@ -244,8 +244,8 @@ class SimpleCopyPaste(Transform):
             elif is_simple_tensor(obj):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
-            elif isinstance(obj, datapoints.BoundingBox):
-                flat_sample[i] = datapoints.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"])
+            elif isinstance(obj, datapoints.BoundingBoxes):
+                flat_sample[i] = datapoints.BoundingBoxes.wrap_like(obj, output_targets[c1]["boxes"])
                 c1 += 1
             elif isinstance(obj, datapoints.Mask):
                 flat_sample[i] = datapoints.Mask.wrap_like(obj, output_targets[c2]["masks"])
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 8d5cc24d2..b328c1320 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@ from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_box, query_spatial_size
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size
 
 
 class FixedSizeCrop(Transform):
@@ -39,9 +39,9 @@ class FixedSizeCrop(Transform):
                 f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
             )
 
-        if has_any(flat_inputs, datapoints.BoundingBox) and not has_any(flat_inputs, Label, OneHotLabel):
+        if has_any(flat_inputs, datapoints.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel):
             raise TypeError(
-                f"If a BoundingBox is contained in the input sample, "
+                f"If a BoundingBoxes is contained in the input sample, "
                 f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
             )
 
@@ -61,13 +61,13 @@ class FixedSizeCrop(Transform):
 
         bounding_boxes: Optional[torch.Tensor]
         try:
-            bounding_boxes = query_bounding_box(flat_inputs)
+            bounding_boxes = query_bounding_boxes(flat_inputs)
         except ValueError:
             bounding_boxes = None
 
         if needs_crop and bounding_boxes is not None:
             format = bounding_boxes.format
-            bounding_boxes, spatial_size = F.crop_bounding_box(
+            bounding_boxes, spatial_size = F.crop_bounding_boxes(
                 bounding_boxes.as_subclass(torch.Tensor),
                 format=format,
                 top=top,
@@ -75,8 +75,8 @@ class FixedSizeCrop(Transform):
                 height=new_height,
                 width=new_width,
             )
-            bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, spatial_size=spatial_size)
-            height_and_width = F.convert_format_bounding_box(
+            bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size)
+            height_and_width = F.convert_format_bounding_boxes(
                 bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
             )[..., 2:]
             is_valid = torch.all(height_and_width > 0, dim=-1)
@@ -112,10 +112,12 @@ class FixedSizeCrop(Transform):
         if params["is_valid"] is not None:
             if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)):
                 inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
-            elif isinstance(inpt, datapoints.BoundingBox):
-                inpt = datapoints.BoundingBox.wrap_like(
+            elif isinstance(inpt, datapoints.BoundingBoxes):
+                inpt = datapoints.BoundingBoxes.wrap_like(
                     inpt,
-                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size),
+                    F.clamp_bounding_boxes(
+                        inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size
+                    ),
                 )
 
         if params["needs_pad"]:
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 8f1c161c2..b44f479c4 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -39,7 +39,7 @@ from ._geometry import (
     ScaleJitter,
     TenCrop,
 )
-from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat
+from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat
 from ._misc import (
     ConvertImageDtype,
     GaussianBlur,
@@ -47,7 +47,7 @@ from ._misc import (
     Lambda,
     LinearTransformation,
     Normalize,
-    SanitizeBoundingBox,
+    SanitizeBoundingBoxes,
     ToDtype,
 )
 from ._temporal import UniformTemporalSubsample
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index fb4e23f5f..f9038c6af 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -155,7 +155,7 @@ class _BaseMixupCutmix(Transform):
         flat_inputs, spec = tree_flatten(inputs)
         needs_transform_list = self._needs_transform_list(flat_inputs)
 
-        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask):
+        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask):
             raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.")
 
         labels = self._labels_getter(inputs)
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 34c0ced43..785e1f697 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -34,7 +34,7 @@ class _AutoAugmentBase(Transform):
     def _flatten_and_extract_image_or_video(
         self,
         inputs: Any,
-        unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBox, datapoints.Mask),
+        unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBoxes, datapoints.Mask),
     ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints._ImageType, datapoints._VideoType]]:
         flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
         needs_transform_list = self._needs_transform_list(flat_inputs)
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index c1b05ce03..a64f7a40e 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -22,7 +22,7 @@ from ._utils import (
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_spatial_size
+from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -31,7 +31,7 @@ class RandomHorizontalFlip(_RandomApplyTransform):
     .. v2betastatus:: RandomHorizontalFlip transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -51,7 +51,7 @@ class RandomVerticalFlip(_RandomApplyTransform):
     .. v2betastatus:: RandomVerticalFlip transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -71,7 +71,7 @@ class Resize(Transform):
     .. v2betastatus:: Resize transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -165,7 +165,7 @@ class CenterCrop(Transform):
     .. v2betastatus:: CenterCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -193,7 +193,7 @@ class RandomResizedCrop(Transform):
     .. v2betastatus:: RandomResizedCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -371,8 +371,8 @@ class FiveCrop(Transform):
         return F.five_crop(inpt, self.size)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
+        if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
 
 
 class TenCrop(Transform):
@@ -414,8 +414,8 @@ class TenCrop(Transform):
         self.vertical_flip = vertical_flip
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
+        if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
 
     def _transform(
         self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
@@ -440,7 +440,7 @@ class Pad(Transform):
     .. v2betastatus:: Pad transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -525,7 +525,7 @@ class RandomZoomOut(_RandomApplyTransform):
         output_height = input_height * r
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -584,7 +584,7 @@ class RandomRotation(Transform):
     .. v2betastatus:: RandomRotation transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -657,7 +657,7 @@ class RandomAffine(Transform):
     .. v2betastatus:: RandomAffine transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -778,7 +778,7 @@ class RandomCrop(Transform):
     .. v2betastatus:: RandomCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -933,7 +933,7 @@ class RandomPerspective(_RandomApplyTransform):
     .. v2betastatus:: RandomPerspective transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1019,7 +1019,7 @@ class ElasticTransform(Transform):
     .. v2betastatus:: RandomPerspective transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1110,15 +1110,15 @@ class RandomIoUCrop(Transform):
 
     .. v2betastatus:: RandomIoUCrop transform
 
-    This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
+    This transformation requires an image or video data and ``datapoints.BoundingBoxes`` in the input.
 
     .. warning::
         In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
-        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
         after or later in the transforms pipeline.
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1155,7 +1155,7 @@ class RandomIoUCrop(Transform):
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
-            has_all(flat_inputs, datapoints.BoundingBox)
+            has_all(flat_inputs, datapoints.BoundingBoxes)
             and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
         ):
             raise TypeError(
@@ -1165,7 +1165,7 @@ class RandomIoUCrop(Transform):
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         orig_h, orig_w = query_spatial_size(flat_inputs)
-        bboxes = query_bounding_box(flat_inputs)
+        bboxes = query_bounding_boxes(flat_inputs)
 
         while True:
             # sample an option
@@ -1193,7 +1193,7 @@ class RandomIoUCrop(Transform):
                     continue
 
                 # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_format_bounding_box(
+                xyxy_bboxes = F.convert_format_bounding_boxes(
                     bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
                 )
                 cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
@@ -1220,9 +1220,9 @@ class RandomIoUCrop(Transform):
 
         output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
 
-        if isinstance(output, datapoints.BoundingBox):
+        if isinstance(output, datapoints.BoundingBoxes):
             # We "mark" the invalid boxes as degenreate, and they can be
-            # removed by a later call to SanitizeBoundingBox()
+            # removed by a later call to SanitizeBoundingBoxes()
             output[~params["is_within_crop_area"]] = 0
 
         return output
@@ -1235,7 +1235,7 @@ class ScaleJitter(Transform):
     .. v2betastatus:: ScaleJitter transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1301,7 +1301,7 @@ class RandomShortestSize(Transform):
     .. v2betastatus:: RandomShortestSize transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1380,7 +1380,7 @@ class RandomResize(Transform):
         output_height = size
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 9abe91d88..71cc159c9 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -15,7 +15,7 @@ class ConvertBoundingBoxFormat(Transform):
             string values match the enums, e.g. "XYXY" or "XYWH" etc.
     """
 
-    _transformed_types = (datapoints.BoundingBox,)
+    _transformed_types = (datapoints.BoundingBoxes,)
 
     def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
         super().__init__()
@@ -23,20 +23,20 @@ class ConvertBoundingBoxFormat(Transform):
             format = datapoints.BoundingBoxFormat[format]
         self.format = format
 
-    def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
-        return F.convert_format_bounding_box(inpt, new_format=self.format)  # type: ignore[return-value]
+    def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
+        return F.convert_format_bounding_boxes(inpt, new_format=self.format)  # type: ignore[return-value]
 
 
-class ClampBoundingBox(Transform):
+class ClampBoundingBoxes(Transform):
     """[BETA] Clamp bounding boxes to their corresponding image dimensions.
 
     The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
 
-    .. v2betastatus:: ClampBoundingBox transform
+    .. v2betastatus:: ClampBoundingBoxes transform
 
     """
 
-    _transformed_types = (datapoints.BoundingBox,)
+    _transformed_types = (datapoints.BoundingBoxes,)
 
-    def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
-        return F.clamp_bounding_box(inpt)  # type: ignore[return-value]
+    def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
+        return F.clamp_bounding_boxes(inpt)  # type: ignore[return-value]
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 4bd6bc131..a4cb594b2 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -10,7 +10,7 @@ from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
-from .utils import has_any, is_simple_tensor, query_bounding_box
+from .utils import has_any, is_simple_tensor, query_bounding_boxes
 
 
 # TODO: do we want/need to expose this?
@@ -332,16 +332,16 @@ class ConvertImageDtype(Transform):
         return F.to_dtype(inpt, dtype=self.dtype, scale=True)
 
 
-class SanitizeBoundingBox(Transform):
+class SanitizeBoundingBoxes(Transform):
     """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
 
-    .. v2betastatus:: SanitizeBoundingBox transform
+    .. v2betastatus:: SanitizeBoundingBoxes transform
 
     This transform removes bounding boxes and their associated labels/masks that:
 
     - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
     - have any coordinate outside of their corresponding image. You may want to
-      call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals.
+      call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals.
 
     It is recommended to call it at the end of a pipeline, before passing the
     input to the models. It is critical to call this transform if
@@ -384,10 +384,10 @@ class SanitizeBoundingBox(Transform):
             )
 
         flat_inputs, spec = tree_flatten(inputs)
-        # TODO: this enforces one single BoundingBox entry.
+        # TODO: this enforces one single BoundingBoxes entry.
         # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
         # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
-        boxes = query_bounding_box(flat_inputs)
+        boxes = query_bounding_boxes(flat_inputs)
 
         if boxes.ndim != 2:
             raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")
@@ -398,8 +398,8 @@ class SanitizeBoundingBox(Transform):
             )
 
         boxes = cast(
-            datapoints.BoundingBox,
-            F.convert_format_bounding_box(
+            datapoints.BoundingBoxes,
+            F.convert_format_bounding_boxes(
                 boxes,
                 new_format=datapoints.BoundingBoxFormat.XYXY,
             ),
@@ -415,7 +415,7 @@ class SanitizeBoundingBox(Transform):
         params = dict(valid=valid, labels=labels)
         flat_outputs = [
             # Even-though it may look like we're transforming all inputs, we don't:
-            # _transform() will only care about BoundingBoxes and the labels
+            # _transform() will only care about BoundingBoxeses and the labels
             self._transform(inpt, params)
             for inpt in flat_inputs
         ]
@@ -424,9 +424,9 @@ class SanitizeBoundingBox(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         is_label = inpt is not None and inpt is params["labels"]
-        is_bounding_box_or_mask = isinstance(inpt, (datapoints.BoundingBox, datapoints.Mask))
+        is_bounding_boxes_or_mask = isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask))
 
-        if not (is_label or is_bounding_box_or_mask):
+        if not (is_label or is_bounding_boxes_or_mask):
             return inpt
 
         output = inpt[params["valid"]]
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 53aa47a2b..16f5ff500 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -3,8 +3,8 @@ from torchvision.transforms import InterpolationMode  # usort: skip
 from ._utils import is_simple_tensor  # usort: skip
 
 from ._meta import (
-    clamp_bounding_box,
-    convert_format_bounding_box,
+    clamp_bounding_boxes,
+    convert_format_bounding_boxes,
     get_dimensions_image_tensor,
     get_dimensions_image_pil,
     get_dimensions,
@@ -15,7 +15,7 @@ from ._meta import (
     get_num_channels_image_pil,
     get_num_channels_video,
     get_num_channels,
-    get_spatial_size_bounding_box,
+    get_spatial_size_bounding_boxes,
     get_spatial_size_image_tensor,
     get_spatial_size_image_pil,
     get_spatial_size_mask,
@@ -76,25 +76,25 @@ from ._color import (
 )
 from ._geometry import (
     affine,
-    affine_bounding_box,
+    affine_bounding_boxes,
     affine_image_pil,
     affine_image_tensor,
     affine_mask,
     affine_video,
     center_crop,
-    center_crop_bounding_box,
+    center_crop_bounding_boxes,
     center_crop_image_pil,
     center_crop_image_tensor,
     center_crop_mask,
     center_crop_video,
     crop,
-    crop_bounding_box,
+    crop_bounding_boxes,
     crop_image_pil,
     crop_image_tensor,
     crop_mask,
     crop_video,
     elastic,
-    elastic_bounding_box,
+    elastic_bounding_boxes,
     elastic_image_pil,
     elastic_image_tensor,
     elastic_mask,
@@ -106,37 +106,37 @@ from ._geometry import (
     five_crop_video,
     hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
     horizontal_flip,
-    horizontal_flip_bounding_box,
+    horizontal_flip_bounding_boxes,
     horizontal_flip_image_pil,
     horizontal_flip_image_tensor,
     horizontal_flip_mask,
     horizontal_flip_video,
     pad,
-    pad_bounding_box,
+    pad_bounding_boxes,
     pad_image_pil,
     pad_image_tensor,
     pad_mask,
     pad_video,
     perspective,
-    perspective_bounding_box,
+    perspective_bounding_boxes,
     perspective_image_pil,
     perspective_image_tensor,
     perspective_mask,
     perspective_video,
     resize,
-    resize_bounding_box,
+    resize_bounding_boxes,
     resize_image_pil,
     resize_image_tensor,
     resize_mask,
     resize_video,
     resized_crop,
-    resized_crop_bounding_box,
+    resized_crop_bounding_boxes,
     resized_crop_image_pil,
     resized_crop_image_tensor,
     resized_crop_mask,
     resized_crop_video,
     rotate,
-    rotate_bounding_box,
+    rotate_bounding_boxes,
     rotate_image_pil,
     rotate_image_tensor,
     rotate_mask,
@@ -146,7 +146,7 @@ from ._geometry import (
     ten_crop_image_tensor,
     ten_crop_video,
     vertical_flip,
-    vertical_flip_bounding_box,
+    vertical_flip_bounding_boxes,
     vertical_flip_image_pil,
     vertical_flip_image_tensor,
     vertical_flip_mask,
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index e1dd2866b..469e58ff9 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -23,7 +23,7 @@ from torchvision.transforms.functional import (
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import clamp_bounding_box, convert_format_bounding_box, get_spatial_size_image_pil
+from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_spatial_size_image_pil
 
 from ._utils import is_simple_tensor
 
@@ -51,21 +51,21 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image_tensor(mask)
 
 
-def horizontal_flip_bounding_box(
-    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
+def horizontal_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
-    shape = bounding_box.shape
+    shape = bounding_boxes.shape
 
-    bounding_box = bounding_box.clone().reshape(-1, 4)
+    bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
 
     if format == datapoints.BoundingBoxFormat.XYXY:
-        bounding_box[:, [2, 0]] = bounding_box[:, [0, 2]].sub_(spatial_size[1]).neg_()
+        bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(spatial_size[1]).neg_()
     elif format == datapoints.BoundingBoxFormat.XYWH:
-        bounding_box[:, 0].add_(bounding_box[:, 2]).sub_(spatial_size[1]).neg_()
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(spatial_size[1]).neg_()
     else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
-        bounding_box[:, 0].sub_(spatial_size[1]).neg_()
+        bounding_boxes[:, 0].sub_(spatial_size[1]).neg_()
 
-    return bounding_box.reshape(shape)
+    return bounding_boxes.reshape(shape)
 
 
 def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
@@ -101,21 +101,21 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
     return vertical_flip_image_tensor(mask)
 
 
-def vertical_flip_bounding_box(
-    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
+def vertical_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
-    shape = bounding_box.shape
+    shape = bounding_boxes.shape
 
-    bounding_box = bounding_box.clone().reshape(-1, 4)
+    bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
 
     if format == datapoints.BoundingBoxFormat.XYXY:
-        bounding_box[:, [1, 3]] = bounding_box[:, [3, 1]].sub_(spatial_size[0]).neg_()
+        bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(spatial_size[0]).neg_()
     elif format == datapoints.BoundingBoxFormat.XYWH:
-        bounding_box[:, 1].add_(bounding_box[:, 3]).sub_(spatial_size[0]).neg_()
+        bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(spatial_size[0]).neg_()
     else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
-        bounding_box[:, 1].sub_(spatial_size[0]).neg_()
+        bounding_boxes[:, 1].sub_(spatial_size[0]).neg_()
 
-    return bounding_box.reshape(shape)
+    return bounding_boxes.reshape(shape)
 
 
 def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
@@ -274,20 +274,20 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
     return output
 
 
-def resize_bounding_box(
-    bounding_box: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+def resize_bounding_boxes(
+    bounding_boxes: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     old_height, old_width = spatial_size
     new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
 
     if (new_height, new_width) == (old_height, old_width):
-        return bounding_box, spatial_size
+        return bounding_boxes, spatial_size
 
     w_ratio = new_width / old_width
     h_ratio = new_height / old_height
-    ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_box.device)
+    ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device)
     return (
-        bounding_box.mul(ratios).to(bounding_box.dtype),
+        bounding_boxes.mul(ratios).to(bounding_boxes.dtype),
         (new_height, new_width),
     )
 
@@ -650,8 +650,8 @@ def affine_image_pil(
     return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
-def _affine_bounding_box_with_expand(
-    bounding_box: torch.Tensor,
+def _affine_bounding_boxes_with_expand(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     angle: Union[int, float],
@@ -661,17 +661,17 @@ def _affine_bounding_box_with_expand(
     center: Optional[List[float]] = None,
     expand: bool = False,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    if bounding_box.numel() == 0:
-        return bounding_box, spatial_size
-
-    original_shape = bounding_box.shape
-    original_dtype = bounding_box.dtype
-    bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
-    dtype = bounding_box.dtype
-    device = bounding_box.device
-    bounding_box = (
-        convert_format_bounding_box(
-            bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes, spatial_size
+
+    original_shape = bounding_boxes.shape
+    original_dtype = bounding_boxes.dtype
+    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    dtype = bounding_boxes.dtype
+    device = bounding_boxes.device
+    bounding_boxes = (
+        convert_format_bounding_boxes(
+            bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
         )
     ).reshape(-1, 4)
 
@@ -697,7 +697,7 @@ def _affine_bounding_box_with_expand(
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
     # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
     points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
     # 2) Now let's transform the points using affine matrix
     transformed_points = torch.matmul(points, transposed_affine_matrix)
@@ -730,8 +730,8 @@ def _affine_bounding_box_with_expand(
         new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
         spatial_size = (new_height, new_width)
 
-    out_bboxes = clamp_bounding_box(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
-    out_bboxes = convert_format_bounding_box(
+    out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
+    out_bboxes = convert_format_bounding_boxes(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
@@ -739,8 +739,8 @@ def _affine_bounding_box_with_expand(
     return out_bboxes, spatial_size
 
 
-def affine_bounding_box(
-    bounding_box: torch.Tensor,
+def affine_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     angle: Union[int, float],
@@ -749,8 +749,8 @@ def affine_bounding_box(
     shear: List[float],
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    out_box, _ = _affine_bounding_box_with_expand(
-        bounding_box,
+    out_box, _ = _affine_bounding_boxes_with_expand(
+        bounding_boxes,
         format=format,
         spatial_size=spatial_size,
         angle=angle,
@@ -927,8 +927,8 @@ def rotate_image_pil(
     )
 
 
-def rotate_bounding_box(
-    bounding_box: torch.Tensor,
+def rotate_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     angle: float,
@@ -938,8 +938,8 @@ def rotate_bounding_box(
     if center is not None and expand:
         warnings.warn("The provided center argument has no effect on the result if expand is True")
 
-    return _affine_bounding_box_with_expand(
-        bounding_box,
+    return _affine_bounding_boxes_with_expand(
+        bounding_boxes,
         format=format,
         spatial_size=spatial_size,
         angle=-angle,
@@ -1165,8 +1165,8 @@ def pad_mask(
     return output
 
 
-def pad_bounding_box(
-    bounding_box: torch.Tensor,
+def pad_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     padding: List[int],
@@ -1182,14 +1182,14 @@ def pad_bounding_box(
         pad = [left, top, left, top]
     else:
         pad = [left, top, 0, 0]
-    bounding_box = bounding_box + torch.tensor(pad, dtype=bounding_box.dtype, device=bounding_box.device)
+    bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
 
     height, width = spatial_size
     height += top + bottom
     width += left + right
     spatial_size = (height, width)
 
-    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
+    return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size
 
 
 def pad_video(
@@ -1245,8 +1245,8 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid
 crop_image_pil = _FP.crop
 
 
-def crop_bounding_box(
-    bounding_box: torch.Tensor,
+def crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     top: int,
     left: int,
@@ -1260,10 +1260,10 @@ def crop_bounding_box(
     else:
         sub = [left, top, 0, 0]
 
-    bounding_box = bounding_box - torch.tensor(sub, dtype=bounding_box.dtype, device=bounding_box.device)
+    bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
     spatial_size = (height, width)
 
-    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
+    return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1409,27 +1409,27 @@ def perspective_image_pil(
     return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
-def perspective_bounding_box(
-    bounding_box: torch.Tensor,
+def perspective_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    if bounding_box.numel() == 0:
-        return bounding_box
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
 
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
-    original_shape = bounding_box.shape
-    # TODO: first cast to float if bbox is int64 before convert_format_bounding_box
-    bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+    original_shape = bounding_boxes.shape
+    # TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes
+    bounding_boxes = (
+        convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+    device = bounding_boxes.device
 
     # perspective_coeffs are computed as endpoint -> start point
     # We have to invert perspective_coeffs for bboxes:
@@ -1475,7 +1475,7 @@ def perspective_bounding_box(
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
     # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
     points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
     # 2) Now let's transform the points using perspective matrices
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
@@ -1490,15 +1490,15 @@ def perspective_bounding_box(
     transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
 
-    out_bboxes = clamp_bounding_box(
-        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
+    out_bboxes = clamp_bounding_boxes(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
         format=datapoints.BoundingBoxFormat.XYXY,
         spatial_size=spatial_size,
     )
 
     # out_bboxes should be of shape [N boxes, 4]
 
-    return convert_format_bounding_box(
+    return convert_format_bounding_boxes(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
@@ -1648,26 +1648,26 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
     return base_grid
 
 
-def elastic_bounding_box(
-    bounding_box: torch.Tensor,
+def elastic_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     displacement: torch.Tensor,
 ) -> torch.Tensor:
-    if bounding_box.numel() == 0:
-        return bounding_box
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
 
     # TODO: add in docstring about approximation we are doing for grid inversion
-    device = bounding_box.device
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
+    device = bounding_boxes.device
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
 
     if displacement.dtype != dtype or displacement.device != device:
         displacement = displacement.to(dtype=dtype, device=device)
 
-    original_shape = bounding_box.shape
-    # TODO: first cast to float if bbox is int64 before convert_format_bounding_box
-    bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+    original_shape = bounding_boxes.shape
+    # TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes
+    bounding_boxes = (
+        convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
@@ -1676,7 +1676,7 @@ def elastic_bounding_box(
     inv_grid = id_grid.sub_(displacement)
 
     # Get points from bboxes
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
     if points.is_floating_point():
         points = points.ceil_()
     index_xy = points.to(dtype=torch.long)
@@ -1688,13 +1688,13 @@ def elastic_bounding_box(
 
     transformed_points = transformed_points.reshape(-1, 4, 2)
     out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
-    out_bboxes = clamp_bounding_box(
-        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
+    out_bboxes = clamp_bounding_boxes(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
         format=datapoints.BoundingBoxFormat.XYXY,
         spatial_size=spatial_size,
     )
 
-    return convert_format_bounding_box(
+    return convert_format_bounding_boxes(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
@@ -1818,15 +1818,17 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL
     return crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
 
 
-def center_crop_bounding_box(
-    bounding_box: torch.Tensor,
+def center_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     spatial_size: Tuple[int, int],
     output_size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
     crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size)
-    return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
+    return crop_bounding_boxes(
+        bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width
+    )
 
 
 def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
@@ -1893,8 +1895,8 @@ def resized_crop_image_pil(
     return resize_image_pil(image, size, interpolation=interpolation)
 
 
-def resized_crop_bounding_box(
-    bounding_box: torch.Tensor,
+def resized_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
     top: int,
     left: int,
@@ -1902,8 +1904,8 @@ def resized_crop_bounding_box(
     width: int,
     size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    bounding_box, _ = crop_bounding_box(bounding_box, format, top, left, height, width)
-    return resize_bounding_box(bounding_box, spatial_size=(height, width), size=size)
+    bounding_boxes, _ = crop_bounding_boxes(bounding_boxes, format, top, left, height, width)
+    return resize_bounding_boxes(bounding_boxes, spatial_size=(height, width), size=size)
 
 
 def resized_crop_mask(
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 809a01fcc..f564b1803 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -109,8 +109,8 @@ def get_spatial_size_mask(mask: torch.Tensor) -> List[int]:
 
 
 @torch.jit.unused
-def get_spatial_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[int]:
-    return list(bounding_box.spatial_size)
+def get_spatial_size_bounding_boxes(bounding_boxes: datapoints.BoundingBoxes) -> List[int]:
+    return list(bounding_boxes.spatial_size)
 
 
 def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
@@ -119,7 +119,7 @@ def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_spatial_size_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBox, datapoints.Mask)):
+    elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBoxes, datapoints.Mask)):
         return list(inpt.spatial_size)
     elif isinstance(inpt, PIL.Image.Image):
         return get_spatial_size_image_pil(inpt)
@@ -185,95 +185,97 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
     return xyxy
 
 
-def _convert_format_bounding_box(
-    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
+def _convert_format_bounding_boxes(
+    bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
 
     if new_format == old_format:
-        return bounding_box
+        return bounding_boxes
 
     # TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance
     if old_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xywh_to_xyxy(bounding_box, inplace)
+        bounding_boxes = _xywh_to_xyxy(bounding_boxes, inplace)
     elif old_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _cxcywh_to_xyxy(bounding_box, inplace)
+        bounding_boxes = _cxcywh_to_xyxy(bounding_boxes, inplace)
 
     if new_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xyxy_to_xywh(bounding_box, inplace)
+        bounding_boxes = _xyxy_to_xywh(bounding_boxes, inplace)
     elif new_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _xyxy_to_cxcywh(bounding_box, inplace)
+        bounding_boxes = _xyxy_to_cxcywh(bounding_boxes, inplace)
 
-    return bounding_box
+    return bounding_boxes
 
 
-def convert_format_bounding_box(
+def convert_format_bounding_boxes(
     inpt: datapoints._InputTypeJIT,
     old_format: Optional[BoundingBoxFormat] = None,
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
 ) -> datapoints._InputTypeJIT:
     # This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor
-    # inputs as well as extract it from `datapoints.BoundingBox` inputs. However, putting a default value on
+    # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
     # default error that would be thrown if `new_format` had no default value.
     if new_format is None:
-        raise TypeError("convert_format_bounding_box() missing 1 required argument: 'new_format'")
+        raise TypeError("convert_format_bounding_boxes() missing 1 required argument: 'new_format'")
 
     if not torch.jit.is_scripting():
-        _log_api_usage_once(convert_format_bounding_box)
+        _log_api_usage_once(convert_format_bounding_boxes)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         if old_format is None:
             raise ValueError("For simple tensor inputs, `old_format` has to be passed.")
-        return _convert_format_bounding_box(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
-    elif isinstance(inpt, datapoints.BoundingBox):
+        return _convert_format_bounding_boxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
+    elif isinstance(inpt, datapoints.BoundingBoxes):
         if old_format is not None:
             raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.")
-        output = _convert_format_bounding_box(
+        output = _convert_format_bounding_boxes(
             inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
         )
-        return datapoints.BoundingBox.wrap_like(inpt, output, format=new_format)
+        return datapoints.BoundingBoxes.wrap_like(inpt, output, format=new_format)
     else:
         raise TypeError(
             f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
         )
 
 
-def _clamp_bounding_box(
-    bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
+def _clamp_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
 ) -> torch.Tensor:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    in_dtype = bounding_box.dtype
-    bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
-    xyxy_boxes = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+    in_dtype = bounding_boxes.dtype
+    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    xyxy_boxes = convert_format_bounding_boxes(
+        bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
-    out_boxes = convert_format_bounding_box(
+    out_boxes = convert_format_bounding_boxes(
         xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
     )
     return out_boxes.to(in_dtype)
 
 
-def clamp_bounding_box(
+def clamp_bounding_boxes(
     inpt: datapoints._InputTypeJIT,
     format: Optional[BoundingBoxFormat] = None,
     spatial_size: Optional[Tuple[int, int]] = None,
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(clamp_bounding_box)
+        _log_api_usage_once(clamp_bounding_boxes)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         if format is None or spatial_size is None:
             raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.")
-        return _clamp_bounding_box(inpt, format=format, spatial_size=spatial_size)
-    elif isinstance(inpt, datapoints.BoundingBox):
+        return _clamp_bounding_boxes(inpt, format=format, spatial_size=spatial_size)
+    elif isinstance(inpt, datapoints.BoundingBoxes):
         if format is not None or spatial_size is not None:
             raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.")
-        output = _clamp_bounding_box(inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size)
-        return datapoints.BoundingBox.wrap_like(inpt, output)
+        output = _clamp_bounding_boxes(
+            inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size
+        )
+        return datapoints.BoundingBoxes.wrap_like(inpt, output)
     else:
         raise TypeError(
             f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
index c4cf481bc..978333296 100644
--- a/torchvision/transforms/v2/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -9,8 +9,8 @@ from torchvision._utils import sequence_to_str
 from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor
 
 
-def query_bounding_box(flat_inputs: List[Any]) -> datapoints.BoundingBox:
-    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBox)]
+def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
+    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)]
     if not bounding_boxes:
         raise TypeError("No bounding box was found in the sample")
     elif len(bounding_boxes) > 1:
@@ -37,7 +37,7 @@ def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
         tuple(get_spatial_size(inpt))
         for inpt in flat_inputs
         if isinstance(
-            inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBox)
+            inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBoxes)
         )
         or is_simple_tensor(inpt)
     }
-- 
GitLab


From 8d4e87931561e6ae246b9d75aded859a3f301d6d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 31 Jul 2023 21:11:37 +0100
Subject: [PATCH 526/624] Add new entry to .git-blame-ignore-revs (#7781)

---
 .git-blame-ignore-revs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index b9754e29b..5e88f5b9b 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -9,3 +9,5 @@ d367a01a18a3ae6bee13d8be3b63fd6a581ea46f
 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f
 # Fix unnecessary exploded black formatting (#7709)
 a335d916db0694770e8152f41e19195de3134523
+# Renaming: `BoundingBox` -> `BoundingBoxes` (#7778)
+332bff937c6711666191880fab57fa2f23ae772e
-- 
GitLab


From 9b4ec8df59274e0839a30768a2c853a5656470b4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 31 Jul 2023 21:59:11 +0100
Subject: [PATCH 527/624] Add gallery example for MixUp and CutMix (#7772)

---
 docs/source/transforms.rst            |   8 +-
 gallery/plot_cutmix_mixup.py          | 148 +++++++++++++++++++++++++-
 test/test_transforms_v2_refactored.py |   2 +-
 torchvision/transforms/v2/_augment.py |  26 +++--
 4 files changed, 171 insertions(+), 13 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 9f3efe303..e3bdbd55a 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -261,13 +261,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
     AugMix
     v2.AugMix
 
-Cutmix - Mixup
+CutMix - MixUp
 --------------
 
-Cutmix and Mixup are special transforms that
+CutMix and MixUp are special transforms that
 are meant to be used on batches rather than on individual images, because they
-are combining pairs of images together. These can be used after the dataloader,
-or part of a collation function. See
+are combining pairs of images together. These can be used after the dataloader
+(once the samples are batched), or part of a collation function. See
 :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
 
 .. autosummary::
diff --git a/gallery/plot_cutmix_mixup.py b/gallery/plot_cutmix_mixup.py
index 19838fe90..d1c92a278 100644
--- a/gallery/plot_cutmix_mixup.py
+++ b/gallery/plot_cutmix_mixup.py
@@ -1,8 +1,152 @@
 
 """
 ===========================
-How to use Cutmix and Mixup
+How to use CutMix and MixUp
 ===========================
 
-TODO
+:class:`~torchvision.transforms.v2.Cutmix` and
+:class:`~torchvision.transforms.v2.Mixup` are popular augmentation strategies
+that can improve classification accuracy.
+
+These transforms are slightly different from the rest of the Torchvision
+transforms, because they expect
+**batches** of samples as input, not individual images. In this example we'll
+explain how to use them: after the ``DataLoader``, or as part of a collation
+function.
 """
+
+# %%
+import torch
+import torchvision
+from torchvision.datasets import FakeData
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision.transforms import v2
+
+
+NUM_CLASSES = 100
+
+# %%
+# Pre-processing pipeline
+# -----------------------
+#
+# We'll use a simple but typical image classification pipeline:
+
+preproc = v2.Compose([
+    v2.PILToTensor(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),  # to float32 in [0, 1]
+    v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),  # typically from ImageNet
+])
+
+dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc)
+
+img, label = dataset[0]
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }")
+
+# %%
+#
+# One important thing to note is that neither CutMix nor MixUp are part of this
+# pre-processing pipeline. We'll add them a bit later once we define the
+# DataLoader. Just as a refresher, this is what the DataLoader and training loop
+# would look like if we weren't using CutMix or MixUp:
+
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    print(labels.dtype)
+    # <rest of the training loop here>
+    break
+# %%
+
+# %%
+# Where to use MixUp and CutMix
+# -----------------------------
+#
+# After the DataLoader
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Now let's add CutMix and MixUp. The simplest way to do this right after the
+# DataLoader: the Dataloader has already batched the images and labels for us,
+# and this is exactly what these transforms expect as input:
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+cutmix = v2.Cutmix(num_classes=NUM_CLASSES)
+mixup = v2.Mixup(num_classes=NUM_CLASSES)
+cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
+
+for images, labels in dataloader:
+    print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }")
+    images, labels = cutmix_or_mixup(images, labels)
+    print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }")
+
+    # <rest of the training loop here>
+    break
+# %%
+#
+# Note how the labels were also transformed: we went from a batched label of
+# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The
+# transformed labels can still be passed as-is to a loss function like
+# :func:`torch.nn.functional.cross_entropy`.
+#
+# As part of the collation function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Passing the transforms after the DataLoader is the simplest way to use CutMix
+# and MixUp, but one disadvantage is that it does not take advantage of the
+# DataLoader multi-processing. For that, we can pass those transforms as part of
+# the collation function (refer to the `PyTorch docs
+# <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`_ to learn
+# more about collation).
+
+from torch.utils.data import default_collate
+
+
+def collate_fn(batch):
+    return cutmix_or_mixup(*default_collate(batch))
+
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader!
+    # <rest of the training loop here>
+    break
+
+# %%
+# Non-standard input format
+# -------------------------
+#
+# So far we've used a typical sample structure where we pass ``(images,
+# labels)`` as inputs. MixUp and CutMix will magically work by default with most
+# common sample structures: tuples where the second parameter is a tensor label,
+# or dict with a "label[s]" key. Look at the documentation of the
+# ``labels_getter`` parameter for more details.
+#
+# If your samples have a different structure, you can still use CutMix and MixUp
+# by passing a callable to the ``labels_getter`` parameter. For example:
+
+batch = {
+    "imgs": torch.rand(4, 3, 224, 224),
+    "target": {
+        "classes": torch.randint(0, NUM_CLASSES, size=(4,)),
+        "some_other_key": "this is going to be passed-through"
+    }
+}
+
+
+def labels_getter(batch):
+    return batch["target"]["classes"]
+
+
+out = v2.Cutmix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 3b808d6b7..f4e00a2b8 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -1922,7 +1922,7 @@ class TestCutMixMixUp:
 
         dataset = self.DummyDataset(size=batch_size, num_classes=num_classes)
 
-        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+        cutmix_mixup = T(num_classes=num_classes)
 
         dl = DataLoader(dataset, batch_size=batch_size)
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index f9038c6af..2c6844c96 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -141,9 +141,9 @@ class RandomErasing(_RandomApplyTransform):
 
 
 class _BaseMixupCutmix(Transform):
-    def __init__(self, *, alpha: float = 1, num_classes: int, labels_getter="default") -> None:
+    def __init__(self, *, alpha: float = 1.0, num_classes: int, labels_getter="default") -> None:
         super().__init__()
-        self.alpha = alpha
+        self.alpha = float(alpha)
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
         self.num_classes = num_classes
@@ -204,13 +204,20 @@ class _BaseMixupCutmix(Transform):
 
 
 class Mixup(_BaseMixupCutmix):
-    """[BETA] Apply Mixup to the provided batch of images and labels.
+    """[BETA] Apply MixUp to the provided batch of images and labels.
 
     .. v2betastatus:: Mixup transform
 
     Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
 
-    See :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
 
     In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
     into a tensor of shape ``(batch_size, num_classes)``.
@@ -246,14 +253,21 @@ class Mixup(_BaseMixupCutmix):
 
 
 class Cutmix(_BaseMixupCutmix):
-    """[BETA] Apply Cutmix to the provided batch of images and labels.
+    """[BETA] Apply CutMix to the provided batch of images and labels.
 
     .. v2betastatus:: Cutmix transform
 
     Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
     <https://arxiv.org/abs/1905.04899>`_.
 
-    See :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
 
     In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
     into a tensor of shape ``(batch_size, num_classes)``.
-- 
GitLab


From bdf16222f7e734f81af0f6ea6a5f413c7b353237 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 31 Jul 2023 23:36:27 +0200
Subject: [PATCH 528/624] add support for instance checks on dataset wrappers
 (#7239)

---
 references/detection/coco_utils.py            |  4 +---
 references/detection/group_by_aspect_ratio.py |  4 +---
 test/datasets_utils.py                        |  6 ++++--
 torchvision/datapoints/_dataset_wrapper.py    | 14 +++++++++++---
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 07c98a67c..5269b45ab 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -178,9 +178,7 @@ def get_coco_api_from_dataset(dataset):
             break
         if isinstance(dataset, torch.utils.data.Subset):
             dataset = dataset.dataset
-    if isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(
-        getattr(dataset, "_dataset", None), torchvision.datasets.CocoDetection
-    ):
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
         return dataset.coco
     return convert_to_coco_api(dataset)
 
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
index d4a447248..d12e14b54 100644
--- a/references/detection/group_by_aspect_ratio.py
+++ b/references/detection/group_by_aspect_ratio.py
@@ -164,9 +164,7 @@ def compute_aspect_ratios(dataset, indices=None):
     if hasattr(dataset, "get_height_and_width"):
         return _compute_aspect_ratios_custom_dataset(dataset, indices)
 
-    if isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(
-        getattr(dataset, "_dataset", None), torchvision.datasets.CocoDetection
-    ):
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
         return _compute_aspect_ratios_coco_dataset(dataset, indices)
 
     if isinstance(dataset, torchvision.datasets.VOCDetection):
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 169437a74..ab325a806 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -571,7 +571,7 @@ class DatasetTestCase(unittest.TestCase):
         from torchvision.datasets import wrap_dataset_for_transforms_v2
 
         try:
-            with self.create_dataset(config) as (dataset, _):
+            with self.create_dataset(config) as (dataset, info):
                 for target_keys in [None, "all"]:
                     if target_keys is not None and self.DATASET_CLASS not in {
                         torchvision.datasets.CocoDetection,
@@ -584,8 +584,10 @@ class DatasetTestCase(unittest.TestCase):
                         continue
 
                     wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
-                    wrapped_sample = wrapped_dataset[0]
+                    assert isinstance(wrapped_dataset, self.DATASET_CLASS)
+                    assert len(wrapped_dataset) == info["num_examples"]
 
+                    wrapped_sample = wrapped_dataset[0]
                     assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
         except TypeError as error:
             msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index 3b162b69c..26e94972b 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -8,7 +8,6 @@ import contextlib
 from collections import defaultdict
 
 import torch
-from torch.utils.data import Dataset
 
 from torchvision import datapoints, datasets
 from torchvision.transforms.v2 import functional as F
@@ -98,7 +97,16 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
             f"but got {target_keys}"
         )
 
-    return VisionDatasetDatapointWrapper(dataset, target_keys)
+    # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name
+    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetDatapointWrapper (see below) as well as the
+    # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks,
+    # while we can still inject everything that we need.
+    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetDatapointWrapper, type(dataset)), {})
+    # Since VisionDatasetDatapointWrapper comes before ImageNet in the MRO, calling the class hits
+    # VisionDatasetDatapointWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
+    # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather
+    # have the existing instance as attribute on the new object.
+    return wrapped_dataset_cls(dataset, target_keys)
 
 
 class WrapperFactories(dict):
@@ -117,7 +125,7 @@ class WrapperFactories(dict):
 WRAPPER_FACTORIES = WrapperFactories()
 
 
-class VisionDatasetDatapointWrapper(Dataset):
+class VisionDatasetDatapointWrapper:
     def __init__(self, dataset, target_keys):
         dataset_cls = type(dataset)
 
-- 
GitLab


From 312c3d3287aa9936e79a374819ff466e0517c5d1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 1 Aug 2023 09:38:50 +0200
Subject: [PATCH 529/624] remove spatial_size (#7734)

---
 gallery/plot_datapoints.py                    |   6 +-
 gallery/plot_transforms_v2.py                 |   2 +-
 test/common_utils.py                          |  86 +++-------
 test/test_datapoints.py                       |   4 +-
 test/test_prototype_transforms.py             |  34 ++--
 test/test_transforms_v2.py                    | 158 ++++++++----------
 test/test_transforms_v2_consistency.py        |  10 +-
 test/test_transforms_v2_functional.py         |  80 ++++-----
 test/test_transforms_v2_refactored.py         |  58 +++----
 test/test_transforms_v2_utils.py              |   8 +-
 test/transforms_v2_kernel_infos.py            |  64 +++----
 torchvision/datapoints/_bounding_box.py       |  62 +++----
 torchvision/datapoints/_datapoint.py          |   2 +-
 torchvision/datapoints/_dataset_wrapper.py    |  20 +--
 torchvision/datapoints/_image.py              |  10 +-
 torchvision/datapoints/_mask.py               |   6 +-
 torchvision/datapoints/_video.py              |  14 +-
 torchvision/prototype/transforms/_augment.py  |   4 +-
 torchvision/prototype/transforms/_geometry.py |  12 +-
 torchvision/transforms/v2/_augment.py         |   4 +-
 torchvision/transforms/v2/_auto_augment.py    |  10 +-
 torchvision/transforms/v2/_geometry.py        |  20 +--
 torchvision/transforms/v2/_meta.py            |   2 +-
 torchvision/transforms/v2/_misc.py            |   2 +-
 .../transforms/v2/functional/__init__.py      |  12 +-
 .../transforms/v2/functional/_deprecated.py   |   2 +-
 .../transforms/v2/functional/_geometry.py     |  90 +++++-----
 torchvision/transforms/v2/functional/_meta.py | 124 ++++++++------
 torchvision/transforms/v2/utils.py            |  25 ++-
 29 files changed, 440 insertions(+), 491 deletions(-)

diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index c5a6efa98..fef282ae0 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -80,7 +80,7 @@ print(image.shape, image.dtype)
 # corresponding image alongside the actual values:
 
 bounding_box = datapoints.BoundingBoxes(
-    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:]
 )
 print(bounding_box)
 
@@ -108,7 +108,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
         target["boxes"] = datapoints.BoundingBoxes(
             boxes,
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=F.get_spatial_size(img),
+            canvas_size=F.get_size(img),
         )
         target["labels"] = labels
         target["masks"] = datapoints.Mask(masks)
@@ -129,7 +129,7 @@ class WrapPennFudanDataset:
         target["boxes"] = datapoints.BoundingBoxes(
             target["boxes"],
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=F.get_spatial_size(img),
+            canvas_size=F.get_size(img),
         )
         target["masks"] = datapoints.Mask(target["masks"])
         return img, target
diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
index c7bae8780..88916ba44 100644
--- a/gallery/plot_transforms_v2.py
+++ b/gallery/plot_transforms_v2.py
@@ -30,7 +30,7 @@ def load_data():
     masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
 
     bounding_boxes = datapoints.BoundingBoxes(
-        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:]
     )
 
     return path, image, bounding_boxes, masks, labels
diff --git a/test/common_utils.py b/test/common_utils.py
index c9cff035c..b5edda3ed 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -412,7 +412,7 @@ DEFAULT_SPATIAL_SIZES = (
 )
 
 
-def _parse_spatial_size(size, *, name="size"):
+def _parse_canvas_size(size, *, name="size"):
     if size == "random":
         raise ValueError("This should never happen")
     elif isinstance(size, int) and size > 0:
@@ -467,12 +467,13 @@ class TensorLoader:
 
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
-    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
     memory_format: torch.memory_format = torch.contiguous_format
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
 
     def __post_init__(self):
-        self.spatial_size = self.shape[-2:]
+        self.canvas_size = self.canvas_size = self.shape[-2:]
         self.num_channels = self.shape[-3]
 
     def load(self, device):
@@ -538,7 +539,7 @@ def make_image_loader(
 ):
     if not constant_alpha:
         raise ValueError("This should never happen")
-    size = _parse_spatial_size(size)
+    size = _parse_canvas_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
@@ -578,7 +579,7 @@ make_images = from_loaders(make_image_loaders)
 def make_image_loader_for_interpolation(
     size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
 ):
-    size = _parse_spatial_size(size)
+    size = _parse_canvas_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
@@ -623,43 +624,20 @@ def make_image_loaders_for_interpolation(
 class BoundingBoxesLoader(TensorLoader):
     format: datapoints.BoundingBoxFormat
     spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.canvas_size = self.spatial_size
 
 
 def make_bounding_box(
-    size=None,
+    canvas_size=DEFAULT_SIZE,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
-    spatial_size=None,
     batch_dims=(),
     dtype=None,
     device="cpu",
 ):
-    """
-    size: Size of the actual bounding box, i.e.
-        - (box[3] - box[1], box[2] - box[0]) for XYXY
-        - (H, W) for XYWH and CXCYWH
-    spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on
-        returned datapoints.BoundingBoxes
-
-    To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker
-    functions, e.g.
-
-    .. code::
-
-        image = make_image=(size=size)
-        bounding_boxes = make_bounding_box(spatial_size=size)
-        assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image)
-
-    For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all
-    other maker functions, e.g.
-
-    .. code::
-
-        image = make_image=()
-        bounding_boxes = make_bounding_box()
-        assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image)
-    """
-
     def sample_position(values, max_value):
         # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
         # However, if we have batch_dims, we need tensors as limits.
@@ -668,28 +646,16 @@ def make_bounding_box(
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
-    if spatial_size is None:
-        if size is None:
-            spatial_size = DEFAULT_SIZE
-        else:
-            height, width = size
-            height_margin, width_margin = torch.randint(10, (2,)).tolist()
-            spatial_size = (height + height_margin, width + width_margin)
-
     dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
         return datapoints.BoundingBoxes(
-            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
         )
 
-    if size is None:
-        h, w = [torch.randint(1, s, batch_dims) for s in spatial_size]
-    else:
-        h, w = [torch.full(batch_dims, s, dtype=torch.int) for s in size]
-
-    y = sample_position(h, spatial_size[0])
-    x = sample_position(w, spatial_size[1])
+    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
 
     if format is datapoints.BoundingBoxFormat.XYWH:
         parts = (x, y, w, h)
@@ -706,15 +672,15 @@ def make_bounding_box(
         raise ValueError(f"Format {format} is not supported")
 
     return datapoints.BoundingBoxes(
-        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
     )
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, canvas_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
-    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
+    canvas_size = _parse_canvas_size(canvas_size, name="canvas_size")
 
     def fn(shape, dtype, device):
         *batch_dims, num_coordinates = shape
@@ -722,21 +688,21 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORT
             raise pytest.UsageError()
 
         return make_bounding_box(
-            format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
+            format=format, canvas_size=canvas_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
-    return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
+    return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=canvas_size)
 
 
 def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    canvas_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     dtypes=(torch.float32, torch.float64, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+        yield make_bounding_box_loader(**params, canvas_size=canvas_size)
 
 
 make_bounding_boxes = from_loaders(make_bounding_box_loaders)
@@ -761,7 +727,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp
 
 def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_spatial_size(size)
+    size = _parse_canvas_size(size)
 
     def fn(shape, dtype, device):
         *batch_dims, num_objects, height, width = shape
@@ -802,7 +768,7 @@ def make_segmentation_mask_loader(
     size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
 ):
     # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    spatial_size = _parse_spatial_size(size)
+    canvas_size = _parse_canvas_size(size)
 
     def fn(shape, dtype, device):
         *batch_dims, height, width = shape
@@ -810,7 +776,7 @@ def make_segmentation_mask_loader(
             (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
-    return MaskLoader(fn, shape=(*extra_dims, *spatial_size), dtype=dtype)
+    return MaskLoader(fn, shape=(*extra_dims, *canvas_size), dtype=dtype)
 
 
 def make_segmentation_mask_loaders(
@@ -860,7 +826,7 @@ def make_video_loader(
     extra_dims=(),
     dtype=torch.uint8,
 ):
-    size = _parse_spatial_size(size)
+    size = _parse_canvas_size(size)
 
     def fn(shape, dtype, device, memory_format):
         *batch_dims, num_frames, _, height, width = shape
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index a5f090435..f0a44ec17 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -27,7 +27,7 @@ def test_mask_instance(data):
     "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
 )
 def test_bbox_instance(data, format):
-    bboxes = datapoints.BoundingBoxes(data, format=format, spatial_size=(32, 32))
+    bboxes = datapoints.BoundingBoxes(data, format=format, canvas_size=(32, 32))
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
@@ -164,7 +164,7 @@ def test_wrap_like():
     [
         datapoints.Image(torch.rand(3, 16, 16)),
         datapoints.Video(torch.rand(2, 3, 16, 16)),
-        datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)),
+        datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(10, 10)),
         datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)),
     ],
 )
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index d1f244107..4c50cf0b9 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -164,7 +164,7 @@ class TestSimpleCopyPaste:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
             "boxes": BoundingBoxes(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
+                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", canvas_size=(32, 32)
             ),
             "masks": Mask(masks),
             "labels": label_type(labels),
@@ -179,7 +179,7 @@ class TestSimpleCopyPaste:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
             "boxes": BoundingBoxes(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
+                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", canvas_size=(32, 32)
             ),
             "masks": Mask(paste_masks),
             "labels": label_type(paste_labels),
@@ -210,13 +210,13 @@ class TestFixedSizeCrop:
     def test__get_params(self, mocker):
         crop_size = (7, 7)
         batch_shape = (10,)
-        spatial_size = (11, 5)
+        canvas_size = (11, 5)
 
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(size=spatial_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape),
+            make_image(size=canvas_size, color_space="RGB"),
+            make_bounding_box(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -295,7 +295,7 @@ class TestFixedSizeCrop:
 
     def test__transform_culling(self, mocker):
         batch_size = 10
-        spatial_size = (10, 10)
+        canvas_size = (10, 10)
 
         is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
         mocker.patch(
@@ -304,17 +304,17 @@ class TestFixedSizeCrop:
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=spatial_size[0],
-                width=spatial_size[1],
+                height=canvas_size[0],
+                width=canvas_size[1],
                 is_valid=is_valid,
                 needs_pad=False,
             ),
         )
 
         bounding_boxes = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
-        masks = make_detection_mask(size=spatial_size, batch_dims=(batch_size,))
+        masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -334,7 +334,7 @@ class TestFixedSizeCrop:
 
     def test__transform_bounding_boxes_clamping(self, mocker):
         batch_size = 3
-        spatial_size = (10, 10)
+        canvas_size = (10, 10)
 
         mocker.patch(
             "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
@@ -342,15 +342,15 @@ class TestFixedSizeCrop:
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=spatial_size[0],
-                width=spatial_size[1],
+                height=canvas_size[0],
+                width=canvas_size[1],
                 is_valid=torch.full((batch_size,), fill_value=True),
                 needs_pad=False,
             ),
         )
 
         bounding_boxes = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes")
 
@@ -496,7 +496,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -505,7 +505,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -514,7 +514,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         datapoint_image = make_image(size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index e5624d78f..4c1815fdd 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -174,20 +174,20 @@ class TestSmoke:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_common(self, transform, adapter, container_type, image_or_video, device):
-        spatial_size = F.get_spatial_size(image_or_video)
+        canvas_size = F.get_size(image_or_video)
         input = dict(
             image_or_video=image_or_video,
-            image_datapoint=make_image(size=spatial_size),
-            video_datapoint=make_video(size=spatial_size),
-            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
+            image_datapoint=make_image(size=canvas_size),
+            video_datapoint=make_video(size=canvas_size),
+            image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])),
             bounding_boxes_xyxy=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
+                format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
             ),
             bounding_boxes_xywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,)
+                format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
             ),
             bounding_boxes_cxcywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,)
+                format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
             ),
             bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes(
                 [
@@ -199,7 +199,7 @@ class TestSmoke:
                     [2, 2, 1, 1],  # x1 > x2, y1 > y2
                 ],
                 format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=spatial_size,
+                canvas_size=canvas_size,
             ),
             bounding_boxes_degenerate_xywh=datapoints.BoundingBoxes(
                 [
@@ -211,7 +211,7 @@ class TestSmoke:
                     [0, 0, -1, -1],  # negative height and width
                 ],
                 format=datapoints.BoundingBoxFormat.XYWH,
-                spatial_size=spatial_size,
+                canvas_size=canvas_size,
             ),
             bounding_boxes_degenerate_cxcywh=datapoints.BoundingBoxes(
                 [
@@ -223,10 +223,10 @@ class TestSmoke:
                     [0, 0, -1, -1],  # negative height and width
                 ],
                 format=datapoints.BoundingBoxFormat.CXCYWH,
-                spatial_size=spatial_size,
+                canvas_size=canvas_size,
             ),
-            detection_mask=make_detection_mask(size=spatial_size),
-            segmentation_mask=make_segmentation_mask(size=spatial_size),
+            detection_mask=make_detection_mask(size=canvas_size),
+            segmentation_mask=make_segmentation_mask(size=canvas_size),
             int=0,
             float=0.0,
             bool=True,
@@ -271,7 +271,7 @@ class TestSmoke:
         # TODO: we should test that against all degenerate boxes above
         for format in list(datapoints.BoundingBoxFormat):
             sample = dict(
-                boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
+                boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
             assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
@@ -473,11 +473,11 @@ class TestRandomZoomOut:
 
     @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
     @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__get_params(self, fill, side_range, mocker):
+    def test__get_params(self, fill, side_range):
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
 
-        image = mocker.MagicMock(spec=datapoints.Image)
-        h, w = image.spatial_size = (24, 32)
+        h, w = size = (24, 32)
+        image = make_image(size)
 
         params = transform._get_params([image])
 
@@ -490,9 +490,7 @@ class TestRandomZoomOut:
     @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
     @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
     def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+        inpt = make_image((24, 32))
 
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
 
@@ -559,11 +557,9 @@ class TestRandomCrop:
 
     @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
     @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
+    def test__get_params(self, padding, pad_if_needed, size):
+        h, w = size = (24, 32)
+        image = make_image(size)
 
         transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
         params = transform._get_params([image])
@@ -613,21 +609,16 @@ class TestRandomCrop:
             output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
         )
 
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (32, 32)
+        h, w = size = (32, 32)
+        inpt = make_image(size)
 
-        expected = mocker.MagicMock(spec=datapoints.Image)
-        expected.num_channels = 3
         if isinstance(padding, int):
-            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
+            new_size = (h + padding, w + padding)
         elif isinstance(padding, list):
-            expected.spatial_size = (
-                inpt.spatial_size[0] + sum(padding[0::2]),
-                inpt.spatial_size[1] + sum(padding[1::2]),
-            )
+            new_size = (h + sum(padding[0::2]), w + sum(padding[1::2]))
         else:
-            expected.spatial_size = inpt.spatial_size
+            new_size = size
+        expected = make_image(new_size)
         _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
         fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
 
@@ -703,7 +694,7 @@ class TestGaussianBlur:
         fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+        inpt.canvas_size = (24, 32)
 
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
@@ -749,16 +740,14 @@ class TestRandomPerspective:
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.RandomPerspective(0.5, fill="abc")
 
-    def test__get_params(self, mocker):
+    def test__get_params(self):
         dscale = 0.5
         transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+
+        image = make_image((24, 32))
 
         params = transform._get_params([image])
 
-        h, w = image.spatial_size
         assert "coefficients" in params
         assert len(params["coefficients"]) == 8
 
@@ -769,9 +758,9 @@ class TestRandomPerspective:
         transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
 
         fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+
+        inpt = make_image((24, 32))
+
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
         # Otherwise, we can mock transform._get_params
@@ -809,17 +798,16 @@ class TestElasticTransform:
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.ElasticTransform(1.0, 2.0, fill="abc")
 
-    def test__get_params(self, mocker):
+    def test__get_params(self):
         alpha = 2.0
         sigma = 3.0
         transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+
+        h, w = size = (24, 32)
+        image = make_image(size)
 
         params = transform._get_params([image])
 
-        h, w = image.spatial_size
         displacement = params["displacement"]
         assert displacement.shape == (1, h, w, 2)
         assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
@@ -845,7 +833,7 @@ class TestElasticTransform:
         fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+        inpt.canvas_size = (24, 32)
 
         # Let's mock transform._get_params to control the output:
         transform._get_params = mocker.MagicMock()
@@ -856,7 +844,7 @@ class TestElasticTransform:
 
 
 class TestRandomErasing:
-    def test_assertions(self, mocker):
+    def test_assertions(self):
         with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
             transforms.RandomErasing(value={})
 
@@ -872,9 +860,7 @@ class TestRandomErasing:
         with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
             transforms.RandomErasing(scale=[-1, 2])
 
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+        image = make_image((24, 32))
 
         transform = transforms.RandomErasing(value=[1, 2, 3, 4])
 
@@ -882,10 +868,9 @@ class TestRandomErasing:
             transform._get_params([image])
 
     @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+    def test__get_params(self, value):
+        image = make_image((24, 32))
+        num_channels, height, width = F.get_dimensions(image)
 
         transform = transforms.RandomErasing(value=value)
         params = transform._get_params([image])
@@ -895,14 +880,14 @@ class TestRandomErasing:
         i, j = params["i"], params["j"]
         assert isinstance(v, torch.Tensor)
         if value == "random":
-            assert v.shape == (image.num_channels, h, w)
+            assert v.shape == (num_channels, h, w)
         elif isinstance(value, (int, float)):
             assert v.shape == (1, 1, 1)
         elif isinstance(value, (list, tuple)):
-            assert v.shape == (image.num_channels, 1, 1)
+            assert v.shape == (num_channels, 1, 1)
 
-        assert 0 <= i <= image.spatial_size[0] - h
-        assert 0 <= j <= image.spatial_size[1] - w
+        assert 0 <= i <= height - h
+        assert 0 <= j <= width - w
 
     @pytest.mark.parametrize("p", [0, 1])
     def test__transform(self, mocker, p):
@@ -1061,14 +1046,13 @@ class TestRandomChoice:
 class TestRandomIoUCrop:
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
-    def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+    def test__get_params(self, device, options):
+        orig_h, orig_w = size = (24, 32)
+        image = make_image(size)
         bboxes = datapoints.BoundingBoxes(
             torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
             format="XYXY",
-            spatial_size=image.spatial_size,
+            canvas_size=size,
             device=device,
         )
         sample = [image, bboxes]
@@ -1087,8 +1071,6 @@ class TestRandomIoUCrop:
             assert len(params["is_within_crop_area"]) > 0
             assert params["is_within_crop_area"].dtype == torch.bool
 
-            orig_h = image.spatial_size[0]
-            orig_w = image.spatial_size[1]
             assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
             assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
 
@@ -1103,7 +1085,7 @@ class TestRandomIoUCrop:
     def test__transform_empty_params(self, mocker):
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
         image = datapoints.Image(torch.rand(1, 3, 4, 4))
-        bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
+        bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4))
         label = torch.tensor([1])
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
@@ -1122,9 +1104,10 @@ class TestRandomIoUCrop:
     def test__transform(self, mocker):
         transform = transforms.RandomIoUCrop()
 
-        image = datapoints.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), batch_dims=(6,))
-        masks = make_detection_mask((32, 24), num_objects=6)
+        size = (32, 24)
+        image = make_image(size)
+        bboxes = make_bounding_box(format="XYXY", canvas_size=size, batch_dims=(6,))
+        masks = make_detection_mask(size, num_objects=6)
 
         sample = [image, bboxes, masks]
 
@@ -1155,13 +1138,14 @@ class TestRandomIoUCrop:
 
 
 class TestScaleJitter:
-    def test__get_params(self, mocker):
-        spatial_size = (24, 32)
+    def test__get_params(self):
+        canvas_size = (24, 32)
         target_size = (16, 12)
         scale_range = (0.5, 1.5)
 
         transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+
+        sample = make_image(canvas_size)
 
         n_samples = 5
         for _ in range(n_samples):
@@ -1174,11 +1158,11 @@ class TestScaleJitter:
             assert isinstance(size, tuple) and len(size) == 2
             height, width = size
 
-            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
+            r_min = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[1]
 
-            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
-            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
+            assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max)
+            assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max)
 
     def test__transform(self, mocker):
         interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
@@ -1206,12 +1190,12 @@ class TestScaleJitter:
 
 class TestRandomShortestSize:
     @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
-    def test__get_params(self, min_size, max_size, mocker):
-        spatial_size = (3, 10)
+    def test__get_params(self, min_size, max_size):
+        canvas_size = (3, 10)
 
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
 
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+        sample = make_image(canvas_size)
         params = transform._get_params([sample])
 
         assert "size" in params
@@ -1523,7 +1507,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
     boxes[:, 2:] += boxes[:, :2]
     boxes = boxes.clamp(min=0, max=min(H, W))
-    boxes = datapoints.BoundingBoxes(boxes, format="XYXY", spatial_size=(H, W))
+    boxes = datapoints.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
 
@@ -1597,7 +1581,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     boxes = datapoints.BoundingBoxes(
         boxes,
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(H, W),
+        canvas_size=(H, W),
     )
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
@@ -1651,7 +1635,7 @@ def test_sanitize_bounding_boxes_errors():
     good_bbox = datapoints.BoundingBoxes(
         [[0, 0, 10, 10]],
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(20, 20),
+        canvas_size=(20, 20),
     )
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
@@ -1678,7 +1662,7 @@ def test_sanitize_bounding_boxes_errors():
                 [[0, 0, 10, 10]],
             ],
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(20, 20),
+            canvas_size=(20, 20),
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
         transforms.SanitizeBoundingBoxes()(different_sizes)
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 9adec66b3..47a0b05b5 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -31,7 +31,7 @@ from torchvision._utils import sequence_to_str
 from torchvision.transforms import functional as legacy_F
 from torchvision.transforms.v2 import functional as prototype_F
 from torchvision.transforms.v2.functional import to_image_pil
-from torchvision.transforms.v2.utils import query_spatial_size
+from torchvision.transforms.v2.utils import query_size
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
@@ -1090,7 +1090,7 @@ class TestRefDetTransforms:
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1100,7 +1100,7 @@ class TestRefDetTransforms:
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1110,7 +1110,7 @@ class TestRefDetTransforms:
 
         datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1172,7 +1172,7 @@ class PadIfSmaller(v2_transforms.Transform):
         self.fill = v2_transforms._geometry._setup_fill_arg(fill)
 
     def _get_params(self, sample):
-        height, width = query_spatial_size(sample)
+        height, width = query_size(sample)
         padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
         needs_padding = any(padding)
         return dict(padding=padding, needs_padding=needs_padding)
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 5d692b581..230695ff9 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -351,7 +351,7 @@ class TestDispatchers:
             F.get_image_size,
             F.get_num_channels,
             F.get_num_frames,
-            F.get_spatial_size,
+            F.get_size,
             F.rgb_to_grayscale,
             F.uniform_temporal_subsample,
         ],
@@ -568,27 +568,27 @@ class TestClampBoundingBoxes:
         [
             dict(),
             dict(format=datapoints.BoundingBoxFormat.XYXY),
-            dict(spatial_size=(1, 1)),
+            dict(canvas_size=(1, 1)),
         ],
     )
     def test_simple_tensor_insufficient_metadata(self, metadata):
         simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
-        with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` has to be passed")):
+        with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
             F.clamp_bounding_boxes(simple_tensor, **metadata)
 
     @pytest.mark.parametrize(
         "metadata",
         [
             dict(format=datapoints.BoundingBoxFormat.XYXY),
-            dict(spatial_size=(1, 1)),
-            dict(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(1, 1)),
+            dict(canvas_size=(1, 1)),
+            dict(format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1, 1)),
         ],
     )
     def test_datapoint_explicit_metadata(self, metadata):
         datapoint = next(make_bounding_boxes())
 
-        with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` must not be passed")):
+        with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
             F.clamp_bounding_boxes(datapoint, **metadata)
 
 
@@ -673,7 +673,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
     #     expected_bboxes.append(out_box)
 
     format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (64, 76)
+    canvas_size = (64, 76)
     in_boxes = [
         [10.0, 15.0, 25.0, 35.0],
         [50.0, 5.0, 70.0, 22.0],
@@ -684,23 +684,23 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
         in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
     expected_bboxes = clamp_bounding_boxes(
-        datapoints.BoundingBoxes(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+        datapoints.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size)
     ).tolist()
 
-    output_boxes, output_spatial_size = F.crop_bounding_boxes(
+    output_boxes, output_canvas_size = F.crop_bounding_boxes(
         in_boxes,
         format,
         top,
         left,
-        spatial_size[0],
-        spatial_size[1],
+        canvas_size[0],
+        canvas_size[1],
     )
 
     if format != datapoints.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_spatial_size, spatial_size)
+    torch.testing.assert_close(output_canvas_size, canvas_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -737,7 +737,7 @@ def test_correctness_resized_crop_bounding_boxes(device, format, top, left, heig
         return bbox
 
     format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (100, 100)
+    canvas_size = (100, 100)
     in_boxes = [
         [10.0, 10.0, 20.0, 20.0],
         [5.0, 10.0, 15.0, 20.0],
@@ -748,18 +748,18 @@ def test_correctness_resized_crop_bounding_boxes(device, format, top, left, heig
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
     in_boxes = datapoints.BoundingBoxes(
-        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
+        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device
     )
     if format != datapoints.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_spatial_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size)
 
     if format != datapoints.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
-    torch.testing.assert_close(output_spatial_size, size)
+    torch.testing.assert_close(output_canvas_size, size)
 
 
 def _parse_padding(padding):
@@ -798,28 +798,28 @@ def test_correctness_pad_bounding_boxes(device, padding):
             bbox = bbox.to(dtype)
         return bbox
 
-    def _compute_expected_spatial_size(bbox, padding_):
+    def _compute_expected_canvas_size(bbox, padding_):
         pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
-        height, width = bbox.spatial_size
+        height, width = bbox.canvas_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
     for bboxes in make_bounding_boxes():
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
+        bboxes_canvas_size = bboxes.canvas_size
 
-        output_boxes, output_spatial_size = F.pad_bounding_boxes(
-            bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding
+        output_boxes, output_canvas_size = F.pad_bounding_boxes(
+            bboxes, format=bboxes_format, canvas_size=bboxes_canvas_size, padding=padding
         )
 
-        torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding))
+        torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding))
 
         if bboxes.ndim < 2 or bboxes.shape[0] == 0:
             bboxes = [bboxes]
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, padding))
 
         if len(expected_bboxes) > 1:
@@ -887,24 +887,24 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
         out_bbox = datapoints.BoundingBoxes(
             out_bbox,
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=bbox.spatial_size,
+            canvas_size=bbox.canvas_size,
             dtype=bbox.dtype,
             device=bbox.device,
         )
         return clamp_bounding_boxes(convert_format_bounding_boxes(out_bbox, new_format=bbox.format))
 
-    spatial_size = (32, 38)
+    canvas_size = (32, 38)
 
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(canvas_size=canvas_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
         output_bboxes = F.perspective_bounding_boxes(
             bboxes.as_subclass(torch.Tensor),
             format=bboxes.format,
-            spatial_size=bboxes.spatial_size,
+            canvas_size=bboxes.canvas_size,
             startpoints=None,
             endpoints=None,
             coefficients=pcoeffs,
@@ -915,7 +915,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size)
+            bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, canvas_size=bboxes.canvas_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
@@ -932,15 +932,15 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
 def test_correctness_center_crop_bounding_boxes(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
-        spatial_size_ = bbox.spatial_size
+        canvas_size_ = bbox.canvas_size
         dtype = bbox.dtype
         bbox = convert_format_bounding_boxes(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
 
-        cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5))
+        cy = int(round((canvas_size_[0] - output_size_[0]) * 0.5))
+        cx = int(round((canvas_size_[1] - output_size_[1]) * 0.5))
         out_bbox = [
             bbox[0].item() - cx,
             bbox[1].item() - cy,
@@ -949,16 +949,16 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
         ]
         out_bbox = torch.tensor(out_bbox)
         out_bbox = convert_format_bounding_boxes(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
-        out_bbox = clamp_bounding_boxes(out_bbox, format=format_, spatial_size=output_size)
+        out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
+        bboxes_canvas_size = bboxes.canvas_size
 
-        output_boxes, output_spatial_size = F.center_crop_bounding_boxes(
-            bboxes, bboxes_format, bboxes_spatial_size, output_size
+        output_boxes, output_canvas_size = F.center_crop_bounding_boxes(
+            bboxes, bboxes_format, bboxes_canvas_size, output_size
         )
 
         if bboxes.ndim < 2:
@@ -966,7 +966,7 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
 
         if len(expected_bboxes) > 1:
@@ -975,7 +975,7 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
             expected_bboxes = expected_bboxes[0]
 
         torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_spatial_size, output_size)
+        torch.testing.assert_close(output_canvas_size, output_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -1003,11 +1003,11 @@ def test_correctness_center_crop_mask(device, output_size):
 
 # Copied from test/test_functional_tensor.py
 @pytest.mark.parametrize("device", cpu_and_cuda())
-@pytest.mark.parametrize("spatial_size", ("small", "large"))
+@pytest.mark.parametrize("canvas_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
 @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma):
+def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, sigma):
     fn = F.gaussian_blur_image_tensor
 
     # true_cv2_results = {
@@ -1027,7 +1027,7 @@ def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize,
     p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
     true_cv2_results = torch.load(p)
 
-    if spatial_size == "small":
+    if canvas_size == "small":
         tensor = (
             torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
         )
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index f4e00a2b8..7d10fbed4 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -392,7 +392,7 @@ def assert_warns_antialias_default_value():
         yield
 
 
-def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix):
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix):
     def transform(bbox):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
@@ -426,7 +426,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_si
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, spatial_size=spatial_size)
+        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, canvas_size=canvas_size)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
@@ -514,14 +514,14 @@ class TestResize:
 
         bounding_boxes = make_bounding_box(
             format=format,
-            spatial_size=self.INPUT_SIZE,
+            canvas_size=self.INPUT_SIZE,
             dtype=dtype,
             device=device,
         )
         check_kernel(
             F.resize_bounding_boxes,
             bounding_boxes,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             size=size,
             **max_size_kwarg,
             check_scripted_vs_eager=not isinstance(size, int),
@@ -588,8 +588,8 @@ class TestResize:
         check_transform(transforms.Resize, make_input(self.INPUT_SIZE, device=device), size=size, antialias=True)
 
     def _check_output_size(self, input, output, *, size, max_size):
-        assert tuple(F.get_spatial_size(output)) == self._compute_output_size(
-            input_size=F.get_spatial_size(input), size=size, max_size=max_size
+        assert tuple(F.get_size(output)) == self._compute_output_size(
+            input_size=F.get_size(input), size=size, max_size=max_size
         )
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
@@ -613,9 +613,9 @@ class TestResize:
         torch.testing.assert_close(actual, expected, atol=1, rtol=0)
 
     def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None):
-        old_height, old_width = bounding_boxes.spatial_size
+        old_height, old_width = bounding_boxes.canvas_size
         new_height, new_width = self._compute_output_size(
-            input_size=bounding_boxes.spatial_size, size=size, max_size=max_size
+            input_size=bounding_boxes.canvas_size, size=size, max_size=max_size
         )
 
         if (old_height, old_width) == (new_height, new_width):
@@ -632,10 +632,10 @@ class TestResize:
         expected_bboxes = reference_affine_bounding_boxes_helper(
             bounding_boxes,
             format=bounding_boxes.format,
-            spatial_size=(new_height, new_width),
+            canvas_size=(new_height, new_width),
             affine_matrix=affine_matrix,
         )
-        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, spatial_size=(new_height, new_width))
+        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, canvas_size=(new_height, new_width))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
@@ -645,7 +645,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE)
+        bounding_boxes = make_bounding_box(format=format, canvas_size=self.INPUT_SIZE)
 
         actual = fn(bounding_boxes, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
@@ -762,7 +762,7 @@ class TestResize:
     def test_noop(self, size, make_input):
         input = make_input(self.INPUT_SIZE)
 
-        output = F.resize(input, size=F.get_spatial_size(input), antialias=True)
+        output = F.resize(input, size=F.get_size(input), antialias=True)
 
         # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
         # is a good reason to break this, feel free to downgrade to an equality check.
@@ -792,11 +792,11 @@ class TestResize:
 
         input = make_input(self.INPUT_SIZE)
 
-        size = min(F.get_spatial_size(input))
+        size = min(F.get_size(input))
         max_size = size + 1
         output = F.resize(input, size=size, max_size=max_size, antialias=True)
 
-        assert max(F.get_spatial_size(output)) == max_size
+        assert max(F.get_size(output)) == max_size
 
 
 class TestHorizontalFlip:
@@ -814,7 +814,7 @@ class TestHorizontalFlip:
             F.horizontal_flip_bounding_boxes,
             bounding_boxes,
             format=format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
         )
 
     @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
@@ -874,7 +874,7 @@ class TestHorizontalFlip:
     def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes):
         affine_matrix = np.array(
             [
-                [-1, 0, bounding_boxes.spatial_size[1]],
+                [-1, 0, bounding_boxes.canvas_size[1]],
                 [0, 1, 0],
             ],
             dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
@@ -883,7 +883,7 @@ class TestHorizontalFlip:
         expected_bboxes = reference_affine_bounding_boxes_helper(
             bounding_boxes,
             format=bounding_boxes.format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -995,7 +995,7 @@ class TestAffine:
             F.affine_bounding_boxes,
             bounding_boxes,
             format=format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             **{param: value},
             check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
         )
@@ -1133,7 +1133,7 @@ class TestAffine:
 
     def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center):
         if center is None:
-            center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]]
+            center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
 
         affine_matrix = self._compute_affine_matrix(
             angle=angle, translate=translate, scale=scale, shear=shear, center=center
@@ -1143,7 +1143,7 @@ class TestAffine:
         expected_bboxes = reference_affine_bounding_boxes_helper(
             bounding_boxes,
             format=bounding_boxes.format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1202,7 +1202,7 @@ class TestAffine:
     @pytest.mark.parametrize("seed", list(range(10)))
     def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
         image = make_image()
-        height, width = F.get_spatial_size(image)
+        height, width = F.get_size(image)
 
         transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
 
@@ -1293,7 +1293,7 @@ class TestVerticalFlip:
             F.vertical_flip_bounding_boxes,
             bounding_boxes,
             format=format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
         )
 
     @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
@@ -1352,7 +1352,7 @@ class TestVerticalFlip:
         affine_matrix = np.array(
             [
                 [1, 0, 0],
-                [0, -1, bounding_boxes.spatial_size[0]],
+                [0, -1, bounding_boxes.canvas_size[0]],
             ],
             dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
@@ -1360,7 +1360,7 @@ class TestVerticalFlip:
         expected_bboxes = reference_affine_bounding_boxes_helper(
             bounding_boxes,
             format=bounding_boxes.format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1449,7 +1449,7 @@ class TestRotate:
             F.rotate_bounding_boxes,
             bounding_boxes,
             format=format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             **kwargs,
         )
 
@@ -1555,7 +1555,7 @@ class TestRotate:
             raise ValueError("This reference currently does not support expand=True")
 
         if center is None:
-            center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]]
+            center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
 
         a = np.cos(angle * np.pi / 180.0)
         b = np.sin(angle * np.pi / 180.0)
@@ -1572,7 +1572,7 @@ class TestRotate:
         expected_bboxes = reference_affine_bounding_boxes_helper(
             bounding_boxes,
             format=bounding_boxes.format,
-            spatial_size=bounding_boxes.spatial_size,
+            canvas_size=bounding_boxes.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1834,7 +1834,7 @@ class TestToDtype:
         mask_dtype = torch.bool
         sample = {
             "inpt": make_input(size=(H, W), dtype=inpt_dtype),
-            "bbox": make_bounding_box(size=(H, W), dtype=bbox_dtype),
+            "bbox": make_bounding_box(canvas_size=(H, W), dtype=bbox_dtype),
             "mask": make_detection_mask(size=(H, W), dtype=mask_dtype),
         }
 
@@ -1988,7 +1988,7 @@ class TestCutMixMixUp:
         for input_with_bad_type in (
             F.to_pil_image(imgs[0]),
             datapoints.Mask(torch.rand(12, 12)),
-            datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", spatial_size=12),
+            datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12),
         ):
             with pytest.raises(ValueError, match="does not support PIL images, "):
                 cutmix_mixup(input_with_bad_type)
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 58c8bfd58..f880dac6c 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,16 +4,16 @@ import pytest
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import make_bounding_box, make_detection_mask, make_image
+from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_image_pil
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
-IMAGE = make_image(color_space="RGB")
-BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
-MASK = make_detection_mask(size=IMAGE.spatial_size)
+IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
+BOUNDING_BOX = make_bounding_box(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
+MASK = make_detection_mask(DEFAULT_SIZE)
 
 
 @pytest.mark.parametrize(
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 6f1c91ac6..85eb24a80 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -184,8 +184,8 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
     return other_args, dict(kwargs, fill=fill)
 
 
-def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix):
-    def transform(bbox, affine_matrix_, format_, spatial_size_):
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix):
+    def transform(bbox, affine_matrix_, format_, canvas_size_):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
@@ -218,14 +218,14 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_si
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
     if bounding_boxes.ndim < 2:
         bounding_boxes = [bounding_boxes]
 
-    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_boxes]
+    expected_bboxes = [transform(bbox, affine_matrix, format, canvas_size) for bbox in bounding_boxes]
     if len(expected_bboxes) > 1:
         expected_bboxes = torch.stack(expected_bboxes)
     else:
@@ -321,11 +321,11 @@ def reference_crop_bounding_boxes(bounding_boxes, *, format, top, left, height,
         dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
     )
 
-    spatial_size = (height, width)
+    canvas_size = (height, width)
     expected_bboxes = reference_affine_bounding_boxes_helper(
-        bounding_boxes, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+        bounding_boxes, format=format, canvas_size=canvas_size, affine_matrix=affine_matrix
     )
-    return expected_bboxes, spatial_size
+    return expected_bboxes, canvas_size
 
 
 def reference_inputs_crop_bounding_boxes():
@@ -507,7 +507,7 @@ def sample_inputs_pad_bounding_boxes():
         yield ArgsKwargs(
             bounding_boxes_loader,
             format=bounding_boxes_loader.format,
-            spatial_size=bounding_boxes_loader.spatial_size,
+            canvas_size=bounding_boxes_loader.canvas_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -530,7 +530,7 @@ def sample_inputs_pad_video():
         yield ArgsKwargs(video_loader, padding=[1])
 
 
-def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, padding, padding_mode):
+def reference_pad_bounding_boxes(bounding_boxes, *, format, canvas_size, padding, padding_mode):
 
     left, right, top, bottom = _parse_pad_padding(padding)
 
@@ -542,11 +542,11 @@ def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, paddin
         dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
     )
 
-    height = spatial_size[0] + top + bottom
-    width = spatial_size[1] + left + right
+    height = canvas_size[0] + top + bottom
+    width = canvas_size[1] + left + right
 
     expected_bboxes = reference_affine_bounding_boxes_helper(
-        bounding_boxes, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
+        bounding_boxes, format=format, canvas_size=(height, width), affine_matrix=affine_matrix
     )
     return expected_bboxes, (height, width)
 
@@ -558,7 +558,7 @@ def reference_inputs_pad_bounding_boxes():
         yield ArgsKwargs(
             bounding_boxes_loader,
             format=bounding_boxes_loader.format,
-            spatial_size=bounding_boxes_loader.spatial_size,
+            canvas_size=bounding_boxes_loader.canvas_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -660,7 +660,7 @@ def sample_inputs_perspective_bounding_boxes():
         yield ArgsKwargs(
             bounding_boxes_loader,
             format=bounding_boxes_loader.format,
-            spatial_size=bounding_boxes_loader.spatial_size,
+            canvas_size=bounding_boxes_loader.canvas_size,
             startpoints=None,
             endpoints=None,
             coefficients=_PERSPECTIVE_COEFFS[0],
@@ -669,7 +669,7 @@ def sample_inputs_perspective_bounding_boxes():
     format = datapoints.BoundingBoxFormat.XYXY
     loader = make_bounding_box_loader(format=format)
     yield ArgsKwargs(
-        loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
+        loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
     )
 
 
@@ -742,13 +742,13 @@ KERNEL_INFOS.extend(
 )
 
 
-def _get_elastic_displacement(spatial_size):
-    return torch.rand(1, *spatial_size, 2)
+def _get_elastic_displacement(canvas_size):
+    return torch.rand(1, *canvas_size, 2)
 
 
 def sample_inputs_elastic_image_tensor():
     for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
-        displacement = _get_elastic_displacement(image_loader.spatial_size)
+        displacement = _get_elastic_displacement(image_loader.canvas_size)
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
 
@@ -762,18 +762,18 @@ def reference_inputs_elastic_image_tensor():
             F.InterpolationMode.BICUBIC,
         ],
     ):
-        displacement = _get_elastic_displacement(image_loader.spatial_size)
+        displacement = _get_elastic_displacement(image_loader.canvas_size)
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
 
 
 def sample_inputs_elastic_bounding_boxes():
     for bounding_boxes_loader in make_bounding_box_loaders():
-        displacement = _get_elastic_displacement(bounding_boxes_loader.spatial_size)
+        displacement = _get_elastic_displacement(bounding_boxes_loader.canvas_size)
         yield ArgsKwargs(
             bounding_boxes_loader,
             format=bounding_boxes_loader.format,
-            spatial_size=bounding_boxes_loader.spatial_size,
+            canvas_size=bounding_boxes_loader.canvas_size,
             displacement=displacement,
         )
 
@@ -850,7 +850,7 @@ def sample_inputs_center_crop_bounding_boxes():
         yield ArgsKwargs(
             bounding_boxes_loader,
             format=bounding_boxes_loader.format,
-            spatial_size=bounding_boxes_loader.spatial_size,
+            canvas_size=bounding_boxes_loader.canvas_size,
             output_size=output_size,
         )
 
@@ -975,7 +975,7 @@ def reference_inputs_equalize_image_tensor():
             image.mul_(torch.iinfo(dtype).max).round_()
         return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True)
 
-    spatial_size = (256, 256)
+    canvas_size = (256, 256)
     for dtype, color_space, fn in itertools.product(
         [torch.uint8],
         ["GRAY", "RGB"],
@@ -1005,7 +1005,7 @@ def reference_inputs_equalize_image_tensor():
             ],
         ],
     ):
-        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype)
+        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *canvas_size), dtype=dtype)
         yield ArgsKwargs(image_loader)
 
 
@@ -1487,7 +1487,7 @@ def sample_inputs_clamp_bounding_boxes():
         yield ArgsKwargs(
             bounding_boxes_loader,
             format=bounding_boxes_loader.format,
-            spatial_size=bounding_boxes_loader.spatial_size,
+            canvas_size=bounding_boxes_loader.canvas_size,
         )
 
 
@@ -1502,7 +1502,7 @@ KERNEL_INFOS.append(
 _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
 
 
-def _get_five_ten_crop_spatial_size(size):
+def _get_five_ten_crop_canvas_size(size):
     if isinstance(size, int):
         crop_height = crop_width = size
     elif len(size) == 1:
@@ -1515,7 +1515,7 @@ def _get_five_ten_crop_spatial_size(size):
 def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)],
+            sizes=[_get_five_ten_crop_canvas_size(size)],
             color_spaces=["RGB"],
             dtypes=[torch.float32],
         ):
@@ -1525,21 +1525,21 @@ def sample_inputs_five_crop_image_tensor():
 def reference_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+            sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8]
         ):
             yield ArgsKwargs(image_loader, size=size)
 
 
 def sample_inputs_five_crop_video():
     size = _FIVE_TEN_CROP_SIZES[0]
-    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]):
         yield ArgsKwargs(video_loader, size=size)
 
 
 def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)],
+            sizes=[_get_five_ten_crop_canvas_size(size)],
             color_spaces=["RGB"],
             dtypes=[torch.float32],
         ):
@@ -1549,14 +1549,14 @@ def sample_inputs_ten_crop_image_tensor():
 def reference_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+            sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8]
         ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
 def sample_inputs_ten_crop_video():
     size = _FIVE_TEN_CROP_SIZES[0]
-    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]):
         yield ArgsKwargs(video_loader, size=size)
 
 
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index b3dc46348..780a95040 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -30,7 +30,7 @@ class BoundingBoxes(Datapoint):
     Args:
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
-        spatial_size (two-tuple of ints): Height and width of the corresponding image or video.
+        canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
         dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
             ``data``.
         device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
@@ -40,13 +40,13 @@ class BoundingBoxes(Datapoint):
     """
 
     format: BoundingBoxFormat
-    spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBoxes:
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBoxes:
         bounding_boxes = tensor.as_subclass(cls)
         bounding_boxes.format = format
-        bounding_boxes.spatial_size = spatial_size
+        bounding_boxes.canvas_size = canvas_size
         return bounding_boxes
 
     def __new__(
@@ -54,7 +54,7 @@ class BoundingBoxes(Datapoint):
         data: Any,
         *,
         format: Union[BoundingBoxFormat, str],
-        spatial_size: Tuple[int, int],
+        canvas_size: Tuple[int, int],
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: Optional[bool] = None,
@@ -64,7 +64,7 @@ class BoundingBoxes(Datapoint):
         if isinstance(format, str):
             format = BoundingBoxFormat[format.upper()]
 
-        return cls._wrap(tensor, format=format, spatial_size=spatial_size)
+        return cls._wrap(tensor, format=format, canvas_size=canvas_size)
 
     @classmethod
     def wrap_like(
@@ -73,7 +73,7 @@ class BoundingBoxes(Datapoint):
         tensor: torch.Tensor,
         *,
         format: Optional[BoundingBoxFormat] = None,
-        spatial_size: Optional[Tuple[int, int]] = None,
+        canvas_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBoxes:
         """Wrap a :class:`torch.Tensor` as :class:`BoundingBoxes` from a reference.
 
@@ -82,7 +82,7 @@ class BoundingBoxes(Datapoint):
             tensor (Tensor): Tensor to be wrapped as :class:`BoundingBoxes`
             format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
                 reference.
-            spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
+            canvas_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
                 omitted, it is taken from the reference.
 
         """
@@ -92,21 +92,21 @@ class BoundingBoxes(Datapoint):
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
-            spatial_size=spatial_size if spatial_size is not None else other.spatial_size,
+            canvas_size=canvas_size if canvas_size is not None else other.canvas_size,
         )
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, spatial_size=self.spatial_size)
+        return self._make_repr(format=self.format, canvas_size=self.canvas_size)
 
     def horizontal_flip(self) -> BoundingBoxes:
         output = self._F.horizontal_flip_bounding_boxes(
-            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
+            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size
         )
         return BoundingBoxes.wrap_like(self, output)
 
     def vertical_flip(self) -> BoundingBoxes:
         output = self._F.vertical_flip_bounding_boxes(
-            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
+            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size
         )
         return BoundingBoxes.wrap_like(self, output)
 
@@ -117,25 +117,25 @@ class BoundingBoxes(Datapoint):
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBoxes:
-        output, spatial_size = self._F.resize_bounding_boxes(
+        output, canvas_size = self._F.resize_bounding_boxes(
             self.as_subclass(torch.Tensor),
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             size=size,
             max_size=max_size,
         )
-        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
 
     def crop(self, top: int, left: int, height: int, width: int) -> BoundingBoxes:
-        output, spatial_size = self._F.crop_bounding_boxes(
+        output, canvas_size = self._F.crop_bounding_boxes(
             self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width
         )
-        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
 
     def center_crop(self, output_size: List[int]) -> BoundingBoxes:
-        output, spatial_size = self._F.center_crop_bounding_boxes(
-            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, output_size=output_size
+        output, canvas_size = self._F.center_crop_bounding_boxes(
+            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size, output_size=output_size
         )
-        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
 
     def resized_crop(
         self,
@@ -147,10 +147,10 @@ class BoundingBoxes(Datapoint):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBoxes:
-        output, spatial_size = self._F.resized_crop_bounding_boxes(
+        output, canvas_size = self._F.resized_crop_bounding_boxes(
             self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
         )
-        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
 
     def pad(
         self,
@@ -158,14 +158,14 @@ class BoundingBoxes(Datapoint):
         fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> BoundingBoxes:
-        output, spatial_size = self._F.pad_bounding_boxes(
+        output, canvas_size = self._F.pad_bounding_boxes(
             self.as_subclass(torch.Tensor),
             format=self.format,
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             padding=padding,
             padding_mode=padding_mode,
         )
-        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
 
     def rotate(
         self,
@@ -175,15 +175,15 @@ class BoundingBoxes(Datapoint):
         center: Optional[List[float]] = None,
         fill: _FillTypeJIT = None,
     ) -> BoundingBoxes:
-        output, spatial_size = self._F.rotate_bounding_boxes(
+        output, canvas_size = self._F.rotate_bounding_boxes(
             self.as_subclass(torch.Tensor),
             format=self.format,
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             angle=angle,
             expand=expand,
             center=center,
         )
-        return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
 
     def affine(
         self,
@@ -198,7 +198,7 @@ class BoundingBoxes(Datapoint):
         output = self._F.affine_bounding_boxes(
             self.as_subclass(torch.Tensor),
             self.format,
-            self.spatial_size,
+            self.canvas_size,
             angle,
             translate=translate,
             scale=scale,
@@ -218,7 +218,7 @@ class BoundingBoxes(Datapoint):
         output = self._F.perspective_bounding_boxes(
             self.as_subclass(torch.Tensor),
             format=self.format,
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             startpoints=startpoints,
             endpoints=endpoints,
             coefficients=coefficients,
@@ -232,6 +232,6 @@ class BoundingBoxes(Datapoint):
         fill: _FillTypeJIT = None,
     ) -> BoundingBoxes:
         output = self._F.elastic_bounding_boxes(
-            self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement
+            self.as_subclass(torch.Tensor), self.format, self.canvas_size, displacement=displacement
         )
         return BoundingBoxes.wrap_like(self, output)
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 35072159d..2059a3a18 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -138,7 +138,7 @@ class Datapoint(torch.Tensor):
         # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
         # attribute is cleared, so we need to refill it before we return.
         # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
-        # `BoundingBoxes.format` and `BoundingBoxes.spatial_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBoxes.format` and `BoundingBoxes.canvas_size`, which are immutable and thus implicitly deep-copied by
         # `BoundingBoxes.clone()`.
         return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
 
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index 26e94972b..f1e785726 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -341,13 +341,13 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
         default={"image_id", "boxes", "labels"},
     )
 
-    def segmentation_to_mask(segmentation, *, spatial_size):
+    def segmentation_to_mask(segmentation, *, canvas_size):
         from pycocotools import mask
 
         segmentation = (
-            mask.frPyObjects(segmentation, *spatial_size)
+            mask.frPyObjects(segmentation, *canvas_size)
             if isinstance(segmentation, dict)
-            else mask.merge(mask.frPyObjects(segmentation, *spatial_size))
+            else mask.merge(mask.frPyObjects(segmentation, *canvas_size))
         )
         return torch.from_numpy(mask.decode(segmentation))
 
@@ -359,7 +359,7 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
         if not target:
             return image, dict(image_id=image_id)
 
-        spatial_size = tuple(F.get_spatial_size(image))
+        canvas_size = tuple(F.get_size(image))
 
         batched_target = list_of_dicts_to_dict_of_lists(target)
         target = {}
@@ -372,7 +372,7 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
                 datapoints.BoundingBoxes(
                     batched_target["bbox"],
                     format=datapoints.BoundingBoxFormat.XYWH,
-                    spatial_size=spatial_size,
+                    canvas_size=canvas_size,
                 ),
                 new_format=datapoints.BoundingBoxFormat.XYXY,
             )
@@ -381,7 +381,7 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
             target["masks"] = datapoints.Mask(
                 torch.stack(
                     [
-                        segmentation_to_mask(segmentation, spatial_size=spatial_size)
+                        segmentation_to_mask(segmentation, canvas_size=canvas_size)
                         for segmentation in batched_target["segmentation"]
                     ]
                 ),
@@ -456,7 +456,7 @@ def voc_detection_wrapper_factory(dataset, target_keys):
                     for bndbox in batched_instances["bndbox"]
                 ],
                 format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=(image.height, image.width),
+                canvas_size=(image.height, image.width),
             )
 
         if "labels" in target_keys:
@@ -493,7 +493,7 @@ def celeba_wrapper_factory(dataset, target_keys):
                     datapoints.BoundingBoxes(
                         item,
                         format=datapoints.BoundingBoxFormat.XYWH,
-                        spatial_size=(image.height, image.width),
+                        canvas_size=(image.height, image.width),
                     ),
                     new_format=datapoints.BoundingBoxFormat.XYXY,
                 ),
@@ -543,7 +543,7 @@ def kitti_wrapper_factory(dataset, target_keys):
             target["boxes"] = datapoints.BoundingBoxes(
                 batched_target["bbox"],
                 format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=(image.height, image.width),
+                canvas_size=(image.height, image.width),
             )
 
         if "labels" in target_keys:
@@ -638,7 +638,7 @@ def widerface_wrapper(dataset, target_keys):
         if "bbox" in target_keys:
             target["bbox"] = F.convert_format_bounding_boxes(
                 datapoints.BoundingBoxes(
-                    target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+                    target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
                 ),
                 new_format=datapoints.BoundingBoxFormat.XYXY,
             )
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index e47a6c10f..2ebf4954d 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union
 
 import PIL.Image
 import torch
@@ -56,14 +56,6 @@ class Image(Datapoint):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        return tuple(self.shape[-2:])  # type: ignore[return-value]
-
-    @property
-    def num_channels(self) -> int:
-        return self.shape[-3]
-
     def horizontal_flip(self) -> Image:
         output = self._F.horizontal_flip_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index 0135d793d..bc50b3058 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union
 
 import PIL.Image
 import torch
@@ -51,10 +51,6 @@ class Mask(Datapoint):
     ) -> Mask:
         return cls._wrap(tensor)
 
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        return tuple(self.shape[-2:])  # type: ignore[return-value]
-
     def horizontal_flip(self) -> Mask:
         output = self._F.horizontal_flip_mask(self.as_subclass(torch.Tensor))
         return Mask.wrap_like(self, output)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index a6fbe2bd4..d527a68a4 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union
 
 import torch
 from torchvision.transforms.functional import InterpolationMode
@@ -46,18 +46,6 @@ class Video(Datapoint):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        return tuple(self.shape[-2:])  # type: ignore[return-value]
-
-    @property
-    def num_channels(self) -> int:
-        return self.shape[-3]
-
-    @property
-    def num_frames(self) -> int:
-        return self.shape[-4]
-
     def horizontal_flip(self) -> Video:
         output = self._F.horizontal_flip_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 0e50fb755..a5f883a24 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -11,7 +11,7 @@ from torchvision.transforms.v2 import functional as F, InterpolationMode, Transf
 
 from torchvision.transforms.v2._transform import _RandomApplyTransform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_spatial_size
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_size
 
 
 class _BaseMixupCutmix(_RandomApplyTransform):
@@ -64,7 +64,7 @@ class RandomCutmix(_BaseMixupCutmix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         lam = float(self._dist.sample(()))  # type: ignore[arg-type]
 
-        H, W = query_spatial_size(flat_inputs)
+        H, W = query_size(flat_inputs)
 
         r_x = torch.randint(W, ())
         r_y = torch.randint(H, ())
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index b328c1320..28aff8416 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@ from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_size
 
 
 class FixedSizeCrop(Transform):
@@ -46,7 +46,7 @@ class FixedSizeCrop(Transform):
             )
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
         new_height = min(height, self.crop_height)
         new_width = min(width, self.crop_width)
 
@@ -67,7 +67,7 @@ class FixedSizeCrop(Transform):
 
         if needs_crop and bounding_boxes is not None:
             format = bounding_boxes.format
-            bounding_boxes, spatial_size = F.crop_bounding_boxes(
+            bounding_boxes, canvas_size = F.crop_bounding_boxes(
                 bounding_boxes.as_subclass(torch.Tensor),
                 format=format,
                 top=top,
@@ -75,7 +75,7 @@ class FixedSizeCrop(Transform):
                 height=new_height,
                 width=new_width,
             )
-            bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size)
+            bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size)
             height_and_width = F.convert_format_bounding_boxes(
                 bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
             )[..., 2:]
@@ -115,9 +115,7 @@ class FixedSizeCrop(Transform):
             elif isinstance(inpt, datapoints.BoundingBoxes):
                 inpt = datapoints.BoundingBoxes.wrap_like(
                     inpt,
-                    F.clamp_bounding_boxes(
-                        inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size
-                    ),
+                    F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size),
                 )
 
         if params["needs_pad"]:
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 2c6844c96..3291c2f50 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -12,7 +12,7 @@ from torchvision.transforms.v2 import functional as F
 
 from ._transform import _RandomApplyTransform, Transform
 from ._utils import _parse_labels_getter
-from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size
+from .utils import has_any, is_simple_tensor, query_chw, query_size
 
 
 class RandomErasing(_RandomApplyTransform):
@@ -284,7 +284,7 @@ class Cutmix(_BaseMixupCutmix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         lam = float(self._dist.sample(()))  # type: ignore[arg-type]
 
-        H, W = query_spatial_size(flat_inputs)
+        H, W = query_size(flat_inputs)
 
         r_x = torch.randint(W, size=(1,))
         r_y = torch.randint(H, size=(1,))
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 785e1f697..2921903da 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -9,7 +9,7 @@ from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms import _functional_tensor as _FT
 from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.functional._meta import get_spatial_size
+from torchvision.transforms.v2.functional._meta import get_size
 
 from ._utils import _setup_fill_arg
 from .utils import check_type, is_simple_tensor
@@ -312,7 +312,7 @@ class AutoAugment(_AutoAugmentBase):
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(image_or_video)
+        height, width = get_size(image_or_video)
 
         policy = self._policies[int(torch.randint(len(self._policies), ()))]
 
@@ -403,7 +403,7 @@ class RandAugment(_AutoAugmentBase):
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(image_or_video)
+        height, width = get_size(image_or_video)
 
         for _ in range(self.num_ops):
             transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
@@ -474,7 +474,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(image_or_video)
+        height, width = get_size(image_or_video)
 
         transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
 
@@ -568,7 +568,7 @@ class AugMix(_AutoAugmentBase):
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(orig_image_or_video)
+        height, width = get_size(orig_image_or_video)
 
         if isinstance(orig_image_or_video, torch.Tensor):
             image_or_video = orig_image_or_video
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index a64f7a40e..9e7ca64d4 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -22,7 +22,7 @@ from ._utils import (
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size
+from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -267,7 +267,7 @@ class RandomResizedCrop(Transform):
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
         area = height * width
 
         log_ratio = self._log_ratio
@@ -558,7 +558,7 @@ class RandomZoomOut(_RandomApplyTransform):
             raise ValueError(f"Invalid canvas side range provided {side_range}.")
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(flat_inputs)
+        orig_h, orig_w = query_size(flat_inputs)
 
         r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
         canvas_width = int(orig_w * r)
@@ -735,7 +735,7 @@ class RandomAffine(Transform):
         self.center = center
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
 
         angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
         if self.translate is not None:
@@ -859,7 +859,7 @@ class RandomCrop(Transform):
         self.padding_mode = padding_mode
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        padded_height, padded_width = query_spatial_size(flat_inputs)
+        padded_height, padded_width = query_size(flat_inputs)
 
         if self.padding is not None:
             pad_left, pad_right, pad_top, pad_bottom = self.padding
@@ -972,7 +972,7 @@ class RandomPerspective(_RandomApplyTransform):
         self._fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
 
         distortion_scale = self.distortion_scale
 
@@ -1072,7 +1072,7 @@ class ElasticTransform(Transform):
         self._fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        size = list(query_spatial_size(flat_inputs))
+        size = list(query_size(flat_inputs))
 
         dx = torch.rand([1, 1] + size) * 2 - 1
         if self.sigma[0] > 0.0:
@@ -1164,7 +1164,7 @@ class RandomIoUCrop(Transform):
             )
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(flat_inputs)
+        orig_h, orig_w = query_size(flat_inputs)
         bboxes = query_bounding_boxes(flat_inputs)
 
         while True:
@@ -1282,7 +1282,7 @@ class ScaleJitter(Transform):
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(flat_inputs)
+        orig_height, orig_width = query_size(flat_inputs)
 
         scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
         r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
@@ -1347,7 +1347,7 @@ class RandomShortestSize(Transform):
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(flat_inputs)
+        orig_height, orig_width = query_size(flat_inputs)
 
         min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
         r = min_size / min(orig_height, orig_width)
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 71cc159c9..f0b622210 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -30,7 +30,7 @@ class ConvertBoundingBoxFormat(Transform):
 class ClampBoundingBoxes(Transform):
     """[BETA] Clamp bounding boxes to their corresponding image dimensions.
 
-    The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
+    The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
 
     .. v2betastatus:: ClampBoundingBoxes transform
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index a4cb594b2..a799070ee 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -408,7 +408,7 @@ class SanitizeBoundingBoxes(Transform):
         valid = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
         # TODO: Do we really need to check for out of bounds here? All
         # transforms should be clamping anyway, so this should never happen?
-        image_h, image_w = boxes.spatial_size
+        image_h, image_w = boxes.canvas_size
         valid &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
         valid &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
 
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 16f5ff500..24b4b4218 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -15,12 +15,12 @@ from ._meta import (
     get_num_channels_image_pil,
     get_num_channels_video,
     get_num_channels,
-    get_spatial_size_bounding_boxes,
-    get_spatial_size_image_tensor,
-    get_spatial_size_image_pil,
-    get_spatial_size_mask,
-    get_spatial_size_video,
-    get_spatial_size,
+    get_size_bounding_boxes,
+    get_size_image_tensor,
+    get_size_image_pil,
+    get_size_mask,
+    get_size_video,
+    get_size,
 )  # usort: skip
 
 from ._augment import erase, erase_image_pil, erase_image_tensor, erase_video
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index 99097aecc..f27d0b29d 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -19,6 +19,6 @@ def to_tensor(inpt: Any) -> torch.Tensor:
 def get_image_size(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
-        "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
+        "Instead, please use `get_size(...)` which returns `[h, w]` instead of `[w, h]`."
     )
     return _F.get_image_size(inpt)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 469e58ff9..a24507256 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -23,7 +23,7 @@ from torchvision.transforms.functional import (
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_spatial_size_image_pil
+from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil
 
 from ._utils import is_simple_tensor
 
@@ -52,18 +52,18 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def horizontal_flip_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_boxes.shape
 
     bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
 
     if format == datapoints.BoundingBoxFormat.XYXY:
-        bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(spatial_size[1]).neg_()
+        bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_()
     elif format == datapoints.BoundingBoxFormat.XYWH:
-        bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(spatial_size[1]).neg_()
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_()
     else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
-        bounding_boxes[:, 0].sub_(spatial_size[1]).neg_()
+        bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
 
     return bounding_boxes.reshape(shape)
 
@@ -102,18 +102,18 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_boxes.shape
 
     bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
 
     if format == datapoints.BoundingBoxFormat.XYXY:
-        bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(spatial_size[0]).neg_()
+        bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_()
     elif format == datapoints.BoundingBoxFormat.XYWH:
-        bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(spatial_size[0]).neg_()
+        bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_()
     else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
-        bounding_boxes[:, 1].sub_(spatial_size[0]).neg_()
+        bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
 
     return bounding_boxes.reshape(shape)
 
@@ -146,7 +146,7 @@ vflip = vertical_flip
 
 
 def _compute_resized_output_size(
-    spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> List[int]:
     if isinstance(size, int):
         size = [size]
@@ -155,7 +155,7 @@ def _compute_resized_output_size(
             "max_size should only be passed if size specifies the length of the smaller edge, "
             "i.e. size should be an int or a sequence of length 1 in torchscript mode."
         )
-    return __compute_resized_output_size(spatial_size, size=size, max_size=max_size)
+    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size)
 
 
 def resize_image_tensor(
@@ -275,13 +275,13 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
 
 
 def resize_bounding_boxes(
-    bounding_boxes: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    bounding_boxes: torch.Tensor, canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    old_height, old_width = spatial_size
-    new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
 
     if (new_height, new_width) == (old_height, old_width):
-        return bounding_boxes, spatial_size
+        return bounding_boxes, canvas_size
 
     w_ratio = new_width / old_width
     h_ratio = new_height / old_height
@@ -643,7 +643,7 @@ def affine_image_pil(
     # it is visually better to estimate the center without 0.5 offset
     # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
     if center is None:
-        height, width = get_spatial_size_image_pil(image)
+        height, width = get_size_image_pil(image)
         center = [width * 0.5, height * 0.5]
     matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
 
@@ -653,7 +653,7 @@ def affine_image_pil(
 def _affine_bounding_boxes_with_expand(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -662,7 +662,7 @@ def _affine_bounding_boxes_with_expand(
     expand: bool = False,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     if bounding_boxes.numel() == 0:
-        return bounding_boxes, spatial_size
+        return bounding_boxes, canvas_size
 
     original_shape = bounding_boxes.shape
     original_dtype = bounding_boxes.dtype
@@ -680,7 +680,7 @@ def _affine_bounding_boxes_with_expand(
     )
 
     if center is None:
-        height, width = spatial_size
+        height, width = canvas_size
         center = [width * 0.5, height * 0.5]
 
     affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
@@ -710,7 +710,7 @@ def _affine_bounding_boxes_with_expand(
     if expand:
         # Compute minimum point for transformed image frame:
         # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-        height, width = spatial_size
+        height, width = canvas_size
         points = torch.tensor(
             [
                 [0.0, 0.0, 1.0],
@@ -728,21 +728,21 @@ def _affine_bounding_boxes_with_expand(
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
         new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
-        spatial_size = (new_height, new_width)
+        canvas_size = (new_height, new_width)
 
-    out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
+    out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size)
     out_bboxes = convert_format_bounding_boxes(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
     out_bboxes = out_bboxes.to(original_dtype)
-    return out_bboxes, spatial_size
+    return out_bboxes, canvas_size
 
 
 def affine_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -752,7 +752,7 @@ def affine_bounding_boxes(
     out_box, _ = _affine_bounding_boxes_with_expand(
         bounding_boxes,
         format=format,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
         angle=angle,
         translate=translate,
         scale=scale,
@@ -930,7 +930,7 @@ def rotate_image_pil(
 def rotate_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
@@ -941,7 +941,7 @@ def rotate_bounding_boxes(
     return _affine_bounding_boxes_with_expand(
         bounding_boxes,
         format=format,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
         angle=-angle,
         translate=[0.0, 0.0],
         scale=1.0,
@@ -1168,7 +1168,7 @@ def pad_mask(
 def pad_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     padding: List[int],
     padding_mode: str = "constant",
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
@@ -1184,12 +1184,12 @@ def pad_bounding_boxes(
         pad = [left, top, 0, 0]
     bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
 
-    height, width = spatial_size
+    height, width = canvas_size
     height += top + bottom
     width += left + right
-    spatial_size = (height, width)
+    canvas_size = (height, width)
 
-    return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size
+    return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
 
 
 def pad_video(
@@ -1261,9 +1261,9 @@ def crop_bounding_boxes(
         sub = [left, top, 0, 0]
 
     bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
-    spatial_size = (height, width)
+    canvas_size = (height, width)
 
-    return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size
+    return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1412,7 +1412,7 @@ def perspective_image_pil(
 def perspective_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
@@ -1493,7 +1493,7 @@ def perspective_bounding_boxes(
     out_bboxes = clamp_bounding_boxes(
         torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
     )
 
     # out_bboxes should be of shape [N boxes, 4]
@@ -1651,7 +1651,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
 def elastic_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     displacement: torch.Tensor,
 ) -> torch.Tensor:
     if bounding_boxes.numel() == 0:
@@ -1670,7 +1670,7 @@ def elastic_bounding_boxes(
         convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
-    id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
     inv_grid = id_grid.sub_(displacement)
@@ -1683,7 +1683,7 @@ def elastic_bounding_boxes(
     index_x, index_y = index_xy[:, 0], index_xy[:, 1]
 
     # Transform points:
-    t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
     transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
 
     transformed_points = transformed_points.reshape(-1, 4, 2)
@@ -1691,7 +1691,7 @@ def elastic_bounding_boxes(
     out_bboxes = clamp_bounding_boxes(
         torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
     )
 
     return convert_format_bounding_boxes(
@@ -1804,13 +1804,13 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
 @torch.jit.unused
 def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    image_height, image_width = get_spatial_size_image_pil(image)
+    image_height, image_width = get_size_image_pil(image)
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
         image = pad_image_pil(image, padding_ltrb, fill=0)
 
-        image_height, image_width = get_spatial_size_image_pil(image)
+        image_height, image_width = get_size_image_pil(image)
         if crop_width == image_width and crop_height == image_height:
             return image
 
@@ -1821,11 +1821,11 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL
 def center_crop_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     output_size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
     return crop_bounding_boxes(
         bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width
     )
@@ -1905,7 +1905,7 @@ def resized_crop_bounding_boxes(
     size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     bounding_boxes, _ = crop_bounding_boxes(bounding_boxes, format, top, left, height, width)
-    return resize_bounding_boxes(bounding_boxes, spatial_size=(height, width), size=size)
+    return resize_bounding_boxes(bounding_boxes, canvas_size=(height, width), size=size)
 
 
 def resized_crop_mask(
@@ -2000,7 +2000,7 @@ def five_crop_image_pil(
     image: PIL.Image.Image, size: List[int]
 ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
     crop_height, crop_width = _parse_five_crop_size(size)
-    image_height, image_width = get_spatial_size_image_pil(image)
+    image_height, image_width = get_size_image_pil(image)
 
     if crop_width > image_width or crop_height > image_height:
         raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index f564b1803..91b370675 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -26,23 +26,29 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
 get_dimensions_image_pil = _FP.get_dimensions
 
 
+def get_dimensions_video(video: torch.Tensor) -> List[int]:
+    return get_dimensions_image_tensor(video)
+
+
 def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_dimensions)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_dimensions_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
-        channels = inpt.num_channels
-        height, width = inpt.spatial_size
-        return [channels, height, width]
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_dimensions_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    for typ, get_size_fn in {
+        datapoints.Image: get_dimensions_image_tensor,
+        datapoints.Video: get_dimensions_video,
+        PIL.Image.Image: get_dimensions_image_pil,
+    }.items():
+        if isinstance(inpt, typ):
+            return get_size_fn(inpt)
+
+    raise TypeError(
+        f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
+        f"but got {type(inpt)} instead."
+    )
 
 
 def get_num_channels_image_tensor(image: torch.Tensor) -> int:
@@ -69,15 +75,19 @@ def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoType
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_channels_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
-        return inpt.num_channels
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_num_channels_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    for typ, get_size_fn in {
+        datapoints.Image: get_num_channels_image_tensor,
+        datapoints.Video: get_num_channels_video,
+        PIL.Image.Image: get_num_channels_image_pil,
+    }.items():
+        if isinstance(inpt, typ):
+            return get_size_fn(inpt)
+
+    raise TypeError(
+        f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
+        f"but got {type(inpt)} instead."
+    )
 
 
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
@@ -85,7 +95,7 @@ def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoType
 get_image_num_channels = get_num_channels
 
 
-def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]:
+def get_size_image_tensor(image: torch.Tensor) -> List[int]:
     hw = list(image.shape[-2:])
     ndims = len(hw)
     if ndims == 2:
@@ -95,39 +105,48 @@ def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]:
 
 
 @torch.jit.unused
-def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]:
+def get_size_image_pil(image: PIL.Image.Image) -> List[int]:
     width, height = _FP.get_image_size(image)
     return [height, width]
 
 
-def get_spatial_size_video(video: torch.Tensor) -> List[int]:
-    return get_spatial_size_image_tensor(video)
+def get_size_video(video: torch.Tensor) -> List[int]:
+    return get_size_image_tensor(video)
 
 
-def get_spatial_size_mask(mask: torch.Tensor) -> List[int]:
-    return get_spatial_size_image_tensor(mask)
+def get_size_mask(mask: torch.Tensor) -> List[int]:
+    return get_size_image_tensor(mask)
 
 
 @torch.jit.unused
-def get_spatial_size_bounding_boxes(bounding_boxes: datapoints.BoundingBoxes) -> List[int]:
-    return list(bounding_boxes.spatial_size)
+def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]:
+    return list(bounding_box.canvas_size)
 
 
-def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
+def get_size(inpt: datapoints._InputTypeJIT) -> List[int]:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(get_spatial_size)
+        _log_api_usage_once(get_size)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return get_spatial_size_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBoxes, datapoints.Mask)):
-        return list(inpt.spatial_size)
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_spatial_size_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+        return get_size_image_tensor(inpt)
+
+    # TODO: This is just the poor mans version of a dispatcher. This will be properly addressed with
+    # https://github.com/pytorch/vision/pull/7747 when we can register the kernels above without the need to have
+    # a method on the datapoint class
+    for typ, get_size_fn in {
+        datapoints.Image: get_size_image_tensor,
+        datapoints.BoundingBoxes: get_size_bounding_boxes,
+        datapoints.Mask: get_size_mask,
+        datapoints.Video: get_size_video,
+        PIL.Image.Image: get_size_image_pil,
+    }.items():
+        if isinstance(inpt, typ):
+            return get_size_fn(inpt)
+
+    raise TypeError(
+        f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+        f"but got {type(inpt)} instead."
+    )
 
 
 def get_num_frames_video(video: torch.Tensor) -> int:
@@ -141,7 +160,7 @@ def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int:
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_frames_video(inpt)
     elif isinstance(inpt, datapoints.Video):
-        return inpt.num_frames
+        return get_num_frames_video(inpt)
     else:
         raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
 
@@ -240,7 +259,7 @@ def convert_format_bounding_boxes(
 
 
 def _clamp_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
@@ -249,8 +268,8 @@ def _clamp_bounding_boxes(
     xyxy_boxes = convert_format_bounding_boxes(
         bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
     )
-    xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
-    xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
+    xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
+    xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
     out_boxes = convert_format_bounding_boxes(
         xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
     )
@@ -260,21 +279,20 @@ def _clamp_bounding_boxes(
 def clamp_bounding_boxes(
     inpt: datapoints._InputTypeJIT,
     format: Optional[BoundingBoxFormat] = None,
-    spatial_size: Optional[Tuple[int, int]] = None,
+    canvas_size: Optional[Tuple[int, int]] = None,
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_boxes)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        if format is None or spatial_size is None:
-            raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.")
-        return _clamp_bounding_boxes(inpt, format=format, spatial_size=spatial_size)
+
+        if format is None or canvas_size is None:
+            raise ValueError("For simple tensor inputs, `format` and `canvas_size` has to be passed.")
+        return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
     elif isinstance(inpt, datapoints.BoundingBoxes):
-        if format is not None or spatial_size is not None:
-            raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.")
-        output = _clamp_bounding_boxes(
-            inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size
-        )
+        if format is not None or canvas_size is not None:
+            raise ValueError("For bounding box datapoint inputs, `format` and `canvas_size` must not be passed.")
+        output = _clamp_bounding_boxes(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size)
         return datapoints.BoundingBoxes.wrap_like(inpt, output)
     else:
         raise TypeError(
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
index 978333296..dd9f4489d 100644
--- a/torchvision/transforms/v2/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -6,15 +6,15 @@ import PIL.Image
 from torchvision import datapoints
 
 from torchvision._utils import sequence_to_str
-from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor
 
 
 def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
     bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)]
     if not bounding_boxes:
-        raise TypeError("No bounding box was found in the sample")
+        raise TypeError("No bounding boxes were found in the sample")
     elif len(bounding_boxes) > 1:
-        raise ValueError("Found multiple bounding boxes in the sample")
+        raise ValueError("Found multiple bounding boxes instances in the sample")
     return bounding_boxes.pop()
 
 
@@ -22,7 +22,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs
-        if isinstance(inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video)) or is_simple_tensor(inpt)
+        if check_type(inpt, (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -32,14 +32,21 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     return c, h, w
 
 
-def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
+def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
     sizes = {
-        tuple(get_spatial_size(inpt))
+        tuple(get_size(inpt))
         for inpt in flat_inputs
-        if isinstance(
-            inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBoxes)
+        if check_type(
+            inpt,
+            (
+                is_simple_tensor,
+                datapoints.Image,
+                PIL.Image.Image,
+                datapoints.Video,
+                datapoints.Mask,
+                datapoints.BoundingBoxes,
+            ),
         )
-        or is_simple_tensor(inpt)
     }
     if not sizes:
         raise TypeError("No image, video, mask or bounding box was found in the sample")
-- 
GitLab


From edde825516e1ee087794cabadda90fcefd5fe393 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 1 Aug 2023 11:20:57 +0100
Subject: [PATCH 530/624] Allow catch-all 'others' key in fill dicts. Avoid
 need for defaultdict. (#7779)

---
 gallery/plot_transforms_v2_e2e.py             |  5 +--
 references/segmentation/presets.py            |  4 +-
 references/segmentation/v2_extras.py          |  4 +-
 test/test_transforms_v2.py                    |  3 +-
 test/test_transforms_v2_consistency.py        |  6 +--
 torchvision/prototype/transforms/_geometry.py |  6 +--
 torchvision/prototype/transforms/_misc.py     | 18 +++++++-
 torchvision/transforms/v2/_auto_augment.py    | 16 +++----
 torchvision/transforms/v2/_geometry.py        | 29 ++++++-------
 torchvision/transforms/v2/_utils.py           | 42 +++++++------------
 10 files changed, 64 insertions(+), 69 deletions(-)

diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 53c461959..981b1e588 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -10,7 +10,6 @@ well as the new ``torchvision.transforms.v2`` v2 API.
 """
 
 import pathlib
-from collections import defaultdict
 
 import PIL.Image
 
@@ -99,9 +98,7 @@ show(sample)
 transform = transforms.Compose(
     [
         transforms.RandomPhotometricDistort(),
-        transforms.RandomZoomOut(
-            fill=defaultdict(lambda: 0, {PIL.Image.Image: (123, 117, 104)})
-        ),
+        transforms.RandomZoomOut(fill={PIL.Image.Image: (123, 117, 104), "others": 0}),
         transforms.RandomIoUCrop(),
         transforms.RandomHorizontalFlip(),
         transforms.ToImageTensor(),
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index abb70d8d0..e62fd5ae3 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -1,5 +1,3 @@
-from collections import defaultdict
-
 import torch
 
 
@@ -48,7 +46,7 @@ class SegmentationPresetTrain:
         if use_v2:
             # We need a custom pad transform here, since the padding we want to perform here is fundamentally
             # different from the padding in `RandomCrop` if `pad_if_needed=True`.
-            transforms += [v2_extras.PadIfSmaller(crop_size, fill=defaultdict(lambda: 0, {datapoints.Mask: 255}))]
+            transforms += [v2_extras.PadIfSmaller(crop_size, fill={datapoints.Mask: 255, "others": 0})]
 
         transforms += [T.RandomCrop(crop_size)]
 
diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
index c69827c22..f21799e86 100644
--- a/references/segmentation/v2_extras.py
+++ b/references/segmentation/v2_extras.py
@@ -8,7 +8,7 @@ class PadIfSmaller(v2.Transform):
     def __init__(self, size, fill=0):
         super().__init__()
         self.size = size
-        self.fill = v2._geometry._setup_fill_arg(fill)
+        self.fill = v2._utils._setup_fill_arg(fill)
 
     def _get_params(self, sample):
         _, height, width = v2.utils.query_chw(sample)
@@ -20,7 +20,7 @@ class PadIfSmaller(v2.Transform):
         if not params["needs_padding"]:
             return inpt
 
-        fill = self.fill[type(inpt)]
+        fill = v2._utils._get_fill(self.fill, type(inpt))
         fill = v2._utils._convert_fill_arg(fill)
 
         return v2.functional.pad(inpt, padding=params["padding"], fill=fill)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 4c1815fdd..d5f448b09 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -3,7 +3,6 @@ import pathlib
 import random
 import textwrap
 import warnings
-from collections import defaultdict
 
 import numpy as np
 
@@ -1475,7 +1474,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     elif data_augmentation == "ssd":
         t = [
             transforms.RandomPhotometricDistort(p=1),
-            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0}), p=1),
+            transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), datapoints.Mask: 0}, p=1),
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
             to_tensor,
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 47a0b05b5..f5ea69279 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -4,7 +4,6 @@ import importlib.util
 import inspect
 import random
 import re
-from collections import defaultdict
 from pathlib import Path
 
 import numpy as np
@@ -30,6 +29,7 @@ from torchvision._utils import sequence_to_str
 
 from torchvision.transforms import functional as legacy_F
 from torchvision.transforms.v2 import functional as prototype_F
+from torchvision.transforms.v2._utils import _get_fill
 from torchvision.transforms.v2.functional import to_image_pil
 from torchvision.transforms.v2.utils import query_size
 
@@ -1181,7 +1181,7 @@ class PadIfSmaller(v2_transforms.Transform):
         if not params["needs_padding"]:
             return inpt
 
-        fill = self.fill[type(inpt)]
+        fill = _get_fill(self.fill, type(inpt))
         return prototype_F.pad(inpt, padding=params["padding"], fill=fill)
 
 
@@ -1243,7 +1243,7 @@ class TestRefSegTransforms:
                 seg_transforms.RandomCrop(size=480),
                 v2_transforms.Compose(
                     [
-                        PadIfSmaller(size=480, fill=defaultdict(lambda: 0, {datapoints.Mask: 255})),
+                        PadIfSmaller(size=480, fill={datapoints.Mask: 255, "others": 0}),
                         v2_transforms.RandomCrop(size=480),
                     ]
                 ),
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 28aff8416..a4023ca21 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -6,7 +6,7 @@ import torch
 from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
-from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
+from torchvision.transforms.v2._utils import _get_fill, _setup_fill_arg, _setup_size
 from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_size
 
 
@@ -14,7 +14,7 @@ class FixedSizeCrop(Transform):
     def __init__(
         self,
         size: Union[int, Sequence[int]],
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
         padding_mode: str = "constant",
     ) -> None:
         super().__init__()
@@ -119,7 +119,7 @@ class FixedSizeCrop(Transform):
                 )
 
         if params["needs_pad"]:
-            fill = self._fill[type(inpt)]
+            fill = _get_fill(self._fill, type(inpt))
             inpt = F.pad(inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         return inpt
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 3a4e6e956..51a2ea907 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -1,15 +1,29 @@
+import functools
 import warnings
-from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
+from collections import defaultdict
+from typing import Any, Dict, Optional, Sequence, Tuple, Type, TypeVar, Union
 
 import torch
 
 from torchvision import datapoints
 from torchvision.transforms.v2 import Transform
 
-from torchvision.transforms.v2._utils import _get_defaultdict
 from torchvision.transforms.v2.utils import is_simple_tensor
 
 
+T = TypeVar("T")
+
+
+def _default_arg(value: T) -> T:
+    return value
+
+
+def _get_defaultdict(default: T) -> Dict[Any, T]:
+    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
+    # If it were possible, we could replace this with `defaultdict(lambda: default)`
+    return defaultdict(functools.partial(_default_arg, default))
+
+
 class PermuteDimensions(Transform):
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
 
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 2921903da..146c8c236 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -11,7 +11,7 @@ from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, Interp
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
 from torchvision.transforms.v2.functional._meta import get_size
 
-from ._utils import _setup_fill_arg
+from ._utils import _get_fill, _setup_fill_arg
 from .utils import check_type, is_simple_tensor
 
 
@@ -20,7 +20,7 @@ class _AutoAugmentBase(Transform):
         self,
         *,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
     ) -> None:
         super().__init__()
         self.interpolation = _check_interpolation(interpolation)
@@ -80,9 +80,9 @@ class _AutoAugmentBase(Transform):
         transform_id: str,
         magnitude: float,
         interpolation: Union[InterpolationMode, int],
-        fill: Dict[Type, datapoints._FillTypeJIT],
+        fill: Dict[Union[Type, str], datapoints._FillTypeJIT],
     ) -> Union[datapoints._ImageType, datapoints._VideoType]:
-        fill_ = fill[type(image)]
+        fill_ = _get_fill(fill, type(image))
 
         if transform_id == "Identity":
             return image
@@ -214,7 +214,7 @@ class AutoAugment(_AutoAugmentBase):
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.policy = policy
@@ -394,7 +394,7 @@ class RandAugment(_AutoAugmentBase):
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_ops = num_ops
@@ -467,7 +467,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         self,
         num_magnitude_bins: int = 31,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
     ):
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_magnitude_bins = num_magnitude_bins
@@ -550,7 +550,7 @@ class AugMix(_AutoAugmentBase):
         alpha: float = 1.0,
         all_ops: bool = True,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = None,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self._PARAMETER_MAX = 10
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 9e7ca64d4..c7a1e3928 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -17,6 +17,7 @@ from ._utils import (
     _check_padding_arg,
     _check_padding_mode_arg,
     _check_sequence_input,
+    _get_fill,
     _setup_angle,
     _setup_fill_arg,
     _setup_float_or_seq,
@@ -487,7 +488,7 @@ class Pad(Transform):
     def __init__(
         self,
         padding: Union[int, Sequence[int]],
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -504,7 +505,7 @@ class Pad(Transform):
         self.padding_mode = padding_mode
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self._fill[type(inpt)]
+        fill = _get_fill(self._fill, type(inpt))
         return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
 
 
@@ -542,7 +543,7 @@ class RandomZoomOut(_RandomApplyTransform):
 
     def __init__(
         self,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
         side_range: Sequence[float] = (1.0, 4.0),
         p: float = 0.5,
     ) -> None:
@@ -574,7 +575,7 @@ class RandomZoomOut(_RandomApplyTransform):
         return dict(padding=padding)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self._fill[type(inpt)]
+        fill = _get_fill(self._fill, type(inpt))
         return F.pad(inpt, **params, fill=fill)
 
 
@@ -620,7 +621,7 @@ class RandomRotation(Transform):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -640,7 +641,7 @@ class RandomRotation(Transform):
         return dict(angle=angle)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self._fill[type(inpt)]
+        fill = _get_fill(self._fill, type(inpt))
         return F.rotate(
             inpt,
             **params,
@@ -702,7 +703,7 @@ class RandomAffine(Transform):
         scale: Optional[Sequence[float]] = None,
         shear: Optional[Union[int, float, Sequence[float]]] = None,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
@@ -762,7 +763,7 @@ class RandomAffine(Transform):
         return dict(angle=angle, translate=translate, scale=scale, shear=shear)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self._fill[type(inpt)]
+        fill = _get_fill(self._fill, type(inpt))
         return F.affine(
             inpt,
             **params,
@@ -840,7 +841,7 @@ class RandomCrop(Transform):
         size: Union[int, Sequence[int]],
         padding: Optional[Union[int, Sequence[int]]] = None,
         pad_if_needed: bool = False,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -918,7 +919,7 @@ class RandomCrop(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["needs_pad"]:
-            fill = self._fill[type(inpt)]
+            fill = _get_fill(self._fill, type(inpt))
             inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         if params["needs_crop"]:
@@ -959,7 +960,7 @@ class RandomPerspective(_RandomApplyTransform):
         distortion_scale: float = 0.5,
         p: float = 0.5,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
     ) -> None:
         super().__init__(p=p)
 
@@ -1002,7 +1003,7 @@ class RandomPerspective(_RandomApplyTransform):
         return dict(coefficients=perspective_coeffs)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self._fill[type(inpt)]
+        fill = _get_fill(self._fill, type(inpt))
         return F.perspective(
             inpt,
             None,
@@ -1061,7 +1062,7 @@ class ElasticTransform(Transform):
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
+        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
@@ -1095,7 +1096,7 @@ class ElasticTransform(Transform):
         return dict(displacement=displacement)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self._fill[type(inpt)]
+        fill = _get_fill(self._fill, type(inpt))
         return F.elastic(
             inpt,
             **params,
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index ed3339dc7..859586be1 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -1,9 +1,7 @@
 import collections.abc
-import functools
 import numbers
-from collections import defaultdict
 from contextlib import suppress
-from typing import Any, Callable, Dict, Literal, Optional, Sequence, Type, TypeVar, Union
+from typing import Any, Callable, Dict, Literal, Optional, Sequence, Type, Union
 
 import torch
 
@@ -29,32 +27,15 @@ def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size:
     return arg
 
 
-def _check_fill_arg(fill: Union[_FillType, Dict[Type, _FillType]]) -> None:
+def _check_fill_arg(fill: Union[_FillType, Dict[Union[Type, str], _FillType]]) -> None:
     if isinstance(fill, dict):
-        for key, value in fill.items():
-            # Check key for type
+        for value in fill.values():
             _check_fill_arg(value)
-        if isinstance(fill, defaultdict) and callable(fill.default_factory):
-            default_value = fill.default_factory()
-            _check_fill_arg(default_value)
     else:
         if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
             raise TypeError("Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed.")
 
 
-T = TypeVar("T")
-
-
-def _default_arg(value: T) -> T:
-    return value
-
-
-def _get_defaultdict(default: T) -> Dict[Any, T]:
-    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
-    # If it were possible, we could replace this with `defaultdict(lambda: default)`
-    return defaultdict(functools.partial(_default_arg, default))
-
-
 def _convert_fill_arg(fill: datapoints._FillType) -> datapoints._FillTypeJIT:
     # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
     # So, we can't reassign fill to 0
@@ -68,19 +49,24 @@ def _convert_fill_arg(fill: datapoints._FillType) -> datapoints._FillTypeJIT:
     return fill  # type: ignore[return-value]
 
 
-def _setup_fill_arg(fill: Union[_FillType, Dict[Type, _FillType]]) -> Dict[Type, _FillTypeJIT]:
+def _setup_fill_arg(fill: Union[_FillType, Dict[Union[Type, str], _FillType]]) -> Dict[Union[Type, str], _FillTypeJIT]:
     _check_fill_arg(fill)
 
     if isinstance(fill, dict):
         for k, v in fill.items():
             fill[k] = _convert_fill_arg(v)
-        if isinstance(fill, defaultdict) and callable(fill.default_factory):
-            default_value = fill.default_factory()
-            sanitized_default = _convert_fill_arg(default_value)
-            fill.default_factory = functools.partial(_default_arg, sanitized_default)
         return fill  # type: ignore[return-value]
+    else:
+        return {"others": _convert_fill_arg(fill)}
 
-    return _get_defaultdict(_convert_fill_arg(fill))
+
+def _get_fill(fill_dict, inpt_type):
+    if inpt_type in fill_dict:
+        return fill_dict[inpt_type]
+    elif "others" in fill_dict:
+        return fill_dict["others"]
+    else:
+        RuntimeError("This should never happen, please open an issue on the torchvision repo if you hit this.")
 
 
 def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
-- 
GitLab


From 3e4e353d5d0c9f48c169477f3d03c9fad8f36df7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 1 Aug 2023 11:50:39 +0100
Subject: [PATCH 531/624] Cutmix -> CutMix (#7784)

---
 docs/source/transforms.rst                   |  4 ++--
 references/classification/transforms.py      | 16 ++++++++--------
 test/test_prototype_transforms.py            |  4 ++--
 test/test_transforms_v2_refactored.py        |  6 +++---
 torchvision/prototype/transforms/__init__.py |  2 +-
 torchvision/prototype/transforms/_augment.py |  6 +++---
 torchvision/transforms/v2/__init__.py        |  2 +-
 torchvision/transforms/v2/_augment.py        | 14 +++++++-------
 torchvision/transforms/v2/_utils.py          |  4 ++--
 9 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index e3bdbd55a..73adb3cf3 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -274,8 +274,8 @@ are combining pairs of images together. These can be used after the dataloader
     :toctree: generated/
     :template: class.rst
 
-    v2.Cutmix
-    v2.Mixup
+    v2.CutMix
+    v2.MixUp
 
 .. _functional_transforms:
 
diff --git a/references/classification/transforms.py b/references/classification/transforms.py
index 330579cb5..3d10388c3 100644
--- a/references/classification/transforms.py
+++ b/references/classification/transforms.py
@@ -13,15 +13,15 @@ def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_categories, use_v2):
     mixup_cutmix = []
     if mixup_alpha > 0:
         mixup_cutmix.append(
-            transforms_module.Mixup(alpha=mixup_alpha, num_categories=num_categories)
+            transforms_module.MixUp(alpha=mixup_alpha, num_categories=num_categories)
             if use_v2
-            else RandomMixup(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+            else RandomMixUp(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
         )
     if cutmix_alpha > 0:
         mixup_cutmix.append(
-            transforms_module.Cutmix(alpha=mixup_alpha, num_categories=num_categories)
+            transforms_module.CutMix(alpha=mixup_alpha, num_categories=num_categories)
             if use_v2
-            else RandomCutmix(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+            else RandomCutMix(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
         )
     if not mixup_cutmix:
         return None
@@ -29,8 +29,8 @@ def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_categories, use_v2):
     return transforms_module.RandomChoice(mixup_cutmix)
 
 
-class RandomMixup(torch.nn.Module):
-    """Randomly apply Mixup to the provided batch and targets.
+class RandomMixUp(torch.nn.Module):
+    """Randomly apply MixUp to the provided batch and targets.
     The class implements the data augmentations as described in the paper
     `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
 
@@ -112,8 +112,8 @@ class RandomMixup(torch.nn.Module):
         return s
 
 
-class RandomCutmix(torch.nn.Module):
-    """Randomly apply Cutmix to the provided batch and targets.
+class RandomCutMix(torch.nn.Module):
+    """Randomly apply CutMix to the provided batch and targets.
     The class implements the data augmentations as described in the paper
     `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
     <https://arxiv.org/abs/1905.04899>`_.
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 4c50cf0b9..7bed48e6c 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -60,8 +60,8 @@ def parametrize(transforms_with_inputs):
             ],
         )
         for transform in [
-            transforms.RandomMixup(alpha=1.0),
-            transforms.RandomCutmix(alpha=1.0),
+            transforms.RandomMixUp(alpha=1.0),
+            transforms.RandomCutMix(alpha=1.0),
         ]
     ]
 )
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 7d10fbed4..e9b72161e 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -1914,7 +1914,7 @@ class TestCutMixMixUp:
         def __len__(self):
             return self.size
 
-    @pytest.mark.parametrize("T", [transforms.Cutmix, transforms.Mixup])
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
     def test_supported_input_structure(self, T):
 
         batch_size = 32
@@ -1964,7 +1964,7 @@ class TestCutMixMixUp:
             check_output(img, target)
 
     @needs_cuda
-    @pytest.mark.parametrize("T", [transforms.Cutmix, transforms.Mixup])
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
     def test_cpu_vs_gpu(self, T):
         num_classes = 10
         batch_size = 3
@@ -1976,7 +1976,7 @@ class TestCutMixMixUp:
 
         _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None)
 
-    @pytest.mark.parametrize("T", [transforms.Cutmix, transforms.Mixup])
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
     def test_error(self, T):
 
         num_classes = 10
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 4f8fdef48..e3a185998 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -1,6 +1,6 @@
 from ._presets import StereoMatching  # usort: skip
 
-from ._augment import RandomCutmix, RandomMixup, SimpleCopyPaste
+from ._augment import RandomCutMix, RandomMixUp, SimpleCopyPaste
 from ._geometry import FixedSizeCrop
 from ._misc import PermuteDimensions, TransposeDimensions
 from ._type_conversion import LabelToOneHot
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index a5f883a24..4da6cfcf9 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -14,7 +14,7 @@ from torchvision.transforms.v2.functional._geometry import _check_interpolation
 from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_size
 
 
-class _BaseMixupCutmix(_RandomApplyTransform):
+class _BaseMixUpCutMix(_RandomApplyTransform):
     def __init__(self, alpha: float, p: float = 0.5) -> None:
         super().__init__(p=p)
         self.alpha = alpha
@@ -38,7 +38,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
         return proto_datapoints.OneHotLabel.wrap_like(inpt, output)
 
 
-class RandomMixup(_BaseMixupCutmix):
+class RandomMixUp(_BaseMixUpCutMix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
 
@@ -60,7 +60,7 @@ class RandomMixup(_BaseMixupCutmix):
             return inpt
 
 
-class RandomCutmix(_BaseMixupCutmix):
+class RandomCutMix(_BaseMixUpCutMix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         lam = float(self._dist.sample(()))  # type: ignore[arg-type]
 
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index b44f479c4..8ce9bee9b 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -4,7 +4,7 @@ from . import functional, utils  # usort: skip
 
 from ._transform import Transform  # usort: skip
 
-from ._augment import Cutmix, Mixup, RandomErasing
+from ._augment import CutMix, MixUp, RandomErasing
 from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
 from ._color import (
     ColorJitter,
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 3291c2f50..780ffccf6 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -140,7 +140,7 @@ class RandomErasing(_RandomApplyTransform):
         return inpt
 
 
-class _BaseMixupCutmix(Transform):
+class _BaseMixUpCutMix(Transform):
     def __init__(self, *, alpha: float = 1.0, num_classes: int, labels_getter="default") -> None:
         super().__init__()
         self.alpha = float(alpha)
@@ -203,10 +203,10 @@ class _BaseMixupCutmix(Transform):
         return label.roll(1, 0).mul_(1.0 - lam).add_(label.mul(lam))
 
 
-class Mixup(_BaseMixupCutmix):
+class MixUp(_BaseMixUpCutMix):
     """[BETA] Apply MixUp to the provided batch of images and labels.
 
-    .. v2betastatus:: Mixup transform
+    .. v2betastatus:: MixUp transform
 
     Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
 
@@ -227,7 +227,7 @@ class Mixup(_BaseMixupCutmix):
         num_classes (int): number of classes in the batch. Used for one-hot-encoding.
         labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
             By default, this will pick the second parameter a the labels if it's a tensor. This covers the most
-            common scenario where this transform is called as ``Mixup()(imgs_batch, labels_batch)``.
+            common scenario where this transform is called as ``MixUp()(imgs_batch, labels_batch)``.
             It can also be a callable that takes the same input as the transform, and returns the labels.
     """
 
@@ -252,10 +252,10 @@ class Mixup(_BaseMixupCutmix):
             return inpt
 
 
-class Cutmix(_BaseMixupCutmix):
+class CutMix(_BaseMixUpCutMix):
     """[BETA] Apply CutMix to the provided batch of images and labels.
 
-    .. v2betastatus:: Cutmix transform
+    .. v2betastatus:: CutMix transform
 
     Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
     <https://arxiv.org/abs/1905.04899>`_.
@@ -277,7 +277,7 @@ class Cutmix(_BaseMixupCutmix):
         num_classes (int): number of classes in the batch. Used for one-hot-encoding.
         labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
             By default, this will pick the second parameter a the labels if it's a tensor. This covers the most
-            common scenario where this transform is called as ``Cutmix()(imgs_batch, labels_batch)``.
+            common scenario where this transform is called as ``CutMix()(imgs_batch, labels_batch)``.
             It can also be a callable that takes the same input as the transform, and returns the labels.
     """
 
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index 859586be1..a7826a664 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -89,7 +89,7 @@ def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
     This heuristic covers three cases:
 
     1. The input is tuple or list whose second item is a labels tensor. This happens for already batched
-       classification inputs for Mixup and Cutmix (typically after the Dataloder).
+       classification inputs for MixUp and CutMix (typically after the Dataloder).
     2. The input is a tuple or list whose second item is a dictionary that contains the labels tensor
        under a label-like (see below) key. This happens for the inputs of detection models.
     3. The input is a dictionary that is structured as the one from 2.
@@ -103,7 +103,7 @@ def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
     if isinstance(inputs, (tuple, list)):
         inputs = inputs[1]
 
-    # Mixup, Cutmix
+    # MixUp, CutMix
     if isinstance(inputs, torch.Tensor):
         return inputs
 
-- 
GitLab


From f524cd3aaf09e96a2be1a17ff57f60ad76597a20 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 1 Aug 2023 12:53:58 +0100
Subject: [PATCH 532/624] Fix docs (#7789)

---
 gallery/plot_cutmix_mixup.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gallery/plot_cutmix_mixup.py b/gallery/plot_cutmix_mixup.py
index d1c92a278..932ce325b 100644
--- a/gallery/plot_cutmix_mixup.py
+++ b/gallery/plot_cutmix_mixup.py
@@ -4,8 +4,8 @@
 How to use CutMix and MixUp
 ===========================
 
-:class:`~torchvision.transforms.v2.Cutmix` and
-:class:`~torchvision.transforms.v2.Mixup` are popular augmentation strategies
+:class:`~torchvision.transforms.v2.CutMix` and
+:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
 that can improve classification accuracy.
 
 These transforms are slightly different from the rest of the Torchvision
@@ -79,8 +79,8 @@ for images, labels in dataloader:
 
 dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
 
-cutmix = v2.Cutmix(num_classes=NUM_CLASSES)
-mixup = v2.Mixup(num_classes=NUM_CLASSES)
+cutmix = v2.CutMix(num_classes=NUM_CLASSES)
+mixup = v2.MixUp(num_classes=NUM_CLASSES)
 cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
 
 for images, labels in dataloader:
@@ -148,5 +148,5 @@ def labels_getter(batch):
     return batch["target"]["classes"]
 
 
-out = v2.Cutmix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
 print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
-- 
GitLab


From 16d62e3072955bd92b76a4ae73fefa73ecc9ee3e Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Tue, 1 Aug 2023 22:52:20 +0800
Subject: [PATCH 533/624] Add MPS kernels for nms and roi ops (#7643)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 .github/scripts/run-clang-format.py           |    2 +-
 CMakeLists.txt                                |    9 +
 setup.py                                      |    5 +
 test/common_utils.py                          |   11 +
 test/conftest.py                              |   17 +-
 test/test_ops.py                              |  109 +-
 torchvision/csrc/ops/cpu/nms_kernel.cpp       |    4 +-
 torchvision/csrc/ops/mps/mps_helpers.h        |    6 +
 torchvision/csrc/ops/mps/mps_kernels.h        | 1102 +++++++++++++++++
 torchvision/csrc/ops/mps/nms_kernel.mm        |  109 ++
 .../csrc/ops/mps/ps_roi_align_kernel.mm       |  205 +++
 .../csrc/ops/mps/ps_roi_pool_kernel.mm        |  200 +++
 torchvision/csrc/ops/mps/roi_align_kernel.mm  |  197 +++
 torchvision/csrc/ops/mps/roi_pool_kernel.mm   |  196 +++
 torchvision/ops/roi_align.py                  |    6 +-
 15 files changed, 2146 insertions(+), 32 deletions(-)
 create mode 100644 torchvision/csrc/ops/mps/mps_helpers.h
 create mode 100644 torchvision/csrc/ops/mps/mps_kernels.h
 create mode 100644 torchvision/csrc/ops/mps/nms_kernel.mm
 create mode 100644 torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
 create mode 100644 torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
 create mode 100644 torchvision/csrc/ops/mps/roi_align_kernel.mm
 create mode 100644 torchvision/csrc/ops/mps/roi_pool_kernel.mm

diff --git a/.github/scripts/run-clang-format.py b/.github/scripts/run-clang-format.py
index 5c61b2519..670fd9783 100755
--- a/.github/scripts/run-clang-format.py
+++ b/.github/scripts/run-clang-format.py
@@ -48,7 +48,7 @@ except ImportError:
     DEVNULL = open(os.devnull, "wb")
 
 
-DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu"
+DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu,mm"
 
 
 class ExitStatus:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 405f947c2..0cd485d7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@ set(CMAKE_CXX_STANDARD 17)
 file(STRINGS version.txt TORCHVISION_VERSION)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_MPS "Enable MPS support" OFF)
 option(WITH_PNG "Enable features requiring LibPNG." ON)
 option(WITH_JPEG "Enable features requiring LibJPEG." ON)
 option(USE_PYTHON "Link to Python when building" OFF)
@@ -15,6 +16,11 @@ if(WITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 endif()
 
+if(WITH_MPS)
+  enable_language(OBJC OBJCXX)
+  add_definitions(-DWITH_MPS)
+endif()
+
 find_package(Torch REQUIRED)
 
 if (WITH_PNG)
@@ -79,6 +85,9 @@ list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCP
 if(WITH_CUDA)
     list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast)
 endif()
+if(WITH_MPS)
+    list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps)
+endif()
 
 FOREACH(DIR ${ALLOW_LISTED})
     file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*)
diff --git a/setup.py b/setup.py
index 8b8ddcde1..cd4108114 100644
--- a/setup.py
+++ b/setup.py
@@ -137,10 +137,13 @@ def get_extensions():
         + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
     )
+    source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm"))
 
     print("Compiling extensions with following flags:")
     force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
     print(f"  FORCE_CUDA: {force_cuda}")
+    force_mps = os.getenv("FORCE_MPS", "0") == "1"
+    print(f"  FORCE_MPS: {force_mps}")
     debug_mode = os.getenv("DEBUG", "0") == "1"
     print(f"  DEBUG: {debug_mode}")
     use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1"
@@ -202,6 +205,8 @@ def get_extensions():
             define_macros += [("WITH_HIP", None)]
             nvcc_flags = []
         extra_compile_args["nvcc"] = nvcc_flags
+    elif torch.backends.mps.is_available() or force_mps:
+        sources += source_mps
 
     if sys.platform == "win32":
         define_macros += [("torchvision_EXPORTS", None)]
diff --git a/test/common_utils.py b/test/common_utils.py
index b5edda3ed..3f8a12e16 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -34,6 +34,7 @@ IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS
 IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
 IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
+MPS_NOT_AVAILABLE_MSG = "MPS device not available"
 OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda."
 
 
@@ -130,12 +131,22 @@ def cpu_and_cuda():
     return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
 
 
+def cpu_and_cuda_and_mps():
+    return cpu_and_cuda() + (pytest.param("mps", marks=pytest.mark.needs_mps),)
+
+
 def needs_cuda(test_func):
     import pytest  # noqa
 
     return pytest.mark.needs_cuda(test_func)
 
 
+def needs_mps(test_func):
+    import pytest  # noqa
+
+    return pytest.mark.needs_mps(test_func)
+
+
 def _create_data(height=3, width=3, channels=3, device="cpu"):
     # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture
     tensor = torch.randint(0, 256, (channels, height, width), dtype=torch.uint8, device=device)
diff --git a/test/conftest.py b/test/conftest.py
index 468587f1c..a54028bc7 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -8,12 +8,20 @@ import torchvision
 
 torchvision.disable_beta_transforms_warning()
 
-from common_utils import CUDA_NOT_AVAILABLE_MSG, IN_FBCODE, IN_OSS_CI, IN_RE_WORKER, OSS_CI_GPU_NO_CUDA_MSG
+from common_utils import (
+    CUDA_NOT_AVAILABLE_MSG,
+    IN_FBCODE,
+    IN_OSS_CI,
+    IN_RE_WORKER,
+    MPS_NOT_AVAILABLE_MSG,
+    OSS_CI_GPU_NO_CUDA_MSG,
+)
 
 
 def pytest_configure(config):
     # register an additional marker (see pytest_collection_modifyitems)
     config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device")
+    config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device")
     config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected")
 
 
@@ -37,12 +45,16 @@ def pytest_collection_modifyitems(items):
         # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
         # and the ones with device == 'cpu' won't have the mark.
         needs_cuda = item.get_closest_marker("needs_cuda") is not None
+        needs_mps = item.get_closest_marker("needs_mps") is not None
 
         if needs_cuda and not torch.cuda.is_available():
             # In general, we skip cuda tests on machines without a GPU
             # There are special cases though, see below
             item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG))
 
+        if needs_mps and not torch.backends.mps.is_available():
+            item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG))
+
         if IN_FBCODE:
             # fbcode doesn't like skipping tests, so instead we  just don't collect the test
             # so that they don't even "exist", hence the continue statements.
@@ -54,6 +66,9 @@ def pytest_collection_modifyitems(items):
                 # TODO: something more robust would be to do that only in a sandcastle instance,
                 # so that we can still see the test being skipped when testing locally from a devvm
                 continue
+            if needs_mps and not torch.backends.mps.is_available():
+                # Same as above, but for MPS
+                continue
         elif IN_OSS_CI:
             # Here we're not in fbcode, so we can safely collect and skip tests.
             if not needs_cuda and torch.cuda.is_available():
diff --git a/test/test_ops.py b/test/test_ops.py
index b993bce65..743fe159e 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -10,7 +10,7 @@ import pytest
 import torch
 import torch.fx
 import torch.nn.functional as F
-from common_utils import assert_equal, cpu_and_cuda, needs_cuda
+from common_utils import assert_equal, cpu_and_cuda, cpu_and_cuda_and_mps, needs_cuda, needs_mps
 from PIL import Image
 from torch import nn, Tensor
 from torch.autograd import gradcheck
@@ -96,12 +96,33 @@ class PoolWrapper(nn.Module):
 
 class RoIOpTester(ABC):
     dtype = torch.float64
+    mps_dtype = torch.float32
+    mps_backward_atol = 2e-2
 
-    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, deterministic=False, **kwargs):
-        x_dtype = self.dtype if x_dtype is None else x_dtype
-        rois_dtype = self.dtype if rois_dtype is None else rois_dtype
+    @pytest.mark.parametrize(
+        "x_dtype",
+        (
+            torch.float16,
+            torch.float32,
+            torch.float64,
+        ),
+        ids=str,
+    )
+    def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, deterministic=False, **kwargs):
+        if device == "mps" and x_dtype is torch.float64:
+            pytest.skip("MPS does not support float64")
+
+        rois_dtype = x_dtype if rois_dtype is None else rois_dtype
+
+        tol = 1e-5
+        if x_dtype is torch.half:
+            if device == "mps":
+                tol = 5e-3
+            else:
+                tol = 4e-3
+
         pool_size = 5
         # n_channels % (pool_size ** 2) == 0 required for PS operations.
         n_channels = 2 * (pool_size**2)
@@ -120,10 +141,9 @@ class RoIOpTester(ABC):
         # the following should be true whether we're running an autocast test or not.
         assert y.dtype == x.dtype
         gt_y = self.expected_fn(
-            x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=self.dtype, **kwargs
+            x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=x_dtype, **kwargs
         )
 
-        tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5
         torch.testing.assert_close(gt_y.to(y), y, rtol=tol, atol=tol)
 
     @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -155,16 +175,19 @@ class RoIOpTester(ABC):
         torch.testing.assert_close(output_gt, output_fx, rtol=tol, atol=tol)
 
     @pytest.mark.parametrize("seed", range(10))
-    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
     def test_backward(self, seed, device, contiguous, deterministic=False):
+        atol = self.mps_backward_atol if device == "mps" else 1e-05
+        dtype = self.mps_dtype if device == "mps" else self.dtype
+
         torch.random.manual_seed(seed)
         pool_size = 2
-        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
+        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=dtype, device=device, requires_grad=True)
         if not contiguous:
             x = x.permute(0, 1, 3, 2)
         rois = torch.tensor(
-            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=self.dtype, device=device  # format is (xyxy)
+            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=dtype, device=device  # format is (xyxy)
         )
 
         def func(z):
@@ -173,9 +196,25 @@ class RoIOpTester(ABC):
         script_func = self.get_script_fn(rois, pool_size)
 
         with DeterministicGuard(deterministic):
-            gradcheck(func, (x,))
+            gradcheck(func, (x,), atol=atol)
+
+        gradcheck(script_func, (x,), atol=atol)
 
-        gradcheck(script_func, (x,))
+    @needs_mps
+    def test_mps_error_inputs(self):
+        pool_size = 2
+        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=torch.float16, device="mps", requires_grad=True)
+        rois = torch.tensor(
+            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=torch.float16, device="mps"  # format is (xyxy)
+        )
+
+        def func(z):
+            return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1)
+
+        with pytest.raises(
+            RuntimeError, match="MPS does not support (?:ps_)?roi_(?:align|pool)? backward with float16 inputs."
+        ):
+            gradcheck(func, (x,))
 
     @needs_cuda
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
@@ -271,6 +310,8 @@ class TestRoiPool(RoIOpTester):
 
 
 class TestPSRoIPool(RoIOpTester):
+    mps_backward_atol = 5e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)
 
@@ -352,6 +393,8 @@ def bilinear_interpolate(data, y, x, snap_border=False):
 
 
 class TestRoIAlign(RoIOpTester):
+    mps_backward_atol = 6e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, aligned=False, **kwargs):
         return ops.RoIAlign(
             (pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio, aligned=aligned
@@ -418,10 +461,11 @@ class TestRoIAlign(RoIOpTester):
         self._helper_boxes_shape(ops.roi_align)
 
     @pytest.mark.parametrize("aligned", (True, False))
-    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    @pytest.mark.parametrize("x_dtype", (torch.float16, torch.float32, torch.float64), ids=str)
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("deterministic", (True, False))
-    def test_forward(self, device, contiguous, deterministic, aligned, x_dtype=None, rois_dtype=None):
+    def test_forward(self, device, contiguous, deterministic, aligned, x_dtype, rois_dtype=None):
         if deterministic and device == "cpu":
             pytest.skip("cpu is always deterministic, don't retest")
         super().test_forward(
@@ -450,7 +494,7 @@ class TestRoIAlign(RoIOpTester):
             )
 
     @pytest.mark.parametrize("seed", range(10))
-    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("deterministic", (True, False))
     def test_backward(self, seed, device, contiguous, deterministic):
@@ -537,6 +581,8 @@ class TestRoIAlign(RoIOpTester):
 
 
 class TestPSRoIAlign(RoIOpTester):
+    mps_backward_atol = 5e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)(x, rois)
 
@@ -705,21 +751,28 @@ class TestNMS:
 
         torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou))
 
-    @needs_cuda
+    @pytest.mark.parametrize(
+        "device",
+        (
+            pytest.param("cuda", marks=pytest.mark.needs_cuda),
+            pytest.param("mps", marks=pytest.mark.needs_mps),
+        ),
+    )
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
-    def test_nms_cuda(self, iou, dtype=torch.float64):
+    def test_nms_gpu(self, iou, device, dtype=torch.float64):
+        dtype = torch.float32 if device == "mps" else dtype
         tol = 1e-3 if dtype is torch.half else 1e-5
         err_msg = "NMS incompatible between CPU and CUDA for IoU={}"
 
         boxes, scores = self._create_tensors_with_iou(1000, iou)
         r_cpu = ops.nms(boxes, scores, iou)
-        r_cuda = ops.nms(boxes.cuda(), scores.cuda(), iou)
+        r_gpu = ops.nms(boxes.to(device), scores.to(device), iou)
 
-        is_eq = torch.allclose(r_cpu, r_cuda.cpu())
+        is_eq = torch.allclose(r_cpu, r_gpu.cpu())
         if not is_eq:
             # if the indices are not the same, ensure that it's because the scores
             # are duplicate
-            is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol)
+            is_eq = torch.allclose(scores[r_cpu], scores[r_gpu.cpu()], rtol=tol, atol=tol)
         assert is_eq, err_msg.format(iou)
 
     @needs_cuda
@@ -727,18 +780,24 @@ class TestNMS:
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
     def test_autocast(self, iou, dtype):
         with torch.cuda.amp.autocast():
-            self.test_nms_cuda(iou=iou, dtype=dtype)
+            self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda")
 
-    @needs_cuda
-    def test_nms_cuda_float16(self):
+    @pytest.mark.parametrize(
+        "device",
+        (
+            pytest.param("cuda", marks=pytest.mark.needs_cuda),
+            pytest.param("mps", marks=pytest.mark.needs_mps),
+        ),
+    )
+    def test_nms_float16(self, device):
         boxes = torch.tensor(
             [
                 [285.3538, 185.5758, 1193.5110, 851.4551],
                 [285.1472, 188.7374, 1192.4984, 851.0669],
                 [279.2440, 197.9812, 1189.4746, 849.2019],
             ]
-        ).cuda()
-        scores = torch.tensor([0.6370, 0.7569, 0.3966]).cuda()
+        ).to(device)
+        scores = torch.tensor([0.6370, 0.7569, 0.3966]).to(device)
 
         iou_thres = 0.2
         keep32 = ops.nms(boxes, scores, iou_thres)
diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp
index c54d1f001..50479066c 100644
--- a/torchvision/csrc/ops/cpu/nms_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp
@@ -11,8 +11,8 @@ at::Tensor nms_kernel_impl(
     const at::Tensor& dets,
     const at::Tensor& scores,
     double iou_threshold) {
-  TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
-  TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
+  TORCH_CHECK(dets.is_cpu(), "dets must be a CPU tensor");
+  TORCH_CHECK(scores.is_cpu(), "scores must be a CPU tensor");
   TORCH_CHECK(
       dets.scalar_type() == scores.scalar_type(),
       "dets should have the same type as scores");
diff --git a/torchvision/csrc/ops/mps/mps_helpers.h b/torchvision/csrc/ops/mps/mps_helpers.h
new file mode 100644
index 000000000..d3c0e8d94
--- /dev/null
+++ b/torchvision/csrc/ops/mps/mps_helpers.h
@@ -0,0 +1,6 @@
+constexpr int threadsPerBlock = 512;
+
+template <typename T>
+constexpr inline T ceil_div(T n, T m) {
+  return (n + m - 1) / m;
+}
diff --git a/torchvision/csrc/ops/mps/mps_kernels.h b/torchvision/csrc/ops/mps/mps_kernels.h
new file mode 100644
index 000000000..e720a1608
--- /dev/null
+++ b/torchvision/csrc/ops/mps/mps_kernels.h
@@ -0,0 +1,1102 @@
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace vision {
+namespace ops {
+
+namespace mps {
+
+static const char* METAL_VISION = R"VISION_METAL(
+
+#include <metal_atomic>
+#include <metal_stdlib>
+using namespace metal;
+
+/*----------Macros----------*/
+
+#define MPS_1D_KERNEL_LOOP_T(i, n, n_tgs, index_t)      \
+  for (index_t i = (tgid.x * tptg.x) + tid2.x; i < (n); \
+       i += (tptg.x * n_tgs))
+
+#define MPS_1D_KERNEL_LOOP(i, n, n_tgs) MPS_1D_KERNEL_LOOP_T(i, n, n_tgs, uint)
+
+/*----------Helpers--------*/
+
+template <typename T>
+inline T ceil_div(T n, T m) {
+  return (n + m - 1) / m;
+}
+
+template <typename T>
+inline void atomic_add_float( device T* data_ptr, const T val)
+{
+#if __METAL_VERSION__ >= 300
+  // atomic_float is supported in Metal 3 (macOS Ventura) onward.
+  device atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed);
+#else
+  // Custom atomic addition implementation
+  // https://github.com/ShoYamanishi/AppleNumericalComputing/blob/053f06c1f5a831095c4bcc29aaf11366fce5231e/03_dot/metal/dot.metal#L447-L472
+  // https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639
+  // https://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf (See the last slide)
+  
+  // Create an atomic uint pointer for atomic transaction.
+  device atomic_uint* atom_var = (device atomic_uint*)data_ptr;
+  // Create necessary storage.
+  uint  fetched_uint,  assigning_uint;
+  T fetched_float, assigning_float;
+
+  // Replace the value in atom_var with 0 and return the previous value in atom_var.
+  fetched_uint = atomic_exchange_explicit( atom_var, 0 /*desired*/, memory_order_relaxed);
+  // Read out the previous value as float.
+  fetched_float = *( (thread T*) &fetched_uint );
+
+  // Do addition and represent the addition result in uint for atomic transaction.
+  assigning_float = fetched_float + val;
+  assigning_uint =  *((thread uint*) &assigning_float);
+
+  // atom_var should be 0 now, try to assign the addition result back to the atom_var (data_ptr).
+  while ((fetched_uint = atomic_exchange_explicit( atom_var, assigning_uint /*desired*/, memory_order_relaxed)) != 0)  {
+    // If atom_var was not 0, i.e. fetched_uint != 0, it means that the data has been modified by other threads.
+    // Try to assign 0 and get the previously assigned addition result.
+    uint fetched_uint_again = atomic_exchange_explicit(atom_var, 0 /*desired*/, memory_order_relaxed);
+    T fetched_float_again = *( (thread T*) &fetched_uint_again );
+    // Re-add again
+    fetched_float = *((thread T*) &(fetched_uint));
+    // Previously assigned addition result + addition result from other threads.
+    assigning_float = fetched_float_again + fetched_float;
+    assigning_uint =  *( (thread uint*) &assigning_float);
+  }
+#endif
+}
+
+template <typename T, typename integer_t>
+inline T bilinear_interpolate(
+    constant T* input,
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    uint index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  integer_t y_low = (integer_t)y;
+  integer_t x_low = (integer_t)x;
+  integer_t y_high;
+  integer_t x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T, typename integer_t>
+inline void bilinear_interpolate_gradient(
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    thread T& w1,
+    thread T& w2,
+    thread T& w3,
+    thread T& w4,
+    thread integer_t& x_low,
+    thread integer_t& x_high,
+    thread integer_t& y_low,
+    thread integer_t& y_high,
+    uint index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (integer_t)y;
+  x_low = (integer_t)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+}
+
+template <typename T, typename scalar_t>
+inline bool IoU(
+  constant T & a,
+  threadgroup T & b,
+  const float threshold) {
+  auto xx1 = max(a.x, b.x);
+  auto yy1 = max(a.y, b.y);
+  auto xx2 = min(a.z, b.z);
+  auto yy2 = min(a.w, b.w);
+  auto w = max(static_cast<scalar_t>(0), xx2 - xx1);
+  auto h = max(static_cast<scalar_t>(0), yy2 - yy1);
+  // Upcast to float before multiplications to circumvent precision issues in half.
+  auto inter = static_cast<float>(w) * static_cast<float>(h);
+  auto area_b = static_cast<float>(b.z - b.x) * static_cast<float>(b.w - b.y);
+  auto area_a = static_cast<float>(a.z - a.x) * static_cast<float>(a.w - a.y);
+  return (inter / (area_a + area_b - inter)) > threshold;
+}
+
+/*----------Kernels----------*/
+
+// This should be in sync with the one in nms_kernel.mm.
+// Since metal does not support dynamic array,
+// we need to make it static instead of deriving it from [[threads_per_threadgroup]].
+constant int64_t nmsThreadsPerBlock = sizeof(uint64_t) * 8;
+
+template<typename T, typename scalar_t>
+kernel void nms(constant  T        * dev_boxes     [[buffer(0)]],
+                device    uint64_t * mask          [[buffer(1)]],
+                constant  int64_t  & n_boxes       [[buffer(2)]],
+                constant  float    & iou_threshold [[buffer(3)]],
+                uint2     tgid     [[threadgroup_position_in_grid]],
+                uint2     tid2     [[thread_position_in_threadgroup]]) {
+  
+  const uint row_start = tgid.y;
+  const uint col_start = tgid.x;
+  const uint tid = tid2.x;
+  const uint row_size =
+      min(n_boxes - row_start * nmsThreadsPerBlock, nmsThreadsPerBlock);
+  const uint col_size =
+      min(n_boxes - col_start * nmsThreadsPerBlock, nmsThreadsPerBlock);
+
+  threadgroup T block_boxes[nmsThreadsPerBlock];
+  block_boxes[tid] = dev_boxes[nmsThreadsPerBlock * col_start + tid];
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid < row_size) {
+    const uint cur_box_idx = nmsThreadsPerBlock * row_start + tid;
+    uint64_t t = 0;
+    uint start = 0;
+    
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+
+    for (uint i = start; i < col_size; i++){
+      if (IoU<T, scalar_t>(dev_boxes[cur_box_idx], block_boxes[i], iou_threshold)){
+        t |= static_cast<uint64_t>(1) << i;  // discard 1 keep 0
+      }
+    }
+    const uint col_blocks = ceil_div(n_boxes, nmsThreadsPerBlock);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+#define REGISTER_NMS_OP(DTYPE)                             \
+template                                                   \
+[[host_name("nms_" #DTYPE)]]                               \
+kernel void nms<DTYPE ## 4, DTYPE>(                        \
+  constant DTYPE ## 4 * dev_boxes         [[buffer(0)]],   \
+  device   uint64_t   * mask              [[buffer(1)]],   \
+  constant int64_t    & n_boxes           [[buffer(2)]],   \
+  constant float      & iou_threshold     [[buffer(3)]],   \
+  uint2    tgid   [[threadgroup_position_in_grid]],        \
+  uint2    tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_align(
+    constant T       * input          [[buffer(0)]],
+    constant T       * rois           [[buffer(1)]],
+    device   T       * output         [[buffer(2)]],
+    constant int64_t & output_size    [[buffer(3)]],
+    constant int64_t & channels       [[buffer(4)]],
+    constant int64_t & height         [[buffer(5)]],
+    constant int64_t & width          [[buffer(6)]],
+    constant int64_t & pooled_height  [[buffer(7)]],
+    constant int64_t & pooled_width   [[buffer(8)]],
+    constant int64_t & sampling_ratio [[buffer(9)]],
+    constant bool    & aligned        [[buffer(10)]],
+    constant float   & spatial_scale  [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, static_cast<integer_t>(1)); // e.g. = 4
+
+    T output_val = 0.;
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    output[index] = output_val;
+  }
+}
+
+#define REGISTER_ROI_ALIGN_OP(DTYPE, INT_DTYPE)         \
+template                                                \
+[[host_name("roi_align_" #DTYPE)]]                      \
+kernel void roi_align<DTYPE, INT_DTYPE>(                \
+  constant DTYPE * input            [[buffer(0)]],      \
+  constant DTYPE * rois             [[buffer(1)]],      \
+  device   DTYPE * output           [[buffer(2)]],      \
+  constant int64_t & output_size    [[buffer(3)]],      \
+  constant int64_t & channels       [[buffer(4)]],      \
+  constant int64_t & height         [[buffer(5)]],      \
+  constant int64_t & width          [[buffer(6)]],      \
+  constant int64_t & pooled_height  [[buffer(7)]],      \
+  constant int64_t & pooled_width   [[buffer(8)]],      \
+  constant int64_t & sampling_ratio [[buffer(9)]],      \
+  constant bool    & aligned        [[buffer(10)]],     \
+  constant float   & spatial_scale  [[buffer(11)]],     \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_align_backward(
+    constant T       * grad_output    [[buffer(0)]],
+    constant T       * rois           [[buffer(1)]],
+    device   T       * grad_input     [[buffer(2)]],
+    constant int64_t & output_size    [[buffer(3)]],
+    constant int64_t & channels       [[buffer(4)]],
+    constant int64_t & height         [[buffer(5)]],
+    constant int64_t & width          [[buffer(6)]],
+    constant int64_t & pooled_height  [[buffer(7)]],
+    constant int64_t & pooled_width   [[buffer(8)]],
+    constant int64_t & sampling_ratio [[buffer(9)]],
+    constant bool    & aligned        [[buffer(10)]],
+    constant float   & spatial_scale  [[buffer(11)]],
+    constant int64_t & n_stride       [[buffer(12)]],
+    constant int64_t & c_stride       [[buffer(13)]],
+    constant int64_t & h_stride       [[buffer(14)]],
+    constant int64_t & w_stride       [[buffer(15)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We need to index the gradient using the tensor strides to access the
+    // correct values.
+    const integer_t output_offset = n * n_stride + c * c_stride;
+    constant T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    const integer_t input_offset = (roi_batch_ind * channels + c) * height * width;
+
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        integer_t x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomic_add_float(grad_input + input_offset + y_low * width + x_low, static_cast<T>(g1));
+          atomic_add_float(grad_input + input_offset + y_low * width + x_high, static_cast<T>(g2));
+          atomic_add_float(grad_input + input_offset + y_high * width + x_low, static_cast<T>(g3));
+          atomic_add_float(grad_input + input_offset + y_high * width + x_high, static_cast<T>(g4));
+          
+        } // if
+      } // ix
+    } // iy
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_ROI_ALIGN_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                   \
+[[host_name("roi_align_backward_" #DTYPE)]]                \
+kernel void roi_align_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output    [[buffer(0)]],       \
+    constant DTYPE   * rois           [[buffer(1)]],       \
+    device   DTYPE   * grad_input     [[buffer(2)]],       \
+    constant int64_t & output_size    [[buffer(3)]],       \
+    constant int64_t & channels       [[buffer(4)]],       \
+    constant int64_t & height         [[buffer(5)]],       \
+    constant int64_t & width          [[buffer(6)]],       \
+    constant int64_t & pooled_height  [[buffer(7)]],       \
+    constant int64_t & pooled_width   [[buffer(8)]],       \
+    constant int64_t & sampling_ratio [[buffer(9)]],       \
+    constant bool    & aligned        [[buffer(10)]],      \
+    constant float   & spatial_scale  [[buffer(11)]],      \
+    constant int64_t & n_stride       [[buffer(12)]],      \
+    constant int64_t & c_stride       [[buffer(13)]],      \
+    constant int64_t & h_stride       [[buffer(14)]],      \
+    constant int64_t & w_stride       [[buffer(15)]],      \
+    uint2     tgid   [[threadgroup_position_in_grid]],     \
+    uint2     tptg   [[threads_per_threadgroup]],          \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_pool(
+    constant T       * input         [[buffer(0)]],
+    constant T       * rois          [[buffer(1)]],
+    device   T       * output        [[buffer(2)]],
+    device   int64_t * argmax        [[buffer(3)]],
+    constant int64_t & output_size   [[buffer(4)]],
+    constant int64_t & channels      [[buffer(5)]],
+    constant int64_t & height        [[buffer(6)]],
+    constant int64_t & width         [[buffer(7)]],
+    constant int64_t & pooled_height [[buffer(8)]],
+    constant int64_t & pooled_width  [[buffer(9)]],
+    constant float   & spatial_scale [[buffer(10)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w + 1, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h + 1, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    integer_t maxidx = -1;
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t input_index = h * width + w;
+        if (offset_input[input_index] > maxval) {
+          maxval = offset_input[input_index];
+          maxidx = input_index;
+        }
+      }
+    }
+    output[index] = maxval;
+    argmax[index] = maxidx;
+  }
+}
+
+#define REGISTER_ROI_POOL_OP(DTYPE, INT_DTYPE)          \
+template                                                \
+[[host_name("roi_pool_" #DTYPE)]]                       \
+kernel void roi_pool<DTYPE, INT_DTYPE>(                 \
+  constant DTYPE * input           [[buffer(0)]],       \
+  constant DTYPE * rois            [[buffer(1)]],       \
+  device   DTYPE * output          [[buffer(2)]],       \
+  device   int64_t * argmax_data   [[buffer(3)]],       \
+  constant int64_t & output_size   [[buffer(4)]],       \
+  constant int64_t & channels      [[buffer(5)]],       \
+  constant int64_t & height        [[buffer(6)]],       \
+  constant int64_t & width         [[buffer(7)]],       \
+  constant int64_t & pooled_height [[buffer(8)]],       \
+  constant int64_t & pooled_width  [[buffer(9)]],       \
+  constant float   & spatial_scale [[buffer(10)]],      \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_pool_backward(
+    constant T       * grad_output   [[buffer(0)]],
+    constant T       * rois          [[buffer(1)]],
+    constant int64_t * argmax_data   [[buffer(2)]],
+    device   T       * grad_input    [[buffer(3)]],
+    constant int64_t & output_size   [[buffer(4)]],
+    constant int64_t & channels      [[buffer(5)]],
+    constant int64_t & height        [[buffer(6)]],
+    constant int64_t & width         [[buffer(7)]],
+    constant int64_t & pooled_height [[buffer(8)]],
+    constant int64_t & pooled_width  [[buffer(9)]],
+    constant float   & spatial_scale [[buffer(10)]],
+    constant int64_t & n_stride      [[buffer(11)]],
+    constant int64_t & c_stride      [[buffer(12)]],
+    constant int64_t & h_stride      [[buffer(13)]],
+    constant int64_t & w_stride      [[buffer(14)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    const integer_t output_offset = n * n_stride + c * c_stride;
+    constant integer_t * argmax_data_offset =
+        argmax_data + (n * channels + c) * pooled_height * pooled_width;
+    const integer_t argmax = argmax_data_offset[ph * pooled_width + pw];
+    const integer_t offset = (roi_batch_ind * channels + c) * height * width;
+
+    if (argmax != -1) {
+      atomic_add_float(grad_input + offset + argmax, static_cast<T>(grad_output[output_offset + ph * h_stride + pw * w_stride]));
+    }
+    
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_ROI_POOL_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                  \
+[[host_name("roi_pool_backward_" #DTYPE)]]                \
+kernel void roi_pool_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output   [[buffer(0)]],       \
+    constant DTYPE   * rois          [[buffer(1)]],       \
+    constant int64_t * argmax_data   [[buffer(2)]],       \
+    device   DTYPE   * grad_input    [[buffer(3)]],       \
+    constant int64_t & output_size   [[buffer(4)]],       \
+    constant int64_t & channels      [[buffer(5)]],       \
+    constant int64_t & height        [[buffer(6)]],       \
+    constant int64_t & width         [[buffer(7)]],       \
+    constant int64_t & pooled_height [[buffer(8)]],       \
+    constant int64_t & pooled_width  [[buffer(9)]],       \
+    constant float   & spatial_scale [[buffer(10)]],      \
+    constant int64_t & n_stride      [[buffer(11)]],      \
+    constant int64_t & c_stride      [[buffer(12)]],      \
+    constant int64_t & h_stride      [[buffer(13)]],      \
+    constant int64_t & w_stride      [[buffer(14)]],      \
+    uint2     tgid   [[threadgroup_position_in_grid]],    \
+    uint2     tptg   [[threads_per_threadgroup]],         \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_align(
+    constant T       * input           [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    device   T       * output          [[buffer(2)]],
+    device   int64_t * channel_mapping [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & sampling_ratio  [[buffer(10)]],
+    constant int64_t & channels_out    [[buffer(11)]],
+    constant float   & spatial_scale   [[buffer(12)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c_out, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c_out = (index / pooled_width / pooled_height) % channels_out;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    // (n, c_in, ph, pw) is the associated element in the input
+    integer_t c_in = (c_out * pooled_height + ph) * pooled_width + pw;
+
+    // [start, end) interval for spatial sampling
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - static_cast<T>(0.5);
+    T roi_start_h = offset_rois[2] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_w = offset_rois[3] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_h = offset_rois[4] * spatial_scale - static_cast<T>(0.5);
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // Do not using floor/ceil; this implementation detail is critical
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c_in) * height * width;
+    T out_sum = 0;
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = hstart +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = wstart +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        out_sum += val;
+      }
+    }
+
+    out_sum /= count;
+    output[index] = out_sum;
+    channel_mapping[index] = c_in;
+  }
+}
+
+#define REGISTER_PS_ROI_ALIGN_OP(DTYPE, INT_DTYPE)      \
+template                                                \
+[[host_name("ps_roi_align_" #DTYPE)]]                   \
+kernel void ps_roi_align<DTYPE, INT_DTYPE>(             \
+  constant DTYPE   * input           [[buffer(0)]],     \
+  constant DTYPE   * rois            [[buffer(1)]],     \
+  device   DTYPE   * output          [[buffer(2)]],     \
+  device   int64_t * channel_mapping [[buffer(3)]],     \
+  constant int64_t & output_size     [[buffer(4)]],     \
+  constant int64_t & channels        [[buffer(5)]],     \
+  constant int64_t & height          [[buffer(6)]],     \
+  constant int64_t & width           [[buffer(7)]],     \
+  constant int64_t & pooled_height   [[buffer(8)]],     \
+  constant int64_t & pooled_width    [[buffer(9)]],     \
+  constant int64_t & sampling_ratio  [[buffer(10)]],    \
+  constant int64_t & channels_out    [[buffer(11)]],    \
+  constant float   & spatial_scale   [[buffer(12)]],    \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_align_backward(
+    constant T       * grad_output     [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    constant int64_t * channel_mapping [[buffer(2)]],
+    device   T       * grad_input      [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & sampling_ratio  [[buffer(10)]],
+    constant int64_t & channels_out    [[buffer(11)]],
+    constant float   & spatial_scale   [[buffer(12)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, *, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - static_cast<T>(0.5);
+    T roi_start_h = offset_rois[2] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_w = offset_rois[3] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_h = offset_rois[4] * spatial_scale - static_cast<T>(0.5);
+
+    // Force too small ROIs to be 1x1
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    integer_t c_in = channel_mapping[index];
+
+    // Do not using floor/ceil; this implementation detail is critical
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    const integer_t offset = (roi_batch_ind * channels + c_in) * height * width;
+
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = hstart +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = wstart +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        integer_t x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomic_add_float(grad_input + offset + y_low * width + x_low, static_cast<T>(g1));
+          atomic_add_float(grad_input + offset + y_low * width + x_high, static_cast<T>(g2));
+          atomic_add_float(grad_input + offset + y_high * width + x_low, static_cast<T>(g3));
+          atomic_add_float(grad_input + offset + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  }
+}
+
+#define REGISTER_PS_ROI_ALIGN_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                      \
+[[host_name("ps_roi_align_backward_" #DTYPE)]]                \
+kernel void ps_roi_align_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output     [[buffer(0)]],         \
+    constant DTYPE   * rois            [[buffer(1)]],         \
+    constant int64_t * channel_mapping [[buffer(2)]],         \
+    device   DTYPE   * grad_input      [[buffer(3)]],         \
+    constant int64_t & output_size     [[buffer(4)]],         \
+    constant int64_t & channels        [[buffer(5)]],         \
+    constant int64_t & height          [[buffer(6)]],         \
+    constant int64_t & width           [[buffer(7)]],         \
+    constant int64_t & pooled_height   [[buffer(8)]],         \
+    constant int64_t & pooled_width    [[buffer(9)]],         \
+    constant int64_t & sampling_ratio  [[buffer(10)]],        \
+    constant int64_t & channels_out    [[buffer(11)]],        \
+    constant float   & spatial_scale   [[buffer(12)]],        \
+    uint2     tgid   [[threadgroup_position_in_grid]],        \
+    uint2     tptg   [[threads_per_threadgroup]],             \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_pool(
+    constant T       * input           [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    device   T       * output          [[buffer(2)]],
+    device   int64_t * channel_mapping [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & channels_out    [[buffer(10)]],
+    constant float   & spatial_scale   [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c_out, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c_out = (index / (pooled_width * pooled_height)) % channels_out;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    // (n, c_in, ph, pw) is the associated element in the input
+    integer_t c_in = (c_out * pooled_height + ph) * pooled_width + pw;
+
+    // [start, end) interval for spatial sampling
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force too small ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height - 1));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height - 1));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width - 1));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width - 1));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c_in) * height * width;
+    T out_sum = 0;
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t input_index = h * width + w;
+        out_sum += offset_input[input_index];
+      }
+    }
+
+    T bin_area = (hend - hstart) * (wend - wstart);
+    output[index] = is_empty ? static_cast<T>(0) : out_sum / bin_area;
+    channel_mapping[index] = c_in;
+  }
+}
+
+#define REGISTER_PS_ROI_POOL_OP(DTYPE, INT_DTYPE)     \
+template                                              \
+[[host_name("ps_roi_pool_" #DTYPE)]]                  \
+kernel void ps_roi_pool<DTYPE, INT_DTYPE>(            \
+  constant DTYPE   * input           [[buffer(0)]],   \
+  constant DTYPE   * rois            [[buffer(1)]],   \
+  device   DTYPE   * output          [[buffer(2)]],   \
+  device   int64_t * channel_mapping [[buffer(3)]],   \
+  constant int64_t & output_size     [[buffer(4)]],   \
+  constant int64_t & channels        [[buffer(5)]],   \
+  constant int64_t & height          [[buffer(6)]],   \
+  constant int64_t & width           [[buffer(7)]],   \
+  constant int64_t & pooled_height   [[buffer(8)]],   \
+  constant int64_t & pooled_width    [[buffer(9)]],   \
+  constant int64_t & channels_out    [[buffer(10)]],  \
+  constant float   & spatial_scale   [[buffer(11)]],  \
+  uint2    tgid   [[threadgroup_position_in_grid]],   \
+  uint2    tptg   [[threads_per_threadgroup]],        \
+  uint2    tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_pool_backward(
+    constant T       * grad_output     [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    constant int64_t * channel_mapping [[buffer(2)]],
+    device   T       * grad_input      [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & channels_out    [[buffer(10)]],
+    constant float   & spatial_scale   [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, *, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force too small ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    integer_t c_in = channel_mapping[index];
+    T bin_area = (hend - hstart) * (wend - wstart);
+    T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
+
+    const integer_t offset = (roi_batch_ind * channels + c_in) * height * width;
+
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t grad_input_index = h * width + w;
+        atomic_add_float(grad_input + offset + grad_input_index, diff_val);
+      }
+    }
+    
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_PS_ROI_POOL_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                     \
+[[host_name("ps_roi_pool_backward_" #DTYPE)]]                \
+kernel void ps_roi_pool_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output     [[buffer(0)]],        \
+    constant DTYPE   * rois            [[buffer(1)]],        \
+    constant int64_t * channel_mapping [[buffer(2)]],        \
+    device   DTYPE   * grad_input      [[buffer(3)]],        \
+    constant int64_t & output_size     [[buffer(4)]],        \
+    constant int64_t & channels        [[buffer(5)]],        \
+    constant int64_t & height          [[buffer(6)]],        \
+    constant int64_t & width           [[buffer(7)]],        \
+    constant int64_t & pooled_height   [[buffer(8)]],        \
+    constant int64_t & pooled_width    [[buffer(9)]],        \
+    constant int64_t & channels_out    [[buffer(10)]],       \ 
+    constant float   & spatial_scale   [[buffer(11)]],       \
+    uint2     tgid   [[threadgroup_position_in_grid]],       \
+    uint2     tptg   [[threads_per_threadgroup]],            \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+REGISTER_NMS_OP(float);
+REGISTER_NMS_OP(half);
+REGISTER_ROI_ALIGN_OP(float, int64_t);
+REGISTER_ROI_ALIGN_OP(half, int64_t);
+REGISTER_ROI_ALIGN_BACKWARD_OP(float, int64_t);
+REGISTER_ROI_ALIGN_BACKWARD_OP(half, int64_t);
+REGISTER_ROI_POOL_OP(float, int64_t);
+REGISTER_ROI_POOL_OP(half, int64_t);
+REGISTER_ROI_POOL_BACKWARD_OP(float, int64_t);
+REGISTER_ROI_POOL_BACKWARD_OP(half, int64_t);
+REGISTER_PS_ROI_ALIGN_OP(float, int64_t);
+REGISTER_PS_ROI_ALIGN_OP(half, int64_t);
+REGISTER_PS_ROI_ALIGN_BACKWARD_OP(float, int64_t);
+REGISTER_PS_ROI_ALIGN_BACKWARD_OP(half, int64_t);
+REGISTER_PS_ROI_POOL_OP(float, int64_t);
+REGISTER_PS_ROI_POOL_OP(half, int64_t);
+REGISTER_PS_ROI_POOL_BACKWARD_OP(float, int64_t);
+REGISTER_PS_ROI_POOL_BACKWARD_OP(half, int64_t);
+
+)VISION_METAL";
+
+static id<MTLLibrary> compileVisionOpsLibrary(id<MTLDevice> device) {
+  static id<MTLLibrary> visionLibrary = nil;
+  if (visionLibrary) {
+    return visionLibrary;
+  }
+
+  NSError* error = nil;
+  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion:MTLLanguageVersion2_3];
+  visionLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_VISION encoding:NSASCIIStringEncoding]
+                                       options:options
+                                         error:&error];
+  TORCH_CHECK(visionLibrary, "Failed to create metal vision library, error: ", [[error description] UTF8String]);
+  return visionLibrary;
+}
+
+static id<MTLComputePipelineState> visionPipelineState(id<MTLDevice> device, const std::string& kernel) {
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
+  id<MTLComputePipelineState> pso = psoCache[kernel];
+  if (pso) {
+    return pso;
+  }
+
+  NSError* error = nil;
+  id<MTLLibrary> visionLib = compileVisionOpsLibrary(device);
+  id<MTLFunction> visionFunc = [visionLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(visionFunc, "Failed to create function state object for: ", kernel);
+  pso = [device newComputePipelineStateWithFunction:visionFunc error:&error];
+  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+
+  psoCache[kernel] = pso;
+  return pso;
+}
+
+} // namespace mps
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/nms_kernel.mm b/torchvision/csrc/ops/mps/nms_kernel.mm
new file mode 100644
index 000000000..5ee9b5cbe
--- /dev/null
+++ b/torchvision/csrc/ops/mps/nms_kernel.mm
@@ -0,0 +1,109 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+// This should be in sync with `nmsThreadsPerBlock` in the metal kernel.
+constexpr int64_t nmsThreadsPerBlock = sizeof(uint64_t) * 8;
+
+at::Tensor nms_kernel(const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) {
+  using namespace at::native::mps;
+  TORCH_CHECK(dets.is_mps(), "dets must be a MPS tensor");
+  TORCH_CHECK(scores.is_mps(), "scores must be a MPS tensor");
+
+  TORCH_CHECK(dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
+  TORCH_CHECK(dets.size(1) == 4, "boxes should have 4 elements in dimension 1, got ", dets.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ", scores.dim(), "D");
+  TORCH_CHECK(dets.size(0) == scores.size(0),
+              "boxes and scores should have same number of elements in ",
+              "dimension 0, got ",
+              dets.size(0),
+              " and ",
+              scores.size(0))
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t).contiguous();
+  int64_t dets_num = dets.size(0);
+  float iou_threshold_f = static_cast<float>(iou_threshold);
+
+  const int col_blocks = (dets_num + nmsThreadsPerBlock - 1) / nmsThreadsPerBlock;
+  at::Tensor mask = at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(dets_sorted);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(mask);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(col_blocks, col_blocks, 1);
+
+      const std::string kernel = "nms_" + scalarToMetalTypeString(dets_sorted.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {dets, scores});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      [computeEncoder setBuffer:inputBuffer offset:dets_sorted.storage_offset() * dets_sorted.element_size() atIndex:0];
+      [computeEncoder setBuffer:outputBuffer offset:mask.storage_offset() * mask.element_size() atIndex:1];
+      [computeEncoder setBytes:&dets_num length:sizeof(int64_t) atIndex:2];
+      [computeEncoder setBytes:&iou_threshold_f length:sizeof(float) atIndex:3];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > nmsThreadsPerBlock) {
+        tgSize = nmsThreadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+
+  int64_t num_to_keep = 0;
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep = at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < dets_num; i++) {
+    int64_t nblock = i / nmsThreadsPerBlock;
+    int64_t inblock = i % nmsThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int64_t j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
new file mode 100644
index 000000000..16b711ad5
--- /dev/null
+++ b/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
@@ -0,0 +1,205 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_kernel(const at::Tensor& input,
+                                                               const at::Tensor& rois,
+                                                               double spatial_scale,
+                                                               int64_t pooled_height,
+                                                               int64_t pooled_width,
+                                                               int64_t sampling_ratio) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ps_roi_align_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  TORCH_CHECK(channels % (pooled_height * pooled_width) == 0,
+              "input channels must be a multiple of pooling height * pooling width");
+
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  auto output = at::zeros({num_rois, channels_out, pooled_height, pooled_width}, input.options());
+  auto channel_mapping = at::zeros(output.sizes(), input.options().dtype(at::kLong));
+
+  int64_t output_size = output.numel();
+
+  if (output_size == 0) {
+    return std::make_tuple(output, channel_mapping);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_align_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:12];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, channel_mapping);
+}
+
+at::Tensor ps_roi_align_backward_kernel(const at::Tensor& grad,
+                                        const at::Tensor& rois,
+                                        const at::Tensor& channel_mapping,
+                                        double spatial_scale,
+                                        int64_t pooled_height,
+                                        int64_t pooled_width,
+                                        int64_t sampling_ratio,
+                                        int64_t batch_size,
+                                        int64_t channels,
+                                        int64_t height,
+                                        int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support ps_roi_align backward with float16 inputs.");
+  TORCH_CHECK(channel_mapping.is_mps(), "channel_mapping must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2},
+      channel_mapping_t{channel_mapping, "channel_mapping", 3};
+
+  at::CheckedFrom c = "ps_roi_align_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t output_size = grad.numel();
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  at::globalContext().alertNotDeterministic("ps_roi_align_backward_kernel");
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_align_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad_.storage_offset() * grad_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:12];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::ps_roi_align"), TORCH_FN(ps_roi_align_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_ps_roi_align_backward"), TORCH_FN(ps_roi_align_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
new file mode 100644
index 000000000..fc24f6990
--- /dev/null
+++ b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
@@ -0,0 +1,200 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_kernel(const at::Tensor& input,
+                                                              const at::Tensor& rois,
+                                                              double spatial_scale,
+                                                              int64_t pooled_height,
+                                                              int64_t pooled_width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ps_roi_pool_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  TORCH_CHECK(channels % (pooled_height * pooled_width) == 0,
+              "input channels must be a multiple of pooling height * pooling width");
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  auto output = at::zeros({num_rois, channels_out, pooled_height, pooled_width}, input.options());
+  auto channel_mapping = at::zeros(output.sizes(), input.options().dtype(at::kLong));
+  auto output_size = output.numel();
+
+  if (output_size == 0) {
+    return std::make_tuple(output, channel_mapping);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_pool_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, channel_mapping);
+}
+
+at::Tensor ps_roi_pool_backward_kernel(const at::Tensor& grad,
+                                       const at::Tensor& rois,
+                                       const at::Tensor& channel_mapping,
+                                       double spatial_scale,
+                                       int64_t pooled_height,
+                                       int64_t pooled_width,
+                                       int64_t batch_size,
+                                       int64_t channels,
+                                       int64_t height,
+                                       int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support ps_roi_pool backward with float16 inputs.");
+  TORCH_CHECK(channel_mapping.is_mps(), "channel_mapping must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
+      channel_mapping_t{channel_mapping, "channel_mapping", 3};
+
+  at::CheckedFrom c = "ps_roi_pool_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  auto num_rois = rois.size(0);
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("ps_roi_pool_backward_kernel");
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_pool_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad_, rois_, channel_mapping});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad_.storage_offset() * grad_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::ps_roi_pool"), TORCH_FN(ps_roi_pool_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_ps_roi_pool_backward"), TORCH_FN(ps_roi_pool_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/roi_align_kernel.mm b/torchvision/csrc/ops/mps/roi_align_kernel.mm
new file mode 100644
index 000000000..d4ed8b43f
--- /dev/null
+++ b/torchvision/csrc/ops/mps/roi_align_kernel.mm
@@ -0,0 +1,197 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor roi_align_forward_kernel(const at::Tensor& input,
+                                    const at::Tensor& rois,
+                                    double spatial_scale,
+                                    int64_t pooled_height,
+                                    int64_t pooled_width,
+                                    int64_t sampling_ratio,
+                                    bool aligned) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_align_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor output = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  int64_t output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_align_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:3];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&aligned length:sizeof(bool) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return output;
+}
+
+at::Tensor roi_align_backward_kernel(const at::Tensor& grad,
+                                     const at::Tensor& rois,
+                                     double spatial_scale,
+                                     int64_t pooled_height,
+                                     int64_t pooled_width,
+                                     int64_t batch_size,
+                                     int64_t channels,
+                                     int64_t height,
+                                     int64_t width,
+                                     int64_t sampling_ratio,
+                                     bool aligned) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support roi_align backward with float16 inputs.");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_align_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_align_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad.storage_offset() * grad.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:2];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:3];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&aligned length:sizeof(bool) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+      [computeEncoder setBytes:&n_stride length:sizeof(int64_t) atIndex:12];
+      [computeEncoder setBytes:&c_stride length:sizeof(int64_t) atIndex:13];
+      [computeEncoder setBytes:&h_stride length:sizeof(int64_t) atIndex:14];
+      [computeEncoder setBytes:&w_stride length:sizeof(int64_t) atIndex:15];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::roi_align"), TORCH_FN(roi_align_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_roi_align_backward"), TORCH_FN(roi_align_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/roi_pool_kernel.mm b/torchvision/csrc/ops/mps/roi_pool_kernel.mm
new file mode 100644
index 000000000..816d8d708
--- /dev/null
+++ b/torchvision/csrc/ops/mps/roi_pool_kernel.mm
@@ -0,0 +1,196 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> roi_pool_forward_kernel(const at::Tensor& input,
+                                                           const at::Tensor& rois,
+                                                           double spatial_scale,
+                                                           int64_t pooled_height,
+                                                           int64_t pooled_width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_pool_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor output = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
+  at::Tensor argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kLong));
+
+  int64_t output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return std::make_tuple(output, argmax);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> argmaxBuffer = getMTLBufferStorage(argmax);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_pool_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:argmaxBuffer offset:argmax.storage_offset() * argmax.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:10];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, argmax);
+}
+
+at::Tensor roi_pool_backward_kernel(const at::Tensor& grad,
+                                    const at::Tensor& rois,
+                                    const at::Tensor& argmax,
+                                    double spatial_scale,
+                                    int64_t pooled_height,
+                                    int64_t pooled_width,
+                                    int64_t batch_size,
+                                    int64_t channels,
+                                    int64_t height,
+                                    int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support roi_pool backward with float16 inputs.");
+  TORCH_CHECK(argmax.is_mps(), "argmax must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2}, argmax_t{argmax, "argmax", 3};
+
+  at::CheckedFrom c = "roi_pool_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("roi_pool_backward_kernel");
+  auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> argmaxBuffer = getMTLBufferStorage(argmax_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_pool_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_, argmax_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad.storage_offset() * grad.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:argmaxBuffer offset:argmax_.storage_offset() * argmax_.element_size() atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:10];
+      [computeEncoder setBytes:&n_stride length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&c_stride length:sizeof(int64_t) atIndex:12];
+      [computeEncoder setBytes:&h_stride length:sizeof(int64_t) atIndex:13];
+      [computeEncoder setBytes:&w_stride length:sizeof(int64_t) atIndex:14];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::roi_pool"), TORCH_FN(roi_pool_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_roi_pool_backward"), TORCH_FN(roi_pool_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index be8ec8aea..0d505c140 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -158,12 +158,12 @@ def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling
     y = (
         from_K(roi_start_h)
         + ph[None, :, None] * from_K(bin_size_h)
-        + (iy[None, None, :] + 0.5) * from_K(bin_size_h / roi_bin_grid_h)
+        + (iy[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_h / roi_bin_grid_h)
     )  # [K, PH, IY]
     x = (
         from_K(roi_start_w)
         + pw[None, :, None] * from_K(bin_size_w)
-        + (ix[None, None, :] + 0.5) * from_K(bin_size_w / roi_bin_grid_w)
+        + (ix[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_w / roi_bin_grid_w)
     )  # [K, PW, IX]
     val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask)  # [K, C, PH, PW, IY, IX]
 
@@ -232,7 +232,7 @@ def roi_align(
     if not isinstance(rois, torch.Tensor):
         rois = convert_boxes_to_roi_format(rois)
     if not torch.jit.is_scripting():
-        if not _has_ops() or (torch.are_deterministic_algorithms_enabled() and input.is_cuda):
+        if not _has_ops() or (torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_mps)):
             return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
     _assert_has_ops()
     return torch.ops.torchvision.roi_align(
-- 
GitLab


From a893f313d02ed67aa3b2968242bebef09f09ce1d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Aug 2023 15:38:00 +0200
Subject: [PATCH 534/624] refactor Datapoint dispatch mechanism (#7747)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/common_utils.py                          |   4 +
 test/datasets_utils.py                        |   6 +-
 test/test_transforms_v2.py                    |  10 +-
 test/test_transforms_v2_functional.py         |  52 +-
 test/test_transforms_v2_refactored.py         | 255 +++++--
 test/transforms_v2_dispatcher_infos.py        |  25 +-
 test/transforms_v2_kernel_infos.py            |  40 -
 torchvision/datapoints/__init__.py            |   2 +-
 torchvision/datapoints/_bounding_box.py       | 143 +---
 torchvision/datapoints/_datapoint.py          | 138 ----
 torchvision/datapoints/_image.py              | 194 +----
 torchvision/datapoints/_mask.py               | 107 +--
 torchvision/datapoints/_video.py              | 190 +----
 torchvision/transforms/v2/_augment.py         |   8 +-
 torchvision/transforms/v2/_geometry.py        |  33 +-
 torchvision/transforms/v2/_temporal.py        |   5 +-
 .../transforms/v2/functional/__init__.py      |   3 +-
 .../transforms/v2/functional/_augment.py      |  60 +-
 .../transforms/v2/functional/_color.py        | 270 ++++---
 .../transforms/v2/functional/_geometry.py     | 686 ++++++++++++------
 torchvision/transforms/v2/functional/_meta.py | 145 ++--
 torchvision/transforms/v2/functional/_misc.py | 101 ++-
 .../transforms/v2/functional/_temporal.py     |  31 +-
 .../transforms/v2/functional/_utils.py        | 135 +++-
 24 files changed, 1215 insertions(+), 1428 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 3f8a12e16..38033d4dc 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -829,6 +829,10 @@ def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
     return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
 def make_video_loader(
     size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     *,
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index ab325a806..b6f22d766 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -567,7 +567,7 @@ class DatasetTestCase(unittest.TestCase):
 
     @test_all_configs
     def test_transforms_v2_wrapper(self, config):
-        from torchvision.datapoints._datapoint import Datapoint
+        from torchvision import datapoints
         from torchvision.datasets import wrap_dataset_for_transforms_v2
 
         try:
@@ -588,7 +588,9 @@ class DatasetTestCase(unittest.TestCase):
                     assert len(wrapped_dataset) == info["num_examples"]
 
                     wrapped_sample = wrapped_dataset[0]
-                    assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
+                    assert tree_any(
+                        lambda item: isinstance(item, (datapoints.Datapoint, PIL.Image.Image)), wrapped_sample
+                    )
         except TypeError as error:
             msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
             if str(error).startswith(msg):
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index d5f448b09..49455b05d 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1344,12 +1344,12 @@ def test_antialias_warning():
         transforms.RandomResize(10, 20)(tensor_img)
 
     with pytest.warns(UserWarning, match=match):
-        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
+        F.resized_crop(datapoints.Image(tensor_img), 0, 0, 10, 10, (20, 20))
 
     with pytest.warns(UserWarning, match=match):
-        datapoints.Video(tensor_video).resize((20, 20))
+        F.resize(datapoints.Video(tensor_video), (20, 20))
     with pytest.warns(UserWarning, match=match):
-        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20))
+        F.resized_crop(datapoints.Video(tensor_video), 0, 0, 10, 10, (20, 20))
 
     with warnings.catch_warnings():
         warnings.simplefilter("error")
@@ -1363,8 +1363,8 @@ def test_antialias_warning():
         transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
         transforms.RandomResize(10, 20, antialias=True)(tensor_img)
 
-        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
-        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+        F.resized_crop(datapoints.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True)
+        F.resized_crop(datapoints.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True)
 
 
 @pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 230695ff9..8d5297326 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -2,13 +2,11 @@ import inspect
 import math
 import os
 import re
-
-from typing import get_type_hints
+from unittest import mock
 
 import numpy as np
 import PIL.Image
 import pytest
-
 import torch
 
 from common_utils import (
@@ -27,6 +25,7 @@ from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes
+from torchvision.transforms.v2.functional._utils import _KERNEL_REGISTRY
 from torchvision.transforms.v2.utils import is_simple_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
@@ -424,12 +423,18 @@ class TestDispatchers:
     def test_dispatch_datapoint(self, info, args_kwargs, spy_on):
         (datapoint, *other_args), kwargs = args_kwargs.load()
 
-        method_name = info.id
-        method = getattr(datapoint, method_name)
-        datapoint_type = type(datapoint)
-        spy = spy_on(method, module=datapoint_type.__module__, name=f"{datapoint_type.__name__}.{method_name}")
+        input_type = type(datapoint)
+
+        wrapped_kernel = _KERNEL_REGISTRY[info.dispatcher][input_type]
 
-        info.dispatcher(datapoint, *other_args, **kwargs)
+        # In case the wrapper was decorated with @functools.wraps, we can make the check more strict and test if the
+        # proper kernel was wrapped
+        if hasattr(wrapped_kernel, "__wrapped__"):
+            assert wrapped_kernel.__wrapped__ is info.kernels[input_type]
+
+        spy = mock.MagicMock(wraps=wrapped_kernel, name=wrapped_kernel.__name__)
+        with mock.patch.dict(_KERNEL_REGISTRY[info.dispatcher], values={input_type: spy}):
+            info.dispatcher(datapoint, *other_args, **kwargs)
 
         spy.assert_called_once()
 
@@ -462,9 +467,12 @@ class TestDispatchers:
         kernel_params = list(kernel_signature.parameters.values())[1:]
 
         # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be
-        # explicit passed to the kernel.
-        datapoint_type_metadata = datapoint_type.__annotations__.keys()
-        kernel_params = [param for param in kernel_params if param.name not in datapoint_type_metadata]
+        # explicitly passed to the kernel.
+        input_type = {v: k for k, v in dispatcher_info.kernels.items()}.get(kernel_info.kernel)
+        explicit_metadata = {
+            datapoints.BoundingBoxes: {"format", "canvas_size"},
+        }
+        kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
 
         dispatcher_params = iter(dispatcher_params)
         for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
@@ -481,28 +489,6 @@ class TestDispatchers:
 
             assert dispatcher_param == kernel_param
 
-    @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
-    def test_dispatcher_datapoint_signatures_consistency(self, info):
-        try:
-            datapoint_method = getattr(datapoints._datapoint.Datapoint, info.id)
-        except AttributeError:
-            pytest.skip("Dispatcher doesn't support arbitrary datapoint dispatch.")
-
-        dispatcher_signature = inspect.signature(info.dispatcher)
-        dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
-
-        datapoint_signature = inspect.signature(datapoint_method)
-        datapoint_params = list(datapoint_signature.parameters.values())[1:]
-
-        # Because we use `from __future__ import annotations` inside the module where `datapoints._datapoint` is
-        # defined, the annotations are stored as strings. This makes them concrete again, so they can be compared to the
-        # natively concrete dispatcher annotations.
-        datapoint_annotations = get_type_hints(datapoint_method)
-        for param in datapoint_params:
-            param._annotation = datapoint_annotations[param.name]
-
-        assert dispatcher_params == datapoint_params
-
     @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
     def test_unkown_type(self, info):
         unkown_input = object()
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index e9b72161e..45668fda1 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -3,7 +3,6 @@ import decimal
 import inspect
 import math
 import re
-from typing import get_type_hints
 from unittest import mock
 
 import numpy as np
@@ -26,6 +25,7 @@ from common_utils import (
     make_image_tensor,
     make_segmentation_mask,
     make_video,
+    make_video_tensor,
     needs_cuda,
     set_rng_seed,
 )
@@ -39,6 +39,7 @@ from torchvision import datapoints
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.functional._utils import _KERNEL_REGISTRY
 
 
 @pytest.fixture(autouse=True)
@@ -176,16 +177,19 @@ def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs):
     """Checks if the dispatcher correctly dispatches the input to the corresponding kernel and that the input type is
     preserved in doing so. For bounding boxes also checks that the format is preserved.
     """
-    if isinstance(input, datapoints._datapoint.Datapoint):
-        # Due to our complex dispatch architecture for datapoints, we cannot spy on the kernel directly,
-        # but rather have to patch the `Datapoint.__F` attribute to contain the spied on kernel.
-        spy = mock.MagicMock(wraps=kernel, name=kernel.__name__)
-        with mock.patch.object(F, kernel.__name__, spy):
-            # Due to Python's name mangling, the `Datapoint.__F` attribute is only accessible from inside the class.
-            # Since that is not the case here, we need to prefix f"_{cls.__name__}"
-            # See https://docs.python.org/3/tutorial/classes.html#private-variables for details
-            with mock.patch.object(datapoints._datapoint.Datapoint, "_Datapoint__F", new=F):
-                output = dispatcher(input, *args, **kwargs)
+    input_type = type(input)
+
+    if isinstance(input, datapoints.Datapoint):
+        wrapped_kernel = _KERNEL_REGISTRY[dispatcher][input_type]
+
+        # In case the wrapper was decorated with @functools.wraps, we can make the check more strict and test if the
+        # proper kernel was wrapped
+        if hasattr(wrapped_kernel, "__wrapped__"):
+            assert wrapped_kernel.__wrapped__ is kernel
+
+        spy = mock.MagicMock(wraps=wrapped_kernel, name=wrapped_kernel.__name__)
+        with mock.patch.dict(_KERNEL_REGISTRY[dispatcher], values={input_type: spy}):
+            output = dispatcher(input, *args, **kwargs)
 
         spy.assert_called_once()
     else:
@@ -194,7 +198,7 @@ def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs):
 
             spy.assert_called_once()
 
-    assert isinstance(output, type(input))
+    assert isinstance(output, input_type)
 
     if isinstance(input, datapoints.BoundingBoxes):
         assert output.format == input.format
@@ -209,15 +213,13 @@ def check_dispatcher(
     check_dispatch=True,
     **kwargs,
 ):
+    unknown_input = object()
     with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy:
-        dispatcher(input, *args, **kwargs)
+        with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
+            dispatcher(unknown_input, *args, **kwargs)
 
         spy.assert_any_call(f"{dispatcher.__module__}.{dispatcher.__name__}")
 
-    unknown_input = object()
-    with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
-        dispatcher(unknown_input, *args, **kwargs)
-
     if check_scripted_smoke:
         _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs)
 
@@ -225,18 +227,18 @@ def check_dispatcher(
         _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs)
 
 
-def _check_dispatcher_kernel_signature_match(dispatcher, *, kernel, input_type):
+def check_dispatcher_kernel_signature_match(dispatcher, *, kernel, input_type):
     """Checks if the signature of the dispatcher matches the kernel signature."""
-    dispatcher_signature = inspect.signature(dispatcher)
-    dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
-
-    kernel_signature = inspect.signature(kernel)
-    kernel_params = list(kernel_signature.parameters.values())[1:]
+    dispatcher_params = list(inspect.signature(dispatcher).parameters.values())[1:]
+    kernel_params = list(inspect.signature(kernel).parameters.values())[1:]
 
-    if issubclass(input_type, datapoints._datapoint.Datapoint):
+    if issubclass(input_type, datapoints.Datapoint):
         # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be
         # explicitly passed to the kernel.
-        kernel_params = [param for param in kernel_params if param.name not in input_type.__annotations__.keys()]
+        explicit_metadata = {
+            datapoints.BoundingBoxes: {"format", "canvas_size"},
+        }
+        kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
 
     dispatcher_params = iter(dispatcher_params)
     for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
@@ -259,30 +261,6 @@ def _check_dispatcher_kernel_signature_match(dispatcher, *, kernel, input_type):
         assert dispatcher_param == kernel_param
 
 
-def _check_dispatcher_datapoint_signature_match(dispatcher):
-    """Checks if the signature of the dispatcher matches the corresponding method signature on the Datapoint class."""
-    dispatcher_signature = inspect.signature(dispatcher)
-    dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
-
-    datapoint_method = getattr(datapoints._datapoint.Datapoint, dispatcher.__name__)
-    datapoint_signature = inspect.signature(datapoint_method)
-    datapoint_params = list(datapoint_signature.parameters.values())[1:]
-
-    # Some annotations in the `datapoints._datapoint` module
-    # are stored as strings. The block below makes them concrete again (non-strings), so they can be compared to the
-    # natively concrete dispatcher annotations.
-    datapoint_annotations = get_type_hints(datapoint_method)
-    for param in datapoint_params:
-        param._annotation = datapoint_annotations[param.name]
-
-    assert dispatcher_params == datapoint_params
-
-
-def check_dispatcher_signatures_match(dispatcher, *, kernel, input_type):
-    _check_dispatcher_kernel_signature_match(dispatcher, kernel=kernel, input_type=input_type)
-    _check_dispatcher_datapoint_signature_match(dispatcher)
-
-
 def _check_transform_v1_compatibility(transform, input):
     """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
     ``get_params`` method, is scriptable, and the scripted version can be called without error."""
@@ -433,6 +411,33 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
     return torch.stack([transform(b) for b in bounding_boxes.reshape(-1, 4).unbind()]).reshape(bounding_boxes.shape)
 
 
+@pytest.mark.parametrize(
+    ("dispatcher", "registered_datapoint_clss"),
+    [(dispatcher, set(registry.keys())) for dispatcher, registry in _KERNEL_REGISTRY.items()],
+)
+def test_exhaustive_kernel_registration(dispatcher, registered_datapoint_clss):
+    missing = {
+        datapoints.Image,
+        datapoints.BoundingBoxes,
+        datapoints.Mask,
+        datapoints.Video,
+    } - registered_datapoint_clss
+    if missing:
+        names = sorted(f"datapoints.{cls.__name__}" for cls in missing)
+        raise AssertionError(
+            "\n".join(
+                [
+                    f"The dispatcher '{dispatcher.__name__}' has no kernel registered for",
+                    "",
+                    *[f"- {name}" for name in names],
+                    "",
+                    f"If available, register the kernels with @_register_kernel_internal({dispatcher.__name__}, ...).",
+                    f"If not, register explicit no-ops with @_register_explicit_noop({', '.join(names)})",
+                ]
+            )
+        )
+
+
 class TestResize:
     INPUT_SIZE = (17, 11)
     OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)]
@@ -568,7 +573,7 @@ class TestResize:
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_signatures_match(F.resize, kernel=kernel, input_type=input_type)
+        check_dispatcher_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -766,7 +771,7 @@ class TestResize:
 
         # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
         # is a good reason to break this, feel free to downgrade to an equality check.
-        if isinstance(input, datapoints._datapoint.Datapoint):
+        if isinstance(input, datapoints.Datapoint):
             # We can't test identity directly, since that checks for the identity of the Python object. Since all
             # datapoints unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check
             # that the underlying storage is the same
@@ -850,7 +855,7 @@ class TestHorizontalFlip:
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_signatures_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
+        check_dispatcher_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1033,7 +1038,7 @@ class TestAffine:
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_signatures_match(F.affine, kernel=kernel, input_type=input_type)
+        check_dispatcher_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1329,7 +1334,7 @@ class TestVerticalFlip:
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_signatures_match(F.vertical_flip, kernel=kernel, input_type=input_type)
+        check_dispatcher_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1486,7 +1491,7 @@ class TestRotate:
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_signatures_match(F.rotate, kernel=kernel, input_type=input_type)
+        check_dispatcher_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1899,6 +1904,56 @@ class TestToDtype:
         assert out["mask"].dtype == mask_dtype
 
 
+class TestAdjustBrightness:
+    _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0]
+    _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.adjust_brightness_image_tensor, make_image),
+            (F.adjust_brightness_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, kernel, make_input, dtype, device):
+        check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.adjust_brightness_image_tensor, make_image_tensor),
+            (F.adjust_brightness_image_pil, make_image_pil),
+            (F.adjust_brightness_image_tensor, make_image),
+            (F.adjust_brightness_video, make_video),
+        ],
+    )
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.adjust_brightness, kernel, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_brightness_image_tensor, torch.Tensor),
+            (F.adjust_brightness_image_pil, PIL.Image.Image),
+            (F.adjust_brightness_image_tensor, datapoints.Image),
+            (F.adjust_brightness_video, datapoints.Video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS)
+    def test_image_correctness(self, brightness_factor):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_brightness(image, brightness_factor=brightness_factor)
+        expected = F.to_image_tensor(F.adjust_brightness(F.to_image_pil(image), brightness_factor=brightness_factor))
+
+        torch.testing.assert_close(actual, expected)
+
+
 class TestCutMixMixUp:
     class DummyDataset:
         def __init__(self, size, num_classes):
@@ -2036,3 +2091,93 @@ def test_labels_getter_default_heuristic(key, sample_type):
         # it takes precedence over other keys which would otherwise be a match
         d = {key: "something_else", "labels": labels}
         assert transforms._utils._find_labels_default_heuristic(d) is labels
+
+
+class TestShapeGetters:
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_dimensions_image_tensor, make_image_tensor),
+            (F.get_dimensions_image_pil, make_image_pil),
+            (F.get_dimensions_image_tensor, make_image),
+            (F.get_dimensions_video, make_video),
+        ],
+    )
+    def test_get_dimensions(self, kernel, make_input):
+        size = (10, 10)
+        color_space, num_channels = "RGB", 3
+
+        input = make_input(size, color_space=color_space)
+
+        assert kernel(input) == F.get_dimensions(input) == [num_channels, *size]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_num_channels_image_tensor, make_image_tensor),
+            (F.get_num_channels_image_pil, make_image_pil),
+            (F.get_num_channels_image_tensor, make_image),
+            (F.get_num_channels_video, make_video),
+        ],
+    )
+    def test_get_num_channels(self, kernel, make_input):
+        color_space, num_channels = "RGB", 3
+
+        input = make_input(color_space=color_space)
+
+        assert kernel(input) == F.get_num_channels(input) == num_channels
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_size_image_tensor, make_image_tensor),
+            (F.get_size_image_pil, make_image_pil),
+            (F.get_size_image_tensor, make_image),
+            (F.get_size_bounding_boxes, make_bounding_box),
+            (F.get_size_mask, make_detection_mask),
+            (F.get_size_mask, make_segmentation_mask),
+            (F.get_size_video, make_video),
+        ],
+    )
+    def test_get_size(self, kernel, make_input):
+        size = (10, 10)
+
+        input = make_input(size)
+
+        assert kernel(input) == F.get_size(input) == list(size)
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_num_frames_video, make_video_tensor),
+            (F.get_num_frames_video, make_video),
+        ],
+    )
+    def test_get_num_frames(self, kernel, make_input):
+        num_frames = 4
+
+        input = make_input(num_frames=num_frames)
+
+        assert kernel(input) == F.get_num_frames(input) == num_frames
+
+    @pytest.mark.parametrize(
+        ("dispatcher", "make_input"),
+        [
+            (F.get_dimensions, make_bounding_box),
+            (F.get_dimensions, make_detection_mask),
+            (F.get_dimensions, make_segmentation_mask),
+            (F.get_num_channels, make_bounding_box),
+            (F.get_num_channels, make_detection_mask),
+            (F.get_num_channels, make_segmentation_mask),
+            (F.get_num_frames, make_image_pil),
+            (F.get_num_frames, make_image),
+            (F.get_num_frames, make_bounding_box),
+            (F.get_num_frames, make_detection_mask),
+            (F.get_num_frames, make_segmentation_mask),
+        ],
+    )
+    def test_unsupported_types(self, dispatcher, make_input):
+        input = make_input()
+
+        with pytest.raises(TypeError, match=re.escape(str(type(input)))):
+            dispatcher(input)
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 239954dda..cef5c3604 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -69,14 +69,15 @@ class DispatcherInfo(InfoBase):
             import itertools
 
             for args_kwargs in sample_inputs:
-                for name in itertools.chain(
-                    datapoint_type.__annotations__.keys(),
-                    # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a
-                    #  per-dispatcher level. However, so far there is no option for that.
-                    (f"old_{name}" for name in datapoint_type.__annotations__.keys()),
-                ):
-                    if name in args_kwargs.kwargs:
-                        del args_kwargs.kwargs[name]
+                if hasattr(datapoint_type, "__annotations__"):
+                    for name in itertools.chain(
+                        datapoint_type.__annotations__.keys(),
+                        # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a
+                        #  per-dispatcher level. However, so far there is no option for that.
+                        (f"old_{name}" for name in datapoint_type.__annotations__.keys()),
+                    ):
+                        if name in args_kwargs.kwargs:
+                            del args_kwargs.kwargs[name]
 
                 yield args_kwargs
 
@@ -289,14 +290,6 @@ DISPATCHER_INFOS = [
             skip_dispatch_datapoint,
         ],
     ),
-    DispatcherInfo(
-        F.adjust_brightness,
-        kernels={
-            datapoints.Image: F.adjust_brightness_image_tensor,
-            datapoints.Video: F.adjust_brightness_video,
-        },
-        pil_kernel_info=PILKernelInfo(F.adjust_brightness_image_pil, kernel_name="adjust_brightness_image_pil"),
-    ),
     DispatcherInfo(
         F.adjust_contrast,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 85eb24a80..01605f696 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -1259,46 +1259,6 @@ KERNEL_INFOS.extend(
     ]
 )
 
-_ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]
-
-
-def sample_inputs_adjust_brightness_image_tensor():
-    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
-        yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
-
-
-def reference_inputs_adjust_brightness_image_tensor():
-    for image_loader, brightness_factor in itertools.product(
-        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
-        _ADJUST_BRIGHTNESS_FACTORS,
-    ):
-        yield ArgsKwargs(image_loader, brightness_factor=brightness_factor)
-
-
-def sample_inputs_adjust_brightness_video():
-    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
-        yield ArgsKwargs(video_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.adjust_brightness_image_tensor,
-            kernel_name="adjust_brightness_image_tensor",
-            sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
-            reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
-            reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
-            float32_vs_uint8=True,
-            closeness_kwargs=float32_vs_uint8_pixel_difference(),
-        ),
-        KernelInfo(
-            F.adjust_brightness_video,
-            sample_inputs_fn=sample_inputs_adjust_brightness_video,
-        ),
-    ]
-)
-
-
 _ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
 
 
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index fb51f0497..03469ca0c 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,7 +1,7 @@
 from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
 
 from ._bounding_box import BoundingBoxes, BoundingBoxFormat
-from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT
+from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT, Datapoint
 from ._image import _ImageType, _ImageTypeJIT, _TensorImageType, _TensorImageTypeJIT, Image
 from ._mask import Mask
 from ._video import _TensorVideoType, _TensorVideoTypeJIT, _VideoType, _VideoTypeJIT, Video
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 780a95040..912cc3bca 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Any, List, Optional, Sequence, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import torch
-from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
 
-from ._datapoint import _FillTypeJIT, Datapoint
+from ._datapoint import Datapoint
 
 
 class BoundingBoxFormat(Enum):
@@ -97,141 +96,3 @@ class BoundingBoxes(Datapoint):
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr(format=self.format, canvas_size=self.canvas_size)
-
-    def horizontal_flip(self) -> BoundingBoxes:
-        output = self._F.horizontal_flip_bounding_boxes(
-            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size
-        )
-        return BoundingBoxes.wrap_like(self, output)
-
-    def vertical_flip(self) -> BoundingBoxes:
-        output = self._F.vertical_flip_bounding_boxes(
-            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size
-        )
-        return BoundingBoxes.wrap_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> BoundingBoxes:
-        output, canvas_size = self._F.resize_bounding_boxes(
-            self.as_subclass(torch.Tensor),
-            canvas_size=self.canvas_size,
-            size=size,
-            max_size=max_size,
-        )
-        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> BoundingBoxes:
-        output, canvas_size = self._F.crop_bounding_boxes(
-            self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width
-        )
-        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
-
-    def center_crop(self, output_size: List[int]) -> BoundingBoxes:
-        output, canvas_size = self._F.center_crop_bounding_boxes(
-            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size, output_size=output_size
-        )
-        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> BoundingBoxes:
-        output, canvas_size = self._F.resized_crop_bounding_boxes(
-            self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
-        )
-        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
-
-    def pad(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Optional[Union[int, float, List[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> BoundingBoxes:
-        output, canvas_size = self._F.pad_bounding_boxes(
-            self.as_subclass(torch.Tensor),
-            format=self.format,
-            canvas_size=self.canvas_size,
-            padding=padding,
-            padding_mode=padding_mode,
-        )
-        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        expand: bool = False,
-        center: Optional[List[float]] = None,
-        fill: _FillTypeJIT = None,
-    ) -> BoundingBoxes:
-        output, canvas_size = self._F.rotate_bounding_boxes(
-            self.as_subclass(torch.Tensor),
-            format=self.format,
-            canvas_size=self.canvas_size,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-        return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> BoundingBoxes:
-        output = self._F.affine_bounding_boxes(
-            self.as_subclass(torch.Tensor),
-            self.format,
-            self.canvas_size,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            center=center,
-        )
-        return BoundingBoxes.wrap_like(self, output)
-
-    def perspective(
-        self,
-        startpoints: Optional[List[List[int]]],
-        endpoints: Optional[List[List[int]]],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-        coefficients: Optional[List[float]] = None,
-    ) -> BoundingBoxes:
-        output = self._F.perspective_bounding_boxes(
-            self.as_subclass(torch.Tensor),
-            format=self.format,
-            canvas_size=self.canvas_size,
-            startpoints=startpoints,
-            endpoints=endpoints,
-            coefficients=coefficients,
-        )
-        return BoundingBoxes.wrap_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-    ) -> BoundingBoxes:
-        output = self._F.elastic_bounding_boxes(
-            self.as_subclass(torch.Tensor), self.format, self.canvas_size, displacement=displacement
-        )
-        return BoundingBoxes.wrap_like(self, output)
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 2059a3a18..384273301 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -1,13 +1,11 @@
 from __future__ import annotations
 
-from types import ModuleType
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
 
 import PIL.Image
 import torch
 from torch._C import DisableTorchFunctionSubclass
 from torch.types import _device, _dtype, _size
-from torchvision.transforms import InterpolationMode
 
 
 D = TypeVar("D", bound="Datapoint")
@@ -16,8 +14,6 @@ _FillTypeJIT = Optional[List[float]]
 
 
 class Datapoint(torch.Tensor):
-    __F: Optional[ModuleType] = None
-
     @staticmethod
     def _to_tensor(
         data: Any,
@@ -99,18 +95,6 @@ class Datapoint(torch.Tensor):
         extra_repr = ", ".join(f"{key}={value}" for key, value in kwargs.items())
         return f"{super().__repr__()[:-1]}, {extra_repr})"
 
-    @property
-    def _F(self) -> ModuleType:
-        # This implements a lazy import of the functional to get around the cyclic import. This import is deferred
-        # until the first time we need reference to the functional module and it's shared across all instances of
-        # the class. This approach avoids the DataLoader issue described at
-        # https://github.com/pytorch/vision/pull/6476#discussion_r953588621
-        if Datapoint.__F is None:
-            from ..transforms.v2 import functional
-
-            Datapoint.__F = functional
-        return Datapoint.__F
-
     # Add properties for common attributes like shape, dtype, device, ndim etc
     # this way we return the result without passing into __torch_function__
     @property
@@ -142,128 +126,6 @@ class Datapoint(torch.Tensor):
         # `BoundingBoxes.clone()`.
         return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
 
-    def horizontal_flip(self) -> Datapoint:
-        return self
-
-    def vertical_flip(self) -> Datapoint:
-        return self
-
-    # TODO: We have to ignore override mypy error as there is torch.Tensor built-in deprecated op: Tensor.resize
-    # https://github.com/pytorch/pytorch/blob/e8727994eb7cdb2ab642749d6549bc497563aa06/torch/_tensor.py#L588-L593
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Datapoint:
-        return self
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Datapoint:
-        return self
-
-    def center_crop(self, output_size: List[int]) -> Datapoint:
-        return self
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Datapoint:
-        return self
-
-    def pad(
-        self,
-        padding: List[int],
-        fill: Optional[Union[int, float, List[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> Datapoint:
-        return self
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        expand: bool = False,
-        center: Optional[List[float]] = None,
-        fill: _FillTypeJIT = None,
-    ) -> Datapoint:
-        return self
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Datapoint:
-        return self
-
-    def perspective(
-        self,
-        startpoints: Optional[List[List[int]]],
-        endpoints: Optional[List[List[int]]],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-        coefficients: Optional[List[float]] = None,
-    ) -> Datapoint:
-        return self
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-    ) -> Datapoint:
-        return self
-
-    def rgb_to_grayscale(self, num_output_channels: int = 1) -> Datapoint:
-        return self
-
-    def adjust_brightness(self, brightness_factor: float) -> Datapoint:
-        return self
-
-    def adjust_saturation(self, saturation_factor: float) -> Datapoint:
-        return self
-
-    def adjust_contrast(self, contrast_factor: float) -> Datapoint:
-        return self
-
-    def adjust_sharpness(self, sharpness_factor: float) -> Datapoint:
-        return self
-
-    def adjust_hue(self, hue_factor: float) -> Datapoint:
-        return self
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> Datapoint:
-        return self
-
-    def posterize(self, bits: int) -> Datapoint:
-        return self
-
-    def solarize(self, threshold: float) -> Datapoint:
-        return self
-
-    def autocontrast(self) -> Datapoint:
-        return self
-
-    def equalize(self) -> Datapoint:
-        return self
-
-    def invert(self) -> Datapoint:
-        return self
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Datapoint:
-        return self
-
 
 _InputType = Union[torch.Tensor, PIL.Image.Image, Datapoint]
 _InputTypeJIT = torch.Tensor
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 2ebf4954d..dccfc81a6 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import PIL.Image
 import torch
-from torchvision.transforms.functional import InterpolationMode
 
-from ._datapoint import _FillTypeJIT, Datapoint
+from ._datapoint import Datapoint
 
 
 class Image(Datapoint):
@@ -56,195 +55,6 @@ class Image(Datapoint):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
-    def horizontal_flip(self) -> Image:
-        output = self._F.horizontal_flip_image_tensor(self.as_subclass(torch.Tensor))
-        return Image.wrap_like(self, output)
-
-    def vertical_flip(self) -> Image:
-        output = self._F.vertical_flip_image_tensor(self.as_subclass(torch.Tensor))
-        return Image.wrap_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Image:
-        output = self._F.resize_image_tensor(
-            self.as_subclass(torch.Tensor), size, interpolation=interpolation, max_size=max_size, antialias=antialias
-        )
-        return Image.wrap_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Image:
-        output = self._F.crop_image_tensor(self.as_subclass(torch.Tensor), top, left, height, width)
-        return Image.wrap_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> Image:
-        output = self._F.center_crop_image_tensor(self.as_subclass(torch.Tensor), output_size=output_size)
-        return Image.wrap_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Image:
-        output = self._F.resized_crop_image_tensor(
-            self.as_subclass(torch.Tensor),
-            top,
-            left,
-            height,
-            width,
-            size=list(size),
-            interpolation=interpolation,
-            antialias=antialias,
-        )
-        return Image.wrap_like(self, output)
-
-    def pad(
-        self,
-        padding: List[int],
-        fill: Optional[Union[int, float, List[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> Image:
-        output = self._F.pad_image_tensor(self.as_subclass(torch.Tensor), padding, fill=fill, padding_mode=padding_mode)
-        return Image.wrap_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        expand: bool = False,
-        center: Optional[List[float]] = None,
-        fill: _FillTypeJIT = None,
-    ) -> Image:
-        output = self._F.rotate_image_tensor(
-            self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-        return Image.wrap_like(self, output)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Image:
-        output = self._F.affine_image_tensor(
-            self.as_subclass(torch.Tensor),
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-        return Image.wrap_like(self, output)
-
-    def perspective(
-        self,
-        startpoints: Optional[List[List[int]]],
-        endpoints: Optional[List[List[int]]],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-        coefficients: Optional[List[float]] = None,
-    ) -> Image:
-        output = self._F.perspective_image_tensor(
-            self.as_subclass(torch.Tensor),
-            startpoints,
-            endpoints,
-            interpolation=interpolation,
-            fill=fill,
-            coefficients=coefficients,
-        )
-        return Image.wrap_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-    ) -> Image:
-        output = self._F.elastic_image_tensor(
-            self.as_subclass(torch.Tensor), displacement, interpolation=interpolation, fill=fill
-        )
-        return Image.wrap_like(self, output)
-
-    def rgb_to_grayscale(self, num_output_channels: int = 1) -> Image:
-        output = self._F.rgb_to_grayscale_image_tensor(
-            self.as_subclass(torch.Tensor), num_output_channels=num_output_channels
-        )
-        return Image.wrap_like(self, output)
-
-    def adjust_brightness(self, brightness_factor: float) -> Image:
-        output = self._F.adjust_brightness_image_tensor(
-            self.as_subclass(torch.Tensor), brightness_factor=brightness_factor
-        )
-        return Image.wrap_like(self, output)
-
-    def adjust_saturation(self, saturation_factor: float) -> Image:
-        output = self._F.adjust_saturation_image_tensor(
-            self.as_subclass(torch.Tensor), saturation_factor=saturation_factor
-        )
-        return Image.wrap_like(self, output)
-
-    def adjust_contrast(self, contrast_factor: float) -> Image:
-        output = self._F.adjust_contrast_image_tensor(self.as_subclass(torch.Tensor), contrast_factor=contrast_factor)
-        return Image.wrap_like(self, output)
-
-    def adjust_sharpness(self, sharpness_factor: float) -> Image:
-        output = self._F.adjust_sharpness_image_tensor(
-            self.as_subclass(torch.Tensor), sharpness_factor=sharpness_factor
-        )
-        return Image.wrap_like(self, output)
-
-    def adjust_hue(self, hue_factor: float) -> Image:
-        output = self._F.adjust_hue_image_tensor(self.as_subclass(torch.Tensor), hue_factor=hue_factor)
-        return Image.wrap_like(self, output)
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> Image:
-        output = self._F.adjust_gamma_image_tensor(self.as_subclass(torch.Tensor), gamma=gamma, gain=gain)
-        return Image.wrap_like(self, output)
-
-    def posterize(self, bits: int) -> Image:
-        output = self._F.posterize_image_tensor(self.as_subclass(torch.Tensor), bits=bits)
-        return Image.wrap_like(self, output)
-
-    def solarize(self, threshold: float) -> Image:
-        output = self._F.solarize_image_tensor(self.as_subclass(torch.Tensor), threshold=threshold)
-        return Image.wrap_like(self, output)
-
-    def autocontrast(self) -> Image:
-        output = self._F.autocontrast_image_tensor(self.as_subclass(torch.Tensor))
-        return Image.wrap_like(self, output)
-
-    def equalize(self) -> Image:
-        output = self._F.equalize_image_tensor(self.as_subclass(torch.Tensor))
-        return Image.wrap_like(self, output)
-
-    def invert(self) -> Image:
-        output = self._F.invert_image_tensor(self.as_subclass(torch.Tensor))
-        return Image.wrap_like(self, output)
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image:
-        output = self._F.gaussian_blur_image_tensor(
-            self.as_subclass(torch.Tensor), kernel_size=kernel_size, sigma=sigma
-        )
-        return Image.wrap_like(self, output)
-
-    def normalize(self, mean: List[float], std: List[float], inplace: bool = False) -> Image:
-        output = self._F.normalize_image_tensor(self.as_subclass(torch.Tensor), mean=mean, std=std, inplace=inplace)
-        return Image.wrap_like(self, output)
-
 
 _ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
 _ImageTypeJIT = torch.Tensor
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index bc50b3058..2b95eca72 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import PIL.Image
 import torch
-from torchvision.transforms import InterpolationMode
 
-from ._datapoint import _FillTypeJIT, Datapoint
+from ._datapoint import Datapoint
 
 
 class Mask(Datapoint):
@@ -50,105 +49,3 @@ class Mask(Datapoint):
         tensor: torch.Tensor,
     ) -> Mask:
         return cls._wrap(tensor)
-
-    def horizontal_flip(self) -> Mask:
-        output = self._F.horizontal_flip_mask(self.as_subclass(torch.Tensor))
-        return Mask.wrap_like(self, output)
-
-    def vertical_flip(self) -> Mask:
-        output = self._F.vertical_flip_mask(self.as_subclass(torch.Tensor))
-        return Mask.wrap_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        max_size: Optional[int] = None,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Mask:
-        output = self._F.resize_mask(self.as_subclass(torch.Tensor), size, max_size=max_size)
-        return Mask.wrap_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Mask:
-        output = self._F.crop_mask(self.as_subclass(torch.Tensor), top, left, height, width)
-        return Mask.wrap_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> Mask:
-        output = self._F.center_crop_mask(self.as_subclass(torch.Tensor), output_size=output_size)
-        return Mask.wrap_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Mask:
-        output = self._F.resized_crop_mask(self.as_subclass(torch.Tensor), top, left, height, width, size=size)
-        return Mask.wrap_like(self, output)
-
-    def pad(
-        self,
-        padding: List[int],
-        fill: Optional[Union[int, float, List[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> Mask:
-        output = self._F.pad_mask(self.as_subclass(torch.Tensor), padding, padding_mode=padding_mode, fill=fill)
-        return Mask.wrap_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        expand: bool = False,
-        center: Optional[List[float]] = None,
-        fill: _FillTypeJIT = None,
-    ) -> Mask:
-        output = self._F.rotate_mask(self.as_subclass(torch.Tensor), angle, expand=expand, center=center, fill=fill)
-        return Mask.wrap_like(self, output)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Mask:
-        output = self._F.affine_mask(
-            self.as_subclass(torch.Tensor),
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            fill=fill,
-            center=center,
-        )
-        return Mask.wrap_like(self, output)
-
-    def perspective(
-        self,
-        startpoints: Optional[List[List[int]]],
-        endpoints: Optional[List[List[int]]],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-        coefficients: Optional[List[float]] = None,
-    ) -> Mask:
-        output = self._F.perspective_mask(
-            self.as_subclass(torch.Tensor), startpoints, endpoints, fill=fill, coefficients=coefficients
-        )
-        return Mask.wrap_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-    ) -> Mask:
-        output = self._F.elastic_mask(self.as_subclass(torch.Tensor), displacement, fill=fill)
-        return Mask.wrap_like(self, output)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index d527a68a4..11d6e2a85 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -1,11 +1,10 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
-from torchvision.transforms.functional import InterpolationMode
 
-from ._datapoint import _FillTypeJIT, Datapoint
+from ._datapoint import Datapoint
 
 
 class Video(Datapoint):
@@ -46,191 +45,6 @@ class Video(Datapoint):
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
-    def horizontal_flip(self) -> Video:
-        output = self._F.horizontal_flip_video(self.as_subclass(torch.Tensor))
-        return Video.wrap_like(self, output)
-
-    def vertical_flip(self) -> Video:
-        output = self._F.vertical_flip_video(self.as_subclass(torch.Tensor))
-        return Video.wrap_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Video:
-        output = self._F.resize_video(
-            self.as_subclass(torch.Tensor),
-            size,
-            interpolation=interpolation,
-            max_size=max_size,
-            antialias=antialias,
-        )
-        return Video.wrap_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Video:
-        output = self._F.crop_video(self.as_subclass(torch.Tensor), top, left, height, width)
-        return Video.wrap_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> Video:
-        output = self._F.center_crop_video(self.as_subclass(torch.Tensor), output_size=output_size)
-        return Video.wrap_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        antialias: Optional[Union[str, bool]] = "warn",
-    ) -> Video:
-        output = self._F.resized_crop_video(
-            self.as_subclass(torch.Tensor),
-            top,
-            left,
-            height,
-            width,
-            size=list(size),
-            interpolation=interpolation,
-            antialias=antialias,
-        )
-        return Video.wrap_like(self, output)
-
-    def pad(
-        self,
-        padding: List[int],
-        fill: Optional[Union[int, float, List[float]]] = None,
-        padding_mode: str = "constant",
-    ) -> Video:
-        output = self._F.pad_video(self.as_subclass(torch.Tensor), padding, fill=fill, padding_mode=padding_mode)
-        return Video.wrap_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        expand: bool = False,
-        center: Optional[List[float]] = None,
-        fill: _FillTypeJIT = None,
-    ) -> Video:
-        output = self._F.rotate_video(
-            self.as_subclass(torch.Tensor), angle, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-        return Video.wrap_like(self, output)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: _FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Video:
-        output = self._F.affine_video(
-            self.as_subclass(torch.Tensor),
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-        return Video.wrap_like(self, output)
-
-    def perspective(
-        self,
-        startpoints: Optional[List[List[int]]],
-        endpoints: Optional[List[List[int]]],
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-        coefficients: Optional[List[float]] = None,
-    ) -> Video:
-        output = self._F.perspective_video(
-            self.as_subclass(torch.Tensor),
-            startpoints,
-            endpoints,
-            interpolation=interpolation,
-            fill=fill,
-            coefficients=coefficients,
-        )
-        return Video.wrap_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: _FillTypeJIT = None,
-    ) -> Video:
-        output = self._F.elastic_video(
-            self.as_subclass(torch.Tensor), displacement, interpolation=interpolation, fill=fill
-        )
-        return Video.wrap_like(self, output)
-
-    def rgb_to_grayscale(self, num_output_channels: int = 1) -> Video:
-        output = self._F.rgb_to_grayscale_image_tensor(
-            self.as_subclass(torch.Tensor), num_output_channels=num_output_channels
-        )
-        return Video.wrap_like(self, output)
-
-    def adjust_brightness(self, brightness_factor: float) -> Video:
-        output = self._F.adjust_brightness_video(self.as_subclass(torch.Tensor), brightness_factor=brightness_factor)
-        return Video.wrap_like(self, output)
-
-    def adjust_saturation(self, saturation_factor: float) -> Video:
-        output = self._F.adjust_saturation_video(self.as_subclass(torch.Tensor), saturation_factor=saturation_factor)
-        return Video.wrap_like(self, output)
-
-    def adjust_contrast(self, contrast_factor: float) -> Video:
-        output = self._F.adjust_contrast_video(self.as_subclass(torch.Tensor), contrast_factor=contrast_factor)
-        return Video.wrap_like(self, output)
-
-    def adjust_sharpness(self, sharpness_factor: float) -> Video:
-        output = self._F.adjust_sharpness_video(self.as_subclass(torch.Tensor), sharpness_factor=sharpness_factor)
-        return Video.wrap_like(self, output)
-
-    def adjust_hue(self, hue_factor: float) -> Video:
-        output = self._F.adjust_hue_video(self.as_subclass(torch.Tensor), hue_factor=hue_factor)
-        return Video.wrap_like(self, output)
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> Video:
-        output = self._F.adjust_gamma_video(self.as_subclass(torch.Tensor), gamma=gamma, gain=gain)
-        return Video.wrap_like(self, output)
-
-    def posterize(self, bits: int) -> Video:
-        output = self._F.posterize_video(self.as_subclass(torch.Tensor), bits=bits)
-        return Video.wrap_like(self, output)
-
-    def solarize(self, threshold: float) -> Video:
-        output = self._F.solarize_video(self.as_subclass(torch.Tensor), threshold=threshold)
-        return Video.wrap_like(self, output)
-
-    def autocontrast(self) -> Video:
-        output = self._F.autocontrast_video(self.as_subclass(torch.Tensor))
-        return Video.wrap_like(self, output)
-
-    def equalize(self) -> Video:
-        output = self._F.equalize_video(self.as_subclass(torch.Tensor))
-        return Video.wrap_like(self, output)
-
-    def invert(self) -> Video:
-        output = self._F.invert_video(self.as_subclass(torch.Tensor))
-        return Video.wrap_like(self, output)
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Video:
-        output = self._F.gaussian_blur_video(self.as_subclass(torch.Tensor), kernel_size=kernel_size, sigma=sigma)
-        return Video.wrap_like(self, output)
-
-    def normalize(self, mean: List[float], std: List[float], inplace: bool = False) -> Video:
-        output = self._F.normalize_video(self.as_subclass(torch.Tensor), mean=mean, std=std, inplace=inplace)
-        return Video.wrap_like(self, output)
-
 
 _VideoType = Union[torch.Tensor, Video]
 _VideoTypeJIT = torch.Tensor
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 780ffccf6..87a43b118 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -1,7 +1,7 @@
 import math
 import numbers
 import warnings
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple
 
 import PIL.Image
 import torch
@@ -56,8 +56,6 @@ class RandomErasing(_RandomApplyTransform):
             value="random" if self.value is None else self.value,
         )
 
-    _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)
-
     def __init__(
         self,
         p: float = 0.5,
@@ -131,9 +129,7 @@ class RandomErasing(_RandomApplyTransform):
 
         return dict(i=i, j=j, h=h, w=w, v=v)
 
-    def _transform(
-        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["v"] is not None:
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index c7a1e3928..e43aa868a 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -355,20 +355,11 @@ class FiveCrop(Transform):
 
     _v1_transform_cls = _transforms.FiveCrop
 
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
     def __init__(self, size: Union[int, Sequence[int]]) -> None:
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
-    def _transform(
-        self, inpt: ImageOrVideoTypeJIT, params: Dict[str, Any]
-    ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.five_crop(inpt, self.size)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
@@ -402,13 +393,6 @@ class TenCrop(Transform):
 
     _v1_transform_cls = _transforms.TenCrop
 
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
     def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
@@ -418,20 +402,7 @@ class TenCrop(Transform):
         if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
             raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
 
-    def _transform(
-        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
-    ) -> Tuple[
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-        ImageOrVideoTypeJIT,
-    ]:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
 
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index df4ad6664..868314e9e 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -1,10 +1,9 @@
 from typing import Any, Dict
 
+import torch
 from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F, Transform
 
-from torchvision.transforms.v2.utils import is_simple_tensor
-
 
 class UniformTemporalSubsample(Transform):
     """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
@@ -20,7 +19,7 @@ class UniformTemporalSubsample(Transform):
         num_samples (int): The number of equispaced samples to be selected
     """
 
-    _transformed_types = (is_simple_tensor, datapoints.Video)
+    _transformed_types = (torch.Tensor,)
 
     def __init__(self, num_samples: int):
         super().__init__()
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 24b4b4218..163a55fad 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -1,12 +1,13 @@
 from torchvision.transforms import InterpolationMode  # usort: skip
 
-from ._utils import is_simple_tensor  # usort: skip
+from ._utils import is_simple_tensor, register_kernel  # usort: skip
 
 from ._meta import (
     clamp_bounding_boxes,
     convert_format_bounding_boxes,
     get_dimensions_image_tensor,
     get_dimensions_image_pil,
+    get_dimensions_video,
     get_dimensions,
     get_num_frames_video,
     get_num_frames,
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 9aedae814..95b4ed937 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -7,9 +7,37 @@ from torchvision import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import is_simple_tensor
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, is_simple_tensor
 
 
+@_register_explicit_noop(datapoints.Mask, datapoints.BoundingBoxes, warn_passthrough=True)
+def erase(
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT],
+    i: int,
+    j: int,
+    h: int,
+    w: int,
+    v: torch.Tensor,
+    inplace: bool = False,
+) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(erase)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(erase, type(inpt))
+        return kernel(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    elif isinstance(inpt, PIL.Image.Image):
+        return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(erase, datapoints.Image)
 def erase_image_tensor(
     image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> torch.Tensor:
@@ -29,36 +57,8 @@ def erase_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
+@_register_kernel_internal(erase, datapoints.Video)
 def erase_video(
     video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> torch.Tensor:
     return erase_image_tensor(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-
-
-def erase(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT],
-    i: int,
-    j: int,
-    h: int,
-    w: int,
-    v: torch.Tensor,
-    inplace: bool = False,
-) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(erase)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    elif isinstance(inpt, datapoints.Image):
-        output = erase_image_tensor(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        return datapoints.Image.wrap_like(inpt, output)
-    elif isinstance(inpt, datapoints.Video):
-        output = erase_video(inpt.as_subclass(torch.Tensor), i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        return datapoints.Video.wrap_like(inpt, output)
-    elif isinstance(inpt, PIL.Image.Image):
-        return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 32568f728..99dc19362 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -10,7 +10,34 @@ from torchvision.transforms._functional_tensor import _max_value
 from torchvision.utils import _log_api_usage_once
 
 from ._misc import _num_value_bits, to_dtype_image_tensor
-from ._utils import is_simple_tensor
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, is_simple_tensor
+
+
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, datapoints.Video)
+def rgb_to_grayscale(
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], num_output_channels: int = 1
+) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(rgb_to_grayscale)
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(rgb_to_grayscale, type(inpt))
+        return kernel(inpt, num_output_channels=num_output_channels)
+    elif isinstance(inpt, PIL.Image.Image):
+        return rgb_to_grayscale_image_pil(inpt, num_output_channels=num_output_channels)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+# `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
+# superset in terms of functionality and has the same signature, we alias here to avoid disruption.
+to_grayscale = rgb_to_grayscale
 
 
 def _rgb_to_grayscale_image_tensor(
@@ -29,6 +56,7 @@ def _rgb_to_grayscale_image_tensor(
     return l_img
 
 
+@_register_kernel_internal(rgb_to_grayscale, datapoints.Image)
 def rgb_to_grayscale_image_tensor(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
     return _rgb_to_grayscale_image_tensor(image, num_output_channels=num_output_channels, preserve_dtype=True)
 
@@ -36,19 +64,26 @@ def rgb_to_grayscale_image_tensor(image: torch.Tensor, num_output_channels: int
 rgb_to_grayscale_image_pil = _FP.to_grayscale
 
 
-def rgb_to_grayscale(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], num_output_channels: int = 1
-) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
+def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
+    ratio = float(ratio)
+    fp = image1.is_floating_point()
+    bound = _max_value(image1.dtype)
+    output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
+    return output if fp else output.to(image1.dtype)
+
+
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def adjust_brightness(inpt: datapoints._InputTypeJIT, brightness_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(rgb_to_grayscale)
-    if num_output_channels not in (1, 3):
-        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+        _log_api_usage_once(adjust_brightness)
+
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.rgb_to_grayscale(num_output_channels=num_output_channels)
+        return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(adjust_brightness, type(inpt))
+        return kernel(inpt, brightness_factor=brightness_factor)
     elif isinstance(inpt, PIL.Image.Image):
-        return rgb_to_grayscale_image_pil(inpt, num_output_channels=num_output_channels)
+        return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -56,19 +91,7 @@ def rgb_to_grayscale(
         )
 
 
-# `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
-# superset in terms of functionality and has the same signature, we alias here to avoid disruption.
-to_grayscale = rgb_to_grayscale
-
-
-def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
-    ratio = float(ratio)
-    fp = image1.is_floating_point()
-    bound = _max_value(image1.dtype)
-    output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
-    return output if fp else output.to(image1.dtype)
-
-
+@_register_kernel_internal(adjust_brightness, datapoints.Image)
 def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     if brightness_factor < 0:
         raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
@@ -83,23 +106,27 @@ def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float
     return output if fp else output.to(image.dtype)
 
 
-adjust_brightness_image_pil = _FP.adjust_brightness
+def adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
+    return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
 
 
+@_register_kernel_internal(adjust_brightness, datapoints.Video)
 def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor)
 
 
-def adjust_brightness(inpt: datapoints._InputTypeJIT, brightness_factor: float) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def adjust_saturation(inpt: datapoints._InputTypeJIT, saturation_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_brightness)
+        _log_api_usage_once(adjust_saturation)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.adjust_brightness(brightness_factor=brightness_factor)
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Datapoint)):
+        return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(adjust_saturation, type(inpt))
+        return kernel(inpt, saturation_factor=saturation_factor)
     elif isinstance(inpt, PIL.Image.Image):
-        return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
+        return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -107,6 +134,7 @@ def adjust_brightness(inpt: datapoints._InputTypeJIT, brightness_factor: float)
         )
 
 
+@_register_kernel_internal(adjust_saturation, datapoints.Image)
 def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if saturation_factor < 0:
         raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
@@ -128,22 +156,23 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
 adjust_saturation_image_pil = _FP.adjust_saturation
 
 
+@_register_kernel_internal(adjust_saturation, datapoints.Video)
 def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor)
 
 
-def adjust_saturation(inpt: datapoints._InputTypeJIT, saturation_factor: float) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def adjust_contrast(inpt: datapoints._InputTypeJIT, contrast_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_saturation)
+        _log_api_usage_once(adjust_contrast)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
-        return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.adjust_saturation(saturation_factor=saturation_factor)
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(adjust_contrast, type(inpt))
+        return kernel(inpt, contrast_factor=contrast_factor)
     elif isinstance(inpt, PIL.Image.Image):
-        return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
+        return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -151,6 +180,7 @@ def adjust_saturation(inpt: datapoints._InputTypeJIT, saturation_factor: float)
         )
 
 
+@_register_kernel_internal(adjust_contrast, datapoints.Image)
 def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if contrast_factor < 0:
         raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
@@ -172,20 +202,23 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
 adjust_contrast_image_pil = _FP.adjust_contrast
 
 
+@_register_kernel_internal(adjust_contrast, datapoints.Video)
 def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor)
 
 
-def adjust_contrast(inpt: datapoints._InputTypeJIT, contrast_factor: float) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def adjust_sharpness(inpt: datapoints._InputTypeJIT, sharpness_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_contrast)
+        _log_api_usage_once(adjust_sharpness)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.adjust_contrast(contrast_factor=contrast_factor)
+    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Datapoint)):
+        return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(adjust_sharpness, type(inpt))
+        return kernel(inpt, sharpness_factor=sharpness_factor)
     elif isinstance(inpt, PIL.Image.Image):
-        return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
+        return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -193,6 +226,7 @@ def adjust_contrast(inpt: datapoints._InputTypeJIT, contrast_factor: float) -> d
         )
 
 
+@_register_kernel_internal(adjust_sharpness, datapoints.Image)
 def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     num_channels, height, width = image.shape[-3:]
     if num_channels not in (1, 3):
@@ -248,22 +282,23 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
 adjust_sharpness_image_pil = _FP.adjust_sharpness
 
 
+@_register_kernel_internal(adjust_sharpness, datapoints.Video)
 def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor)
 
 
-def adjust_sharpness(inpt: datapoints._InputTypeJIT, sharpness_factor: float) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def adjust_hue(inpt: datapoints._InputTypeJIT, hue_factor: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_sharpness)
+        _log_api_usage_once(adjust_hue)
 
-    if isinstance(inpt, torch.Tensor) and (
-        torch.jit.is_scripting() or not isinstance(inpt, datapoints._datapoint.Datapoint)
-    ):
-        return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.adjust_sharpness(sharpness_factor=sharpness_factor)
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(adjust_hue, type(inpt))
+        return kernel(inpt, hue_factor=hue_factor)
     elif isinstance(inpt, PIL.Image.Image):
-        return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
+        return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -335,6 +370,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
     return (a4.mul_(mask.unsqueeze(dim=-4))).sum(dim=-3)
 
 
+@_register_kernel_internal(adjust_hue, datapoints.Image)
 def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if not (-0.5 <= hue_factor <= 0.5):
         raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
@@ -365,20 +401,23 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
 adjust_hue_image_pil = _FP.adjust_hue
 
 
+@_register_kernel_internal(adjust_hue, datapoints.Video)
 def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
     return adjust_hue_image_tensor(video, hue_factor=hue_factor)
 
 
-def adjust_hue(inpt: datapoints._InputTypeJIT, hue_factor: float) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def adjust_gamma(inpt: datapoints._InputTypeJIT, gamma: float, gain: float = 1) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_hue)
+        _log_api_usage_once(adjust_gamma)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.adjust_hue(hue_factor=hue_factor)
+        return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(adjust_gamma, type(inpt))
+        return kernel(inpt, gamma=gamma, gain=gain)
     elif isinstance(inpt, PIL.Image.Image):
-        return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
+        return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -386,6 +425,7 @@ def adjust_hue(inpt: datapoints._InputTypeJIT, hue_factor: float) -> datapoints.
         )
 
 
+@_register_kernel_internal(adjust_gamma, datapoints.Image)
 def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
     if gamma < 0:
         raise ValueError("Gamma should be a non-negative real number")
@@ -408,20 +448,23 @@ def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1
 adjust_gamma_image_pil = _FP.adjust_gamma
 
 
+@_register_kernel_internal(adjust_gamma, datapoints.Video)
 def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
     return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain)
 
 
-def adjust_gamma(inpt: datapoints._InputTypeJIT, gamma: float, gain: float = 1) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def posterize(inpt: datapoints._InputTypeJIT, bits: int) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_gamma)
+        _log_api_usage_once(posterize)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.adjust_gamma(gamma=gamma, gain=gain)
+        return posterize_image_tensor(inpt, bits=bits)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(posterize, type(inpt))
+        return kernel(inpt, bits=bits)
     elif isinstance(inpt, PIL.Image.Image):
-        return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
+        return posterize_image_pil(inpt, bits=bits)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -429,6 +472,7 @@ def adjust_gamma(inpt: datapoints._InputTypeJIT, gamma: float, gain: float = 1)
         )
 
 
+@_register_kernel_internal(posterize, datapoints.Image)
 def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
     if image.is_floating_point():
         levels = 1 << bits
@@ -445,20 +489,23 @@ def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
 posterize_image_pil = _FP.posterize
 
 
+@_register_kernel_internal(posterize, datapoints.Video)
 def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
     return posterize_image_tensor(video, bits=bits)
 
 
-def posterize(inpt: datapoints._InputTypeJIT, bits: int) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def solarize(inpt: datapoints._InputTypeJIT, threshold: float) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(posterize)
+        _log_api_usage_once(solarize)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return posterize_image_tensor(inpt, bits=bits)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.posterize(bits=bits)
+        return solarize_image_tensor(inpt, threshold=threshold)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(solarize, type(inpt))
+        return kernel(inpt, threshold=threshold)
     elif isinstance(inpt, PIL.Image.Image):
-        return posterize_image_pil(inpt, bits=bits)
+        return solarize_image_pil(inpt, threshold=threshold)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -466,6 +513,7 @@ def posterize(inpt: datapoints._InputTypeJIT, bits: int) -> datapoints._InputTyp
         )
 
 
+@_register_kernel_internal(solarize, datapoints.Image)
 def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
     if threshold > _max_value(image.dtype):
         raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
@@ -476,20 +524,25 @@ def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor
 solarize_image_pil = _FP.solarize
 
 
+@_register_kernel_internal(solarize, datapoints.Video)
 def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
     return solarize_image_tensor(video, threshold=threshold)
 
 
-def solarize(inpt: datapoints._InputTypeJIT, threshold: float) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def autocontrast(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(solarize)
+        _log_api_usage_once(autocontrast)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return solarize_image_tensor(inpt, threshold=threshold)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.solarize(threshold=threshold)
+        return autocontrast_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(autocontrast, type(inpt))
+        return kernel(
+            inpt,
+        )
     elif isinstance(inpt, PIL.Image.Image):
-        return solarize_image_pil(inpt, threshold=threshold)
+        return autocontrast_image_pil(inpt)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -497,6 +550,7 @@ def solarize(inpt: datapoints._InputTypeJIT, threshold: float) -> datapoints._In
         )
 
 
+@_register_kernel_internal(autocontrast, datapoints.Image)
 def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
     c = image.shape[-3]
     if c not in [1, 3]:
@@ -529,20 +583,25 @@ def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
 autocontrast_image_pil = _FP.autocontrast
 
 
+@_register_kernel_internal(autocontrast, datapoints.Video)
 def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
     return autocontrast_image_tensor(video)
 
 
-def autocontrast(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def equalize(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(autocontrast)
+        _log_api_usage_once(equalize)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return autocontrast_image_tensor(inpt)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.autocontrast()
+        return equalize_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(equalize, type(inpt))
+        return kernel(
+            inpt,
+        )
     elif isinstance(inpt, PIL.Image.Image):
-        return autocontrast_image_pil(inpt)
+        return equalize_image_pil(inpt)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -550,6 +609,7 @@ def autocontrast(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
         )
 
 
+@_register_kernel_internal(equalize, datapoints.Image)
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
@@ -622,20 +682,25 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
 equalize_image_pil = _FP.equalize
 
 
+@_register_kernel_internal(equalize, datapoints.Video)
 def equalize_video(video: torch.Tensor) -> torch.Tensor:
     return equalize_image_tensor(video)
 
 
-def equalize(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def invert(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(equalize)
+        _log_api_usage_once(invert)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return equalize_image_tensor(inpt)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.equalize()
+        return invert_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(invert, type(inpt))
+        return kernel(
+            inpt,
+        )
     elif isinstance(inpt, PIL.Image.Image):
-        return equalize_image_pil(inpt)
+        return invert_image_pil(inpt)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -643,6 +708,7 @@ def equalize(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
         )
 
 
+@_register_kernel_internal(invert, datapoints.Image)
 def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.is_floating_point():
         return 1.0 - image
@@ -656,22 +722,6 @@ def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
 invert_image_pil = _FP.invert
 
 
+@_register_kernel_internal(invert, datapoints.Video)
 def invert_video(video: torch.Tensor) -> torch.Tensor:
     return invert_image_tensor(video)
-
-
-def invert(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(invert)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return invert_image_tensor(inpt)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.invert()
-    elif isinstance(inpt, PIL.Image.Image):
-        return invert_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index a24507256..21f2aa8df 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -1,7 +1,7 @@
 import math
 import numbers
 import warnings
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import PIL.Image
 import torch
@@ -25,7 +25,13 @@ from torchvision.utils import _log_api_usage_once
 
 from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil
 
-from ._utils import is_simple_tensor
+from ._utils import (
+    _get_kernel,
+    _register_explicit_noop,
+    _register_five_ten_crop_kernel,
+    _register_kernel_internal,
+    is_simple_tensor,
+)
 
 
 def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
@@ -39,6 +45,27 @@ def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> Interp
     return interpolation
 
 
+def horizontal_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(horizontal_flip)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return horizontal_flip_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(horizontal_flip, type(inpt))
+        return kernel(
+            inpt,
+        )
+    elif isinstance(inpt, PIL.Image.Image):
+        return horizontal_flip_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(horizontal_flip, datapoints.Image)
 def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
 
@@ -47,6 +74,7 @@ def horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
     return _FP.hflip(image)
 
 
+@_register_kernel_internal(horizontal_flip, datapoints.Mask)
 def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image_tensor(mask)
 
@@ -68,20 +96,32 @@ def horizontal_flip_bounding_boxes(
     return bounding_boxes.reshape(shape)
 
 
+@_register_kernel_internal(horizontal_flip, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _horizontal_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> datapoints.BoundingBoxes:
+    output = horizontal_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+
+
+@_register_kernel_internal(horizontal_flip, datapoints.Video)
 def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image_tensor(video)
 
 
-def horizontal_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(horizontal_flip)
+        _log_api_usage_once(vertical_flip)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return horizontal_flip_image_tensor(inpt)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.horizontal_flip()
+        return vertical_flip_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(vertical_flip, type(inpt))
+        return kernel(
+            inpt,
+        )
     elif isinstance(inpt, PIL.Image.Image):
-        return horizontal_flip_image_pil(inpt)
+        return vertical_flip_image_pil(inpt)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -89,6 +129,7 @@ def horizontal_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
         )
 
 
+@_register_kernel_internal(vertical_flip, datapoints.Image)
 def vertical_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-2)
 
@@ -97,6 +138,7 @@ def vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
     return _FP.vflip(image)
 
 
+@_register_kernel_internal(vertical_flip, datapoints.Mask)
 def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
     return vertical_flip_image_tensor(mask)
 
@@ -118,25 +160,17 @@ def vertical_flip_bounding_boxes(
     return bounding_boxes.reshape(shape)
 
 
-def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
-    return vertical_flip_image_tensor(video)
-
+@_register_kernel_internal(vertical_flip, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _vertical_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> datapoints.BoundingBoxes:
+    output = vertical_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output)
 
-def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(vertical_flip)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return vertical_flip_image_tensor(inpt)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.vertical_flip()
-    elif isinstance(inpt, PIL.Image.Image):
-        return vertical_flip_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+@_register_kernel_internal(vertical_flip, datapoints.Video)
+def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image_tensor(video)
 
 
 # We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
@@ -158,6 +192,32 @@ def _compute_resized_output_size(
     return __compute_resized_output_size(canvas_size, size=size, max_size=max_size)
 
 
+def resize(
+    inpt: datapoints._InputTypeJIT,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> datapoints._InputTypeJIT:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(resize)
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(resize, type(inpt))
+        return kernel(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+    elif isinstance(inpt, PIL.Image.Image):
+        if antialias is False:
+            warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+        return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(resize, datapoints.Image)
 def resize_image_tensor(
     image: torch.Tensor,
     size: List[int],
@@ -274,6 +334,14 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
     return output
 
 
+@_register_kernel_internal(resize, datapoints.Mask, datapoint_wrapper=False)
+def _resize_mask_dispatch(
+    inpt: datapoints.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+) -> datapoints.Mask:
+    output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size)
+    return datapoints.Mask.wrap_like(inpt, output)
+
+
 def resize_bounding_boxes(
     bounding_boxes: torch.Tensor, canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
@@ -292,6 +360,17 @@ def resize_bounding_boxes(
     )
 
 
+@_register_kernel_internal(resize, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _resize_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+) -> datapoints.BoundingBoxes:
+    output, canvas_size = resize_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), inpt.canvas_size, size, max_size=max_size
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(resize, datapoints.Video)
 def resize_video(
     video: torch.Tensor,
     size: List[int],
@@ -302,23 +381,54 @@ def resize_video(
     return resize_image_tensor(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
 
-def resize(
+def affine(
     inpt: datapoints._InputTypeJIT,
-    size: List[int],
-    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-    antialias: Optional[Union[str, bool]] = "warn",
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: datapoints._FillTypeJIT = None,
+    center: Optional[List[float]] = None,
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(resize)
+        _log_api_usage_once(affine)
+
+    # TODO: consider deprecating integers from angle and shear on the future
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+        return affine_image_tensor(
+            inpt,
+            angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(affine, type(inpt))
+        return kernel(
+            inpt,
+            angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
     elif isinstance(inpt, PIL.Image.Image):
-        if antialias is False:
-            warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
-        return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
+        return affine_image_pil(
+            inpt,
+            angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -574,6 +684,7 @@ def _affine_grid(
     return output_grid.view(1, oh, ow, 2)
 
 
+@_register_kernel_internal(affine, datapoints.Image)
 def affine_image_tensor(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -763,6 +874,29 @@ def affine_bounding_boxes(
     return out_box
 
 
+@_register_kernel_internal(affine, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _affine_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    center: Optional[List[float]] = None,
+    **kwargs,
+) -> datapoints.BoundingBoxes:
+    output = affine_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+
+
 def affine_mask(
     mask: torch.Tensor,
     angle: Union[int, float],
@@ -795,6 +929,30 @@ def affine_mask(
     return output
 
 
+@_register_kernel_internal(affine, datapoints.Mask, datapoint_wrapper=False)
+def _affine_mask_dispatch(
+    inpt: datapoints.Mask,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    fill: datapoints._FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+    **kwargs,
+) -> datapoints.Mask:
+    output = affine_mask(
+        inpt.as_subclass(torch.Tensor),
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        fill=fill,
+        center=center,
+    )
+    return datapoints.Mask.wrap_like(inpt, output)
+
+
+@_register_kernel_internal(affine, datapoints.Video)
 def affine_video(
     video: torch.Tensor,
     angle: Union[int, float],
@@ -817,46 +975,24 @@ def affine_video(
     )
 
 
-def affine(
+def rotate(
     inpt: datapoints._InputTypeJIT,
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
+    angle: float,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints._FillTypeJIT = None,
+    expand: bool = False,
     center: Optional[List[float]] = None,
+    fill: datapoints._FillTypeJIT = None,
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(affine)
+        _log_api_usage_once(rotate)
 
-    # TODO: consider deprecating integers from angle and shear on the future
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return affine_image_tensor(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.affine(
-            angle, translate=translate, scale=scale, shear=shear, interpolation=interpolation, fill=fill, center=center
-        )
+        return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(rotate, type(inpt))
+        return kernel(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
     elif isinstance(inpt, PIL.Image.Image):
-        return affine_image_pil(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
+        return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -864,6 +1000,7 @@ def affine(
         )
 
 
+@_register_kernel_internal(rotate, datapoints.Image)
 def rotate_image_tensor(
     image: torch.Tensor,
     angle: float,
@@ -951,6 +1088,21 @@ def rotate_bounding_boxes(
     )
 
 
+@_register_kernel_internal(rotate, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _rotate_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, angle: float, expand: bool = False, center: Optional[List[float]] = None, **kwargs
+) -> datapoints.BoundingBoxes:
+    output, canvas_size = rotate_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        expand=expand,
+        center=center,
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+
+
 def rotate_mask(
     mask: torch.Tensor,
     angle: float,
@@ -979,6 +1131,20 @@ def rotate_mask(
     return output
 
 
+@_register_kernel_internal(rotate, datapoints.Mask, datapoint_wrapper=False)
+def _rotate_mask_dispatch(
+    inpt: datapoints.Mask,
+    angle: float,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: datapoints._FillTypeJIT = None,
+    **kwargs,
+) -> datapoints.Mask:
+    output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
+    return datapoints.Mask.wrap_like(inpt, output)
+
+
+@_register_kernel_internal(rotate, datapoints.Video)
 def rotate_video(
     video: torch.Tensor,
     angle: float,
@@ -990,23 +1156,23 @@ def rotate_video(
     return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
 
-def rotate(
+def pad(
     inpt: datapoints._InputTypeJIT,
-    angle: float,
-    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    expand: bool = False,
-    center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
+    padding_mode: str = "constant",
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(rotate)
+        _log_api_usage_once(pad)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.rotate(angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+        return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
+
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(pad, type(inpt))
+        return kernel(inpt, padding, fill=fill, padding_mode=padding_mode)
     elif isinstance(inpt, PIL.Image.Image):
-        return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+        return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1038,6 +1204,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
     return [pad_left, pad_right, pad_top, pad_bottom]
 
 
+@_register_kernel_internal(pad, datapoints.Image)
 def pad_image_tensor(
     image: torch.Tensor,
     padding: List[int],
@@ -1139,6 +1306,7 @@ def _pad_with_vector_fill(
 pad_image_pil = _FP.pad
 
 
+@_register_kernel_internal(pad, datapoints.Mask)
 def pad_mask(
     mask: torch.Tensor,
     padding: List[int],
@@ -1192,6 +1360,21 @@ def pad_bounding_boxes(
     return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
 
 
+@_register_kernel_internal(pad, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _pad_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, padding: List[int], padding_mode: str = "constant", **kwargs
+) -> datapoints.BoundingBoxes:
+    output, canvas_size = pad_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        padding=padding,
+        padding_mode=padding_mode,
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(pad, datapoints.Video)
 def pad_video(
     video: torch.Tensor,
     padding: List[int],
@@ -1201,22 +1384,17 @@ def pad_video(
     return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode)
 
 
-def pad(
-    inpt: datapoints._InputTypeJIT,
-    padding: List[int],
-    fill: Optional[Union[int, float, List[float]]] = None,
-    padding_mode: str = "constant",
-) -> datapoints._InputTypeJIT:
+def crop(inpt: datapoints._InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(pad)
+        _log_api_usage_once(crop)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
-
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.pad(padding, fill=fill, padding_mode=padding_mode)
+        return crop_image_tensor(inpt, top, left, height, width)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(crop, type(inpt))
+        return kernel(inpt, top, left, height, width)
     elif isinstance(inpt, PIL.Image.Image):
-        return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
+        return crop_image_pil(inpt, top, left, height, width)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1224,6 +1402,7 @@ def pad(
         )
 
 
+@_register_kernel_internal(crop, datapoints.Image)
 def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     h, w = image.shape[-2:]
 
@@ -1266,6 +1445,17 @@ def crop_bounding_boxes(
     return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
 
 
+@_register_kernel_internal(crop, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _crop_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, top: int, left: int, height: int, width: int
+) -> datapoints.BoundingBoxes:
+    output, canvas_size = crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(crop, datapoints.Mask)
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1281,20 +1471,32 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int)
     return output
 
 
+@_register_kernel_internal(crop, datapoints.Video)
 def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     return crop_image_tensor(video, top, left, height, width)
 
 
-def crop(inpt: datapoints._InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints._InputTypeJIT:
+def perspective(
+    inpt: datapoints._InputTypeJIT,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: datapoints._FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(crop)
-
+        _log_api_usage_once(perspective)
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return crop_image_tensor(inpt, top, left, height, width)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.crop(top, left, height, width)
+        return perspective_image_tensor(
+            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(perspective, type(inpt))
+        return kernel(inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients)
     elif isinstance(inpt, PIL.Image.Image):
-        return crop_image_pil(inpt, top, left, height, width)
+        return perspective_image_pil(
+            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1349,6 +1551,7 @@ def _perspective_coefficients(
         raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
 
 
+@_register_kernel_internal(perspective, datapoints.Image)
 def perspective_image_tensor(
     image: torch.Tensor,
     startpoints: Optional[List[List[int]]],
@@ -1503,6 +1706,25 @@ def perspective_bounding_boxes(
     ).reshape(original_shape)
 
 
+@_register_kernel_internal(perspective, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _perspective_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]] = None,
+    **kwargs,
+) -> datapoints.BoundingBoxes:
+    output = perspective_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        coefficients=coefficients,
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+
+
 def perspective_mask(
     mask: torch.Tensor,
     startpoints: Optional[List[List[int]]],
@@ -1526,6 +1748,26 @@ def perspective_mask(
     return output
 
 
+@_register_kernel_internal(perspective, datapoints.Mask, datapoint_wrapper=False)
+def _perspective_mask_dispatch(
+    inpt: datapoints.Mask,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    fill: datapoints._FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+    **kwargs,
+) -> datapoints.Mask:
+    output = perspective_mask(
+        inpt.as_subclass(torch.Tensor),
+        startpoints=startpoints,
+        endpoints=endpoints,
+        fill=fill,
+        coefficients=coefficients,
+    )
+    return datapoints.Mask.wrap_like(inpt, output)
+
+
+@_register_kernel_internal(perspective, datapoints.Video)
 def perspective_video(
     video: torch.Tensor,
     startpoints: Optional[List[List[int]]],
@@ -1539,28 +1781,25 @@ def perspective_video(
     )
 
 
-def perspective(
+def elastic(
     inpt: datapoints._InputTypeJIT,
-    startpoints: Optional[List[List[int]]],
-    endpoints: Optional[List[List[int]]],
+    displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints._FillTypeJIT = None,
-    coefficients: Optional[List[float]] = None,
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(perspective)
+        _log_api_usage_once(elastic)
+
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return perspective_image_tensor(
-            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
-        )
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.perspective(
-            startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
-        )
+        return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(elastic, type(inpt))
+        return kernel(inpt, displacement, interpolation=interpolation, fill=fill)
     elif isinstance(inpt, PIL.Image.Image):
-        return perspective_image_pil(
-            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
-        )
+        return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1568,6 +1807,10 @@ def perspective(
         )
 
 
+elastic_transform = elastic
+
+
+@_register_kernel_internal(elastic, datapoints.Image)
 def elastic_image_tensor(
     image: torch.Tensor,
     displacement: torch.Tensor,
@@ -1699,6 +1942,16 @@ def elastic_bounding_boxes(
     ).reshape(original_shape)
 
 
+@_register_kernel_internal(elastic, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _elastic_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, displacement: torch.Tensor, **kwargs
+) -> datapoints.BoundingBoxes:
+    output = elastic_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, displacement=displacement
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+
+
 def elastic_mask(
     mask: torch.Tensor,
     displacement: torch.Tensor,
@@ -1718,6 +1971,15 @@ def elastic_mask(
     return output
 
 
+@_register_kernel_internal(elastic, datapoints.Mask, datapoint_wrapper=False)
+def _elastic_mask_dispatch(
+    inpt: datapoints.Mask, displacement: torch.Tensor, fill: datapoints._FillTypeJIT = None, **kwargs
+) -> datapoints.Mask:
+    output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
+    return datapoints.Mask.wrap_like(inpt, output)
+
+
+@_register_kernel_internal(elastic, datapoints.Video)
 def elastic_video(
     video: torch.Tensor,
     displacement: torch.Tensor,
@@ -1727,24 +1989,17 @@ def elastic_video(
     return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
 
 
-def elastic(
-    inpt: datapoints._InputTypeJIT,
-    displacement: torch.Tensor,
-    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
-) -> datapoints._InputTypeJIT:
+def center_crop(inpt: datapoints._InputTypeJIT, output_size: List[int]) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(elastic)
-
-    if not isinstance(displacement, torch.Tensor):
-        raise TypeError("Argument displacement should be a Tensor")
+        _log_api_usage_once(center_crop)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.elastic(displacement, interpolation=interpolation, fill=fill)
+        return center_crop_image_tensor(inpt, output_size)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(center_crop, type(inpt))
+        return kernel(inpt, output_size)
     elif isinstance(inpt, PIL.Image.Image):
-        return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
+        return center_crop_image_pil(inpt, output_size)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1752,9 +2007,6 @@ def elastic(
         )
 
 
-elastic_transform = elastic
-
-
 def _center_crop_parse_output_size(output_size: List[int]) -> List[int]:
     if isinstance(output_size, numbers.Number):
         s = int(output_size)
@@ -1782,6 +2034,7 @@ def _center_crop_compute_crop_anchor(
     return crop_top, crop_left
 
 
+@_register_kernel_internal(center_crop, datapoints.Image)
 def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
     shape = image.shape
@@ -1831,6 +2084,17 @@ def center_crop_bounding_boxes(
     )
 
 
+@_register_kernel_internal(center_crop, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _center_crop_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, output_size: List[int]
+) -> datapoints.BoundingBoxes:
+    output, canvas_size = center_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, output_size=output_size
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(center_crop, datapoints.Mask)
 def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1846,20 +2110,33 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor
     return output
 
 
+@_register_kernel_internal(center_crop, datapoints.Video)
 def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     return center_crop_image_tensor(video, output_size)
 
 
-def center_crop(inpt: datapoints._InputTypeJIT, output_size: List[int]) -> datapoints._InputTypeJIT:
+def resized_crop(
+    inpt: datapoints._InputTypeJIT,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(center_crop)
+        _log_api_usage_once(resized_crop)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return center_crop_image_tensor(inpt, output_size)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.center_crop(output_size)
+        return resized_crop_image_tensor(
+            inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
+        )
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(resized_crop, type(inpt))
+        return kernel(inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
     elif isinstance(inpt, PIL.Image.Image):
-        return center_crop_image_pil(inpt, output_size)
+        return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1867,6 +2144,7 @@ def center_crop(inpt: datapoints._InputTypeJIT, output_size: List[int]) -> datap
         )
 
 
+@_register_kernel_internal(resized_crop, datapoints.Image)
 def resized_crop_image_tensor(
     image: torch.Tensor,
     top: int,
@@ -1904,8 +2182,18 @@ def resized_crop_bounding_boxes(
     width: int,
     size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    bounding_boxes, _ = crop_bounding_boxes(bounding_boxes, format, top, left, height, width)
-    return resize_bounding_boxes(bounding_boxes, canvas_size=(height, width), size=size)
+    bounding_boxes, canvas_size = crop_bounding_boxes(bounding_boxes, format, top, left, height, width)
+    return resize_bounding_boxes(bounding_boxes, canvas_size=canvas_size, size=size)
+
+
+@_register_kernel_internal(resized_crop, datapoints.BoundingBoxes, datapoint_wrapper=False)
+def _resized_crop_bounding_boxes_dispatch(
+    inpt: datapoints.BoundingBoxes, top: int, left: int, height: int, width: int, size: List[int], **kwargs
+) -> datapoints.BoundingBoxes:
+    output, canvas_size = resized_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width, size=size
+    )
+    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
 
 
 def resized_crop_mask(
@@ -1920,6 +2208,17 @@ def resized_crop_mask(
     return resize_mask(mask, size)
 
 
+@_register_kernel_internal(resized_crop, datapoints.Mask, datapoint_wrapper=False)
+def _resized_crop_mask_dispatch(
+    inpt: datapoints.Mask, top: int, left: int, height: int, width: int, size: List[int], **kwargs
+) -> datapoints.Mask:
+    output = resized_crop_mask(
+        inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
+    )
+    return datapoints.Mask.wrap_like(inpt, output)
+
+
+@_register_kernel_internal(resized_crop, datapoints.Video)
 def resized_crop_video(
     video: torch.Tensor,
     top: int,
@@ -1935,27 +2234,26 @@ def resized_crop_video(
     )
 
 
-def resized_crop(
-    inpt: datapoints._InputTypeJIT,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    antialias: Optional[Union[str, bool]] = "warn",
-) -> datapoints._InputTypeJIT:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True)
+def five_crop(
+    inpt: datapoints._InputTypeJIT, size: List[int]
+) -> Tuple[
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+]:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(resized_crop)
+        _log_api_usage_once(five_crop)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return resized_crop_image_tensor(
-            inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
-        )
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.resized_crop(top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
+        return five_crop_image_tensor(inpt, size)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(five_crop, type(inpt))
+        return kernel(inpt, size)
     elif isinstance(inpt, PIL.Image.Image):
-        return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
+        return five_crop_image_pil(inpt, size)
     else:
         raise TypeError(
             f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
@@ -1977,6 +2275,7 @@ def _parse_five_crop_size(size: List[int]) -> List[int]:
     return size
 
 
+@_register_five_ten_crop_kernel(five_crop, datapoints.Image)
 def five_crop_image_tensor(
     image: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -2014,38 +2313,46 @@ def five_crop_image_pil(
     return tl, tr, bl, br, center
 
 
+@_register_five_ten_crop_kernel(five_crop, datapoints.Video)
 def five_crop_video(
     video: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     return five_crop_image_tensor(video, size)
 
 
-ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
-
-
-def five_crop(
-    inpt: ImageOrVideoTypeJIT, size: List[int]
-) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True)
+def ten_crop(
+    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+    datapoints._InputTypeJIT,
+]:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(five_crop)
+        _log_api_usage_once(ten_crop)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return five_crop_image_tensor(inpt, size)
-    elif isinstance(inpt, datapoints.Image):
-        output = five_crop_image_tensor(inpt.as_subclass(torch.Tensor), size)
-        return tuple(datapoints.Image.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
-    elif isinstance(inpt, datapoints.Video):
-        output = five_crop_video(inpt.as_subclass(torch.Tensor), size)
-        return tuple(datapoints.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
+        return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(ten_crop, type(inpt))
+        return kernel(inpt, size, vertical_flip=vertical_flip)
     elif isinstance(inpt, PIL.Image.Image):
-        return five_crop_image_pil(inpt, size)
+        return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
             f"but got {type(inpt)} instead."
         )
 
 
+@_register_five_ten_crop_kernel(ten_crop, datapoints.Image)
 def ten_crop_image_tensor(
     image: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
@@ -2099,6 +2406,7 @@ def ten_crop_image_pil(
     return non_flipped + flipped
 
 
+@_register_five_ten_crop_kernel(ten_crop, datapoints.Video)
 def ten_crop_video(
     video: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
@@ -2114,37 +2422,3 @@ def ten_crop_video(
     torch.Tensor,
 ]:
     return ten_crop_image_tensor(video, size, vertical_flip=vertical_flip)
-
-
-def ten_crop(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], size: List[int], vertical_flip: bool = False
-) -> Tuple[
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-    ImageOrVideoTypeJIT,
-]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(ten_crop)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
-    elif isinstance(inpt, datapoints.Image):
-        output = ten_crop_image_tensor(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
-        return tuple(datapoints.Image.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
-    elif isinstance(inpt, datapoints.Video):
-        output = ten_crop_video(inpt.as_subclass(torch.Tensor), size, vertical_flip=vertical_flip)
-        return tuple(datapoints.Video.wrap_like(inpt, item) for item in output)  # type: ignore[return-value]
-    elif isinstance(inpt, PIL.Image.Image):
-        return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 91b370675..a4bfe7df8 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -8,9 +8,29 @@ from torchvision.transforms import _functional_pil as _FP
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import is_simple_tensor
+from ._utils import _get_kernel, _register_kernel_internal, _register_unsupported_type, is_simple_tensor
 
 
+@_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
+def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(get_dimensions)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return get_dimensions_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(get_dimensions, type(inpt))
+        return kernel(inpt)
+    elif isinstance(inpt, PIL.Image.Image):
+        return get_dimensions_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(get_dimensions, datapoints.Image, datapoint_wrapper=False)
 def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
     chw = list(image.shape[-3:])
     ndims = len(chw)
@@ -26,31 +46,31 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
 get_dimensions_image_pil = _FP.get_dimensions
 
 
+@_register_kernel_internal(get_dimensions, datapoints.Video, datapoint_wrapper=False)
 def get_dimensions_video(video: torch.Tensor) -> List[int]:
     return get_dimensions_image_tensor(video)
 
 
-def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
+@_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
+def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> int:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(get_dimensions)
+        _log_api_usage_once(get_num_channels)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return get_dimensions_image_tensor(inpt)
-
-    for typ, get_size_fn in {
-        datapoints.Image: get_dimensions_image_tensor,
-        datapoints.Video: get_dimensions_video,
-        PIL.Image.Image: get_dimensions_image_pil,
-    }.items():
-        if isinstance(inpt, typ):
-            return get_size_fn(inpt)
-
-    raise TypeError(
-        f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-        f"but got {type(inpt)} instead."
-    )
+        return get_num_channels_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(get_num_channels, type(inpt))
+        return kernel(inpt)
+    elif isinstance(inpt, PIL.Image.Image):
+        return get_num_channels_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
 
 
+@_register_kernel_internal(get_num_channels, datapoints.Image, datapoint_wrapper=False)
 def get_num_channels_image_tensor(image: torch.Tensor) -> int:
     chw = image.shape[-3:]
     ndims = len(chw)
@@ -65,36 +85,35 @@ def get_num_channels_image_tensor(image: torch.Tensor) -> int:
 get_num_channels_image_pil = _FP.get_image_num_channels
 
 
+@_register_kernel_internal(get_num_channels, datapoints.Video, datapoint_wrapper=False)
 def get_num_channels_video(video: torch.Tensor) -> int:
     return get_num_channels_image_tensor(video)
 
 
-def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> int:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(get_num_channels)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return get_num_channels_image_tensor(inpt)
-
-    for typ, get_size_fn in {
-        datapoints.Image: get_num_channels_image_tensor,
-        datapoints.Video: get_num_channels_video,
-        PIL.Image.Image: get_num_channels_image_pil,
-    }.items():
-        if isinstance(inpt, typ):
-            return get_size_fn(inpt)
-
-    raise TypeError(
-        f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-        f"but got {type(inpt)} instead."
-    )
-
-
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
 # deprecating the old names.
 get_image_num_channels = get_num_channels
 
 
+def get_size(inpt: datapoints._InputTypeJIT) -> List[int]:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(get_size)
+
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return get_size_image_tensor(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(get_size, type(inpt))
+        return kernel(inpt)
+    elif isinstance(inpt, PIL.Image.Image):
+        return get_size_image_pil(inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(get_size, datapoints.Image, datapoint_wrapper=False)
 def get_size_image_tensor(image: torch.Tensor) -> List[int]:
     hw = list(image.shape[-2:])
     ndims = len(hw)
@@ -110,59 +129,41 @@ def get_size_image_pil(image: PIL.Image.Image) -> List[int]:
     return [height, width]
 
 
+@_register_kernel_internal(get_size, datapoints.Video, datapoint_wrapper=False)
 def get_size_video(video: torch.Tensor) -> List[int]:
     return get_size_image_tensor(video)
 
 
+@_register_kernel_internal(get_size, datapoints.Mask, datapoint_wrapper=False)
 def get_size_mask(mask: torch.Tensor) -> List[int]:
     return get_size_image_tensor(mask)
 
 
-@torch.jit.unused
+@_register_kernel_internal(get_size, datapoints.BoundingBoxes, datapoint_wrapper=False)
 def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]:
     return list(bounding_box.canvas_size)
 
 
-def get_size(inpt: datapoints._InputTypeJIT) -> List[int]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(get_size)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return get_size_image_tensor(inpt)
-
-    # TODO: This is just the poor mans version of a dispatcher. This will be properly addressed with
-    # https://github.com/pytorch/vision/pull/7747 when we can register the kernels above without the need to have
-    # a method on the datapoint class
-    for typ, get_size_fn in {
-        datapoints.Image: get_size_image_tensor,
-        datapoints.BoundingBoxes: get_size_bounding_boxes,
-        datapoints.Mask: get_size_mask,
-        datapoints.Video: get_size_video,
-        PIL.Image.Image: get_size_image_pil,
-    }.items():
-        if isinstance(inpt, typ):
-            return get_size_fn(inpt)
-
-    raise TypeError(
-        f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-        f"but got {type(inpt)} instead."
-    )
-
-
-def get_num_frames_video(video: torch.Tensor) -> int:
-    return video.shape[-4]
-
-
+@_register_unsupported_type(PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)
 def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_num_frames)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_frames_video(inpt)
-    elif isinstance(inpt, datapoints.Video):
-        return get_num_frames_video(inpt)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(get_num_frames, type(inpt))
+        return kernel(inpt)
     else:
-        raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
+        raise TypeError(
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(get_num_frames, datapoints.Video, datapoint_wrapper=False)
+def get_num_frames_video(video: torch.Tensor) -> int:
+    return video.shape[-4]
 
 
 def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index cda85ba90..90a3e44e9 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -11,9 +11,37 @@ from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import is_simple_tensor
+from ._utils import (
+    _get_kernel,
+    _register_explicit_noop,
+    _register_kernel_internal,
+    _register_unsupported_type,
+    is_simple_tensor,
+)
 
 
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+@_register_unsupported_type(PIL.Image.Image)
+def normalize(
+    inpt: Union[datapoints._TensorImageTypeJIT, datapoints._TensorVideoTypeJIT],
+    mean: List[float],
+    std: List[float],
+    inplace: bool = False,
+) -> torch.Tensor:
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(normalize)
+    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+        return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(normalize, type(inpt))
+        return kernel(inpt, mean=mean, std=std, inplace=inplace)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or any TorchVision datapoint, but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(normalize, datapoints.Image)
 def normalize_image_tensor(
     image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False
 ) -> torch.Tensor:
@@ -49,25 +77,29 @@ def normalize_image_tensor(
     return image.div_(std)
 
 
+@_register_kernel_internal(normalize, datapoints.Video)
 def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
     return normalize_image_tensor(video, mean, std, inplace=inplace)
 
 
-def normalize(
-    inpt: Union[datapoints._TensorImageTypeJIT, datapoints._TensorVideoTypeJIT],
-    mean: List[float],
-    std: List[float],
-    inplace: bool = False,
-) -> torch.Tensor:
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def gaussian_blur(
+    inpt: datapoints._InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(normalize)
+        _log_api_usage_once(gaussian_blur)
+
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
-        return inpt.normalize(mean=mean, std=std, inplace=inplace)
+        return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(gaussian_blur, type(inpt))
+        return kernel(inpt, kernel_size=kernel_size, sigma=sigma)
+    elif isinstance(inpt, PIL.Image.Image):
+        return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor or an `Image` or `Video` datapoint, " f"but got {type(inpt)} instead."
+            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"but got {type(inpt)} instead."
         )
 
 
@@ -87,6 +119,7 @@ def _get_gaussian_kernel2d(
     return kernel2d
 
 
+@_register_kernel_internal(gaussian_blur, datapoints.Image)
 def gaussian_blur_image_tensor(
     image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
@@ -160,28 +193,27 @@ def gaussian_blur_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
+@_register_kernel_internal(gaussian_blur, datapoints.Video)
 def gaussian_blur_video(
     video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
     return gaussian_blur_image_tensor(video, kernel_size, sigma)
 
 
-def gaussian_blur(
-    inpt: datapoints._InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
+def to_dtype(
+    inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(gaussian_blur)
+        _log_api_usage_once(to_dtype)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, PIL.Image.Image):
-        return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
+        return to_dtype_image_tensor(inpt, dtype, scale=scale)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(to_dtype, type(inpt))
+        return kernel(inpt, dtype, scale=scale)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
+            f"Input can either be a plain tensor or any TorchVision datapoint, but got {type(inpt)} instead."
         )
 
 
@@ -200,6 +232,7 @@ def _num_value_bits(dtype: torch.dtype) -> int:
         raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
 
 
+@_register_kernel_internal(to_dtype, datapoints.Image)
 def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
 
     if image.dtype == dtype:
@@ -257,23 +290,15 @@ def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32)
     return to_dtype_image_tensor(image, dtype=dtype, scale=True)
 
 
+@_register_kernel_internal(to_dtype, datapoints.Video)
 def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
     return to_dtype_image_tensor(video, dtype, scale=scale)
 
 
-def to_dtype(inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(to_dtype)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return to_dtype_image_tensor(inpt, dtype, scale=scale)
-    elif isinstance(inpt, datapoints.Image):
-        output = to_dtype_image_tensor(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
-        return datapoints.Image.wrap_like(inpt, output)
-    elif isinstance(inpt, datapoints.Video):
-        output = to_dtype_video(inpt.as_subclass(torch.Tensor), dtype, scale=scale)
-        return datapoints.Video.wrap_like(inpt, output)
-    elif isinstance(inpt, datapoints._datapoint.Datapoint):
-        return inpt.to(dtype)
-    else:
-        raise TypeError(f"Input can either be a plain tensor or a datapoint, but got {type(inpt)} instead.")
+@_register_kernel_internal(to_dtype, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(to_dtype, datapoints.Mask, datapoint_wrapper=False)
+def _to_dtype_tensor_dispatch(
+    inpt: datapoints._InputTypeJIT, dtype: torch.dtype, scale: bool = False
+) -> datapoints._InputTypeJIT:
+    # We don't need to unwrap and rewrap here, since Datapoint.to() preserves the type
+    return inpt.to(dtype)
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 5612a3877..52c745f99 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -1,27 +1,34 @@
+import PIL.Image
 import torch
 
 from torchvision import datapoints
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import is_simple_tensor
-
-
-def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
-    # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
-    t_max = video.shape[-4] - 1
-    indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
-    return torch.index_select(video, -4, indices)
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, is_simple_tensor
 
 
+@_register_explicit_noop(
+    PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True
+)
 def uniform_temporal_subsample(inpt: datapoints._VideoTypeJIT, num_samples: int) -> datapoints._VideoTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(uniform_temporal_subsample)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return uniform_temporal_subsample_video(inpt, num_samples)
-    elif isinstance(inpt, datapoints.Video):
-        output = uniform_temporal_subsample_video(inpt.as_subclass(torch.Tensor), num_samples)
-        return datapoints.Video.wrap_like(inpt, output)
+    elif isinstance(inpt, datapoints.Datapoint):
+        kernel = _get_kernel(uniform_temporal_subsample, type(inpt))
+        return kernel(inpt, num_samples)
     else:
-        raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
+        raise TypeError(
+            f"Input can either be a plain tensor or any TorchVision datapoint, but got {type(inpt)} instead."
+        )
+
+
+@_register_kernel_internal(uniform_temporal_subsample, datapoints.Video)
+def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
+    # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
+    t_max = video.shape[-4] - 1
+    indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
+    return torch.index_select(video, -4, indices)
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index f31ccb939..63e029d6c 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -1,8 +1,137 @@
-from typing import Any
+import functools
+import warnings
+from typing import Any, Callable, Dict, Type
 
 import torch
-from torchvision.datapoints._datapoint import Datapoint
+from torchvision import datapoints
 
 
 def is_simple_tensor(inpt: Any) -> bool:
-    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, Datapoint)
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint)
+
+
+# {dispatcher: {input_type: type_specific_kernel}}
+_KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {}
+
+
+def _kernel_datapoint_wrapper(kernel):
+    @functools.wraps(kernel)
+    def wrapper(inpt, *args, **kwargs):
+        output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
+        return type(inpt).wrap_like(inpt, output)
+
+    return wrapper
+
+
+def _register_kernel_internal(dispatcher, datapoint_cls, *, datapoint_wrapper=True):
+    registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
+    if datapoint_cls in registry:
+        raise TypeError(
+            f"Dispatcher '{dispatcher.__name__}' already has a kernel registered for type '{datapoint_cls.__name__}'."
+        )
+
+    def decorator(kernel):
+        registry[datapoint_cls] = _kernel_datapoint_wrapper(kernel) if datapoint_wrapper else kernel
+        return kernel
+
+    return decorator
+
+
+def register_kernel(dispatcher, datapoint_cls):
+    return _register_kernel_internal(dispatcher, datapoint_cls, datapoint_wrapper=False)
+
+
+def _get_kernel(dispatcher, datapoint_cls):
+    registry = _KERNEL_REGISTRY.get(dispatcher)
+    if not registry:
+        raise ValueError(f"No kernel registered for dispatcher '{dispatcher.__name__}'.")
+
+    if datapoint_cls in registry:
+        return registry[datapoint_cls]
+
+    for registered_cls, kernel in registry.items():
+        if issubclass(datapoint_cls, registered_cls):
+            return kernel
+
+    return _noop
+
+
+# Everything below this block is stuff that we need right now, since it looks like we need to release in an intermediate
+# stage. See https://github.com/pytorch/vision/pull/7747#issuecomment-1661698450 for details.
+
+
+# In the future, the default behavior will be to error on unsupported types in dispatchers. The noop behavior that we
+# need for transforms will be handled by _get_kernel rather than actually registering no-ops on the dispatcher.
+# Finally, the use case of preventing users from registering kernels for our builtin types will be handled inside
+# register_kernel.
+def _register_explicit_noop(*datapoints_classes, warn_passthrough=False):
+    """
+    Although this looks redundant with the no-op behavior of _get_kernel, this explicit registration prevents users
+    from registering kernels for builtin datapoints on builtin dispatchers that rely on the no-op behavior.
+
+    For example, without explicit no-op registration the following would be valid user code:
+
+    .. code::
+        from torchvision.transforms.v2 import functional as F
+
+        @F.register_kernel(F.adjust_brightness, datapoints.BoundingBox)
+        def lol(...):
+            ...
+    """
+
+    def decorator(dispatcher):
+        for cls in datapoints_classes:
+            msg = (
+                f"F.{dispatcher.__name__} is currently passing through inputs of type datapoints.{cls.__name__}. "
+                f"This will likely change in the future."
+            )
+            register_kernel(dispatcher, cls)(functools.partial(_noop, __msg__=msg if warn_passthrough else None))
+        return dispatcher
+
+    return decorator
+
+
+def _noop(inpt, *args, __msg__=None, **kwargs):
+    if __msg__:
+        warnings.warn(__msg__, UserWarning, stacklevel=2)
+    return inpt
+
+
+# TODO: we only need this, since our default behavior in case no kernel is found is passthrough. When we change that
+# to error later, this decorator can be removed, since the error will be raised by _get_kernel
+def _register_unsupported_type(*datapoints_classes):
+    def kernel(inpt, *args, __dispatcher_name__, **kwargs):
+        raise TypeError(f"F.{__dispatcher_name__} does not support inputs of type {type(inpt)}.")
+
+    def decorator(dispatcher):
+        for cls in datapoints_classes:
+            register_kernel(dispatcher, cls)(functools.partial(kernel, __dispatcher_name__=dispatcher.__name__))
+        return dispatcher
+
+    return decorator
+
+
+# This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
+# We could get rid of this by letting _register_kernel_internal take arbitrary dispatchers rather than wrap_kernel: bool
+# TODO: decide if we want that
+def _register_five_ten_crop_kernel(dispatcher, datapoint_cls):
+    registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
+    if datapoint_cls in registry:
+        raise TypeError(
+            f"Dispatcher '{dispatcher.__name__}' already has a kernel registered for type '{datapoint_cls.__name__}'."
+        )
+
+    def wrap(kernel):
+        @functools.wraps(kernel)
+        def wrapper(inpt, *args, **kwargs):
+            output = kernel(inpt, *args, **kwargs)
+            container_type = type(output)
+            return container_type(type(inpt).wrap_like(inpt, o) for o in output)
+
+        return wrapper
+
+    def decorator(kernel):
+        registry[datapoint_cls] = wrap(kernel)
+        return kernel
+
+    return decorator
-- 
GitLab


From 408917d19b151297831dc1dab481c029db99f5a5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 2 Aug 2023 16:05:04 +0100
Subject: [PATCH 535/624] Use `# %%`  syntax for gallery examples (#7793)

---
 gallery/plot_datapoints.py                 | 18 ++++----
 gallery/plot_optical_flow.py               | 16 +++----
 gallery/plot_repurposing_annotations.py    | 20 ++++-----
 gallery/plot_scripted_tensor_transforms.py | 12 ++---
 gallery/plot_transforms.py                 | 52 +++++++++++-----------
 gallery/plot_transforms_v2.py              |  8 ++--
 gallery/plot_transforms_v2_e2e.py          | 12 ++---
 gallery/plot_video_api.py                  | 34 +++++++-------
 gallery/plot_visualization_utils.py        | 48 ++++++++++----------
 9 files changed, 110 insertions(+), 110 deletions(-)

diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index fef282ae0..57e29bd86 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -23,7 +23,7 @@ from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F
 
 
-########################################################################################################################
+# %%
 # What are datapoints?
 # --------------------
 #
@@ -36,7 +36,7 @@ assert isinstance(image, torch.Tensor)
 assert image.data_ptr() == tensor.data_ptr()
 
 
-########################################################################################################################
+# %%
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
@@ -59,7 +59,7 @@ image = datapoints.Image([[[[0, 1], [1, 0]]]])
 print(image)
 
 
-########################################################################################################################
+# %%
 # Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
 # parameters.
 
@@ -67,14 +67,14 @@ float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires
 print(float_image)
 
 
-########################################################################################################################
+# %%
 # In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
 # :class:`PIL.Image.Image` directly:
 
 image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
-########################################################################################################################
+# %%
 # In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
 # :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
 # corresponding image alongside the actual values:
@@ -85,7 +85,7 @@ bounding_box = datapoints.BoundingBoxes(
 print(bounding_box)
 
 
-########################################################################################################################
+# %%
 # Do I have to wrap the output of the datasets myself?
 # ----------------------------------------------------
 #
@@ -120,7 +120,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
 
         ...
 
-########################################################################################################################
+# %%
 # 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
 
 
@@ -144,7 +144,7 @@ def get_transform(train):
     transforms.append(T.PILToTensor())
     ...
 
-########################################################################################################################
+# %%
 # .. note::
 #
 #    If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
@@ -171,7 +171,7 @@ new_image = image + 0
 
 assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
 
-########################################################################################################################
+# %%
 # .. note::
 #
 #    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
diff --git a/gallery/plot_optical_flow.py b/gallery/plot_optical_flow.py
index 835ce3301..499f8c663 100644
--- a/gallery/plot_optical_flow.py
+++ b/gallery/plot_optical_flow.py
@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs):
 
     plt.tight_layout()
 
-###################################
+# %%
 # Reading Videos Using Torchvision
 # --------------------------------
 # We will first read a video using :func:`~torchvision.io.read_video`.
@@ -62,7 +62,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask
 video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
 _ = urlretrieve(video_url, video_path)
 
-#########################
+# %%
 # :func:`~torchvision.io.read_video` returns the video frames, audio frames and
 # the metadata associated with the video. In our case, we only need the video
 # frames.
@@ -79,7 +79,7 @@ img2_batch = torch.stack([frames[101], frames[151]])
 
 plot(img1_batch)
 
-#########################
+# %%
 # The RAFT model accepts RGB images. We first get the frames from
 # :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
 # are divisible by 8. Note that we explicitly use ``antialias=False``, because
@@ -104,7 +104,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
 print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
 
 
-####################################
+# %%
 # Estimating Optical flow using RAFT
 # ----------------------------------
 # We will use our RAFT implementation from
@@ -125,7 +125,7 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
 print(f"type = {type(list_of_flows)}")
 print(f"length = {len(list_of_flows)} = number of iterations of the model")
 
-####################################
+# %%
 # The RAFT model outputs lists of predicted flows where each entry is a
 # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
 # in the model. For more details on the iterative nature of the model, please
@@ -144,7 +144,7 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
 print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
 
 
-####################################
+# %%
 # Visualizing predicted flows
 # ---------------------------
 # Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
@@ -166,7 +166,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
 grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
 plot(grid)
 
-####################################
+# %%
 # Bonus: Creating GIFs of predicted flows
 # ---------------------------------------
 # In the example above we have only shown the predicted flows of 2 pairs of
@@ -187,7 +187,7 @@ plot(grid)
 #     output_folder = "/tmp/"  # Update this to the folder of your choice
 #     write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
 
-####################################
+# %%
 # Once the .jpg flow images are saved, you can convert them into a video or a
 # GIF using ffmpeg with e.g.:
 #
diff --git a/gallery/plot_repurposing_annotations.py b/gallery/plot_repurposing_annotations.py
index 7bb68617a..99f75f03f 100644
--- a/gallery/plot_repurposing_annotations.py
+++ b/gallery/plot_repurposing_annotations.py
@@ -36,7 +36,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Masks
 # -----
 # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
@@ -53,7 +53,7 @@ def show(imgs):
 # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
 # localization tasks.
 
-####################################
+# %%
 # Converting Masks to Bounding Boxes
 # -----------------------------------------------
 # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
@@ -70,7 +70,7 @@ img = read_image(img_path)
 mask = read_image(mask_path)
 
 
-#########################
+# %%
 # Here the masks are represented as a PNG Image, with floating point values.
 # Each pixel is encoded as different colors, with 0 being background.
 # Notice that the spatial dimensions of image and mask match.
@@ -79,7 +79,7 @@ print(mask.size())
 print(img.size())
 print(mask)
 
-############################
+# %%
 
 # We get the unique colors, as these would be the object ids.
 obj_ids = torch.unique(mask)
@@ -91,7 +91,7 @@ obj_ids = obj_ids[1:]
 # Note that this snippet would work as well if the masks were float values instead of ints.
 masks = mask == obj_ids[:, None, None]
 
-########################
+# %%
 # Now the masks are a boolean tensor.
 # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
 # The other two dimensions are height and width, which are equal to the dimensions of the image.
@@ -101,7 +101,7 @@ masks = mask == obj_ids[:, None, None]
 print(masks.size())
 print(masks)
 
-####################################
+# %%
 # Let us visualize an image and plot its corresponding segmentation masks.
 # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
 
@@ -113,7 +113,7 @@ for mask in masks:
 
 show(drawn_masks)
 
-####################################
+# %%
 # To convert the boolean masks into bounding boxes.
 # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
 # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
@@ -124,7 +124,7 @@ boxes = masks_to_boxes(masks)
 print(boxes.size())
 print(boxes)
 
-####################################
+# %%
 # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
 # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
 # provided in :ref:`torchvision.utils <utils>`.
@@ -134,7 +134,7 @@ from torchvision.utils import draw_bounding_boxes
 drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
 show(drawn_boxes)
 
-###################################
+# %%
 # These boxes can now directly be used by detection models in torchvision.
 # Here is demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -153,7 +153,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
 detection_outputs = model(img.unsqueeze(0), [target])
 
 
-####################################
+# %%
 # Converting Segmentation Dataset to Detection Dataset
 # ----------------------------------------------------
 #
diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py
index b0851217e..e803da779 100644
--- a/gallery/plot_scripted_tensor_transforms.py
+++ b/gallery/plot_scripted_tensor_transforms.py
@@ -45,7 +45,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # The :func:`~torchvision.io.read_image` function allows to read an image and
 # directly load it as a tensor
 
@@ -53,7 +53,7 @@ dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
 dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
 show([dog1, dog2])
 
-####################################
+# %%
 # Transforming images on GPU
 # --------------------------
 # Most transforms natively support tensors on top of PIL images (to visualize
@@ -76,7 +76,7 @@ transformed_dog1 = transforms(dog1)
 transformed_dog2 = transforms(dog2)
 show([transformed_dog1, transformed_dog2])
 
-####################################
+# %%
 # Scriptable transforms for easier deployment via torchscript
 # -----------------------------------------------------------
 # We now show how to combine image transformations and a model forward pass,
@@ -103,7 +103,7 @@ class Predictor(nn.Module):
             return y_pred.argmax(dim=1)
 
 
-####################################
+# %%
 # Now, let's define scripted and non-scripted instances of ``Predictor`` and
 # apply it on multiple tensor images of the same size
 
@@ -115,7 +115,7 @@ batch = torch.stack([dog1, dog2]).to(device)
 res = predictor(batch)
 res_scripted = scripted_predictor(batch)
 
-####################################
+# %%
 # We can verify that the prediction of the scripted and non-scripted models are
 # the same:
 
@@ -128,7 +128,7 @@ for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
     assert pred == pred_scripted
     print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
 
-####################################
+# %%
 # Since the model is scripted, it can be easily dumped on disk and re-used
 
 import tempfile
diff --git a/gallery/plot_transforms.py b/gallery/plot_transforms.py
index 2330dc0f9..ac6e50a39 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/plot_transforms.py
@@ -50,7 +50,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
     plt.tight_layout()
 
 
-####################################
+# %%
 # Geometric Transforms
 # --------------------
 # Geometric image transformation refers to the process of altering the geometric properties of an image,
@@ -65,7 +65,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
 padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
 plot(padded_imgs)
 
-####################################
+# %%
 # Resize
 # ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
@@ -74,7 +74,7 @@ plot(padded_imgs)
 resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
 plot(resized_imgs)
 
-####################################
+# %%
 # CenterCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
@@ -83,7 +83,7 @@ plot(resized_imgs)
 center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
 plot(center_crops)
 
-####################################
+# %%
 # FiveCrop
 # ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
@@ -92,7 +92,7 @@ plot(center_crops)
 (top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
 plot([top_left, top_right, bottom_left, bottom_right, center])
 
-####################################
+# %%
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPerspective` transform
@@ -102,7 +102,7 @@ perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
 plot(perspective_imgs)
 
-####################################
+# %%
 # RandomRotation
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomRotation` transform
@@ -112,7 +112,7 @@ rotater = T.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
 plot(rotated_imgs)
 
-####################################
+# %%
 # RandomAffine
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAffine` transform
@@ -122,7 +122,7 @@ affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
 plot(affine_imgs)
 
-####################################
+# %%
 # ElasticTransform
 # ~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ElasticTransform` transform
@@ -133,7 +133,7 @@ elastic_transformer = T.ElasticTransform(alpha=250.0)
 transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
 plot(transformed_imgs)
 
-####################################
+# %%
 # RandomCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomCrop` transform
@@ -143,7 +143,7 @@ cropper = T.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
 plot(crops)
 
-####################################
+# %%
 # RandomResizedCrop
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomResizedCrop` transform
@@ -154,7 +154,7 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
 plot(resized_crops)
 
-####################################
+# %%
 # Photometric Transforms
 # ----------------------
 # Photometric image transformation refers to the process of modifying the photometric properties of an image,
@@ -174,7 +174,7 @@ plot(resized_crops)
 gray_img = T.Grayscale()(orig_img)
 plot([gray_img], cmap='gray')
 
-####################################
+# %%
 # ColorJitter
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ColorJitter` transform
@@ -183,7 +183,7 @@ jitter = T.ColorJitter(brightness=.5, hue=.3)
 jitted_imgs = [jitter(orig_img) for _ in range(4)]
 plot(jitted_imgs)
 
-####################################
+# %%
 # GaussianBlur
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.GaussianBlur` transform
@@ -193,7 +193,7 @@ blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
 blurred_imgs = [blurrer(orig_img) for _ in range(4)]
 plot(blurred_imgs)
 
-####################################
+# %%
 # RandomInvert
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomInvert` transform
@@ -203,7 +203,7 @@ inverter = T.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
 plot(invertered_imgs)
 
-####################################
+# %%
 # RandomPosterize
 # ~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPosterize` transform
@@ -214,7 +214,7 @@ posterizer = T.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
 plot(posterized_imgs)
 
-####################################
+# %%
 # RandomSolarize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomSolarize` transform
@@ -225,7 +225,7 @@ solarizer = T.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
 plot(solarized_imgs)
 
-####################################
+# %%
 # RandomAdjustSharpness
 # ~~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
@@ -235,7 +235,7 @@ sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
 plot(sharpened_imgs)
 
-####################################
+# %%
 # RandomAutocontrast
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
@@ -245,7 +245,7 @@ autocontraster = T.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
 plot(autocontrasted_imgs)
 
-####################################
+# %%
 # RandomEqualize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomEqualize` transform
@@ -255,7 +255,7 @@ equalizer = T.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
 plot(equalized_imgs)
 
-####################################
+# %%
 # Augmentation Transforms
 # -----------------------
 # The following transforms are combinations of multiple transforms,
@@ -275,7 +275,7 @@ imgs = [
 row_title = [str(policy).split('.')[-1] for policy in policies]
 plot(imgs, row_title=row_title)
 
-####################################
+# %%
 # RandAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
@@ -283,7 +283,7 @@ augmenter = T.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)
 
-####################################
+# %%
 # TrivialAugmentWide
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
@@ -293,7 +293,7 @@ augmenter = T.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)
 
-####################################
+# %%
 # AugMix
 # ~~~~~~
 # The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
@@ -301,7 +301,7 @@ augmenter = T.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)
 
-####################################
+# %%
 # Randomly-applied Transforms
 # ---------------------------
 #
@@ -318,7 +318,7 @@ hflipper = T.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
 plot(transformed_imgs)
 
-####################################
+# %%
 # RandomVerticalFlip
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
@@ -328,7 +328,7 @@ vflipper = T.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
 plot(transformed_imgs)
 
-####################################
+# %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform
diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
index 88916ba44..b85481ae1 100644
--- a/gallery/plot_transforms_v2.py
+++ b/gallery/plot_transforms_v2.py
@@ -36,7 +36,7 @@ def load_data():
     return path, image, bounding_boxes, masks, labels
 
 
-########################################################################################################################
+# %%
 # The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
 # masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
 # detection or instance and semantic segmentation. Still, the interface is the same, making
@@ -55,7 +55,7 @@ transform = transforms.Compose(
     ]
 )
 
-########################################################################################################################
+# %%
 # :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
 # potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
 # order.
@@ -70,7 +70,7 @@ new_image, new_bounding_boxes, new_masks, new_labels = transform(
 )  # Instance Segmentation
 new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure
 
-########################################################################################################################
+# %%
 # Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
 # appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
 # regular user, you likely don't have to touch this yourself. See
@@ -84,7 +84,7 @@ new_sample = transform(sample)
 
 assert new_sample["path"] is sample["path"]
 
-########################################################################################################################
+# %%
 # As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
 # also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
 # simple heuristic:
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 981b1e588..8a80c78e1 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -47,7 +47,7 @@ from torchvision import models, datasets
 import torchvision.transforms.v2 as transforms
 
 
-########################################################################################################################
+# %%
 # We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
 # returns, and we'll see how to convert it to a format that is compatible with our new transforms.
 
@@ -67,7 +67,7 @@ print(type(image))
 print(type(target), type(target[0]), list(target[0].keys()))
 
 
-########################################################################################################################
+# %%
 # The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
 # dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
 # with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
@@ -85,13 +85,13 @@ print(type(image))
 print(type(target), list(target.keys()))
 print(type(target["boxes"]), type(target["labels"]))
 
-########################################################################################################################
+# %%
 # As baseline, let's have a look at a sample without transformations:
 
 show(sample)
 
 
-########################################################################################################################
+# %%
 # With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
 # ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
 
@@ -107,7 +107,7 @@ transform = transforms.Compose(
     ]
 )
 
-########################################################################################################################
+# %%
 # .. note::
 #    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
 #    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
@@ -126,7 +126,7 @@ sample = dataset[0]
 show(sample)
 
 
-########################################################################################################################
+# %%
 # We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
 # In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
 
diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
index 76e3590d5..aa3a620a6 100644
--- a/gallery/plot_video_api.py
+++ b/gallery/plot_video_api.py
@@ -7,14 +7,14 @@ This example illustrates some of the APIs that torchvision offers for
 videos, together with the examples on how to build datasets and more.
 """
 
-####################################
+# %%
 # 1. Introduction: building a new video object and examining the properties
 # -------------------------------------------------------------------------
 # First we select a video to test the object out. For the sake of argument
 # we're using one from kinetics400 dataset.
 # To create it, we need to define the path and the stream we want to use.
 
-######################################
+# %%
 # Chosen video statistics:
 #
 # - WUzgd7C1pWA.mp4
@@ -42,7 +42,7 @@ download_url(
 )
 video_path = "./WUzgd7C1pWA.mp4"
 
-######################################
+# %%
 # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
 # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
 # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
@@ -52,7 +52,7 @@ stream = "video"
 video = torchvision.io.VideoReader(video_path, stream)
 video.get_metadata()
 
-######################################
+# %%
 # Here we can see that video has two streams - a video and an audio stream.
 # Currently available stream types include ['video', 'audio'].
 # Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
@@ -61,7 +61,7 @@ video.get_metadata()
 # users can access the one they want.
 # If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
 
-######################################
+# %%
 # Let's read all the frames from the video stream. By default, the return value of
 # ``next(video_reader)`` is a dict containing the following fields.
 #
@@ -85,7 +85,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
 print("Approx total number of datapoints we can expect: ", approx_nf)
 print("Read data size: ", frames[0].size(0) * len(frames))
 
-######################################
+# %%
 # But what if we only want to read certain time segment of the video?
 # That can be done easily using the combination of our ``seek`` function, and the fact that each call
 # to next returns the presentation timestamp of the returned frame in seconds.
@@ -107,7 +107,7 @@ for frame, pts in itertools.islice(video.seek(2), 10):
 
 print("Total number of frames: ", len(frames))
 
-######################################
+# %%
 # Or if we wanted to read from 2nd to 5th second,
 # We seek into a second second of the video,
 # then we utilize the itertools takewhile to get the
@@ -125,7 +125,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
 print("We can expect approx: ", approx_nf)
 print("Tensor size: ", frames[0].size())
 
-####################################
+# %%
 # 2. Building a sample read_video function
 # ----------------------------------------------------------------------------------------
 # We can utilize the methods above to build the read video function that follows
@@ -170,21 +170,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
 vf, af, info, meta = example_read_video(video)
 print(vf.size(), af.size())
 
-####################################
+# %%
 # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
 # -------------------------------------------------------------------------------------------------------
 # Cool, so now we can use the same principle to make the sample dataset.
 # We suggest trying out iterable dataset for this purpose.
 # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
 
-####################################
+# %%
 # Make sample dataset
 import os
 os.makedirs("./dataset", exist_ok=True)
 os.makedirs("./dataset/1", exist_ok=True)
 os.makedirs("./dataset/2", exist_ok=True)
 
-####################################
+# %%
 # Download the videos
 from torchvision.datasets.utils import download_url
 download_url(
@@ -212,7 +212,7 @@ download_url(
     "v_SoccerJuggling_g24_c01.avi"
 )
 
-####################################
+# %%
 # Housekeeping and utilities
 import os
 import random
@@ -232,7 +232,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
     _, class_to_idx = _find_classes(root)
     return make_dataset(root, class_to_idx, extensions=extensions)
 
-####################################
+# %%
 # We are going to define the dataset and some basic arguments.
 # We assume the structure of the FolderDataset, and add the following parameters:
 #
@@ -287,7 +287,7 @@ class RandomDataset(torch.utils.data.IterableDataset):
                 'end': current_pts}
             yield output
 
-####################################
+# %%
 # Given a path of videos in a folder structure, i.e:
 #
 # - dataset
@@ -309,7 +309,7 @@ frame_transform = t.Compose(transforms)
 
 dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
 
-####################################
+# %%
 from torch.utils.data import DataLoader
 loader = DataLoader(dataset, batch_size=12)
 data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
@@ -321,7 +321,7 @@ for batch in loader:
         data['tensorsize'].append(batch['video'][i].size())
 print(data)
 
-####################################
+# %%
 # 4. Data Visualization
 # ----------------------------------
 # Example of visualized video
@@ -334,7 +334,7 @@ for i in range(16):
     plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
     plt.axis("off")
 
-####################################
+# %%
 # Cleanup the video and dataset:
 import os
 import shutil
diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index d6350a7a4..5e629cb8c 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -30,7 +30,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Visualizing a grid of images
 # ----------------------------
 # The :func:`~torchvision.utils.make_grid` function can be used to create a
@@ -48,7 +48,7 @@ dog_list = [dog1_int, dog2_int]
 grid = make_grid(dog_list)
 show(grid)
 
-####################################
+# %%
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
@@ -64,7 +64,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
 show(result)
 
 
-#####################################
+# %%
 # Naturally, we can also plot bounding boxes produced by torchvision detection
 # models.  Here is a demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -85,7 +85,7 @@ model = model.eval()
 outputs = model(images)
 print(outputs)
 
-#####################################
+# %%
 # Let's plot the boxes detected by our model. We will only plot the boxes with a
 # score greater than a given threshold.
 
@@ -96,7 +96,7 @@ dogs_with_boxes = [
 ]
 show(dogs_with_boxes)
 
-#####################################
+# %%
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
@@ -125,7 +125,7 @@ batch = torch.stack([transforms(d) for d in dog_list])
 output = model(batch)['out']
 print(output.shape, output.min().item(), output.max().item())
 
-#####################################
+# %%
 # As we can see above, the output of the segmentation model is a tensor of shape
 # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
 # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
@@ -147,7 +147,7 @@ dog_and_boat_masks = [
 
 show(dog_and_boat_masks)
 
-#####################################
+# %%
 # As expected, the model is confident about the dog class, but not so much for
 # the boat class.
 #
@@ -162,7 +162,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
 show([m.float() for m in boolean_dog_masks])
 
 
-#####################################
+# %%
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
 # class?"
@@ -184,7 +184,7 @@ dogs_with_masks = [
 ]
 show(dogs_with_masks)
 
-#####################################
+# %%
 # We can plot more than one mask per image! Remember that the model returned as
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
@@ -204,7 +204,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
 dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
 show(dog_with_all_masks)
 
-#####################################
+# %%
 # We can see in the image above that only 2 masks were drawn: the mask for the
 # background and the mask for the dog. This is because the model thinks that
 # only these 2 classes are the most likely ones across all the pixels. If the
@@ -231,7 +231,7 @@ dogs_with_masks = [
 show(dogs_with_masks)
 
 
-#####################################
+# %%
 # .. _instance_seg_output:
 #
 # Instance segmentation models
@@ -265,7 +265,7 @@ model = model.eval()
 output = model(images)
 print(output)
 
-#####################################
+# %%
 # Let's break this down. For each image in the batch, the model outputs some
 # detections (or instances). The number of detections varies for each input
 # image. Each instance is described by its bounding box, its label, its score
@@ -288,7 +288,7 @@ dog1_masks = dog1_output['masks']
 print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
       f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
 
-#####################################
+# %%
 # Here the masks correspond to probabilities indicating, for each pixel, how
 # likely it is to belong to the predicted label of that instance. Those
 # predicted labels correspond to the 'labels' element in the same output dict.
@@ -297,7 +297,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
 print("For the first dog, the following instances were detected:")
 print([weights.meta["categories"][label] for label in dog1_output['labels']])
 
-#####################################
+# %%
 # Interestingly, the model detects two persons in the image. Let's go ahead and
 # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
 # expects boolean masks, we need to convert those probabilities into boolean
@@ -315,14 +315,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)
 
 show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
 
-#####################################
+# %%
 # The model seems to have properly detected the dog, but it also confused trees
 # with people. Looking more closely at the scores will help us plot more
 # relevant masks:
 
 print(dog1_output['scores'])
 
-#####################################
+# %%
 # Clearly the model is more confident about the dog detection than it is about
 # the people detections. That's good news. When plotting the masks, we can ask
 # for only those that have a good score. Let's use a score threshold of .75
@@ -341,12 +341,12 @@ dogs_with_masks = [
 ]
 show(dogs_with_masks)
 
-#####################################
+# %%
 # The two 'people' masks in the first image where not selected because they have
 # a lower score than the score threshold. Similarly, in the second image, the
 # instance with class 15 (which corresponds to 'bench') was not selected.
 
-#####################################
+# %%
 # .. _keypoint_output:
 #
 # Visualizing keypoints
@@ -373,7 +373,7 @@ model = model.eval()
 outputs = model([person_float])
 print(outputs)
 
-#####################################
+# %%
 # As we see the output contains a list of dictionaries.
 # The output list is of length batch_size.
 # We currently have just a single image so length of list is 1.
@@ -388,7 +388,7 @@ scores = outputs[0]['scores']
 print(kpts)
 print(scores)
 
-#####################################
+# %%
 # The KeypointRCNN model detects there are two instances in the image.
 # If you plot the boxes by using :func:`~draw_bounding_boxes`
 # you would recognize they are the person and the surfboard.
@@ -402,7 +402,7 @@ keypoints = kpts[idx]
 
 print(keypoints)
 
-#####################################
+# %%
 # Great, now we have the keypoints corresponding to the person.
 # Each keypoint is represented by x, y coordinates and the visibility.
 # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
@@ -413,7 +413,7 @@ from torchvision.utils import draw_keypoints
 res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
 show(res)
 
-#####################################
+# %%
 # As we see the keypoints appear as colored circles over the image.
 # The coco keypoints for a person are ordered and represent the following list.\
 
@@ -424,7 +424,7 @@ coco_keypoints = [
     "left_knee", "right_knee", "left_ankle", "right_ankle",
 ]
 
-#####################################
+# %%
 # What if we are interested in joining the keypoints?
 # This is especially useful in creating pose detection or action recognition.
 # We can join the keypoints easily using the `connectivity` parameter.
@@ -450,7 +450,7 @@ connect_skeleton = [
     (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
 ]
 
-#####################################
+# %%
 # We pass the above list to the connectivity parameter to connect the keypoints.
 #
 
-- 
GitLab


From cab9fba8879690e1719a12ed6cb42c2541007450 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 2 Aug 2023 16:06:21 +0100
Subject: [PATCH 536/624] Fix typo in setup.py (#7792)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index cd4108114..482ab9146 100644
--- a/setup.py
+++ b/setup.py
@@ -303,8 +303,8 @@ def get_extensions():
     use_jpeg = use_jpeg and jpeg_found
     if use_jpeg:
         print("Building torchvision with JPEG image support")
-        print(f"  libpng include path: {jpeg_include}")
-        print(f"  libpng lib path: {jpeg_lib}")
+        print(f"  libjpeg include path: {jpeg_include}")
+        print(f"  libjpeg lib path: {jpeg_lib}")
         image_link_flags.append("jpeg")
         if jpeg_conda:
             image_library += [jpeg_lib]
-- 
GitLab


From f3c89cc696cb77a3d453d93f06b637b202316af1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 3 Aug 2023 00:07:08 +0100
Subject: [PATCH 537/624] Remove cutmix and mixup from prototype (#7787)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_prototype_transforms.py            | 48 +---------
 torchvision/prototype/transforms/__init__.py |  2 +-
 torchvision/prototype/transforms/_augment.py | 95 +-------------------
 3 files changed, 3 insertions(+), 142 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 7bed48e6c..b1760f6f9 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -12,13 +12,10 @@ from common_utils import (
     make_bounding_box,
     make_detection_mask,
     make_image,
-    make_images,
-    make_segmentation_mask,
     make_video,
-    make_videos,
 )
 
-from prototype_common_utils import make_label, make_one_hot_labels
+from prototype_common_utils import make_label
 
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
@@ -44,49 +41,6 @@ def parametrize(transforms_with_inputs):
     )
 
 
-@parametrize(
-    [
-        (
-            transform,
-            [
-                dict(inpt=inpt, one_hot_label=one_hot_label)
-                for inpt, one_hot_label in itertools.product(
-                    itertools.chain(
-                        make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                        make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                    ),
-                    make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                )
-            ],
-        )
-        for transform in [
-            transforms.RandomMixUp(alpha=1.0),
-            transforms.RandomCutMix(alpha=1.0),
-        ]
-    ]
-)
-def test_mixup_cutmix(transform, input):
-    transform(input)
-
-    input_copy = dict(input)
-    input_copy["path"] = "/path/to/somewhere"
-    input_copy["num"] = 1234
-    transform(input_copy)
-
-    # Check if we raise an error if sample contains bbox or mask or label
-    err_msg = "does not support PIL images, bounding boxes, masks and plain labels"
-    input_copy = dict(input)
-    for unsup_data in [
-        make_label(),
-        make_bounding_box(format="XYXY"),
-        make_detection_mask(),
-        make_segmentation_mask(),
-    ]:
-        input_copy["unsupported"] = unsup_data
-        with pytest.raises(TypeError, match=err_msg):
-            transform(input_copy)
-
-
 class TestSimpleCopyPaste:
     def create_fake_image(self, mocker, image_type):
         if image_type == PIL.Image.Image:
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index e3a185998..c264db5d3 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -1,6 +1,6 @@
 from ._presets import StereoMatching  # usort: skip
 
-from ._augment import RandomCutMix, RandomMixUp, SimpleCopyPaste
+from ._augment import SimpleCopyPaste
 from ._geometry import FixedSizeCrop
 from ._misc import PermuteDimensions, TransposeDimensions
 from ._type_conversion import LabelToOneHot
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 4da6cfcf9..95585fe28 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -1,4 +1,3 @@
-import math
 from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
 import PIL.Image
@@ -9,100 +8,8 @@ from torchvision.ops import masks_to_boxes
 from torchvision.prototype import datapoints as proto_datapoints
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
 
-from torchvision.transforms.v2._transform import _RandomApplyTransform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_size
-
-
-class _BaseMixUpCutMix(_RandomApplyTransform):
-    def __init__(self, alpha: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.alpha = alpha
-        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
-
-    def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if not (
-            has_any(flat_inputs, datapoints.Image, datapoints.Video, is_simple_tensor)
-            and has_any(flat_inputs, proto_datapoints.OneHotLabel)
-        ):
-            raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
-        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask, proto_datapoints.Label):
-            raise TypeError(
-                f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
-            )
-
-    def _mixup_onehotlabel(self, inpt: proto_datapoints.OneHotLabel, lam: float) -> proto_datapoints.OneHotLabel:
-        if inpt.ndim < 2:
-            raise ValueError("Need a batch of one hot labels")
-        output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
-        return proto_datapoints.OneHotLabel.wrap_like(inpt, output)
-
-
-class RandomMixUp(_BaseMixUpCutMix):
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        lam = params["lam"]
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
-            expected_ndim = 5 if isinstance(inpt, datapoints.Video) else 4
-            if inpt.ndim < expected_ndim:
-                raise ValueError("The transform expects a batched input")
-            output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
-
-            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-                output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
-
-            return output
-        elif isinstance(inpt, proto_datapoints.OneHotLabel):
-            return self._mixup_onehotlabel(inpt, lam)
-        else:
-            return inpt
-
-
-class RandomCutMix(_BaseMixUpCutMix):
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        lam = float(self._dist.sample(()))  # type: ignore[arg-type]
-
-        H, W = query_size(flat_inputs)
-
-        r_x = torch.randint(W, ())
-        r_y = torch.randint(H, ())
-
-        r = 0.5 * math.sqrt(1.0 - lam)
-        r_w_half = int(r * W)
-        r_h_half = int(r * H)
-
-        x1 = int(torch.clamp(r_x - r_w_half, min=0))
-        y1 = int(torch.clamp(r_y - r_h_half, min=0))
-        x2 = int(torch.clamp(r_x + r_w_half, max=W))
-        y2 = int(torch.clamp(r_y + r_h_half, max=H))
-        box = (x1, y1, x2, y2)
-
-        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
-
-        return dict(box=box, lam_adjusted=lam_adjusted)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
-            box = params["box"]
-            expected_ndim = 5 if isinstance(inpt, datapoints.Video) else 4
-            if inpt.ndim < expected_ndim:
-                raise ValueError("The transform expects a batched input")
-            x1, y1, x2, y2 = box
-            rolled = inpt.roll(1, 0)
-            output = inpt.clone()
-            output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
-
-            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-                output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
-
-            return output
-        elif isinstance(inpt, proto_datapoints.OneHotLabel):
-            lam_adjusted = params["lam_adjusted"]
-            return self._mixup_onehotlabel(inpt, lam_adjusted)
-        else:
-            return inpt
+from torchvision.transforms.v2.utils import is_simple_tensor
 
 
 class SimpleCopyPaste(Transform):
-- 
GitLab


From 9ebf10af02aeb882bd8d7782149e21b48528d562 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 3 Aug 2023 11:57:59 +0100
Subject: [PATCH 538/624] Allow register_kernel() to take dispatcher name as
 input (#7796)

---
 test/test_transforms_v2_refactored.py         | 33 +++++++++++++++++++
 .../transforms/v2/functional/_utils.py        | 11 +++++++
 2 files changed, 44 insertions(+)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 45668fda1..8a858bf58 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2181,3 +2181,36 @@ class TestShapeGetters:
 
         with pytest.raises(TypeError, match=re.escape(str(type(input)))):
             dispatcher(input)
+
+
+class TestRegisterKernel:
+    @pytest.mark.parametrize("dispatcher", (F.resize, "resize"))
+    def test_register_kernel(self, dispatcher):
+        class CustomDatapoint(datapoints.Datapoint):
+            pass
+
+        kernel_was_called = False
+
+        @F.register_kernel(dispatcher, CustomDatapoint)
+        def new_resize(dp, *args, **kwargs):
+            nonlocal kernel_was_called
+            kernel_was_called = True
+            return dp
+
+        t = transforms.Resize(size=(224, 224), antialias=True)
+
+        my_dp = CustomDatapoint(torch.rand(3, 10, 10))
+        out = t(my_dp)
+        assert out is my_dp
+        assert kernel_was_called
+
+        # Sanity check to make sure we didn't override the kernel of other types
+        t(torch.rand(3, 10, 10)).shape == (3, 224, 224)
+        t(datapoints.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
+
+    def test_bad_disaptcher_name(self):
+        class CustomDatapoint(datapoints.Datapoint):
+            pass
+
+        with pytest.raises(ValueError, match="Could not find dispatcher with name"):
+            F.register_kernel("bad_name", CustomDatapoint)
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 63e029d6c..1eaa54102 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -37,7 +37,18 @@ def _register_kernel_internal(dispatcher, datapoint_cls, *, datapoint_wrapper=Tr
     return decorator
 
 
+def _name_to_dispatcher(name):
+    import torchvision.transforms.v2.functional  # noqa
+
+    try:
+        return getattr(torchvision.transforms.v2.functional, name)
+    except AttributeError:
+        raise ValueError(f"Could not find dispatcher with name '{name}'.") from None
+
+
 def register_kernel(dispatcher, datapoint_cls):
+    if isinstance(dispatcher, str):
+        dispatcher = _name_to_dispatcher(name=dispatcher)
     return _register_kernel_internal(dispatcher, datapoint_cls, datapoint_wrapper=False)
 
 
-- 
GitLab


From bf03f4edfe8fd7543e98ed9a3771b01ab6c6f062 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 4 Aug 2023 00:12:55 +0200
Subject: [PATCH 539/624] cleanup spatial_size -> canvas_size (#7783)

---
 test/common_utils.py                  | 30 +++++++++++++--------------
 test/test_transforms_v2_functional.py |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 38033d4dc..ec694cc81 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -423,7 +423,7 @@ DEFAULT_SPATIAL_SIZES = (
 )
 
 
-def _parse_canvas_size(size, *, name="size"):
+def _parse_size(size, *, name="size"):
     if size == "random":
         raise ValueError("This should never happen")
     elif isinstance(size, int) and size > 0:
@@ -478,13 +478,13 @@ class TensorLoader:
 
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
     memory_format: torch.memory_format = torch.contiguous_format
     canvas_size: Tuple[int, int] = dataclasses.field(init=False)
 
     def __post_init__(self):
-        self.canvas_size = self.canvas_size = self.shape[-2:]
+        self.spatial_size = self.canvas_size = self.shape[-2:]
         self.num_channels = self.shape[-3]
 
     def load(self, device):
@@ -550,7 +550,7 @@ def make_image_loader(
 ):
     if not constant_alpha:
         raise ValueError("This should never happen")
-    size = _parse_canvas_size(size)
+    size = _parse_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
@@ -590,7 +590,7 @@ make_images = from_loaders(make_image_loaders)
 def make_image_loader_for_interpolation(
     size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
 ):
-    size = _parse_canvas_size(size)
+    size = _parse_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
@@ -687,11 +687,11 @@ def make_bounding_box(
     )
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, canvas_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
-    canvas_size = _parse_canvas_size(canvas_size, name="canvas_size")
+    spatial_size = _parse_size(spatial_size, name="canvas_size")
 
     def fn(shape, dtype, device):
         *batch_dims, num_coordinates = shape
@@ -699,21 +699,21 @@ def make_bounding_box_loader(*, extra_dims=(), format, canvas_size=DEFAULT_PORTR
             raise pytest.UsageError()
 
         return make_bounding_box(
-            format=format, canvas_size=canvas_size, batch_dims=batch_dims, dtype=dtype, device=device
+            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
-    return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=canvas_size)
+    return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
 def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(datapoints.BoundingBoxFormat),
-    canvas_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     dtypes=(torch.float32, torch.float64, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, canvas_size=canvas_size)
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
 
 
 make_bounding_boxes = from_loaders(make_bounding_box_loaders)
@@ -738,7 +738,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp
 
 def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_canvas_size(size)
+    size = _parse_size(size)
 
     def fn(shape, dtype, device):
         *batch_dims, num_objects, height, width = shape
@@ -779,7 +779,7 @@ def make_segmentation_mask_loader(
     size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
 ):
     # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    canvas_size = _parse_canvas_size(size)
+    size = _parse_size(size)
 
     def fn(shape, dtype, device):
         *batch_dims, height, width = shape
@@ -787,7 +787,7 @@ def make_segmentation_mask_loader(
             (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
-    return MaskLoader(fn, shape=(*extra_dims, *canvas_size), dtype=dtype)
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
 
 
 def make_segmentation_mask_loaders(
@@ -841,7 +841,7 @@ def make_video_loader(
     extra_dims=(),
     dtype=torch.uint8,
 ):
-    size = _parse_canvas_size(size)
+    size = _parse_size(size)
 
     def fn(shape, dtype, device, memory_format):
         *batch_dims, num_frames, _, height, width = shape
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 8d5297326..a05f1a3c3 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -884,7 +884,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(canvas_size=canvas_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
         output_bboxes = F.perspective_bounding_boxes(
-- 
GitLab


From 84db2ac4572dd23b67d93d08660426e44f97ba75 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 4 Aug 2023 17:13:15 +0100
Subject: [PATCH 540/624] Add tuto for custom transforms and custom datapoints
 in gallery example (#7795)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/conf.py                           |   2 +-
 docs/source/datapoints.rst                    |   1 +
 docs/source/transforms.rst                    |  11 ++
 gallery/plot_custom_datapoints.py             | 125 ++++++++++++++++
 gallery/plot_custom_transforms.py             | 123 ++++++++++++++++
 gallery/plot_datapoints.py                    | 137 ++++++++++++++----
 torchvision/datapoints/_bounding_box.py       |   2 +-
 torchvision/datapoints/_datapoint.py          |  13 +-
 torchvision/datapoints/_image.py              |   9 --
 torchvision/datapoints/_mask.py               |  12 --
 torchvision/datapoints/_video.py              |   9 --
 torchvision/prototype/datapoints/_label.py    |   2 +-
 .../transforms/v2/functional/_utils.py        |   5 +
 13 files changed, 385 insertions(+), 66 deletions(-)
 create mode 100644 gallery/plot_custom_datapoints.py
 create mode 100644 gallery/plot_custom_transforms.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7b3e9e8a7..fed3884ea 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -320,7 +320,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
       used within the autoclass directive.
     """
 
-    if obj.__name__.endswith(("_Weights", "_QuantizedWeights")):
+    if getattr(obj, ".__name__", "").endswith(("_Weights", "_QuantizedWeights")):
 
         if len(obj) == 0:
             lines[:] = ["There are no available pre-trained weights."]
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 55d3cda4a..ea23a7ff7 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -17,3 +17,4 @@ see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
     BoundingBoxFormat
     BoundingBoxes
     Mask
+    Datapoint
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 73adb3cf3..a1858c6b5 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -375,3 +375,14 @@ you can use a functional transform to build transform classes with custom behavi
     to_pil_image
     to_tensor
     vflip
+
+Developer tools
+---------------
+
+.. currentmodule:: torchvision.transforms.v2.functional
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    register_kernel
diff --git a/gallery/plot_custom_datapoints.py b/gallery/plot_custom_datapoints.py
new file mode 100644
index 000000000..ea757283e
--- /dev/null
+++ b/gallery/plot_custom_datapoints.py
@@ -0,0 +1,125 @@
+"""
+=====================================
+How to write your own Datapoint class
+=====================================
+
+This guide is intended for downstream library maintainers. We explain how to
+write your own datapoint class, and how to make it compatible with the built-in
+Torchvision v2 transforms. Before continuing, make sure you have read
+:ref:`sphx_glr_auto_examples_plot_datapoints.py`.
+"""
+
+# %%
+import torch
+import torchvision
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import datapoints
+from torchvision.transforms import v2
+
+# %%
+# We will create a very simple class that just inherits from the base
+# :class:`~torchvision.datapoints.Datapoint` class. It will be enough to cover
+# what you need to know to implement your more elaborate uses-cases. If you need
+# to create a class that carries meta-data, take a look at how the
+# :class:`~torchvision.datapoints.BoundingBoxes` class is `implemented
+# <https://github.com/pytorch/vision/blob/main/torchvision/datapoints/_bounding_box.py>`_.
+
+
+class MyDatapoint(datapoints.Datapoint):
+    pass
+
+
+my_dp = MyDatapoint([1, 2, 3])
+my_dp
+
+# %%
+# Now that we have defined our custom Datapoint class, we want it to be
+# compatible with the built-in torchvision transforms, and the functional API.
+# For that, we need to implement a kernel which performs the core of the
+# transformation, and then "hook" it to the functional that we want to support
+# via :func:`~torchvision.transforms.v2.functional.register_kernel`.
+#
+# We illustrate this process below: we create a kernel for the "horizontal flip"
+# operation of our MyDatapoint class, and register it to the functional API.
+
+from torchvision.transforms.v2 import functional as F
+
+
+@F.register_kernel(dispatcher="hflip", datapoint_cls=MyDatapoint)
+def hflip_my_datapoint(my_dp, *args, **kwargs):
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return MyDatapoint.wrap_like(my_dp, out)
+
+
+# %%
+# To understand why ``wrap_like`` is used, see
+# :ref:`datapoint_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
+# we will explain it below in :ref:`param_forwarding`.
+#
+# .. note::
+#
+#     In our call to ``register_kernel`` above we used a string
+#     ``dispatcher="hflip"`` to refer to the functional we want to hook into. We
+#     could also have used the  functional *itself*, i.e.
+#     ``@register_kernel(dispatcher=F.hflip, ...)``.
+#
+#     The functionals that you can be hooked into are the ones in
+#     ``torchvision.transforms.v2.functional`` and they are documented in
+#     :ref:`functional_transforms`.
+#
+# Now that we have registered our kernel, we can call the functional API on a
+# ``MyDatapoint`` instance:
+
+my_dp = MyDatapoint(torch.rand(3, 256, 256))
+_ = F.hflip(my_dp)
+
+# %%
+# And we can also use the
+# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally:
+t = v2.RandomHorizontalFlip(p=1)
+_ = t(my_dp)
+
+# %%
+# .. note::
+#
+#     We cannot register a kernel for a transform class, we can only register a
+#     kernel for a **functional**. The reason we can't register a transform
+#     class is because one transform may internally rely on more than one
+#     functional, so in general we can't register a single kernel for a given
+#     class.
+#
+# .. _param_forwarding:
+#
+# Parameter forwarding, and ensuring future compatibility of your kernels
+# -----------------------------------------------------------------------
+#
+# The functional API that you're hooking into is public and therefore
+# **backward** compatible: we guarantee that the parameters of these functionals
+# won't be removed or renamed without a proper deprecation cycle. However, we
+# don't guarantee **forward** compatibility, and we may add new parameters in
+# the future.
+#
+# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter
+# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you
+# already defined and registered your own kernel as
+
+def hflip_my_datapoint(my_dp):  # noqa
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return MyDatapoint.wrap_like(my_dp, out)
+
+
+# %%
+# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to
+# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't
+# accept it.
+#
+# For this reason, we recommend to always define your kernels with
+# ``*args, **kwargs`` in their signature, as done above. This way, your kernel
+# will be able to accept any new parameter that we may add in the future.
+# (Technically, adding `**kwargs` only should be enough).
diff --git a/gallery/plot_custom_transforms.py b/gallery/plot_custom_transforms.py
new file mode 100644
index 000000000..eba8e91fa
--- /dev/null
+++ b/gallery/plot_custom_transforms.py
@@ -0,0 +1,123 @@
+"""
+===================================
+How to write your own v2 transforms
+===================================
+
+This guide explains how to write transforms that are compatible with the
+torchvision transforms V2 API.
+"""
+
+# %%
+import torch
+import torchvision
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import datapoints
+from torchvision.transforms import v2
+
+
+# %%
+# Just create a ``nn.Module`` and override the ``forward`` method
+# ===============================================================
+#
+# In most cases, this is all you're going to need, as long as you already know
+# the structure of the input that your transform will expect. For example if
+# you're just doing image classification, your transform will typically accept a
+# single image as input, or a ``(img, label)`` input. So you can just hard-code
+# your ``forward`` method to accept just that, e.g.
+#
+# .. code:: python
+#
+#     class MyCustomTransform(torch.nn.Module):
+#         def forward(self, img, label):
+#             # Do some transformations
+#             return new_img, new_label
+#
+# .. note::
+#
+#     This means that if you have a custom transform that is already compatible
+#     with the V1 transforms (those in ``torchvision.transforms``), it will
+#     still work with the V2 transforms without any change!
+#
+# We will illustrate this more completely below with a typical detection case,
+# where our samples are just images, bounding boxes and labels:
+
+class MyCustomTransform(torch.nn.Module):
+    def forward(self, img, bboxes, label):  # we assume inputs are always structured like this
+        print(
+            f"I'm transforming an image of shape {img.shape} "
+            f"with bboxes = {bboxes}\n{label = }"
+        )
+        # Do some transformations. Here, we're just passing though the input
+        return img, bboxes, label
+
+
+transforms = v2.Compose([
+    MyCustomTransform(),
+    v2.RandomResizedCrop((224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=1),
+    v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1])
+])
+
+H, W = 256, 256
+img = torch.rand(3, H, W)
+bboxes = datapoints.BoundingBoxes(
+    torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]),
+    format="XYXY",
+    canvas_size=(H, W)
+)
+label = 3
+
+out_img, out_bboxes, out_label = transforms(img, bboxes, label)
+# %%
+print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
+# %%
+# .. note::
+#     While working with datapoint classes in your code, make sure to
+#     familiarize yourself with this section:
+#     :ref:`datapoint_unwrapping_behaviour`
+#
+# Supporting arbitrary input structures
+# =====================================
+#
+# In the section above, we have assumed that you already know the structure of
+# your inputs and that you're OK with hard-coding this expected structure in
+# your code. If you want your custom transforms to be as flexible as possible,
+# this can be a bit limitting.
+#
+# A key feature of the builtin Torchvision V2 transforms is that they can accept
+# arbitrary input structure and return the same structure as output (with
+# transformed entries). For example, transforms can accept a single image, or a
+# tuple of ``(img, label)``, or an arbitrary nested dictionary as input:
+
+structured_input = {
+    "img": img,
+    "annotations": (bboxes, label),
+    "something_that_will_be_ignored": (1, "hello")
+}
+structured_output = v2.RandomHorizontalFlip(p=1)(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something_that_will_be_ignored"] == (1, "hello")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# If you want to reproduce this behavior in your own transform, we invite you to
+# look at our `code
+# <https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/_transform.py>`_
+# and adapt it to your needs.
+#
+# In brief, the core logic is to unpack the input into a flat list using `pytree
+# <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
+# then transform only the entries that can be transformed (the decision is made
+# based on the **class** of the entries, as all datapoints are
+# tensor-subclasses) plus some custom logic that is out of score here - check the
+# code for details. The (potentially transformed) entries are then repacked and
+# returned, in the same structure as the input.
+#
+# We do not provide public dev-facing tools to achieve that at this time, but if
+# this is something that would be valuable to you, please let us know by opening
+# an issue on our `GitHub repo <https://github.com/pytorch/vision/issues>`_.
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index 57e29bd86..d87575cdb 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -3,13 +3,22 @@
 Datapoints FAQ
 ==============
 
-The :mod:`torchvision.datapoints` namespace was introduced together with ``torchvision.transforms.v2``. This example
-showcases what these datapoints are and how they behave. This is a fairly low-level topic that most users will not need
-to worry about: you do not need to understand the internals of datapoints to efficiently rely on
-``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets,
-transforms, or work directly with the datapoints.
+Datapoints are Tensor subclasses introduced together with
+``torchvision.transforms.v2``. This example showcases what these datapoints are
+and how they behave.
+
+.. warning::
+
+    **Intended Audience** Unless you're writing your own transforms or your own datapoints, you
+    probably do not need to read this guide. This is a fairly low-level topic
+    that most users will not need to worry about: you do not need to understand
+    the internals of datapoints to efficiently rely on
+    ``torchvision.transforms.v2``. It may however be useful for advanced users
+    trying to implement their own datasets, transforms, or work directly with
+    the datapoints.
 """
 
+# %%
 import PIL.Image
 
 import torch
@@ -35,11 +44,20 @@ image = datapoints.Image(tensor)
 assert isinstance(image, torch.Tensor)
 assert image.data_ptr() == tensor.data_ptr()
 
-
 # %%
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
+# What can I do with a datapoint?
+# -------------------------------
+#
+# Datapoints look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
+# any ``torch.*`` operator will also works on datapoints. See
+# :ref:`datapoint_unwrapping_behaviour` for a few gotchas.
+
+# %%
+#
 # What datapoints are supported?
 # ------------------------------
 #
@@ -50,9 +68,14 @@ assert image.data_ptr() == tensor.data_ptr()
 # * :class:`~torchvision.datapoints.BoundingBoxes`
 # * :class:`~torchvision.datapoints.Mask`
 #
+# .. _datapoint_creation:
+#
 # How do I construct a datapoint?
 # -------------------------------
 #
+# Using the constructor
+# ^^^^^^^^^^^^^^^^^^^^^
+#
 # Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
 
 image = datapoints.Image([[[[0, 1], [1, 0]]]])
@@ -68,27 +91,52 @@ print(float_image)
 
 
 # %%
-# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
+# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` can also take a
 # :class:`PIL.Image.Image` directly:
 
 image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
 # %%
-# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
-# :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
-# corresponding image alongside the actual values:
-
-bounding_box = datapoints.BoundingBoxes(
-    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:]
+# Some datapoints require additional metadata to be passed in ordered to be constructed. For example,
+# :class:`~torchvision.datapoints.BoundingBoxes` requires the coordinate format as well as the size of the
+# corresponding image (``canvas_size``) alongside the actual values. These
+# metadata are required to properly transform the bounding boxes.
+
+bboxes = datapoints.BoundingBoxes(
+    [[17, 16, 344, 495], [0, 10, 0, 10]],
+    format=datapoints.BoundingBoxFormat.XYXY,
+    canvas_size=image.shape[-2:]
 )
-print(bounding_box)
+print(bboxes)
+
+# %%
+# Using the ``wrap_like()`` class method
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can also use the ``wrap_like()`` class method to wrap a tensor object
+# into a datapoint. This is useful when you already have an object of the
+# desired type, which typically happens when writing transforms: you just want
+# to wrap the output like the input. This API is inspired by utils like
+# :func:`torch.zeros_like`:
+
+new_bboxes = torch.tensor([0, 20, 30, 40])
+new_bboxes = datapoints.BoundingBoxes.wrap_like(bboxes, new_bboxes)
+assert isinstance(new_bboxes, datapoints.BoundingBoxes)
+assert new_bboxes.canvas_size == bboxes.canvas_size
 
 
 # %%
+# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
+# it as a parameter to override it. Check the
+# :meth:`~torchvision.datapoints.BoundingBoxes.wrap_like` documentation for
+# more details.
+#
 # Do I have to wrap the output of the datasets myself?
 # ----------------------------------------------------
 #
+# TODO: Move this in another guide - this is user-facing, not dev-facing.
+#
 # Only if you are using custom datasets. For the built-in ones, you can use
 # :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
 # built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
@@ -105,8 +153,8 @@ class PennFudanDataset(torch.utils.data.Dataset):
     def __getitem__(self, item):
         ...
 
-        target["boxes"] = datapoints.BoundingBoxes(
-            boxes,
+        target["bboxes"] = datapoints.BoundingBoxes(
+            bboxes,
             format=datapoints.BoundingBoxFormat.XYXY,
             canvas_size=F.get_size(img),
         )
@@ -147,7 +195,7 @@ def get_transform(train):
 # %%
 # .. note::
 #
-#    If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
+#    If both :class:`~torchvision.datapoints.BoundingBoxes` and :class:`~torchvision.datapoints.Mask`'s are included in
 #    the sample, ``torchvision.transforms.v2`` will transform them both. Meaning, if you don't need both, dropping or
 #    at least not wrapping the obsolete parts, can lead to a significant performance boost.
 #
@@ -156,41 +204,66 @@ def get_transform(train):
 #    even better to not load the masks at all, but this is not possible in this example, since the bounding boxes are
 #    generated from the masks.
 #
-# How do the datapoints behave inside a computation?
-# --------------------------------------------------
+# .. _datapoint_unwrapping_behaviour:
 #
-# Datapoints look and feel just like regular tensors. Everything that is supported on a plain :class:`torch.Tensor`
-# also works on datapoints.
-# Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the
-# datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below):
+# I had a Datapoint but now I have a Tensor. Help!
+# ------------------------------------------------
+#
+# For a lot of operations involving datapoints, we cannot safely infer whether
+# the result should retain the datapoint type, so we choose to return a plain
+# tensor instead of a datapoint (this might change, see note below):
 
 
-assert isinstance(image, datapoints.Image)
+assert isinstance(bboxes, datapoints.BoundingBoxes)
 
-new_image = image + 0
+# Shift bboxes by 3 pixels in both H and W
+new_bboxes = bboxes + 3
 
-assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+assert isinstance(new_bboxes, torch.Tensor) and not isinstance(new_bboxes, datapoints.BoundingBoxes)
+
+# %%
+# If you're writing your own custom transforms or code involving datapoints, you
+# can re-wrap the output into a datapoint by just calling their constructor, or
+# by using the ``.wrap_like()`` class method:
+
+new_bboxes = bboxes + 3
+new_bboxes = datapoints.BoundingBoxes.wrap_like(bboxes, new_bboxes)
+assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 
 # %%
+# See more details above in :ref:`datapoint_creation`.
+#
+# .. note::
+#
+#    You never need to re-wrap manually if you're using the built-in transforms
+#    or their functional equivalents: this is automatically taken care of for
+#    you.
+#
 # .. note::
 #
 #    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
 #    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
 #    https://github.com/pytorch/vision/issues/7319
 #
-# There are two exceptions to this rule:
+# There are a few exceptions to this "unwrapping" rule:
 #
-# 1. The operations :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, and :meth:`~torch.Tensor.requires_grad_`
-#    retain the datapoint type.
-# 2. Inplace operations on datapoints cannot change the type of the datapoint they are called on. However, if you use
-#    the flow style, the returned value will be unwrapped:
+# 1. Operations like :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
+#    :meth:`torch.Tensor.detach` and :meth:`~torch.Tensor.requires_grad_` retain
+#    the datapoint type.
+# 2. Inplace operations on datapoints like ``.add_()`` preserve they type. However,
+#    the **returned** value of inplace operations will be unwrapped into a pure
+#    tensor:
 
 image = datapoints.Image([[[0, 1], [1, 0]]])
 
 new_image = image.add_(1).mul_(2)
 
-assert isinstance(image, torch.Tensor)
+# image got transformed in-place and is still an Image datapoint, but new_image
+# is a Tensor. They share the same underlying data and they're equal, just
+# different classes.
+assert isinstance(image, datapoints.Image)
 print(image)
 
 assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
 assert (new_image == image).all()
+assert new_image.data_ptr() == image.data_ptr()
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 912cc3bca..7477b3652 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -42,7 +42,7 @@ class BoundingBoxes(Datapoint):
     canvas_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBoxes:
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBoxes:  # type: ignore[override]
         bounding_boxes = tensor.as_subclass(cls)
         bounding_boxes.format = format
         bounding_boxes.canvas_size = canvas_size
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 384273301..fae3c1865 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -14,6 +14,13 @@ _FillTypeJIT = Optional[List[float]]
 
 
 class Datapoint(torch.Tensor):
+    """[Beta] Base class for all datapoints.
+
+    You probably don't want to use this class unless you're defining your own
+    custom Datapoints. See
+    :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for details.
+    """
+
     @staticmethod
     def _to_tensor(
         data: Any,
@@ -25,9 +32,13 @@ class Datapoint(torch.Tensor):
             requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
         return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
+    @classmethod
+    def _wrap(cls: Type[D], tensor: torch.Tensor) -> D:
+        return tensor.as_subclass(cls)
+
     @classmethod
     def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
-        raise NotImplementedError
+        return cls._wrap(tensor)
 
     _NO_WRAPPING_EXCEPTIONS = {
         torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output),
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index dccfc81a6..9b635e8e0 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -22,11 +22,6 @@ class Image(Datapoint):
             ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
     """
 
-    @classmethod
-    def _wrap(cls, tensor: torch.Tensor) -> Image:
-        image = tensor.as_subclass(cls)
-        return image
-
     def __new__(
         cls,
         data: Any,
@@ -48,10 +43,6 @@ class Image(Datapoint):
 
         return cls._wrap(tensor)
 
-    @classmethod
-    def wrap_like(cls, other: Image, tensor: torch.Tensor) -> Image:
-        return cls._wrap(tensor)
-
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index 2b95eca72..95eda0779 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -22,10 +22,6 @@ class Mask(Datapoint):
             ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
     """
 
-    @classmethod
-    def _wrap(cls, tensor: torch.Tensor) -> Mask:
-        return tensor.as_subclass(cls)
-
     def __new__(
         cls,
         data: Any,
@@ -41,11 +37,3 @@ class Mask(Datapoint):
 
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return cls._wrap(tensor)
-
-    @classmethod
-    def wrap_like(
-        cls,
-        other: Mask,
-        tensor: torch.Tensor,
-    ) -> Mask:
-        return cls._wrap(tensor)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index 11d6e2a85..842c05bf7 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -20,11 +20,6 @@ class Video(Datapoint):
             ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
     """
 
-    @classmethod
-    def _wrap(cls, tensor: torch.Tensor) -> Video:
-        video = tensor.as_subclass(cls)
-        return video
-
     def __new__(
         cls,
         data: Any,
@@ -38,10 +33,6 @@ class Video(Datapoint):
             raise ValueError
         return cls._wrap(tensor)
 
-    @classmethod
-    def wrap_like(cls, other: Video, tensor: torch.Tensor) -> Video:
-        return cls._wrap(tensor)
-
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/datapoints/_label.py
index 7ed2f7522..ac9b2d891 100644
--- a/torchvision/prototype/datapoints/_label.py
+++ b/torchvision/prototype/datapoints/_label.py
@@ -15,7 +15,7 @@ class _LabelBase(Datapoint):
     categories: Optional[Sequence[str]]
 
     @classmethod
-    def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L:
+    def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L:  # type: ignore[override]
         label_base = tensor.as_subclass(cls)
         label_base.categories = categories
         return label_base
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 1eaa54102..bb3d59b55 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -47,6 +47,11 @@ def _name_to_dispatcher(name):
 
 
 def register_kernel(dispatcher, datapoint_cls):
+    """Decorate a kernel to register it for a dispatcher and a (custom) datapoint type.
+
+    See :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for usage
+    details.
+    """
     if isinstance(dispatcher, str):
         dispatcher = _name_to_dispatcher(name=dispatcher)
     return _register_kernel_internal(dispatcher, datapoint_cls, datapoint_wrapper=False)
-- 
GitLab


From 2030d208ba1044b97b8ceab91852858672a56cc8 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 7 Aug 2023 09:46:09 +0200
Subject: [PATCH 541/624] register tensor and PIL kernel the same way as
 datapoints (#7797)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2_functional.py         |  58 +--
 test/test_transforms_v2_refactored.py         | 168 ++++---
 .../transforms/v2/functional/_augment.py      |  25 +-
 .../transforms/v2/functional/_color.py        | 290 +++++-------
 .../transforms/v2/functional/_geometry.py     | 423 ++++++++----------
 torchvision/transforms/v2/functional/_meta.py |  88 ++--
 torchvision/transforms/v2/functional/_misc.py |  69 ++-
 .../transforms/v2/functional/_temporal.py     |  20 +-
 .../transforms/v2/functional/_utils.py        |  98 ++--
 9 files changed, 552 insertions(+), 687 deletions(-)

diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index a05f1a3c3..713737abb 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -2,7 +2,6 @@ import inspect
 import math
 import os
 import re
-from unittest import mock
 
 import numpy as np
 import PIL.Image
@@ -25,7 +24,6 @@ from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes
-from torchvision.transforms.v2.functional._utils import _KERNEL_REGISTRY
 from torchvision.transforms.v2.utils import is_simple_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
@@ -359,18 +357,6 @@ class TestDispatchers:
     def test_scriptable(self, dispatcher):
         script(dispatcher)
 
-    @image_sample_inputs
-    def test_dispatch_simple_tensor(self, info, args_kwargs, spy_on):
-        (image_datapoint, *other_args), kwargs = args_kwargs.load()
-        image_simple_tensor = torch.Tensor(image_datapoint)
-
-        kernel_info = info.kernel_infos[datapoints.Image]
-        spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.id)
-
-        info.dispatcher(image_simple_tensor, *other_args, **kwargs)
-
-        spy.assert_called_once()
-
     @image_sample_inputs
     def test_simple_tensor_output_type(self, info, args_kwargs):
         (image_datapoint, *other_args), kwargs = args_kwargs.load()
@@ -381,25 +367,6 @@ class TestDispatchers:
         # We cannot use `isinstance` here since all datapoints are instances of `torch.Tensor` as well
         assert type(output) is torch.Tensor
 
-    @make_info_args_kwargs_parametrization(
-        [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
-        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
-    )
-    def test_dispatch_pil(self, info, args_kwargs, spy_on):
-        (image_datapoint, *other_args), kwargs = args_kwargs.load()
-
-        if image_datapoint.ndim > 3:
-            pytest.skip("Input is batched")
-
-        image_pil = F.to_image_pil(image_datapoint)
-
-        pil_kernel_info = info.pil_kernel_info
-        spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.id)
-
-        info.dispatcher(image_pil, *other_args, **kwargs)
-
-        spy.assert_called_once()
-
     @make_info_args_kwargs_parametrization(
         [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
         args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
@@ -416,28 +383,6 @@ class TestDispatchers:
 
         assert isinstance(output, PIL.Image.Image)
 
-    @make_info_args_kwargs_parametrization(
-        DISPATCHER_INFOS,
-        args_kwargs_fn=lambda info: info.sample_inputs(),
-    )
-    def test_dispatch_datapoint(self, info, args_kwargs, spy_on):
-        (datapoint, *other_args), kwargs = args_kwargs.load()
-
-        input_type = type(datapoint)
-
-        wrapped_kernel = _KERNEL_REGISTRY[info.dispatcher][input_type]
-
-        # In case the wrapper was decorated with @functools.wraps, we can make the check more strict and test if the
-        # proper kernel was wrapped
-        if hasattr(wrapped_kernel, "__wrapped__"):
-            assert wrapped_kernel.__wrapped__ is info.kernels[input_type]
-
-        spy = mock.MagicMock(wraps=wrapped_kernel, name=wrapped_kernel.__name__)
-        with mock.patch.dict(_KERNEL_REGISTRY[info.dispatcher], values={input_type: spy}):
-            info.dispatcher(datapoint, *other_args, **kwargs)
-
-        spy.assert_called_once()
-
     @make_info_args_kwargs_parametrization(
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(),
@@ -449,6 +394,9 @@ class TestDispatchers:
 
         assert isinstance(output, type(datapoint))
 
+        if isinstance(datapoint, datapoints.BoundingBoxes) and info.dispatcher is not F.convert_format_bounding_boxes:
+            assert output.format == datapoint.format
+
     @pytest.mark.parametrize(
         ("dispatcher_info", "datapoint_type", "kernel_info"),
         [
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 8a858bf58..c910882f9 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -39,7 +39,7 @@ from torchvision import datapoints
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.functional._utils import _KERNEL_REGISTRY
+from torchvision.transforms.v2.functional._utils import _get_kernel, _KERNEL_REGISTRY, _noop, _register_kernel_internal
 
 
 @pytest.fixture(autouse=True)
@@ -173,59 +173,32 @@ def _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs):
         dispatcher_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
 
 
-def _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs):
-    """Checks if the dispatcher correctly dispatches the input to the corresponding kernel and that the input type is
-    preserved in doing so. For bounding boxes also checks that the format is preserved.
-    """
-    input_type = type(input)
-
-    if isinstance(input, datapoints.Datapoint):
-        wrapped_kernel = _KERNEL_REGISTRY[dispatcher][input_type]
-
-        # In case the wrapper was decorated with @functools.wraps, we can make the check more strict and test if the
-        # proper kernel was wrapped
-        if hasattr(wrapped_kernel, "__wrapped__"):
-            assert wrapped_kernel.__wrapped__ is kernel
-
-        spy = mock.MagicMock(wraps=wrapped_kernel, name=wrapped_kernel.__name__)
-        with mock.patch.dict(_KERNEL_REGISTRY[dispatcher], values={input_type: spy}):
-            output = dispatcher(input, *args, **kwargs)
-
-        spy.assert_called_once()
-    else:
-        with mock.patch(f"{dispatcher.__module__}.{kernel.__name__}", wraps=kernel) as spy:
-            output = dispatcher(input, *args, **kwargs)
-
-            spy.assert_called_once()
-
-    assert isinstance(output, input_type)
-
-    if isinstance(input, datapoints.BoundingBoxes):
-        assert output.format == input.format
-
-
 def check_dispatcher(
     dispatcher,
+    # TODO: remove this parameter
     kernel,
     input,
     *args,
     check_scripted_smoke=True,
-    check_dispatch=True,
     **kwargs,
 ):
     unknown_input = object()
+    with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
+        dispatcher(unknown_input, *args, **kwargs)
+
     with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy:
-        with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
-            dispatcher(unknown_input, *args, **kwargs)
+        output = dispatcher(input, *args, **kwargs)
 
         spy.assert_any_call(f"{dispatcher.__module__}.{dispatcher.__name__}")
 
+    assert isinstance(output, type(input))
+
+    if isinstance(input, datapoints.BoundingBoxes):
+        assert output.format == input.format
+
     if check_scripted_smoke:
         _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs)
 
-    if check_dispatch:
-        _check_dispatcher_dispatch(dispatcher, kernel, input, *args, **kwargs)
-
 
 def check_dispatcher_kernel_signature_match(dispatcher, *, kernel, input_type):
     """Checks if the signature of the dispatcher matches the kernel signature."""
@@ -412,18 +385,20 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
 
 
 @pytest.mark.parametrize(
-    ("dispatcher", "registered_datapoint_clss"),
+    ("dispatcher", "registered_input_types"),
     [(dispatcher, set(registry.keys())) for dispatcher, registry in _KERNEL_REGISTRY.items()],
 )
-def test_exhaustive_kernel_registration(dispatcher, registered_datapoint_clss):
+def test_exhaustive_kernel_registration(dispatcher, registered_input_types):
     missing = {
+        torch.Tensor,
+        PIL.Image.Image,
         datapoints.Image,
         datapoints.BoundingBoxes,
         datapoints.Mask,
         datapoints.Video,
-    } - registered_datapoint_clss
+    } - registered_input_types
     if missing:
-        names = sorted(f"datapoints.{cls.__name__}" for cls in missing)
+        names = sorted(str(t) for t in missing)
         raise AssertionError(
             "\n".join(
                 [
@@ -1753,11 +1728,6 @@ class TestToDtype:
             F.to_dtype,
             kernel,
             make_input(dtype=input_dtype, device=device),
-            # TODO: we could leave check_dispatch to True but it currently fails
-            # in _check_dispatcher_dispatch because there is no to_dtype() method on the datapoints.
-            # We should be able to put this back if we change the dispatch
-            # mechanism e.g. via https://github.com/pytorch/vision/pull/7733
-            check_dispatch=False,
             dtype=output_dtype,
             scale=scale,
         )
@@ -2208,9 +2178,105 @@ class TestRegisterKernel:
         t(torch.rand(3, 10, 10)).shape == (3, 224, 224)
         t(datapoints.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
 
-    def test_bad_disaptcher_name(self):
-        class CustomDatapoint(datapoints.Datapoint):
+    def test_errors(self):
+        with pytest.raises(ValueError, match="Could not find dispatcher with name"):
+            F.register_kernel("bad_name", datapoints.Image)
+
+        with pytest.raises(ValueError, match="Kernels can only be registered on dispatchers"):
+            F.register_kernel(datapoints.Image, F.resize)
+
+        with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"):
+            F.register_kernel(F.resize, object)
+
+        with pytest.raises(ValueError, match="already has a kernel registered for type"):
+            F.register_kernel(F.resize, datapoints.Image)(F.resize_image_tensor)
+
+
+class TestGetKernel:
+    # We are using F.resize as dispatcher and the kernels below as proxy. Any other dispatcher / kernels combination
+    # would also be fine
+    KERNELS = {
+        torch.Tensor: F.resize_image_tensor,
+        PIL.Image.Image: F.resize_image_pil,
+        datapoints.Image: F.resize_image_tensor,
+        datapoints.BoundingBoxes: F.resize_bounding_boxes,
+        datapoints.Mask: F.resize_mask,
+        datapoints.Video: F.resize_video,
+    }
+
+    def test_unsupported_types(self):
+        class MyTensor(torch.Tensor):
             pass
 
-        with pytest.raises(ValueError, match="Could not find dispatcher with name"):
-            F.register_kernel("bad_name", CustomDatapoint)
+        class MyPILImage(PIL.Image.Image):
+            pass
+
+        for input_type in [str, int, object, MyTensor, MyPILImage]:
+            with pytest.raises(
+                TypeError,
+                match=(
+                    "supports inputs of type torch.Tensor, PIL.Image.Image, "
+                    "and subclasses of torchvision.datapoints.Datapoint"
+                ),
+            ):
+                _get_kernel(F.resize, input_type)
+
+    def test_exact_match(self):
+        # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize dispatcher
+        # here, register the kernels without wrapper, and check the exact matching afterwards.
+        def resize_with_pure_kernels():
+            pass
+
+        for input_type, kernel in self.KERNELS.items():
+            _register_kernel_internal(resize_with_pure_kernels, input_type, datapoint_wrapper=False)(kernel)
+
+            assert _get_kernel(resize_with_pure_kernels, input_type) is kernel
+
+    def test_builtin_datapoint_subclass(self):
+        # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize dispatcher
+        # here, register the kernels without wrapper, and check if subclasses of our builtin datapoints get dispatched
+        # to the kernel of the corresponding superclass
+        def resize_with_pure_kernels():
+            pass
+
+        class MyImage(datapoints.Image):
+            pass
+
+        class MyBoundingBoxes(datapoints.BoundingBoxes):
+            pass
+
+        class MyMask(datapoints.Mask):
+            pass
+
+        class MyVideo(datapoints.Video):
+            pass
+
+        for custom_datapoint_subclass in [
+            MyImage,
+            MyBoundingBoxes,
+            MyMask,
+            MyVideo,
+        ]:
+            builtin_datapoint_class = custom_datapoint_subclass.__mro__[1]
+            builtin_datapoint_kernel = self.KERNELS[builtin_datapoint_class]
+            _register_kernel_internal(resize_with_pure_kernels, builtin_datapoint_class, datapoint_wrapper=False)(
+                builtin_datapoint_kernel
+            )
+
+            assert _get_kernel(resize_with_pure_kernels, custom_datapoint_subclass) is builtin_datapoint_kernel
+
+    def test_datapoint_subclass(self):
+        class MyDatapoint(datapoints.Datapoint):
+            pass
+
+        # Note that this will be an error in the future
+        assert _get_kernel(F.resize, MyDatapoint) is _noop
+
+        def resize_my_datapoint():
+            pass
+
+        _register_kernel_internal(F.resize, MyDatapoint, datapoint_wrapper=False)(resize_my_datapoint)
+
+        assert _get_kernel(F.resize, MyDatapoint) is resize_my_datapoint
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 95b4ed937..89fa25437 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -7,7 +7,7 @@ from torchvision import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, is_simple_tensor
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
 
 
 @_register_explicit_noop(datapoints.Mask, datapoints.BoundingBoxes, warn_passthrough=True)
@@ -20,23 +20,16 @@ def erase(
     v: torch.Tensor,
     inplace: bool = False,
 ) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(erase)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(erase, type(inpt))
-        return kernel(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    elif isinstance(inpt, PIL.Image.Image):
-        return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(erase)
+
+    kernel = _get_kernel(erase, type(inpt))
+    return kernel(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
 
 
+@_register_kernel_internal(erase, torch.Tensor)
 @_register_kernel_internal(erase, datapoints.Image)
 def erase_image_tensor(
     image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
@@ -48,7 +41,7 @@ def erase_image_tensor(
     return image
 
 
-@torch.jit.unused
+@_register_kernel_internal(erase, PIL.Image.Image)
 def erase_image_pil(
     image: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> PIL.Image.Image:
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 99dc19362..71797fd25 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -10,29 +10,20 @@ from torchvision.transforms._functional_tensor import _max_value
 from torchvision.utils import _log_api_usage_once
 
 from ._misc import _num_value_bits, to_dtype_image_tensor
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, is_simple_tensor
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, datapoints.Video)
 def rgb_to_grayscale(
     inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], num_output_channels: int = 1
 ) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(rgb_to_grayscale)
-    if num_output_channels not in (1, 3):
-        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(rgb_to_grayscale, type(inpt))
-        return kernel(inpt, num_output_channels=num_output_channels)
-    elif isinstance(inpt, PIL.Image.Image):
-        return rgb_to_grayscale_image_pil(inpt, num_output_channels=num_output_channels)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(rgb_to_grayscale)
+
+    kernel = _get_kernel(rgb_to_grayscale, type(inpt))
+    return kernel(inpt, num_output_channels=num_output_channels)
 
 
 # `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
@@ -56,12 +47,19 @@ def _rgb_to_grayscale_image_tensor(
     return l_img
 
 
+@_register_kernel_internal(rgb_to_grayscale, torch.Tensor)
 @_register_kernel_internal(rgb_to_grayscale, datapoints.Image)
 def rgb_to_grayscale_image_tensor(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
     return _rgb_to_grayscale_image_tensor(image, num_output_channels=num_output_channels, preserve_dtype=True)
 
 
-rgb_to_grayscale_image_pil = _FP.to_grayscale
+@_register_kernel_internal(rgb_to_grayscale, PIL.Image.Image)
+def rgb_to_grayscale_image_pil(image: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _FP.to_grayscale(image, num_output_channels=num_output_channels)
 
 
 def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
@@ -74,23 +72,16 @@ def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Te
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_brightness(inpt: datapoints._InputTypeJIT, brightness_factor: float) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_brightness)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(adjust_brightness, type(inpt))
-        return kernel(inpt, brightness_factor=brightness_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(adjust_brightness)
 
+    kernel = _get_kernel(adjust_brightness, type(inpt))
+    return kernel(inpt, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, torch.Tensor)
 @_register_kernel_internal(adjust_brightness, datapoints.Image)
 def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     if brightness_factor < 0:
@@ -106,6 +97,7 @@ def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float
     return output if fp else output.to(image.dtype)
 
 
+@_register_kernel_internal(adjust_brightness, PIL.Image.Image)
 def adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
     return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
 
@@ -117,23 +109,16 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_saturation(inpt: datapoints._InputTypeJIT, saturation_factor: float) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_saturation)
-
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Datapoint)):
+    if torch.jit.is_scripting():
         return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(adjust_saturation, type(inpt))
-        return kernel(inpt, saturation_factor=saturation_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(adjust_saturation)
+
+    kernel = _get_kernel(adjust_saturation, type(inpt))
+    return kernel(inpt, saturation_factor=saturation_factor)
 
+
+@_register_kernel_internal(adjust_saturation, torch.Tensor)
 @_register_kernel_internal(adjust_saturation, datapoints.Image)
 def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if saturation_factor < 0:
@@ -153,7 +138,7 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
     return _blend(image, grayscale_image, saturation_factor)
 
 
-adjust_saturation_image_pil = _FP.adjust_saturation
+adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
 
 
 @_register_kernel_internal(adjust_saturation, datapoints.Video)
@@ -163,23 +148,16 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_contrast(inpt: datapoints._InputTypeJIT, contrast_factor: float) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_contrast)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(adjust_contrast, type(inpt))
-        return kernel(inpt, contrast_factor=contrast_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(adjust_contrast)
+
+    kernel = _get_kernel(adjust_contrast, type(inpt))
+    return kernel(inpt, contrast_factor=contrast_factor)
 
 
+@_register_kernel_internal(adjust_contrast, torch.Tensor)
 @_register_kernel_internal(adjust_contrast, datapoints.Image)
 def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if contrast_factor < 0:
@@ -199,7 +177,7 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
     return _blend(image, mean, contrast_factor)
 
 
-adjust_contrast_image_pil = _FP.adjust_contrast
+adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
 
 
 @_register_kernel_internal(adjust_contrast, datapoints.Video)
@@ -209,23 +187,16 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_sharpness(inpt: datapoints._InputTypeJIT, sharpness_factor: float) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_sharpness)
-
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, datapoints.Datapoint)):
+    if torch.jit.is_scripting():
         return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(adjust_sharpness, type(inpt))
-        return kernel(inpt, sharpness_factor=sharpness_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(adjust_sharpness)
+
+    kernel = _get_kernel(adjust_sharpness, type(inpt))
+    return kernel(inpt, sharpness_factor=sharpness_factor)
 
+
+@_register_kernel_internal(adjust_sharpness, torch.Tensor)
 @_register_kernel_internal(adjust_sharpness, datapoints.Image)
 def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     num_channels, height, width = image.shape[-3:]
@@ -279,7 +250,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     return output
 
 
-adjust_sharpness_image_pil = _FP.adjust_sharpness
+adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
 
 
 @_register_kernel_internal(adjust_sharpness, datapoints.Video)
@@ -289,21 +260,13 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_hue(inpt: datapoints._InputTypeJIT, hue_factor: float) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_hue)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(adjust_hue, type(inpt))
-        return kernel(inpt, hue_factor=hue_factor)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(adjust_hue)
+
+    kernel = _get_kernel(adjust_hue, type(inpt))
+    return kernel(inpt, hue_factor=hue_factor)
 
 
 def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
@@ -370,6 +333,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
     return (a4.mul_(mask.unsqueeze(dim=-4))).sum(dim=-3)
 
 
+@_register_kernel_internal(adjust_hue, torch.Tensor)
 @_register_kernel_internal(adjust_hue, datapoints.Image)
 def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if not (-0.5 <= hue_factor <= 0.5):
@@ -398,7 +362,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
     return to_dtype_image_tensor(image_hue_adj, orig_dtype, scale=True)
 
 
-adjust_hue_image_pil = _FP.adjust_hue
+adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
 
 
 @_register_kernel_internal(adjust_hue, datapoints.Video)
@@ -408,23 +372,16 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_gamma(inpt: datapoints._InputTypeJIT, gamma: float, gain: float = 1) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(adjust_gamma)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(adjust_gamma, type(inpt))
-        return kernel(inpt, gamma=gamma, gain=gain)
-    elif isinstance(inpt, PIL.Image.Image):
-        return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(adjust_gamma)
 
+    kernel = _get_kernel(adjust_gamma, type(inpt))
+    return kernel(inpt, gamma=gamma, gain=gain)
+
+
+@_register_kernel_internal(adjust_gamma, torch.Tensor)
 @_register_kernel_internal(adjust_gamma, datapoints.Image)
 def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
     if gamma < 0:
@@ -445,7 +402,7 @@ def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1
     return to_dtype_image_tensor(output, image.dtype, scale=True)
 
 
-adjust_gamma_image_pil = _FP.adjust_gamma
+adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
 
 
 @_register_kernel_internal(adjust_gamma, datapoints.Video)
@@ -455,23 +412,16 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def posterize(inpt: datapoints._InputTypeJIT, bits: int) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(posterize)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return posterize_image_tensor(inpt, bits=bits)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(posterize, type(inpt))
-        return kernel(inpt, bits=bits)
-    elif isinstance(inpt, PIL.Image.Image):
-        return posterize_image_pil(inpt, bits=bits)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(posterize)
+
+    kernel = _get_kernel(posterize, type(inpt))
+    return kernel(inpt, bits=bits)
 
 
+@_register_kernel_internal(posterize, torch.Tensor)
 @_register_kernel_internal(posterize, datapoints.Image)
 def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
     if image.is_floating_point():
@@ -486,7 +436,7 @@ def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
         return image & mask
 
 
-posterize_image_pil = _FP.posterize
+posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
 
 
 @_register_kernel_internal(posterize, datapoints.Video)
@@ -496,23 +446,16 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def solarize(inpt: datapoints._InputTypeJIT, threshold: float) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(solarize)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return solarize_image_tensor(inpt, threshold=threshold)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(solarize, type(inpt))
-        return kernel(inpt, threshold=threshold)
-    elif isinstance(inpt, PIL.Image.Image):
-        return solarize_image_pil(inpt, threshold=threshold)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(solarize)
+
+    kernel = _get_kernel(solarize, type(inpt))
+    return kernel(inpt, threshold=threshold)
 
+
+@_register_kernel_internal(solarize, torch.Tensor)
 @_register_kernel_internal(solarize, datapoints.Image)
 def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
     if threshold > _max_value(image.dtype):
@@ -521,7 +464,7 @@ def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor
     return torch.where(image >= threshold, invert_image_tensor(image), image)
 
 
-solarize_image_pil = _FP.solarize
+solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
 
 
 @_register_kernel_internal(solarize, datapoints.Video)
@@ -531,25 +474,16 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def autocontrast(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(autocontrast)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return autocontrast_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(autocontrast, type(inpt))
-        return kernel(
-            inpt,
-        )
-    elif isinstance(inpt, PIL.Image.Image):
-        return autocontrast_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(autocontrast)
+
+    kernel = _get_kernel(autocontrast, type(inpt))
+    return kernel(inpt)
 
+
+@_register_kernel_internal(autocontrast, torch.Tensor)
 @_register_kernel_internal(autocontrast, datapoints.Image)
 def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
     c = image.shape[-3]
@@ -580,7 +514,7 @@ def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return diff.div_(inv_scale).clamp_(0, bound).to(image.dtype)
 
 
-autocontrast_image_pil = _FP.autocontrast
+autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
 
 
 @_register_kernel_internal(autocontrast, datapoints.Video)
@@ -590,25 +524,16 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def equalize(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(equalize)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return equalize_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(equalize, type(inpt))
-        return kernel(
-            inpt,
-        )
-    elif isinstance(inpt, PIL.Image.Image):
-        return equalize_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
 
+    _log_api_usage_once(equalize)
 
+    kernel = _get_kernel(equalize, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(equalize, torch.Tensor)
 @_register_kernel_internal(equalize, datapoints.Image)
 def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
@@ -679,7 +604,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return to_dtype_image_tensor(output, output_dtype, scale=True)
 
 
-equalize_image_pil = _FP.equalize
+equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
 
 
 @_register_kernel_internal(equalize, datapoints.Video)
@@ -689,25 +614,16 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def invert(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(invert)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return invert_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(invert, type(inpt))
-        return kernel(
-            inpt,
-        )
-    elif isinstance(inpt, PIL.Image.Image):
-        return invert_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(invert)
+
+    kernel = _get_kernel(invert, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(invert, torch.Tensor)
 @_register_kernel_internal(invert, datapoints.Image)
 def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
     if image.is_floating_point():
@@ -719,7 +635,7 @@ def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
         return image.bitwise_xor((1 << _num_value_bits(image.dtype)) - 1)
 
 
-invert_image_pil = _FP.invert
+invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
 
 
 @_register_kernel_internal(invert, datapoints.Video)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 21f2aa8df..bb19def2c 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -25,13 +25,7 @@ from torchvision.utils import _log_api_usage_once
 
 from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil
 
-from ._utils import (
-    _get_kernel,
-    _register_explicit_noop,
-    _register_five_ten_crop_kernel,
-    _register_kernel_internal,
-    is_simple_tensor,
-)
+from ._utils import _get_kernel, _register_explicit_noop, _register_five_ten_crop_kernel, _register_kernel_internal
 
 
 def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
@@ -46,30 +40,22 @@ def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> Interp
 
 
 def horizontal_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(horizontal_flip)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return horizontal_flip_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(horizontal_flip, type(inpt))
-        return kernel(
-            inpt,
-        )
-    elif isinstance(inpt, PIL.Image.Image):
-        return horizontal_flip_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(horizontal_flip)
+
+    kernel = _get_kernel(horizontal_flip, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(horizontal_flip, torch.Tensor)
 @_register_kernel_internal(horizontal_flip, datapoints.Image)
 def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
 
 
+@_register_kernel_internal(horizontal_flip, PIL.Image.Image)
 def horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
     return _FP.hflip(image)
 
@@ -110,30 +96,22 @@ def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(vertical_flip)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return vertical_flip_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(vertical_flip, type(inpt))
-        return kernel(
-            inpt,
-        )
-    elif isinstance(inpt, PIL.Image.Image):
-        return vertical_flip_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(vertical_flip)
+
+    kernel = _get_kernel(vertical_flip, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(vertical_flip, torch.Tensor)
 @_register_kernel_internal(vertical_flip, datapoints.Image)
 def vertical_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-2)
 
 
+@_register_kernel_internal(vertical_flip, PIL.Image.Image)
 def vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
     return _FP.vflip(image)
 
@@ -199,24 +177,16 @@ def resize(
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(resize)
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(resize, type(inpt))
-        return kernel(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    elif isinstance(inpt, PIL.Image.Image):
-        if antialias is False:
-            warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
-        return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return resize_image_tensor(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+    _log_api_usage_once(resize)
+
+    kernel = _get_kernel(resize, type(inpt))
+    return kernel(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
 
+@_register_kernel_internal(resize, torch.Tensor)
 @_register_kernel_internal(resize, datapoints.Image)
 def resize_image_tensor(
     image: torch.Tensor,
@@ -297,7 +267,6 @@ def resize_image_tensor(
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
-@torch.jit.unused
 def resize_image_pil(
     image: PIL.Image.Image,
     size: Union[Sequence[int], int],
@@ -319,6 +288,19 @@ def resize_image_pil(
     return image.resize((new_width, new_height), resample=pil_modes_mapping[interpolation])
 
 
+@_register_kernel_internal(resize, PIL.Image.Image)
+def _resize_image_pil_dispatch(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
+
+
 def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = None) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -391,26 +373,10 @@ def affine(
     fill: datapoints._FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(affine)
-
-    # TODO: consider deprecating integers from angle and shear on the future
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return affine_image_tensor(
             inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(affine, type(inpt))
-        return kernel(
-            inpt,
-            angle,
+            angle=angle,
             translate=translate,
             scale=scale,
             shear=shear,
@@ -418,22 +384,20 @@ def affine(
             fill=fill,
             center=center,
         )
-    elif isinstance(inpt, PIL.Image.Image):
-        return affine_image_pil(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(affine)
+
+    kernel = _get_kernel(affine, type(inpt))
+    return kernel(
+        inpt,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
 
 
 def _affine_parse_args(
@@ -684,6 +648,7 @@ def _affine_grid(
     return output_grid.view(1, oh, ow, 2)
 
 
+@_register_kernel_internal(affine, torch.Tensor)
 @_register_kernel_internal(affine, datapoints.Image)
 def affine_image_tensor(
     image: torch.Tensor,
@@ -736,7 +701,7 @@ def affine_image_tensor(
     return output
 
 
-@torch.jit.unused
+@_register_kernel_internal(affine, PIL.Image.Image)
 def affine_image_pil(
     image: PIL.Image.Image,
     angle: Union[int, float],
@@ -983,23 +948,18 @@ def rotate(
     center: Optional[List[float]] = None,
     fill: datapoints._FillTypeJIT = None,
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(rotate)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(rotate, type(inpt))
-        return kernel(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    elif isinstance(inpt, PIL.Image.Image):
-        return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
+    if torch.jit.is_scripting():
+        return rotate_image_tensor(
+            inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center
         )
 
+    _log_api_usage_once(rotate)
 
+    kernel = _get_kernel(rotate, type(inpt))
+    return kernel(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+@_register_kernel_internal(rotate, torch.Tensor)
 @_register_kernel_internal(rotate, datapoints.Image)
 def rotate_image_tensor(
     image: torch.Tensor,
@@ -1045,7 +1005,7 @@ def rotate_image_tensor(
     return output.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
-@torch.jit.unused
+@_register_kernel_internal(rotate, PIL.Image.Image)
 def rotate_image_pil(
     image: PIL.Image.Image,
     angle: float,
@@ -1162,22 +1122,13 @@ def pad(
     fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(pad)
+    if torch.jit.is_scripting():
+        return pad_image_tensor(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
+    _log_api_usage_once(pad)
 
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(pad, type(inpt))
-        return kernel(inpt, padding, fill=fill, padding_mode=padding_mode)
-    elif isinstance(inpt, PIL.Image.Image):
-        return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    kernel = _get_kernel(pad, type(inpt))
+    return kernel(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
 
 
 def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
@@ -1204,6 +1155,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
     return [pad_left, pad_right, pad_top, pad_bottom]
 
 
+@_register_kernel_internal(pad, torch.Tensor)
 @_register_kernel_internal(pad, datapoints.Image)
 def pad_image_tensor(
     image: torch.Tensor,
@@ -1303,7 +1255,7 @@ def _pad_with_vector_fill(
     return output
 
 
-pad_image_pil = _FP.pad
+pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
 
 
 @_register_kernel_internal(pad, datapoints.Mask)
@@ -1385,23 +1337,16 @@ def pad_video(
 
 
 def crop(inpt: datapoints._InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(crop)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return crop_image_tensor(inpt, top, left, height, width)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(crop, type(inpt))
-        return kernel(inpt, top, left, height, width)
-    elif isinstance(inpt, PIL.Image.Image):
-        return crop_image_pil(inpt, top, left, height, width)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return crop_image_tensor(inpt, top=top, left=left, height=height, width=width)
+
+    _log_api_usage_once(crop)
 
+    kernel = _get_kernel(crop, type(inpt))
+    return kernel(inpt, top=top, left=left, height=height, width=width)
 
+
+@_register_kernel_internal(crop, torch.Tensor)
 @_register_kernel_internal(crop, datapoints.Image)
 def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     h, w = image.shape[-2:]
@@ -1422,6 +1367,7 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid
 
 
 crop_image_pil = _FP.crop
+_register_kernel_internal(crop, PIL.Image.Image)(crop_image_pil)
 
 
 def crop_bounding_boxes(
@@ -1484,25 +1430,28 @@ def perspective(
     fill: datapoints._FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(perspective)
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return perspective_image_tensor(
-            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
-        )
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(perspective, type(inpt))
-        return kernel(inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients)
-    elif isinstance(inpt, PIL.Image.Image):
-        return perspective_image_pil(
-            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
-        )
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
+            inpt,
+            startpoints=startpoints,
+            endpoints=endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
         )
 
+    _log_api_usage_once(perspective)
+
+    kernel = _get_kernel(perspective, type(inpt))
+    return kernel(
+        inpt,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        interpolation=interpolation,
+        fill=fill,
+        coefficients=coefficients,
+    )
+
 
 def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
     # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
@@ -1551,6 +1500,7 @@ def _perspective_coefficients(
         raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
 
 
+@_register_kernel_internal(perspective, torch.Tensor)
 @_register_kernel_internal(perspective, datapoints.Image)
 def perspective_image_tensor(
     image: torch.Tensor,
@@ -1598,7 +1548,7 @@ def perspective_image_tensor(
     return output
 
 
-@torch.jit.unused
+@_register_kernel_internal(perspective, PIL.Image.Image)
 def perspective_image_pil(
     image: PIL.Image.Image,
     startpoints: Optional[List[List[int]]],
@@ -1787,29 +1737,19 @@ def elastic(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: datapoints._FillTypeJIT = None,
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(elastic)
-
-    if not isinstance(displacement, torch.Tensor):
-        raise TypeError("Argument displacement should be a Tensor")
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(elastic, type(inpt))
-        return kernel(inpt, displacement, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, PIL.Image.Image):
-        return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return elastic_image_tensor(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+    _log_api_usage_once(elastic)
+
+    kernel = _get_kernel(elastic, type(inpt))
+    return kernel(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
 
 
 elastic_transform = elastic
 
 
+@_register_kernel_internal(elastic, torch.Tensor)
 @_register_kernel_internal(elastic, datapoints.Image)
 def elastic_image_tensor(
     image: torch.Tensor,
@@ -1867,7 +1807,7 @@ def elastic_image_tensor(
     return output
 
 
-@torch.jit.unused
+@_register_kernel_internal(elastic, PIL.Image.Image)
 def elastic_image_pil(
     image: PIL.Image.Image,
     displacement: torch.Tensor,
@@ -1990,21 +1930,13 @@ def elastic_video(
 
 
 def center_crop(inpt: datapoints._InputTypeJIT, output_size: List[int]) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(center_crop)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return center_crop_image_tensor(inpt, output_size)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(center_crop, type(inpt))
-        return kernel(inpt, output_size)
-    elif isinstance(inpt, PIL.Image.Image):
-        return center_crop_image_pil(inpt, output_size)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return center_crop_image_tensor(inpt, output_size=output_size)
+
+    _log_api_usage_once(center_crop)
+
+    kernel = _get_kernel(center_crop, type(inpt))
+    return kernel(inpt, output_size=output_size)
 
 
 def _center_crop_parse_output_size(output_size: List[int]) -> List[int]:
@@ -2034,6 +1966,7 @@ def _center_crop_compute_crop_anchor(
     return crop_top, crop_left
 
 
+@_register_kernel_internal(center_crop, torch.Tensor)
 @_register_kernel_internal(center_crop, datapoints.Image)
 def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
@@ -2054,7 +1987,7 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
     return image[..., crop_top : (crop_top + crop_height), crop_left : (crop_left + crop_width)]
 
 
-@torch.jit.unused
+@_register_kernel_internal(center_crop, PIL.Image.Image)
 def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
     image_height, image_width = get_size_image_pil(image)
@@ -2125,25 +2058,34 @@ def resized_crop(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(resized_crop)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return resized_crop_image_tensor(
-            inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
-        )
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(resized_crop, type(inpt))
-        return kernel(inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
-    elif isinstance(inpt, PIL.Image.Image):
-        return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
+            inpt,
+            top=top,
+            left=left,
+            height=height,
+            width=width,
+            size=size,
+            interpolation=interpolation,
+            antialias=antialias,
         )
 
+    _log_api_usage_once(resized_crop)
+
+    kernel = _get_kernel(resized_crop, type(inpt))
+    return kernel(
+        inpt,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+        antialias=antialias,
+    )
 
+
+@_register_kernel_internal(resized_crop, torch.Tensor)
 @_register_kernel_internal(resized_crop, datapoints.Image)
 def resized_crop_image_tensor(
     image: torch.Tensor,
@@ -2159,7 +2101,6 @@ def resized_crop_image_tensor(
     return resize_image_tensor(image, size, interpolation=interpolation, antialias=antialias)
 
 
-@torch.jit.unused
 def resized_crop_image_pil(
     image: PIL.Image.Image,
     top: int,
@@ -2173,6 +2114,30 @@ def resized_crop_image_pil(
     return resize_image_pil(image, size, interpolation=interpolation)
 
 
+@_register_kernel_internal(resized_crop, PIL.Image.Image)
+def resized_crop_image_pil_dispatch(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return resized_crop_image_pil(
+        image,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+    )
+
+
 def resized_crop_bounding_boxes(
     bounding_boxes: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
@@ -2244,21 +2209,13 @@ def five_crop(
     datapoints._InputTypeJIT,
     datapoints._InputTypeJIT,
 ]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(five_crop)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return five_crop_image_tensor(inpt, size)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(five_crop, type(inpt))
-        return kernel(inpt, size)
-    elif isinstance(inpt, PIL.Image.Image):
-        return five_crop_image_pil(inpt, size)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return five_crop_image_tensor(inpt, size=size)
+
+    _log_api_usage_once(five_crop)
+
+    kernel = _get_kernel(five_crop, type(inpt))
+    return kernel(inpt, size=size)
 
 
 def _parse_five_crop_size(size: List[int]) -> List[int]:
@@ -2275,6 +2232,7 @@ def _parse_five_crop_size(size: List[int]) -> List[int]:
     return size
 
 
+@_register_five_ten_crop_kernel(five_crop, torch.Tensor)
 @_register_five_ten_crop_kernel(five_crop, datapoints.Image)
 def five_crop_image_tensor(
     image: torch.Tensor, size: List[int]
@@ -2294,7 +2252,7 @@ def five_crop_image_tensor(
     return tl, tr, bl, br, center
 
 
-@torch.jit.unused
+@_register_five_ten_crop_kernel(five_crop, PIL.Image.Image)
 def five_crop_image_pil(
     image: PIL.Image.Image, size: List[int]
 ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
@@ -2335,23 +2293,16 @@ def ten_crop(
     datapoints._InputTypeJIT,
     datapoints._InputTypeJIT,
 ]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(ten_crop)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(ten_crop, type(inpt))
-        return kernel(inpt, size, vertical_flip=vertical_flip)
-    elif isinstance(inpt, PIL.Image.Image):
-        return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return ten_crop_image_tensor(inpt, size=size, vertical_flip=vertical_flip)
+
+    _log_api_usage_once(ten_crop)
+
+    kernel = _get_kernel(ten_crop, type(inpt))
+    return kernel(inpt, size=size, vertical_flip=vertical_flip)
 
 
+@_register_five_ten_crop_kernel(ten_crop, torch.Tensor)
 @_register_five_ten_crop_kernel(ten_crop, datapoints.Image)
 def ten_crop_image_tensor(
     image: torch.Tensor, size: List[int], vertical_flip: bool = False
@@ -2379,7 +2330,7 @@ def ten_crop_image_tensor(
     return non_flipped + flipped
 
 
-@torch.jit.unused
+@_register_five_ten_crop_kernel(ten_crop, PIL.Image.Image)
 def ten_crop_image_pil(
     image: PIL.Image.Image, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index a4bfe7df8..fc1aa05f3 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -13,23 +13,16 @@ from ._utils import _get_kernel, _register_kernel_internal, _register_unsupporte
 
 @_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
 def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(get_dimensions)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return get_dimensions_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(get_dimensions, type(inpt))
-        return kernel(inpt)
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_dimensions_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(get_dimensions)
+
+    kernel = _get_kernel(get_dimensions, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(get_dimensions, torch.Tensor)
 @_register_kernel_internal(get_dimensions, datapoints.Image, datapoint_wrapper=False)
 def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
     chw = list(image.shape[-3:])
@@ -43,7 +36,7 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
         raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
 
 
-get_dimensions_image_pil = _FP.get_dimensions
+get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
 
 
 @_register_kernel_internal(get_dimensions, datapoints.Video, datapoint_wrapper=False)
@@ -53,23 +46,16 @@ def get_dimensions_video(video: torch.Tensor) -> List[int]:
 
 @_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
 def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> int:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(get_num_channels)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return get_num_channels_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(get_num_channels, type(inpt))
-        return kernel(inpt)
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_num_channels_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(get_num_channels)
+
+    kernel = _get_kernel(get_num_channels, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(get_num_channels, torch.Tensor)
 @_register_kernel_internal(get_num_channels, datapoints.Image, datapoint_wrapper=False)
 def get_num_channels_image_tensor(image: torch.Tensor) -> int:
     chw = image.shape[-3:]
@@ -82,7 +68,7 @@ def get_num_channels_image_tensor(image: torch.Tensor) -> int:
         raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
 
 
-get_num_channels_image_pil = _FP.get_image_num_channels
+get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
 
 
 @_register_kernel_internal(get_num_channels, datapoints.Video, datapoint_wrapper=False)
@@ -96,23 +82,16 @@ get_image_num_channels = get_num_channels
 
 
 def get_size(inpt: datapoints._InputTypeJIT) -> List[int]:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(get_size)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return get_size_image_tensor(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(get_size, type(inpt))
-        return kernel(inpt)
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_size_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(get_size)
+
+    kernel = _get_kernel(get_size, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(get_size, torch.Tensor)
 @_register_kernel_internal(get_size, datapoints.Image, datapoint_wrapper=False)
 def get_size_image_tensor(image: torch.Tensor) -> List[int]:
     hw = list(image.shape[-2:])
@@ -123,7 +102,7 @@ def get_size_image_tensor(image: torch.Tensor) -> List[int]:
         raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
 
 
-@torch.jit.unused
+@_register_kernel_internal(get_size, PIL.Image.Image)
 def get_size_image_pil(image: PIL.Image.Image) -> List[int]:
     width, height = _FP.get_image_size(image)
     return [height, width]
@@ -146,21 +125,16 @@ def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]
 
 @_register_unsupported_type(PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)
 def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(get_num_frames)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return get_num_frames_video(inpt)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(get_num_frames, type(inpt))
-        return kernel(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(get_num_frames)
+
+    kernel = _get_kernel(get_num_frames, type(inpt))
+    return kernel(inpt)
 
 
+@_register_kernel_internal(get_num_frames, torch.Tensor)
 @_register_kernel_internal(get_num_frames, datapoints.Video, datapoint_wrapper=False)
 def get_num_frames_video(video: torch.Tensor) -> int:
     return video.shape[-4]
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index 90a3e44e9..e3a800ea7 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -11,13 +11,7 @@ from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import (
-    _get_kernel,
-    _register_explicit_noop,
-    _register_kernel_internal,
-    _register_unsupported_type,
-    is_simple_tensor,
-)
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, _register_unsupported_type
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
@@ -28,19 +22,16 @@ def normalize(
     std: List[float],
     inplace: bool = False,
 ) -> torch.Tensor:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(normalize)
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(normalize, type(inpt))
-        return kernel(inpt, mean=mean, std=std, inplace=inplace)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor or any TorchVision datapoint, but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(normalize)
+
+    kernel = _get_kernel(normalize, type(inpt))
+    return kernel(inpt, mean=mean, std=std, inplace=inplace)
 
 
+@_register_kernel_internal(normalize, torch.Tensor)
 @_register_kernel_internal(normalize, datapoints.Image)
 def normalize_image_tensor(
     image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False
@@ -86,21 +77,13 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
 def gaussian_blur(
     inpt: datapoints._InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(gaussian_blur)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting():
         return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(gaussian_blur, type(inpt))
-        return kernel(inpt, kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, PIL.Image.Image):
-        return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    _log_api_usage_once(gaussian_blur)
+
+    kernel = _get_kernel(gaussian_blur, type(inpt))
+    return kernel(inpt, kernel_size=kernel_size, sigma=sigma)
 
 
 def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
@@ -119,6 +102,7 @@ def _get_gaussian_kernel2d(
     return kernel2d
 
 
+@_register_kernel_internal(gaussian_blur, torch.Tensor)
 @_register_kernel_internal(gaussian_blur, datapoints.Image)
 def gaussian_blur_image_tensor(
     image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
@@ -184,7 +168,7 @@ def gaussian_blur_image_tensor(
     return output
 
 
-@torch.jit.unused
+@_register_kernel_internal(gaussian_blur, PIL.Image.Image)
 def gaussian_blur_image_pil(
     image: PIL.Image.Image, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> PIL.Image.Image:
@@ -200,21 +184,17 @@ def gaussian_blur_video(
     return gaussian_blur_image_tensor(video, kernel_size, sigma)
 
 
+@_register_unsupported_type(PIL.Image.Image)
 def to_dtype(
     inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False
 ) -> datapoints._InputTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(to_dtype)
-
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return to_dtype_image_tensor(inpt, dtype, scale=scale)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(to_dtype, type(inpt))
-        return kernel(inpt, dtype, scale=scale)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor or any TorchVision datapoint, but got {type(inpt)} instead."
-        )
+    if torch.jit.is_scripting():
+        return to_dtype_image_tensor(inpt, dtype=dtype, scale=scale)
+
+    _log_api_usage_once(to_dtype)
+
+    kernel = _get_kernel(to_dtype, type(inpt))
+    return kernel(inpt, dtype=dtype, scale=scale)
 
 
 def _num_value_bits(dtype: torch.dtype) -> int:
@@ -232,6 +212,7 @@ def _num_value_bits(dtype: torch.dtype) -> int:
         raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
 
 
+@_register_kernel_internal(to_dtype, torch.Tensor)
 @_register_kernel_internal(to_dtype, datapoints.Image)
 def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
 
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 52c745f99..62d12cb4b 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -5,27 +5,23 @@ from torchvision import datapoints
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, is_simple_tensor
+from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
 
 
 @_register_explicit_noop(
     PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True
 )
 def uniform_temporal_subsample(inpt: datapoints._VideoTypeJIT, num_samples: int) -> datapoints._VideoTypeJIT:
-    if not torch.jit.is_scripting():
-        _log_api_usage_once(uniform_temporal_subsample)
+    if torch.jit.is_scripting():
+        return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return uniform_temporal_subsample_video(inpt, num_samples)
-    elif isinstance(inpt, datapoints.Datapoint):
-        kernel = _get_kernel(uniform_temporal_subsample, type(inpt))
-        return kernel(inpt, num_samples)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor or any TorchVision datapoint, but got {type(inpt)} instead."
-        )
+    _log_api_usage_once(uniform_temporal_subsample)
 
+    kernel = _get_kernel(uniform_temporal_subsample, type(inpt))
+    return kernel(inpt, num_samples=num_samples)
 
+
+@_register_kernel_internal(uniform_temporal_subsample, torch.Tensor)
 @_register_kernel_internal(uniform_temporal_subsample, datapoints.Video)
 def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
     # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index bb3d59b55..576a2b99d 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -23,15 +23,17 @@ def _kernel_datapoint_wrapper(kernel):
     return wrapper
 
 
-def _register_kernel_internal(dispatcher, datapoint_cls, *, datapoint_wrapper=True):
+def _register_kernel_internal(dispatcher, input_type, *, datapoint_wrapper=True):
     registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
-    if datapoint_cls in registry:
-        raise TypeError(
-            f"Dispatcher '{dispatcher.__name__}' already has a kernel registered for type '{datapoint_cls.__name__}'."
-        )
+    if input_type in registry:
+        raise ValueError(f"Dispatcher {dispatcher} already has a kernel registered for type {input_type}.")
 
     def decorator(kernel):
-        registry[datapoint_cls] = _kernel_datapoint_wrapper(kernel) if datapoint_wrapper else kernel
+        registry[input_type] = (
+            _kernel_datapoint_wrapper(kernel)
+            if issubclass(input_type, datapoints.Datapoint) and datapoint_wrapper
+            else kernel
+        )
         return kernel
 
     return decorator
@@ -43,7 +45,9 @@ def _name_to_dispatcher(name):
     try:
         return getattr(torchvision.transforms.v2.functional, name)
     except AttributeError:
-        raise ValueError(f"Could not find dispatcher with name '{name}'.") from None
+        raise ValueError(
+            f"Could not find dispatcher with name '{name}' in torchvision.transforms.v2.functional."
+        ) from None
 
 
 def register_kernel(dispatcher, datapoint_cls):
@@ -54,22 +58,57 @@ def register_kernel(dispatcher, datapoint_cls):
     """
     if isinstance(dispatcher, str):
         dispatcher = _name_to_dispatcher(name=dispatcher)
+    elif not (
+        callable(dispatcher)
+        and getattr(dispatcher, "__module__", "").startswith("torchvision.transforms.v2.functional")
+    ):
+        raise ValueError(
+            f"Kernels can only be registered on dispatchers from the torchvision.transforms.v2.functional namespace, "
+            f"but got {dispatcher}."
+        )
+
+    if not (
+        isinstance(datapoint_cls, type)
+        and issubclass(datapoint_cls, datapoints.Datapoint)
+        and datapoint_cls is not datapoints.Datapoint
+    ):
+        raise ValueError(
+            f"Kernels can only be registered for subclasses of torchvision.datapoints.Datapoint, "
+            f"but got {datapoint_cls}."
+        )
+
     return _register_kernel_internal(dispatcher, datapoint_cls, datapoint_wrapper=False)
 
 
-def _get_kernel(dispatcher, datapoint_cls):
+def _get_kernel(dispatcher, input_type):
     registry = _KERNEL_REGISTRY.get(dispatcher)
     if not registry:
-        raise ValueError(f"No kernel registered for dispatcher '{dispatcher.__name__}'.")
-
-    if datapoint_cls in registry:
-        return registry[datapoint_cls]
-
-    for registered_cls, kernel in registry.items():
-        if issubclass(datapoint_cls, registered_cls):
-            return kernel
-
-    return _noop
+        raise ValueError(f"No kernel registered for dispatcher {dispatcher.__name__}.")
+
+    # In case we have an exact type match, we take a shortcut.
+    if input_type in registry:
+        return registry[input_type]
+
+    # In case of datapoints, we check if we have a kernel for a superclass registered
+    if issubclass(input_type, datapoints.Datapoint):
+        # Since we have already checked for an exact match above, we can start the traversal at the superclass.
+        for cls in input_type.__mro__[1:]:
+            if cls is datapoints.Datapoint:
+                # We don't want user-defined datapoints to dispatch to the pure Tensor kernels, so we explicit stop the
+                # MRO traversal before hitting torch.Tensor. We can even stop at datapoints.Datapoint, since we don't
+                # allow kernels to be registered for datapoints.Datapoint anyway.
+                break
+            elif cls in registry:
+                return registry[cls]
+
+        # Note that in the future we are not going to return a noop here, but rather raise the error below
+        return _noop
+
+    raise TypeError(
+        f"Dispatcher {dispatcher} supports inputs of type torch.Tensor, PIL.Image.Image, "
+        f"and subclasses of torchvision.datapoints.Datapoint, "
+        f"but got {input_type} instead."
+    )
 
 
 # Everything below this block is stuff that we need right now, since it looks like we need to release in an intermediate
@@ -101,7 +140,9 @@ def _register_explicit_noop(*datapoints_classes, warn_passthrough=False):
                 f"F.{dispatcher.__name__} is currently passing through inputs of type datapoints.{cls.__name__}. "
                 f"This will likely change in the future."
             )
-            register_kernel(dispatcher, cls)(functools.partial(_noop, __msg__=msg if warn_passthrough else None))
+            _register_kernel_internal(dispatcher, cls, datapoint_wrapper=False)(
+                functools.partial(_noop, __msg__=msg if warn_passthrough else None)
+            )
         return dispatcher
 
     return decorator
@@ -115,13 +156,15 @@ def _noop(inpt, *args, __msg__=None, **kwargs):
 
 # TODO: we only need this, since our default behavior in case no kernel is found is passthrough. When we change that
 # to error later, this decorator can be removed, since the error will be raised by _get_kernel
-def _register_unsupported_type(*datapoints_classes):
+def _register_unsupported_type(*input_types):
     def kernel(inpt, *args, __dispatcher_name__, **kwargs):
         raise TypeError(f"F.{__dispatcher_name__} does not support inputs of type {type(inpt)}.")
 
     def decorator(dispatcher):
-        for cls in datapoints_classes:
-            register_kernel(dispatcher, cls)(functools.partial(kernel, __dispatcher_name__=dispatcher.__name__))
+        for input_type in input_types:
+            _register_kernel_internal(dispatcher, input_type, datapoint_wrapper=False)(
+                functools.partial(kernel, __dispatcher_name__=dispatcher.__name__)
+            )
         return dispatcher
 
     return decorator
@@ -129,13 +172,10 @@ def _register_unsupported_type(*datapoints_classes):
 
 # This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
 # We could get rid of this by letting _register_kernel_internal take arbitrary dispatchers rather than wrap_kernel: bool
-# TODO: decide if we want that
-def _register_five_ten_crop_kernel(dispatcher, datapoint_cls):
+def _register_five_ten_crop_kernel(dispatcher, input_type):
     registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
-    if datapoint_cls in registry:
-        raise TypeError(
-            f"Dispatcher '{dispatcher.__name__}' already has a kernel registered for type '{datapoint_cls.__name__}'."
-        )
+    if input_type in registry:
+        raise TypeError(f"Dispatcher '{dispatcher}' already has a kernel registered for type '{input_type}'.")
 
     def wrap(kernel):
         @functools.wraps(kernel)
@@ -147,7 +187,7 @@ def _register_five_ten_crop_kernel(dispatcher, datapoint_cls):
         return wrapper
 
     def decorator(kernel):
-        registry[datapoint_cls] = wrap(kernel)
+        registry[input_type] = wrap(kernel) if issubclass(input_type, datapoints.Datapoint) else kernel
         return kernel
 
     return decorator
-- 
GitLab


From 9b82df43341a6891f652be1803abd1d1d05bfbb2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 7 Aug 2023 14:16:12 +0100
Subject: [PATCH 542/624] Remove `_wrap()` class method from base class
 Datapoint (#7805)

---
 test/test_datapoints.py                    | 20 ++++++++++++++++++++
 torchvision/datapoints/_bounding_box.py    | 13 ++++---------
 torchvision/datapoints/_datapoint.py       |  6 +-----
 torchvision/datapoints/_image.py           |  2 +-
 torchvision/datapoints/_mask.py            |  2 +-
 torchvision/datapoints/_video.py           |  2 +-
 torchvision/prototype/datapoints/_label.py |  2 +-
 7 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index f0a44ec17..25a2182e0 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -113,6 +113,26 @@ def test_detach_wrapping():
     assert type(image_detached) is datapoints.Image
 
 
+def test_no_wrapping_exceptions_with_metadata():
+    # Sanity checks for the ops in _NO_WRAPPING_EXCEPTIONS and datapoints with metadata
+    format, canvas_size = "XYXY", (32, 32)
+    bbox = datapoints.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
+
+    bbox = bbox.clone()
+    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    bbox = bbox.to(torch.float64)
+    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    bbox = bbox.detach()
+    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    assert not bbox.requires_grad
+    bbox.requires_grad_(True)
+    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+    assert bbox.requires_grad
+
+
 def test_other_op_no_wrapping():
     image = datapoints.Image(torch.rand(3, 16, 16))
 
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 7477b3652..9677cef21 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -42,7 +42,9 @@ class BoundingBoxes(Datapoint):
     canvas_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBoxes:  # type: ignore[override]
+    def _wrap(cls, tensor: torch.Tensor, *, format: Union[BoundingBoxFormat, str], canvas_size: Tuple[int, int]) -> BoundingBoxes:  # type: ignore[override]
+        if isinstance(format, str):
+            format = BoundingBoxFormat[format.upper()]
         bounding_boxes = tensor.as_subclass(cls)
         bounding_boxes.format = format
         bounding_boxes.canvas_size = canvas_size
@@ -59,10 +61,6 @@ class BoundingBoxes(Datapoint):
         requires_grad: Optional[bool] = None,
     ) -> BoundingBoxes:
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        if isinstance(format, str):
-            format = BoundingBoxFormat[format.upper()]
-
         return cls._wrap(tensor, format=format, canvas_size=canvas_size)
 
     @classmethod
@@ -71,7 +69,7 @@ class BoundingBoxes(Datapoint):
         other: BoundingBoxes,
         tensor: torch.Tensor,
         *,
-        format: Optional[BoundingBoxFormat] = None,
+        format: Optional[Union[BoundingBoxFormat, str]] = None,
         canvas_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBoxes:
         """Wrap a :class:`torch.Tensor` as :class:`BoundingBoxes` from a reference.
@@ -85,9 +83,6 @@ class BoundingBoxes(Datapoint):
                 omitted, it is taken from the reference.
 
         """
-        if isinstance(format, str):
-            format = BoundingBoxFormat[format.upper()]
-
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index fae3c1865..9b1c64864 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -32,13 +32,9 @@ class Datapoint(torch.Tensor):
             requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
         return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
-    @classmethod
-    def _wrap(cls: Type[D], tensor: torch.Tensor) -> D:
-        return tensor.as_subclass(cls)
-
     @classmethod
     def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
-        return cls._wrap(tensor)
+        return tensor.as_subclass(cls)
 
     _NO_WRAPPING_EXCEPTIONS = {
         torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output),
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 9b635e8e0..cf7b8b1fc 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -41,7 +41,7 @@ class Image(Datapoint):
         elif tensor.ndim == 2:
             tensor = tensor.unsqueeze(0)
 
-        return cls._wrap(tensor)
+        return tensor.as_subclass(cls)
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index 95eda0779..e2bafcd68 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -36,4 +36,4 @@ class Mask(Datapoint):
             data = F.pil_to_tensor(data)
 
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-        return cls._wrap(tensor)
+        return tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index 842c05bf7..19ab0aa8d 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -31,7 +31,7 @@ class Video(Datapoint):
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         if data.ndim < 4:
             raise ValueError
-        return cls._wrap(tensor)
+        return tensor.as_subclass(cls)
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/datapoints/_label.py
index ac9b2d891..7ed2f7522 100644
--- a/torchvision/prototype/datapoints/_label.py
+++ b/torchvision/prototype/datapoints/_label.py
@@ -15,7 +15,7 @@ class _LabelBase(Datapoint):
     categories: Optional[Sequence[str]]
 
     @classmethod
-    def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L:  # type: ignore[override]
+    def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L:
         label_base = tensor.as_subclass(cls)
         label_base.categories = categories
         return label_base
-- 
GitLab


From 8faa1b14d383129877c7d233e8be848330980875 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 8 Aug 2023 10:17:12 +0100
Subject: [PATCH 543/624] Simplify query_bounding_boxes logic (#7786)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/common_utils.py                          |  6 +-
 test/test_datapoints.py                       |  8 +-
 test/test_prototype_transforms.py             |  6 +-
 test/test_transforms_v2.py                    | 12 ---
 test/test_transforms_v2_functional.py         | 78 +++++++------------
 test/transforms_v2_kernel_infos.py            | 13 +---
 torchvision/datapoints/_bounding_box.py       | 10 +++
 torchvision/prototype/transforms/_geometry.py |  4 +-
 torchvision/transforms/v2/_geometry.py        |  4 +-
 torchvision/transforms/v2/_misc.py            | 10 +--
 torchvision/transforms/v2/utils.py            | 13 ++--
 11 files changed, 67 insertions(+), 97 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index ec694cc81..8d5eb0475 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -691,7 +691,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORT
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
-    spatial_size = _parse_size(spatial_size, name="canvas_size")
+    spatial_size = _parse_size(spatial_size, name="spatial_size")
 
     def fn(shape, dtype, device):
         *batch_dims, num_coordinates = shape
@@ -702,12 +702,12 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORT
             format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
-    return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
+    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
 def make_bounding_box_loaders(
     *,
-    extra_dims=DEFAULT_EXTRA_DIMS,
+    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
     formats=tuple(datapoints.BoundingBoxFormat),
     spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     dtypes=(torch.float32, torch.float64, torch.int64),
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 25a2182e0..984caa2c3 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -22,7 +22,7 @@ def test_mask_instance(data):
     assert mask.ndim == 3 and mask.shape[0] == 1
 
 
-@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]])
 @pytest.mark.parametrize(
     "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
 )
@@ -35,6 +35,12 @@ def test_bbox_instance(data, format):
     assert bboxes.format == format
 
 
+def test_bbox_dim_error():
+    data_3d = [[[1, 2, 3, 4]]]
+    with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"):
+        datapoints.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32))
+
+
 @pytest.mark.parametrize(
     ("data", "input_requires_grad", "expected_requires_grad"),
     [
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index b1760f6f9..d395c2247 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -20,7 +20,7 @@ from prototype_common_utils import make_label
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
 from torchvision.transforms.v2._utils import _convert_fill_arg
-from torchvision.transforms.v2.functional import InterpolationMode, pil_to_tensor, to_image_pil
+from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_image_pil
 from torchvision.transforms.v2.utils import check_type, is_simple_tensor
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
@@ -306,7 +306,9 @@ class TestFixedSizeCrop:
         bounding_boxes = make_bounding_box(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes")
+        mock = mocker.patch(
+            "torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes", wraps=clamp_bounding_boxes
+        )
 
         transform = transforms.FixedSizeCrop((-1, -1))
         mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 49455b05d..353cc846b 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1654,18 +1654,6 @@ def test_sanitize_bounding_boxes_errors():
         different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
         transforms.SanitizeBoundingBoxes()(different_sizes)
 
-    with pytest.raises(ValueError, match="boxes must be of shape"):
-        bad_bbox = datapoints.BoundingBoxes(  # batch with 2 elements
-            [
-                [[0, 0, 10, 10]],
-                [[0, 0, 10, 10]],
-            ],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=(20, 20),
-        )
-        different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
-
 
 @pytest.mark.parametrize(
     "import_statement",
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 713737abb..bf447c8ce 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -711,21 +711,20 @@ def _parse_padding(padding):
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]])
 def test_correctness_pad_bounding_boxes(device, padding):
-    def _compute_expected_bbox(bbox, padding_):
+    def _compute_expected_bbox(bbox, format, padding_):
         pad_left, pad_up, _, _ = _parse_padding(padding_)
 
         dtype = bbox.dtype
-        format = bbox.format
         bbox = (
             bbox.clone()
             if format == datapoints.BoundingBoxFormat.XYXY
-            else convert_format_bounding_boxes(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
+            else convert_format_bounding_boxes(bbox, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
         )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_format_bounding_boxes(bbox, new_format=format)
+        bbox = convert_format_bounding_boxes(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format)
         if bbox.dtype != dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
@@ -737,7 +736,7 @@ def test_correctness_pad_bounding_boxes(device, padding):
         height, width = bbox.canvas_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
-    for bboxes in make_bounding_boxes():
+    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
@@ -748,18 +747,10 @@ def test_correctness_pad_bounding_boxes(device, padding):
 
         torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding))
 
-        if bboxes.ndim < 2 or bboxes.shape[0] == 0:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, padding))
+        expected_bboxes = torch.stack(
+            [_compute_expected_bbox(b, bboxes_format, padding) for b in bboxes.reshape(-1, 4).unbind()]
+        ).reshape(bboxes.shape)
 
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
 
 
@@ -784,7 +775,7 @@ def test_correctness_pad_segmentation_mask_on_fixed_input(device):
     ],
 )
 def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
-    def _compute_expected_bbox(bbox, pcoeffs_):
+    def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_):
         m1 = np.array(
             [
                 [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
@@ -798,7 +789,9 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
             ]
         )
 
-        bbox_xyxy = convert_format_bounding_boxes(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
+        bbox_xyxy = convert_format_bounding_boxes(
+            bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY
+        )
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -818,14 +811,11 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
                 np.max(transformed_points[:, 1]),
             ]
         )
-        out_bbox = datapoints.BoundingBoxes(
-            out_bbox,
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=bbox.canvas_size,
-            dtype=bbox.dtype,
-            device=bbox.device,
+        out_bbox = torch.from_numpy(out_bbox)
+        out_bbox = convert_format_bounding_boxes(
+            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_
         )
-        return clamp_bounding_boxes(convert_format_bounding_boxes(out_bbox, new_format=bbox.format))
+        return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox)
 
     canvas_size = (32, 38)
 
@@ -844,17 +834,13 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
             coefficients=pcoeffs,
         )
 
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
+        expected_bboxes = torch.stack(
+            [
+                _compute_expected_bbox(b, bboxes.format, bboxes.canvas_size, inv_pcoeffs)
+                for b in bboxes.reshape(-1, 4).unbind()
+            ]
+        ).reshape(bboxes.shape)
 
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, canvas_size=bboxes.canvas_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=0, atol=1)
 
 
@@ -864,9 +850,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
     [(18, 18), [18, 15], (16, 19), [12], [46, 48]],
 )
 def test_correctness_center_crop_bounding_boxes(device, output_size):
-    def _compute_expected_bbox(bbox, output_size_):
-        format_ = bbox.format
-        canvas_size_ = bbox.canvas_size
+    def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_):
         dtype = bbox.dtype
         bbox = convert_format_bounding_boxes(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
 
@@ -895,18 +879,12 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
             bboxes, bboxes_format, bboxes_canvas_size, output_size
         )
 
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
-
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
+        expected_bboxes = torch.stack(
+            [
+                _compute_expected_bbox(b, bboxes_format, bboxes_canvas_size, output_size)
+                for b in bboxes.reshape(-1, 4).unbind()
+            ]
+        ).reshape(bboxes.shape)
 
         torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
         torch.testing.assert_close(output_canvas_size, output_size)
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 01605f696..ac5651d32 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -222,16 +222,9 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
-    if bounding_boxes.ndim < 2:
-        bounding_boxes = [bounding_boxes]
-
-    expected_bboxes = [transform(bbox, affine_matrix, format, canvas_size) for bbox in bounding_boxes]
-    if len(expected_bboxes) > 1:
-        expected_bboxes = torch.stack(expected_bboxes)
-    else:
-        expected_bboxes = expected_bboxes[0]
-
-    return expected_bboxes
+    return torch.stack(
+        [transform(b, affine_matrix, format, canvas_size) for b in bounding_boxes.reshape(-1, 4).unbind()]
+    ).reshape(bounding_boxes.shape)
 
 
 def sample_inputs_convert_format_bounding_boxes():
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 9677cef21..d459a5544 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -26,6 +26,12 @@ class BoundingBoxFormat(Enum):
 class BoundingBoxes(Datapoint):
     """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
 
+    .. note::
+        There should be only one :class:`~torchvision.datapoints.BoundingBoxes`
+        instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``,
+        although one :class:`~torchvision.datapoints.BoundingBoxes` object can
+        contain multiple bounding boxes.
+
     Args:
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
@@ -43,6 +49,10 @@ class BoundingBoxes(Datapoint):
 
     @classmethod
     def _wrap(cls, tensor: torch.Tensor, *, format: Union[BoundingBoxFormat, str], canvas_size: Tuple[int, int]) -> BoundingBoxes:  # type: ignore[override]
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        elif tensor.ndim != 2:
+            raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
         if isinstance(format, str):
             format = BoundingBoxFormat[format.upper()]
         bounding_boxes = tensor.as_subclass(cls)
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index a4023ca21..e3819554d 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@ from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _get_fill, _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_size
+from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_simple_tensor, query_size
 
 
 class FixedSizeCrop(Transform):
@@ -61,7 +61,7 @@ class FixedSizeCrop(Transform):
 
         bounding_boxes: Optional[torch.Tensor]
         try:
-            bounding_boxes = query_bounding_boxes(flat_inputs)
+            bounding_boxes = get_bounding_boxes(flat_inputs)
         except ValueError:
             bounding_boxes = None
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index e43aa868a..23d4c971a 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -23,7 +23,7 @@ from ._utils import (
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_size
+from .utils import get_bounding_boxes, has_all, has_any, is_simple_tensor, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -1137,7 +1137,7 @@ class RandomIoUCrop(Transform):
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         orig_h, orig_w = query_size(flat_inputs)
-        bboxes = query_bounding_boxes(flat_inputs)
+        bboxes = get_bounding_boxes(flat_inputs)
 
         while True:
             # sample an option
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index a799070ee..d2dddd96d 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -10,7 +10,7 @@ from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
-from .utils import has_any, is_simple_tensor, query_bounding_boxes
+from .utils import get_bounding_boxes, has_any, is_simple_tensor
 
 
 # TODO: do we want/need to expose this?
@@ -384,13 +384,7 @@ class SanitizeBoundingBoxes(Transform):
             )
 
         flat_inputs, spec = tree_flatten(inputs)
-        # TODO: this enforces one single BoundingBoxes entry.
-        # Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
-        # should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
-        boxes = query_bounding_boxes(flat_inputs)
-
-        if boxes.ndim != 2:
-            raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")
+        boxes = get_bounding_boxes(flat_inputs)
 
         if labels is not None and boxes.shape[0] != labels.shape[0]:
             raise ValueError(
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
index dd9f4489d..1d9219fb4 100644
--- a/torchvision/transforms/v2/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -9,13 +9,12 @@ from torchvision._utils import sequence_to_str
 from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor
 
 
-def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
-    bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)]
-    if not bounding_boxes:
-        raise TypeError("No bounding boxes were found in the sample")
-    elif len(bounding_boxes) > 1:
-        raise ValueError("Found multiple bounding boxes instances in the sample")
-    return bounding_boxes.pop()
+def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
+    # This assumes there is only one bbox per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes))
+    except StopIteration:
+        raise ValueError("No bounding boxes were found in the sample")
 
 
 def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
-- 
GitLab


From 2ab937a07d6a3d2486edef945ec8a2de16439e95 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 9 Aug 2023 09:38:55 +0100
Subject: [PATCH 544/624] Change default pytest traceback from native to short
 (#7810)

---
 pytest.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index a2f59ecec..8d52b55d5 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,7 +3,7 @@ addopts =
     # show tests that (f)ailed, (E)rror, or (X)passed in the summary
     -rfEX
     # Make tracebacks shorter
-    --tb=native
+    --tb=short
     # enable all warnings
     -Wd
     --ignore=test/test_datasets_download.py
-- 
GitLab


From 5d8d61acc907a0e80bfee6dd35a22e4c75f16e82 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 9 Aug 2023 10:48:04 +0200
Subject: [PATCH 545/624] add PermuteChannels transform (#7624)

---
 docs/source/transforms.rst                    |  1 +
 test/test_transforms_v2.py                    |  1 +
 test/test_transforms_v2_refactored.py         | 58 +++++++++++++++++
 torchvision/transforms/v2/__init__.py         |  1 +
 torchvision/transforms/v2/_color.py           | 39 ++++++-----
 .../transforms/v2/functional/__init__.py      |  4 ++
 .../transforms/v2/functional/_color.py        | 65 ++++++++++++++++++-
 7 files changed, 151 insertions(+), 18 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index a1858c6b5..0df46c925 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -155,6 +155,7 @@ Color
 
     ColorJitter
     v2.ColorJitter
+    v2.RandomChannelPermutation
     v2.RandomPhotometricDistort
     Grayscale
     v2.Grayscale
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 353cc846b..5f4a9b628 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -124,6 +124,7 @@ class TestSmoke:
             (transforms.RandomEqualize(p=1.0), None),
             (transforms.RandomGrayscale(p=1.0), None),
             (transforms.RandomInvert(p=1.0), None),
+            (transforms.RandomChannelPermutation(), None),
             (transforms.RandomPhotometricDistort(p=1.0), None),
             (transforms.RandomPosterize(bits=4, p=1.0), None),
             (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index c910882f9..fa04d5deb 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2280,3 +2280,61 @@ class TestGetKernel:
         _register_kernel_internal(F.resize, MyDatapoint, datapoint_wrapper=False)(resize_my_datapoint)
 
         assert _get_kernel(F.resize, MyDatapoint) is resize_my_datapoint
+
+
+class TestPermuteChannels:
+    _DEFAULT_PERMUTATION = [2, 0, 1]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.permute_channels_image_tensor, make_image_tensor),
+            # FIXME
+            # check_kernel does not support PIL kernel, but it should
+            (F.permute_channels_image_tensor, make_image),
+            (F.permute_channels_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, kernel, make_input, dtype, device):
+        check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION)
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.permute_channels_image_tensor, make_image_tensor),
+            (F.permute_channels_image_pil, make_image_pil),
+            (F.permute_channels_image_tensor, make_image),
+            (F.permute_channels_video, make_video),
+        ],
+    )
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.permute_channels, kernel, make_input(), permutation=self._DEFAULT_PERMUTATION)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.permute_channels_image_tensor, torch.Tensor),
+            (F.permute_channels_image_pil, PIL.Image.Image),
+            (F.permute_channels_image_tensor, datapoints.Image),
+            (F.permute_channels_video, datapoints.Video),
+        ],
+    )
+    def test_dispatcher_signature(self, kernel, input_type):
+        check_dispatcher_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type)
+
+    def reference_image_correctness(self, image, permutation):
+        channel_images = image.split(1, dim=-3)
+        permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation]
+        return datapoints.Image(torch.concat(permuted_channel_images, dim=-3))
+
+    @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]])
+    @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)])
+    def test_image_correctness(self, permutation, batch_dims):
+        image = make_image(batch_dims=batch_dims)
+
+        actual = F.permute_channels(image, permutation=permutation)
+        expected = self.reference_image_correctness(image, permutation=permutation)
+
+        torch.testing.assert_close(actual, expected)
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 8ce9bee9b..4451cb7a1 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -11,6 +11,7 @@ from ._color import (
     Grayscale,
     RandomAdjustSharpness,
     RandomAutocontrast,
+    RandomChannelPermutation,
     RandomEqualize,
     RandomGrayscale,
     RandomInvert,
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 7dd8eeae2..8315e2f36 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -177,7 +177,27 @@ class ColorJitter(Transform):
         return output
 
 
-# TODO: This class seems to be untested
+class RandomChannelPermutation(Transform):
+    """[BETA] Randomly permute the channels of an image or video
+
+    .. v2betastatus:: RandomChannelPermutation transform
+    """
+
+    _transformed_types = (
+        datapoints.Image,
+        PIL.Image.Image,
+        is_simple_tensor,
+        datapoints.Video,
+    )
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        return dict(permutation=torch.randperm(num_channels))
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return F.permute_channels(inpt, params["permutation"])
+
+
 class RandomPhotometricDistort(Transform):
     """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
     MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
@@ -241,21 +261,6 @@ class RandomPhotometricDistort(Transform):
         params["channel_permutation"] = torch.randperm(num_channels) if torch.rand(1) < self.p else None
         return params
 
-    def _permute_channels(
-        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], permutation: torch.Tensor
-    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
-        orig_inpt = inpt
-        if isinstance(orig_inpt, PIL.Image.Image):
-            inpt = F.pil_to_tensor(inpt)
-
-        # TODO: Find a better fix than as_subclass???
-        output = inpt[..., permutation, :, :].as_subclass(type(inpt))
-
-        if isinstance(orig_inpt, PIL.Image.Image):
-            output = F.to_image_pil(output)
-
-        return output
-
     def _transform(
         self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
     ) -> Union[datapoints._ImageType, datapoints._VideoType]:
@@ -270,7 +275,7 @@ class RandomPhotometricDistort(Transform):
         if params["contrast_factor"] is not None and not params["contrast_before"]:
             inpt = F.adjust_contrast(inpt, contrast_factor=params["contrast_factor"])
         if params["channel_permutation"] is not None:
-            inpt = self._permute_channels(inpt, permutation=params["channel_permutation"])
+            inpt = F.permute_channels(inpt, permutation=params["channel_permutation"])
         return inpt
 
 
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 163a55fad..f32958601 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -62,6 +62,10 @@ from ._color import (
     invert_image_pil,
     invert_image_tensor,
     invert_video,
+    permute_channels,
+    permute_channels_image_pil,
+    permute_channels_image_tensor,
+    permute_channels_video,
     posterize,
     posterize_image_pil,
     posterize_image_tensor,
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 71797fd25..9b6bf3886 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import List, Union
 
 import PIL.Image
 import torch
@@ -10,6 +10,8 @@ from torchvision.transforms._functional_tensor import _max_value
 from torchvision.utils import _log_api_usage_once
 
 from ._misc import _num_value_bits, to_dtype_image_tensor
+
+from ._type_conversion import pil_to_tensor, to_image_pil
 from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
 
 
@@ -641,3 +643,64 @@ invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert
 @_register_kernel_internal(invert, datapoints.Video)
 def invert_video(video: torch.Tensor) -> torch.Tensor:
     return invert_image_tensor(video)
+
+
+@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
+def permute_channels(inpt: datapoints._InputTypeJIT, permutation: List[int]) -> datapoints._InputTypeJIT:
+    """Permute the channels of the input according to the given permutation.
+
+    This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and
+    :class:`torchvision.datapoints.Image` and :class:`torchvision.datapoints.Video`.
+
+    Example:
+        >>> rgb_image = torch.rand(3, 256, 256)
+        >>> bgr_image = F.permutate_channels(rgb_image, permutation=[2, 1, 0])
+
+    Args:
+        permutation (List[int]): Valid permutation of the input channel indices. The index of the element determines the
+            channel index in the input and the value determines the channel index in the output. For example,
+            ``permutation=[2, 0 , 1]``
+
+            - takes ``ìnpt[..., 0, :, :]`` and puts it at ``output[..., 2, :, :]``,
+            - takes ``ìnpt[..., 1, :, :]`` and puts it at ``output[..., 0, :, :]``, and
+            - takes ``ìnpt[..., 2, :, :]`` and puts it at ``output[..., 1, :, :]``.
+
+    Raises:
+        ValueError: If ``len(permutation)`` doesn't match the number of channels in the input.
+    """
+    if torch.jit.is_scripting():
+        return permute_channels_image_tensor(inpt, permutation=permutation)
+
+    _log_api_usage_once(permute_channels)
+
+    kernel = _get_kernel(permute_channels, type(inpt))
+    return kernel(inpt, permutation=permutation)
+
+
+@_register_kernel_internal(permute_channels, torch.Tensor)
+@_register_kernel_internal(permute_channels, datapoints.Image)
+def permute_channels_image_tensor(image: torch.Tensor, permutation: List[int]) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    if len(permutation) != num_channels:
+        raise ValueError(
+            f"Length of permutation does not match number of channels: " f"{len(permutation)} != {num_channels}"
+        )
+
+    if image.numel() == 0:
+        return image
+
+    image = image.reshape(-1, num_channels, height, width)
+    image = image[:, permutation, :, :]
+    return image.reshape(shape)
+
+
+@_register_kernel_internal(permute_channels, PIL.Image.Image)
+def permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) -> PIL.Image:
+    return to_image_pil(permute_channels_image_tensor(pil_to_tensor(image), permutation=permutation))
+
+
+@_register_kernel_internal(permute_channels, datapoints.Video)
+def permute_channels_video(video: torch.Tensor, permutation: List[int]) -> torch.Tensor:
+    return permute_channels_image_tensor(video, permutation=permutation)
-- 
GitLab


From 6b020798524f538b6e5ffe61648d63682695ab91 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 9 Aug 2023 11:34:30 +0200
Subject: [PATCH 546/624] cleanup v2 tests (#7812)

---
 test/test_transforms_v2_refactored.py | 110 ++++++--------------------
 1 file changed, 25 insertions(+), 85 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index fa04d5deb..9028b304c 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -173,15 +173,7 @@ def _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs):
         dispatcher_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
 
 
-def check_dispatcher(
-    dispatcher,
-    # TODO: remove this parameter
-    kernel,
-    input,
-    *args,
-    check_scripted_smoke=True,
-    **kwargs,
-):
+def check_dispatcher(dispatcher, input, *args, check_scripted_smoke=True, **kwargs):
     unknown_input = object()
     with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
         dispatcher(unknown_input, *args, **kwargs)
@@ -516,20 +508,12 @@ class TestResize:
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.resize_image_tensor, make_image_tensor),
-            (F.resize_image_pil, make_image_pil),
-            (F.resize_image_tensor, make_image),
-            (F.resize_bounding_boxes, make_bounding_box),
-            (F.resize_mask, make_segmentation_mask),
-            (F.resize_video, make_video),
-        ],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, size, kernel, make_input):
+    def test_dispatcher(self, size, make_input):
         check_dispatcher(
             F.resize,
-            kernel,
             make_input(self.INPUT_SIZE),
             size=size,
             antialias=True,
@@ -805,18 +789,11 @@ class TestHorizontalFlip:
         check_kernel(F.horizontal_flip_video, make_video())
 
     @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.horizontal_flip_image_tensor, make_image_tensor),
-            (F.horizontal_flip_image_pil, make_image_pil),
-            (F.horizontal_flip_image_tensor, make_image),
-            (F.horizontal_flip_bounding_boxes, make_bounding_box),
-            (F.horizontal_flip_mask, make_segmentation_mask),
-            (F.horizontal_flip_video, make_video),
-        ],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, kernel, make_input):
-        check_dispatcher(F.horizontal_flip, kernel, make_input())
+    def test_dispatcher(self, make_input):
+        check_dispatcher(F.horizontal_flip, make_input())
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -988,18 +965,11 @@ class TestAffine:
         self._check_kernel(F.affine_video, make_video())
 
     @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.affine_image_tensor, make_image_tensor),
-            (F.affine_image_pil, make_image_pil),
-            (F.affine_image_tensor, make_image),
-            (F.affine_bounding_boxes, make_bounding_box),
-            (F.affine_mask, make_segmentation_mask),
-            (F.affine_video, make_video),
-        ],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, kernel, make_input):
-        check_dispatcher(F.affine, kernel, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+    def test_dispatcher(self, make_input):
+        check_dispatcher(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -1284,18 +1254,11 @@ class TestVerticalFlip:
         check_kernel(F.vertical_flip_video, make_video())
 
     @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.vertical_flip_image_tensor, make_image_tensor),
-            (F.vertical_flip_image_pil, make_image_pil),
-            (F.vertical_flip_image_tensor, make_image),
-            (F.vertical_flip_bounding_boxes, make_bounding_box),
-            (F.vertical_flip_mask, make_segmentation_mask),
-            (F.vertical_flip_video, make_video),
-        ],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, kernel, make_input):
-        check_dispatcher(F.vertical_flip, kernel, make_input())
+    def test_dispatcher(self, make_input):
+        check_dispatcher(F.vertical_flip, make_input())
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -1441,18 +1404,11 @@ class TestRotate:
         check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.rotate_image_tensor, make_image_tensor),
-            (F.rotate_image_pil, make_image_pil),
-            (F.rotate_image_tensor, make_image),
-            (F.rotate_bounding_boxes, make_bounding_box),
-            (F.rotate_mask, make_segmentation_mask),
-            (F.rotate_video, make_video),
-        ],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, kernel, make_input):
-        check_dispatcher(F.rotate, kernel, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+    def test_dispatcher(self, make_input):
+        check_dispatcher(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -1711,22 +1667,14 @@ class TestToDtype:
             scale=scale,
         )
 
-    @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.to_dtype_image_tensor, make_image_tensor),
-            (F.to_dtype_image_tensor, make_image),
-            (F.to_dtype_video, make_video),
-        ],
-    )
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
     @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
     @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("scale", (True, False))
-    def test_dispatcher(self, kernel, make_input, input_dtype, output_dtype, device, scale):
+    def test_dispatcher(self, make_input, input_dtype, output_dtype, device, scale):
         check_dispatcher(
             F.to_dtype,
-            kernel,
             make_input(dtype=input_dtype, device=device),
             dtype=output_dtype,
             scale=scale,
@@ -1890,17 +1838,9 @@ class TestAdjustBrightness:
     def test_kernel(self, kernel, make_input, dtype, device):
         check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
 
-    @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.adjust_brightness_image_tensor, make_image_tensor),
-            (F.adjust_brightness_image_pil, make_image_pil),
-            (F.adjust_brightness_image_tensor, make_image),
-            (F.adjust_brightness_video, make_video),
-        ],
-    )
-    def test_dispatcher(self, kernel, make_input):
-        check_dispatcher(F.adjust_brightness, kernel, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_dispatcher(self, make_input):
+        check_dispatcher(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
-- 
GitLab


From 641fdd9f71a17f6088269efdf7b9e311a14ee548 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 9 Aug 2023 12:18:19 +0200
Subject: [PATCH 547/624] remove custom types defintions from datapoints module
 (#7814)

---
 torchvision/datapoints/__init__.py            |   6 +-
 torchvision/datapoints/_datapoint.py          |   9 +-
 torchvision/datapoints/_image.py              |   6 -
 torchvision/datapoints/_video.py              |   6 -
 torchvision/prototype/transforms/_augment.py  |  10 +-
 torchvision/prototype/transforms/_geometry.py |   4 +-
 torchvision/prototype/transforms/_misc.py     |   8 +-
 torchvision/transforms/v2/_auto_augment.py    |  24 ++--
 torchvision/transforms/v2/_color.py           |   4 +-
 torchvision/transforms/v2/_geometry.py        |  18 ++-
 torchvision/transforms/v2/_misc.py            |   4 +-
 torchvision/transforms/v2/_temporal.py        |   3 +-
 torchvision/transforms/v2/_utils.py           |   5 +-
 .../transforms/v2/functional/_augment.py      |   6 +-
 .../transforms/v2/functional/_color.py        |  30 ++--
 .../transforms/v2/functional/_deprecated.py   |   5 +-
 .../transforms/v2/functional/_geometry.py     | 130 +++++++++---------
 torchvision/transforms/v2/functional/_meta.py |  18 +--
 torchvision/transforms/v2/functional/_misc.py |  16 +--
 .../transforms/v2/functional/_temporal.py     |   2 +-
 .../transforms/v2/functional/_utils.py        |   5 +-
 21 files changed, 141 insertions(+), 178 deletions(-)

diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index 03469ca0c..de6f975e4 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,10 +1,10 @@
 from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
 
 from ._bounding_box import BoundingBoxes, BoundingBoxFormat
-from ._datapoint import _FillType, _FillTypeJIT, _InputType, _InputTypeJIT, Datapoint
-from ._image import _ImageType, _ImageTypeJIT, _TensorImageType, _TensorImageTypeJIT, Image
+from ._datapoint import Datapoint
+from ._image import Image
 from ._mask import Mask
-from ._video import _TensorVideoType, _TensorVideoTypeJIT, _VideoType, _VideoTypeJIT, Video
+from ._video import Video
 
 if _WARN_ABOUT_BETA_TRANSFORMS:
     import warnings
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 9b1c64864..af6d5929d 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -1,16 +1,13 @@
 from __future__ import annotations
 
-from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
 
-import PIL.Image
 import torch
 from torch._C import DisableTorchFunctionSubclass
 from torch.types import _device, _dtype, _size
 
 
 D = TypeVar("D", bound="Datapoint")
-_FillType = Union[int, float, Sequence[int], Sequence[float], None]
-_FillTypeJIT = Optional[List[float]]
 
 
 class Datapoint(torch.Tensor):
@@ -132,7 +129,3 @@ class Datapoint(torch.Tensor):
         # `BoundingBoxes.format` and `BoundingBoxes.canvas_size`, which are immutable and thus implicitly deep-copied by
         # `BoundingBoxes.clone()`.
         return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
-
-
-_InputType = Union[torch.Tensor, PIL.Image.Image, Datapoint]
-_InputTypeJIT = torch.Tensor
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index cf7b8b1fc..609ace90d 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -45,9 +45,3 @@ class Image(Datapoint):
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
-
-
-_ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
-_ImageTypeJIT = torch.Tensor
-_TensorImageType = Union[torch.Tensor, Image]
-_TensorImageTypeJIT = torch.Tensor
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index 19ab0aa8d..f6cc80fab 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -35,9 +35,3 @@ class Video(Datapoint):
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
-
-
-_VideoType = Union[torch.Tensor, Video]
-_VideoTypeJIT = torch.Tensor
-_TensorVideoType = Union[torch.Tensor, Video]
-_TensorVideoTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 95585fe28..53f3f8013 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -26,15 +26,15 @@ class SimpleCopyPaste(Transform):
 
     def _copy_paste(
         self,
-        image: datapoints._TensorImageType,
+        image: Union[torch.Tensor, datapoints.Image],
         target: Dict[str, Any],
-        paste_image: datapoints._TensorImageType,
+        paste_image: Union[torch.Tensor, datapoints.Image],
         paste_target: Dict[str, Any],
         random_selection: torch.Tensor,
         blending: bool,
         resize_interpolation: F.InterpolationMode,
         antialias: Optional[bool],
-    ) -> Tuple[datapoints._TensorImageType, Dict[str, Any]]:
+    ) -> Tuple[torch.Tensor, Dict[str, Any]]:
 
         paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection])
         paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection])
@@ -106,7 +106,7 @@ class SimpleCopyPaste(Transform):
 
     def _extract_image_targets(
         self, flat_sample: List[Any]
-    ) -> Tuple[List[datapoints._TensorImageType], List[Dict[str, Any]]]:
+    ) -> Tuple[List[Union[torch.Tensor, datapoints.Image]], List[Dict[str, Any]]]:
         # fetch all images, bboxes, masks and labels from unstructured input
         # with List[image], List[BoundingBoxes], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
@@ -137,7 +137,7 @@ class SimpleCopyPaste(Transform):
     def _insert_outputs(
         self,
         flat_sample: List[Any],
-        output_images: List[datapoints._TensorImageType],
+        output_images: List[torch.Tensor],
         output_targets: List[Dict[str, Any]],
     ) -> None:
         c0, c1, c2, c3 = 0, 0, 0, 0
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index e3819554d..1a2802db0 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -6,7 +6,7 @@ import torch
 from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
-from torchvision.transforms.v2._utils import _get_fill, _setup_fill_arg, _setup_size
+from torchvision.transforms.v2._utils import _FillType, _get_fill, _setup_fill_arg, _setup_size
 from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_simple_tensor, query_size
 
 
@@ -14,7 +14,7 @@ class FixedSizeCrop(Transform):
     def __init__(
         self,
         size: Union[int, Sequence[int]],
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
         padding_mode: str = "constant",
     ) -> None:
         super().__init__()
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 51a2ea907..f1b859aac 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -39,9 +39,7 @@ class PermuteDimensions(Transform):
             )
         self.dims = dims
 
-    def _transform(
-        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
-    ) -> torch.Tensor:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
         dims = self.dims[type(inpt)]
         if dims is None:
             return inpt.as_subclass(torch.Tensor)
@@ -63,9 +61,7 @@ class TransposeDimensions(Transform):
             )
         self.dims = dims
 
-    def _transform(
-        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
-    ) -> torch.Tensor:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
         dims = self.dims[type(inpt)]
         if dims is None:
             return inpt.as_subclass(torch.Tensor)
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 146c8c236..26eb3abbc 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -10,17 +10,21 @@ from torchvision.transforms import _functional_tensor as _FT
 from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
 from torchvision.transforms.v2.functional._meta import get_size
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
 from ._utils import _get_fill, _setup_fill_arg
 from .utils import check_type, is_simple_tensor
 
 
+ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video]
+
+
 class _AutoAugmentBase(Transform):
     def __init__(
         self,
         *,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
     ) -> None:
         super().__init__()
         self.interpolation = _check_interpolation(interpolation)
@@ -35,7 +39,7 @@ class _AutoAugmentBase(Transform):
         self,
         inputs: Any,
         unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBoxes, datapoints.Mask),
-    ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints._ImageType, datapoints._VideoType]]:
+    ) -> Tuple[Tuple[List[Any], TreeSpec, int], ImageOrVideo]:
         flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
         needs_transform_list = self._needs_transform_list(flat_inputs)
 
@@ -68,7 +72,7 @@ class _AutoAugmentBase(Transform):
     def _unflatten_and_insert_image_or_video(
         self,
         flat_inputs_with_spec: Tuple[List[Any], TreeSpec, int],
-        image_or_video: Union[datapoints._ImageType, datapoints._VideoType],
+        image_or_video: ImageOrVideo,
     ) -> Any:
         flat_inputs, spec, idx = flat_inputs_with_spec
         flat_inputs[idx] = image_or_video
@@ -76,12 +80,12 @@ class _AutoAugmentBase(Transform):
 
     def _apply_image_or_video_transform(
         self,
-        image: Union[datapoints._ImageType, datapoints._VideoType],
+        image: ImageOrVideo,
         transform_id: str,
         magnitude: float,
         interpolation: Union[InterpolationMode, int],
-        fill: Dict[Union[Type, str], datapoints._FillTypeJIT],
-    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
+        fill: Dict[Union[Type, str], _FillTypeJIT],
+    ) -> ImageOrVideo:
         fill_ = _get_fill(fill, type(image))
 
         if transform_id == "Identity":
@@ -214,7 +218,7 @@ class AutoAugment(_AutoAugmentBase):
         self,
         policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.policy = policy
@@ -394,7 +398,7 @@ class RandAugment(_AutoAugmentBase):
         magnitude: int = 9,
         num_magnitude_bins: int = 31,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_ops = num_ops
@@ -467,7 +471,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
         self,
         num_magnitude_bins: int = 31,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
     ):
         super().__init__(interpolation=interpolation, fill=fill)
         self.num_magnitude_bins = num_magnitude_bins
@@ -550,7 +554,7 @@ class AugMix(_AutoAugmentBase):
         alpha: float = 1.0,
         all_ops: bool = True,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = None,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
     ) -> None:
         super().__init__(interpolation=interpolation, fill=fill)
         self._PARAMETER_MAX = 10
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 8315e2f36..90e3ce2ff 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -261,9 +261,7 @@ class RandomPhotometricDistort(Transform):
         params["channel_permutation"] = torch.randperm(num_channels) if torch.rand(1) < self.p else None
         return params
 
-    def _transform(
-        self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
-    ) -> Union[datapoints._ImageType, datapoints._VideoType]:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["brightness_factor"] is not None:
             inpt = F.adjust_brightness(inpt, brightness_factor=params["brightness_factor"])
         if params["contrast_factor"] is not None and params["contrast_before"]:
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 23d4c971a..5c2850569 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -11,6 +11,7 @@ from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.functional._utils import _FillType
 
 from ._transform import _RandomApplyTransform
 from ._utils import (
@@ -311,9 +312,6 @@ class RandomResizedCrop(Transform):
         )
 
 
-ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
-
-
 class FiveCrop(Transform):
     """[BETA] Crop the image or video into four corners and the central crop.
 
@@ -459,7 +457,7 @@ class Pad(Transform):
     def __init__(
         self,
         padding: Union[int, Sequence[int]],
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -514,7 +512,7 @@ class RandomZoomOut(_RandomApplyTransform):
 
     def __init__(
         self,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
         side_range: Sequence[float] = (1.0, 4.0),
         p: float = 0.5,
     ) -> None:
@@ -592,7 +590,7 @@ class RandomRotation(Transform):
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
         center: Optional[List[float]] = None,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
     ) -> None:
         super().__init__()
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -674,7 +672,7 @@ class RandomAffine(Transform):
         scale: Optional[Sequence[float]] = None,
         shear: Optional[Union[int, float, Sequence[float]]] = None,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
         center: Optional[List[float]] = None,
     ) -> None:
         super().__init__()
@@ -812,7 +810,7 @@ class RandomCrop(Transform):
         size: Union[int, Sequence[int]],
         padding: Optional[Union[int, Sequence[int]]] = None,
         pad_if_needed: bool = False,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
         padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
     ) -> None:
         super().__init__()
@@ -931,7 +929,7 @@ class RandomPerspective(_RandomApplyTransform):
         distortion_scale: float = 0.5,
         p: float = 0.5,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
     ) -> None:
         super().__init__(p=p)
 
@@ -1033,7 +1031,7 @@ class ElasticTransform(Transform):
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-        fill: Union[datapoints._FillType, Dict[Union[Type, str], datapoints._FillType]] = 0,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
     ) -> None:
         super().__init__()
         self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index d2dddd96d..da71cebb4 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -169,9 +169,7 @@ class Normalize(Transform):
         if has_any(sample, PIL.Image.Image):
             raise TypeError(f"{type(self).__name__}() does not support PIL images.")
 
-    def _transform(
-        self, inpt: Union[datapoints._TensorImageType, datapoints._TensorVideoType], params: Dict[str, Any]
-    ) -> Any:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index 868314e9e..591341e7c 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict
 
 import torch
-from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F, Transform
 
 
@@ -25,5 +24,5 @@ class UniformTemporalSubsample(Transform):
         super().__init__()
         self.num_samples = num_samples
 
-    def _transform(self, inpt: datapoints._VideoType, params: Dict[str, Any]) -> datapoints._VideoType:
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return F.uniform_temporal_subsample(inpt, self.num_samples)
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index a7826a664..f9d9bae49 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -5,9 +5,8 @@ from typing import Any, Callable, Dict, Literal, Optional, Sequence, Type, Union
 
 import torch
 
-from torchvision import datapoints
-from torchvision.datapoints._datapoint import _FillType, _FillTypeJIT
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
 
 def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size: int = 2) -> Sequence[float]:
@@ -36,7 +35,7 @@ def _check_fill_arg(fill: Union[_FillType, Dict[Union[Type, str], _FillType]]) -
             raise TypeError("Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed.")
 
 
-def _convert_fill_arg(fill: datapoints._FillType) -> datapoints._FillTypeJIT:
+def _convert_fill_arg(fill: _FillType) -> _FillTypeJIT:
     # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
     # So, we can't reassign fill to 0
     # if fill is None:
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 89fa25437..1497638f6 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -1,5 +1,3 @@
-from typing import Union
-
 import PIL.Image
 
 import torch
@@ -12,14 +10,14 @@ from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_inter
 
 @_register_explicit_noop(datapoints.Mask, datapoints.BoundingBoxes, warn_passthrough=True)
 def erase(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT],
+    inpt: torch.Tensor,
     i: int,
     j: int,
     h: int,
     w: int,
     v: torch.Tensor,
     inplace: bool = False,
-) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
 
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 9b6bf3886..9ba88d31b 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List
 
 import PIL.Image
 import torch
@@ -16,9 +16,7 @@ from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_inter
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, datapoints.Video)
-def rgb_to_grayscale(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], num_output_channels: int = 1
-) -> Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]:
+def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
     if torch.jit.is_scripting():
         return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
 
@@ -73,7 +71,7 @@ def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Te
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def adjust_brightness(inpt: datapoints._InputTypeJIT, brightness_factor: float) -> datapoints._InputTypeJIT:
+def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
 
@@ -110,7 +108,7 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def adjust_saturation(inpt: datapoints._InputTypeJIT, saturation_factor: float) -> datapoints._InputTypeJIT:
+def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
 
@@ -149,7 +147,7 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def adjust_contrast(inpt: datapoints._InputTypeJIT, contrast_factor: float) -> datapoints._InputTypeJIT:
+def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
 
@@ -188,7 +186,7 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def adjust_sharpness(inpt: datapoints._InputTypeJIT, sharpness_factor: float) -> datapoints._InputTypeJIT:
+def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
 
@@ -261,7 +259,7 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def adjust_hue(inpt: datapoints._InputTypeJIT, hue_factor: float) -> datapoints._InputTypeJIT:
+def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
 
@@ -373,7 +371,7 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def adjust_gamma(inpt: datapoints._InputTypeJIT, gamma: float, gain: float = 1) -> datapoints._InputTypeJIT:
+def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
 
@@ -413,7 +411,7 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def posterize(inpt: datapoints._InputTypeJIT, bits: int) -> datapoints._InputTypeJIT:
+def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
     if torch.jit.is_scripting():
         return posterize_image_tensor(inpt, bits=bits)
 
@@ -447,7 +445,7 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def solarize(inpt: datapoints._InputTypeJIT, threshold: float) -> datapoints._InputTypeJIT:
+def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return solarize_image_tensor(inpt, threshold=threshold)
 
@@ -475,7 +473,7 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def autocontrast(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return autocontrast_image_tensor(inpt)
 
@@ -525,7 +523,7 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def equalize(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+def equalize(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return equalize_image_tensor(inpt)
 
@@ -615,7 +613,7 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def invert(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+def invert(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return invert_image_tensor(inpt)
 
@@ -646,7 +644,7 @@ def invert_video(video: torch.Tensor) -> torch.Tensor:
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def permute_channels(inpt: datapoints._InputTypeJIT, permutation: List[int]) -> datapoints._InputTypeJIT:
+def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor:
     """Permute the channels of the input according to the given permutation.
 
     This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index f27d0b29d..1cb7f50e5 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -1,9 +1,8 @@
 import warnings
-from typing import Any, List, Union
+from typing import Any, List
 
 import torch
 
-from torchvision import datapoints
 from torchvision.transforms import functional as _F
 
 
@@ -16,7 +15,7 @@ def to_tensor(inpt: Any) -> torch.Tensor:
     return _F.to_tensor(inpt)
 
 
-def get_image_size(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
+def get_image_size(inpt: torch.Tensor) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
         "Instead, please use `get_size(...)` which returns `[h, w]` instead of `[w, h]`."
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index bb19def2c..6416a143c 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -25,7 +25,13 @@ from torchvision.utils import _log_api_usage_once
 
 from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil
 
-from ._utils import _get_kernel, _register_explicit_noop, _register_five_ten_crop_kernel, _register_kernel_internal
+from ._utils import (
+    _FillTypeJIT,
+    _get_kernel,
+    _register_explicit_noop,
+    _register_five_ten_crop_kernel,
+    _register_kernel_internal,
+)
 
 
 def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
@@ -39,7 +45,7 @@ def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> Interp
     return interpolation
 
 
-def horizontal_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return horizontal_flip_image_tensor(inpt)
 
@@ -95,7 +101,7 @@ def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image_tensor(video)
 
 
-def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
+def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return vertical_flip_image_tensor(inpt)
 
@@ -171,12 +177,12 @@ def _compute_resized_output_size(
 
 
 def resize(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     size: List[int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return resize_image_tensor(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
@@ -364,15 +370,15 @@ def resize_video(
 
 
 def affine(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     angle: Union[int, float],
     translate: List[float],
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return affine_image_tensor(
             inpt,
@@ -549,9 +555,7 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
     return int(size[0]), int(size[1])  # w, h
 
 
-def _apply_grid_transform(
-    img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints._FillTypeJIT
-) -> torch.Tensor:
+def _apply_grid_transform(img: torch.Tensor, grid: torch.Tensor, mode: str, fill: _FillTypeJIT) -> torch.Tensor:
 
     # We are using context knowledge that grid should have float dtype
     fp = img.dtype == grid.dtype
@@ -592,7 +596,7 @@ def _assert_grid_transform_inputs(
     image: torch.Tensor,
     matrix: Optional[List[float]],
     interpolation: str,
-    fill: datapoints._FillTypeJIT,
+    fill: _FillTypeJIT,
     supported_interpolation_modes: List[str],
     coeffs: Optional[List[float]] = None,
 ) -> None:
@@ -657,7 +661,7 @@ def affine_image_tensor(
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     interpolation = _check_interpolation(interpolation)
@@ -709,7 +713,7 @@ def affine_image_pil(
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     interpolation = _check_interpolation(interpolation)
@@ -868,7 +872,7 @@ def affine_mask(
     translate: List[float],
     scale: float,
     shear: List[float],
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
@@ -901,7 +905,7 @@ def _affine_mask_dispatch(
     translate: List[float],
     scale: float,
     shear: List[float],
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
     **kwargs,
 ) -> datapoints.Mask:
@@ -925,7 +929,7 @@ def affine_video(
     scale: float,
     shear: List[float],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     return affine_image_tensor(
@@ -941,13 +945,13 @@ def affine_video(
 
 
 def rotate(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     angle: float,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
-) -> datapoints._InputTypeJIT:
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return rotate_image_tensor(
             inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center
@@ -967,7 +971,7 @@ def rotate_image_tensor(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     interpolation = _check_interpolation(interpolation)
 
@@ -1012,7 +1016,7 @@ def rotate_image_pil(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> PIL.Image.Image:
     interpolation = _check_interpolation(interpolation)
 
@@ -1068,7 +1072,7 @@ def rotate_mask(
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1097,7 +1101,7 @@ def _rotate_mask_dispatch(
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     **kwargs,
 ) -> datapoints.Mask:
     output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
@@ -1111,17 +1115,17 @@ def rotate_video(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
     expand: bool = False,
     center: Optional[List[float]] = None,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
 
 def pad(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     padding: List[int],
     fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return pad_image_tensor(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
 
@@ -1336,7 +1340,7 @@ def pad_video(
     return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode)
 
 
-def crop(inpt: datapoints._InputTypeJIT, top: int, left: int, height: int, width: int) -> datapoints._InputTypeJIT:
+def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     if torch.jit.is_scripting():
         return crop_image_tensor(inpt, top=top, left=left, height=height, width=width)
 
@@ -1423,13 +1427,13 @@ def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int
 
 
 def perspective(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return perspective_image_tensor(
             inpt,
@@ -1507,7 +1511,7 @@ def perspective_image_tensor(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
@@ -1554,7 +1558,7 @@ def perspective_image_pil(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BICUBIC,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
@@ -1679,7 +1683,7 @@ def perspective_mask(
     mask: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
@@ -1703,7 +1707,7 @@ def _perspective_mask_dispatch(
     inpt: datapoints.Mask,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
     **kwargs,
 ) -> datapoints.Mask:
@@ -1723,7 +1727,7 @@ def perspective_video(
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     return perspective_image_tensor(
@@ -1732,11 +1736,11 @@ def perspective_video(
 
 
 def elastic(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
-) -> datapoints._InputTypeJIT:
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return elastic_image_tensor(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
 
@@ -1755,7 +1759,7 @@ def elastic_image_tensor(
     image: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     interpolation = _check_interpolation(interpolation)
 
@@ -1812,7 +1816,7 @@ def elastic_image_pil(
     image: PIL.Image.Image,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
     output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill)
@@ -1895,7 +1899,7 @@ def _elastic_bounding_boxes_dispatch(
 def elastic_mask(
     mask: torch.Tensor,
     displacement: torch.Tensor,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1913,7 +1917,7 @@ def elastic_mask(
 
 @_register_kernel_internal(elastic, datapoints.Mask, datapoint_wrapper=False)
 def _elastic_mask_dispatch(
-    inpt: datapoints.Mask, displacement: torch.Tensor, fill: datapoints._FillTypeJIT = None, **kwargs
+    inpt: datapoints.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
 ) -> datapoints.Mask:
     output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
     return datapoints.Mask.wrap_like(inpt, output)
@@ -1924,12 +1928,12 @@ def elastic_video(
     video: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    fill: datapoints._FillTypeJIT = None,
+    fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
 
 
-def center_crop(inpt: datapoints._InputTypeJIT, output_size: List[int]) -> datapoints._InputTypeJIT:
+def center_crop(inpt: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     if torch.jit.is_scripting():
         return center_crop_image_tensor(inpt, output_size=output_size)
 
@@ -2049,7 +2053,7 @@ def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tens
 
 
 def resized_crop(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     top: int,
     left: int,
     height: int,
@@ -2057,7 +2061,7 @@ def resized_crop(
     size: List[int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     if torch.jit.is_scripting():
         return resized_crop_image_tensor(
             inpt,
@@ -2201,14 +2205,8 @@ def resized_crop_video(
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True)
 def five_crop(
-    inpt: datapoints._InputTypeJIT, size: List[int]
-) -> Tuple[
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-]:
+    inpt: torch.Tensor, size: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     if torch.jit.is_scripting():
         return five_crop_image_tensor(inpt, size=size)
 
@@ -2280,18 +2278,18 @@ def five_crop_video(
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True)
 def ten_crop(
-    inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT], size: List[int], vertical_flip: bool = False
+    inpt: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
-    datapoints._InputTypeJIT,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
 ]:
     if torch.jit.is_scripting():
         return ten_crop_image_tensor(inpt, size=size, vertical_flip=vertical_flip)
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index fc1aa05f3..a7177ab04 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import PIL.Image
 import torch
@@ -12,7 +12,7 @@ from ._utils import _get_kernel, _register_kernel_internal, _register_unsupporte
 
 
 @_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
-def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
+def get_dimensions(inpt: torch.Tensor) -> List[int]:
     if torch.jit.is_scripting():
         return get_dimensions_image_tensor(inpt)
 
@@ -45,7 +45,7 @@ def get_dimensions_video(video: torch.Tensor) -> List[int]:
 
 
 @_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
-def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> int:
+def get_num_channels(inpt: torch.Tensor) -> int:
     if torch.jit.is_scripting():
         return get_num_channels_image_tensor(inpt)
 
@@ -81,7 +81,7 @@ def get_num_channels_video(video: torch.Tensor) -> int:
 get_image_num_channels = get_num_channels
 
 
-def get_size(inpt: datapoints._InputTypeJIT) -> List[int]:
+def get_size(inpt: torch.Tensor) -> List[int]:
     if torch.jit.is_scripting():
         return get_size_image_tensor(inpt)
 
@@ -124,7 +124,7 @@ def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]
 
 
 @_register_unsupported_type(PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)
-def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int:
+def get_num_frames(inpt: torch.Tensor) -> int:
     if torch.jit.is_scripting():
         return get_num_frames_video(inpt)
 
@@ -201,11 +201,11 @@ def _convert_format_bounding_boxes(
 
 
 def convert_format_bounding_boxes(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     old_format: Optional[BoundingBoxFormat] = None,
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     # This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor
     # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
@@ -252,10 +252,10 @@ def _clamp_bounding_boxes(
 
 
 def clamp_bounding_boxes(
-    inpt: datapoints._InputTypeJIT,
+    inpt: torch.Tensor,
     format: Optional[BoundingBoxFormat] = None,
     canvas_size: Optional[Tuple[int, int]] = None,
-) -> datapoints._InputTypeJIT:
+) -> torch.Tensor:
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_boxes)
 
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index e3a800ea7..ec9c194d5 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional, Union
+from typing import List, Optional
 
 import PIL.Image
 import torch
@@ -17,7 +17,7 @@ from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_inter
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 @_register_unsupported_type(PIL.Image.Image)
 def normalize(
-    inpt: Union[datapoints._TensorImageTypeJIT, datapoints._TensorVideoTypeJIT],
+    inpt: torch.Tensor,
     mean: List[float],
     std: List[float],
     inplace: bool = False,
@@ -74,9 +74,7 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
 
 
 @_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-def gaussian_blur(
-    inpt: datapoints._InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> datapoints._InputTypeJIT:
+def gaussian_blur(inpt: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None) -> torch.Tensor:
     if torch.jit.is_scripting():
         return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
 
@@ -185,9 +183,7 @@ def gaussian_blur_video(
 
 
 @_register_unsupported_type(PIL.Image.Image)
-def to_dtype(
-    inpt: datapoints._InputTypeJIT, dtype: torch.dtype = torch.float, scale: bool = False
-) -> datapoints._InputTypeJIT:
+def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
     if torch.jit.is_scripting():
         return to_dtype_image_tensor(inpt, dtype=dtype, scale=scale)
 
@@ -278,8 +274,6 @@ def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale:
 
 @_register_kernel_internal(to_dtype, datapoints.BoundingBoxes, datapoint_wrapper=False)
 @_register_kernel_internal(to_dtype, datapoints.Mask, datapoint_wrapper=False)
-def _to_dtype_tensor_dispatch(
-    inpt: datapoints._InputTypeJIT, dtype: torch.dtype, scale: bool = False
-) -> datapoints._InputTypeJIT:
+def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: bool = False) -> torch.Tensor:
     # We don't need to unwrap and rewrap here, since Datapoint.to() preserves the type
     return inpt.to(dtype)
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 62d12cb4b..78dcfc1ef 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -11,7 +11,7 @@ from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_inter
 @_register_explicit_noop(
     PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True
 )
-def uniform_temporal_subsample(inpt: datapoints._VideoTypeJIT, num_samples: int) -> datapoints._VideoTypeJIT:
+def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Tensor:
     if torch.jit.is_scripting():
         return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
 
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 576a2b99d..ce1c320a7 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -1,10 +1,13 @@
 import functools
 import warnings
-from typing import Any, Callable, Dict, Type
+from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
 
 import torch
 from torchvision import datapoints
 
+_FillType = Union[int, float, Sequence[int], Sequence[float], None]
+_FillTypeJIT = Optional[List[float]]
+
 
 def is_simple_tensor(inpt: Any) -> bool:
     return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint)
-- 
GitLab


From 1db6907bd137ca006c61657e2a37802e418139d5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 9 Aug 2023 16:44:59 +0100
Subject: [PATCH 548/624] Use libjpeg-turbo for tests and builds (#7672)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 .github/scripts/setup-env.sh                  |  3 ++-
 .github/scripts/unittest.sh                   |  1 +
 packaging/pre_build_script.sh                 |  4 ++--
 packaging/torchvision/meta.yaml               |  6 +++---
 test/smoke_test.py                            |  2 ++
 torchvision/csrc/io/image/cpu/decode_jpeg.cpp | 15 +++++++++++++
 torchvision/csrc/io/image/cpu/decode_jpeg.h   |  3 +++
 torchvision/csrc/io/image/image.cpp           | 21 +++++++++++--------
 8 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index e4af4e7c6..4b852efd9 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -40,9 +40,10 @@ conda create \
   --quiet --yes \
   python="${PYTHON_VERSION}" pip \
   ninja cmake \
-  libpng jpeg \
+  libpng \
   'ffmpeg<4.3'
 conda activate ci
+conda install --quiet --yes libjpeg-turbo -c pytorch
 pip install --progress-bar=off --upgrade setuptools
 
 # See https://github.com/pytorch/vision/issues/6790
diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
index 2a0b71542..bb2ad7371 100755
--- a/.github/scripts/unittest.sh
+++ b/.github/scripts/unittest.sh
@@ -11,4 +11,5 @@ echo '::group::Install testing utilities'
 pip install --progress-bar=off pytest pytest-mock pytest-cov
 echo '::endgroup::'
 
+python test/smoke_test.py
 pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 9d10738cf..43f60e510 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -13,8 +13,8 @@ fi
 
 if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Install libpng from Anaconda (defaults)
-  conda install libpng "jpeg<=9b" -yq
-  conda install -yq ffmpeg=4.2 -c pytorch
+  conda install libpng -yq
+  conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch
 
   # Copy binaries to be included in the wheel distribution
   if [[ "$OSTYPE" == "msys" ]]; then
diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml
index c75b37f94..9adc13b55 100644
--- a/packaging/torchvision/meta.yaml
+++ b/packaging/torchvision/meta.yaml
@@ -10,7 +10,7 @@ requirements:
   build:
     - {{ compiler('c') }} # [win]
     - libpng
-    - jpeg
+    - libjpeg-turbo
     # NOTE: The only ffmpeg version that we build is actually 4.2
     - ffmpeg >=4.2  # [not win]
 
@@ -28,7 +28,7 @@ requirements:
     - requests
     - libpng
     - ffmpeg >=4.2  # [not win]
-    - jpeg
+    - libjpeg-turbo
     - pillow >=5.3.0, !=8.3.*
     - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
     {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
@@ -62,7 +62,7 @@ test:
   requires:
     - pytest
     - scipy
-    - jpeg
+    - libjpeg-turbo
     - ca-certificates
 
 
diff --git a/test/smoke_test.py b/test/smoke_test.py
index 8037183e1..a157f1c91 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -78,6 +78,8 @@ def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
 def main() -> None:
     print(f"torchvision: {torchvision.__version__}")
     print(f"torch.cuda.is_available: {torch.cuda.is_available()}")
+    print(f"{torch.ops.image._jpeg_version() = }")
+    assert torch.ops.image._is_compiled_against_turbo()
     smoke_test_torchvision()
     smoke_test_torchvision_read_decode()
     smoke_test_torchvision_resnet50_classify()
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
index 6ec644d00..d07844a5e 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -152,8 +152,23 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   jpeg_destroy_decompress(&cinfo);
   return tensor.permute({2, 0, 1});
 }
+#endif // #if !JPEG_FOUND
 
+int64_t _jpeg_version() {
+#ifdef JPEG_FOUND
+  return JPEG_LIB_VERSION;
+#else
+  return -1;
+#endif
+}
+
+bool _is_compiled_against_turbo() {
+#ifdef LIBJPEG_TURBO_VERSION
+  return true;
+#else
+  return false;
 #endif
+}
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.h b/torchvision/csrc/io/image/cpu/decode_jpeg.h
index 97ed3d51a..254e94680 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.h
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.h
@@ -10,5 +10,8 @@ C10_EXPORT torch::Tensor decode_jpeg(
     const torch::Tensor& data,
     ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
 
+C10_EXPORT int64_t _jpeg_version();
+C10_EXPORT bool _is_compiled_against_turbo();
+
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
index 3c9d632f0..b5952739a 100644
--- a/torchvision/csrc/io/image/image.cpp
+++ b/torchvision/csrc/io/image/image.cpp
@@ -19,15 +19,18 @@ PyMODINIT_FUNC PyInit_image(void) {
 namespace vision {
 namespace image {
 
-static auto registry = torch::RegisterOperators()
-                           .op("image::decode_png", &decode_png)
-                           .op("image::encode_png", &encode_png)
-                           .op("image::decode_jpeg", &decode_jpeg)
-                           .op("image::encode_jpeg", &encode_jpeg)
-                           .op("image::read_file", &read_file)
-                           .op("image::write_file", &write_file)
-                           .op("image::decode_image", &decode_image)
-                           .op("image::decode_jpeg_cuda", &decode_jpeg_cuda);
+static auto registry =
+    torch::RegisterOperators()
+        .op("image::decode_png", &decode_png)
+        .op("image::encode_png", &encode_png)
+        .op("image::decode_jpeg", &decode_jpeg)
+        .op("image::encode_jpeg", &encode_jpeg)
+        .op("image::read_file", &read_file)
+        .op("image::write_file", &write_file)
+        .op("image::decode_image", &decode_image)
+        .op("image::decode_jpeg_cuda", &decode_jpeg_cuda)
+        .op("image::_jpeg_version", &_jpeg_version)
+        .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo);
 
 } // namespace image
 } // namespace vision
-- 
GitLab


From f2b6f43a85452fe47eaa042ce684183add17fcac Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 9 Aug 2023 17:27:53 +0100
Subject: [PATCH 549/624] Fix main (#7816)

---
 test/test_transforms_v2_refactored.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 9028b304c..c1a21b634 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2240,17 +2240,9 @@ class TestPermuteChannels:
     def test_kernel(self, kernel, make_input, dtype, device):
         check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION)
 
-    @pytest.mark.parametrize(
-        ("kernel", "make_input"),
-        [
-            (F.permute_channels_image_tensor, make_image_tensor),
-            (F.permute_channels_image_pil, make_image_pil),
-            (F.permute_channels_image_tensor, make_image),
-            (F.permute_channels_video, make_video),
-        ],
-    )
-    def test_dispatcher(self, kernel, make_input):
-        check_dispatcher(F.permute_channels, kernel, make_input(), permutation=self._DEFAULT_PERMUTATION)
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_dispatcher(self, make_input):
+        check_dispatcher(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
-- 
GitLab


From bf6a8dc2156b9761e7bcdd0df605cc1d875f8435 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 10 Aug 2023 11:04:59 +0100
Subject: [PATCH 550/624] Simplify _NO_WRAPPING_EXCEPTIONS (#7806)

---
 test/test_datapoints.py              |  1 -
 torchvision/datapoints/_datapoint.py | 28 +++++++++++-----------------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 984caa2c3..4987b1991 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -209,4 +209,3 @@ def test_deepcopy(datapoint, requires_grad):
 
     assert type(datapoint_deepcopied) is type(datapoint)
     assert datapoint_deepcopied.requires_grad is requires_grad
-    assert datapoint_deepcopied.is_leaf
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index af6d5929d..7032d518f 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -33,14 +33,9 @@ class Datapoint(torch.Tensor):
     def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
         return tensor.as_subclass(cls)
 
-    _NO_WRAPPING_EXCEPTIONS = {
-        torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output),
-        torch.Tensor.to: lambda cls, input, output: cls.wrap_like(input, output),
-        torch.Tensor.detach: lambda cls, input, output: cls.wrap_like(input, output),
-        # We don't need to wrap the output of `Tensor.requires_grad_`, since it is an inplace operation and thus
-        # retains the type automatically
-        torch.Tensor.requires_grad_: lambda cls, input, output: output,
-    }
+    # The ops in this set are those that should *preserve* the Datapoint type,
+    # i.e. they are exceptions to the "no wrapping" rule.
+    _NO_WRAPPING_EXCEPTIONS = {torch.Tensor.clone, torch.Tensor.to, torch.Tensor.detach, torch.Tensor.requires_grad_}
 
     @classmethod
     def __torch_function__(
@@ -76,22 +71,21 @@ class Datapoint(torch.Tensor):
         with DisableTorchFunctionSubclass():
             output = func(*args, **kwargs or dict())
 
-            wrapper = cls._NO_WRAPPING_EXCEPTIONS.get(func)
-            # Apart from `func` needing to be an exception, we also require the primary operand, i.e. `args[0]`, to be
+        if func in cls._NO_WRAPPING_EXCEPTIONS and isinstance(args[0], cls):
+            # We also require the primary operand, i.e. `args[0]`, to be
             # an instance of the class that `__torch_function__` was invoked on. The __torch_function__ protocol will
             # invoke this method on *all* types involved in the computation by walking the MRO upwards. For example,
             # `torch.Tensor(...).to(datapoints.Image(...))` will invoke `datapoints.Image.__torch_function__` with
             # `args = (torch.Tensor(), datapoints.Image())` first. Without this guard, the original `torch.Tensor` would
             # be wrapped into a `datapoints.Image`.
-            if wrapper and isinstance(args[0], cls):
-                return wrapper(cls, args[0], output)
+            return cls.wrap_like(args[0], output)
 
-            # Inplace `func`'s, canonically identified with a trailing underscore in their name like `.add_(...)`,
-            # will retain the input type. Thus, we need to unwrap here.
-            if isinstance(output, cls):
-                return output.as_subclass(torch.Tensor)
+        if isinstance(output, cls):
+            # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`,
+            # so for those, the output is still a Datapoint. Thus, we need to manually unwrap.
+            return output.as_subclass(torch.Tensor)
 
-            return output
+        return output
 
     def _make_repr(self, **kwargs: Any) -> str:
         # This is a poor man's implementation of the proposal in https://github.com/pytorch/pytorch/issues/76532.
-- 
GitLab


From 876813149436051dcca4e96995c66cf89c2d8efb Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 10 Aug 2023 12:43:53 +0100
Subject: [PATCH 551/624] Clean up jpeg tests (#7820)

---
 test/test_image.py | 73 ----------------------------------------------
 1 file changed, 73 deletions(-)

diff --git a/test/test_image.py b/test/test_image.py
index b08dc2026..b24ac07d9 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -422,77 +422,6 @@ def test_encode_jpeg_errors():
         encode_jpeg(torch.empty((100, 100), dtype=torch.uint8))
 
 
-def _collect_if(cond):
-    # TODO: remove this once test_encode_jpeg_reference and test_write_jpeg_reference
-    # are removed
-    def _inner(test_func):
-        if cond:
-            return test_func
-        else:
-            return pytest.mark.dont_collect(test_func)
-
-    return _inner
-
-
-@_collect_if(cond=False)
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
-)
-def test_encode_jpeg_reference(img_path):
-    # This test is *wrong*.
-    # It compares a torchvision-encoded jpeg with a PIL-encoded jpeg (the reference), but it
-    # starts encoding the torchvision version from an image that comes from
-    # decode_jpeg, which can yield different results from pil.decode (see
-    # test_decode... which uses a high tolerance).
-    # Instead, we should start encoding from the exact same decoded image, for a
-    # valid comparison. This is done in test_encode_jpeg, but unfortunately
-    # these more correct tests fail on windows (probably because of a difference
-    # in libjpeg) between torchvision and PIL.
-    # FIXME: make the correct tests pass on windows and remove this.
-    dirname = os.path.dirname(img_path)
-    filename, _ = os.path.splitext(os.path.basename(img_path))
-    write_folder = os.path.join(dirname, "jpeg_write")
-    expected_file = os.path.join(write_folder, f"{filename}_pil.jpg")
-    img = decode_jpeg(read_file(img_path))
-
-    with open(expected_file, "rb") as f:
-        pil_bytes = f.read()
-        pil_bytes = torch.as_tensor(list(pil_bytes), dtype=torch.uint8)
-    for src_img in [img, img.contiguous()]:
-        # PIL sets jpeg quality to 75 by default
-        jpeg_bytes = encode_jpeg(src_img, quality=75)
-        assert_equal(jpeg_bytes, pil_bytes)
-
-
-@_collect_if(cond=False)
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
-)
-def test_write_jpeg_reference(img_path, tmpdir):
-    # FIXME: Remove this eventually, see test_encode_jpeg_reference
-    data = read_file(img_path)
-    img = decode_jpeg(data)
-
-    basedir = os.path.dirname(img_path)
-    filename, _ = os.path.splitext(os.path.basename(img_path))
-    torch_jpeg = os.path.join(tmpdir, f"{filename}_torch.jpg")
-    pil_jpeg = os.path.join(basedir, "jpeg_write", f"{filename}_pil.jpg")
-
-    write_jpeg(img, torch_jpeg, quality=75)
-
-    with open(torch_jpeg, "rb") as f:
-        torch_bytes = f.read()
-
-    with open(pil_jpeg, "rb") as f:
-        pil_bytes = f.read()
-
-    assert_equal(torch_bytes, pil_bytes)
-
-
-# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162.
-@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo")
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
@@ -511,8 +440,6 @@ def test_encode_jpeg(img_path):
         assert_equal(encoded_jpeg_torch, encoded_jpeg_pil)
 
 
-# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162.
-@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo")
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
-- 
GitLab


From db31063682bdca68bd8901d13dc2105eb55fe520 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 10 Aug 2023 15:02:41 +0200
Subject: [PATCH 552/624] move passthrough for unknown types from dispatchers
 to transforms (#7804)

---
 test/test_prototype_transforms.py             |  63 ---
 test/test_transforms_v2.py                    | 362 +-----------------
 test/test_transforms_v2_consistency.py        |  62 ---
 test/test_transforms_v2_refactored.py         |  56 +--
 torchvision/prototype/transforms/_geometry.py |   5 +-
 torchvision/transforms/v2/_augment.py         |  12 +-
 torchvision/transforms/v2/_color.py           |  71 +---
 torchvision/transforms/v2/_geometry.py        |  81 ++--
 torchvision/transforms/v2/_misc.py            |  13 +-
 torchvision/transforms/v2/_temporal.py        |   2 +-
 torchvision/transforms/v2/_transform.py       |   6 +
 .../transforms/v2/functional/_augment.py      |   3 +-
 .../transforms/v2/functional/_color.py        |  17 +-
 .../transforms/v2/functional/_geometry.py     |  26 +-
 torchvision/transforms/v2/functional/_meta.py |   5 +-
 torchvision/transforms/v2/functional/_misc.py |   6 +-
 .../transforms/v2/functional/_temporal.py     |   6 +-
 .../transforms/v2/functional/_utils.py        |  85 +---
 18 files changed, 150 insertions(+), 731 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index d395c2247..43a7df4f3 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -1,5 +1,3 @@
-import itertools
-
 import re
 
 import PIL.Image
@@ -19,7 +17,6 @@ from prototype_common_utils import make_label
 
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
-from torchvision.transforms.v2._utils import _convert_fill_arg
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_image_pil
 from torchvision.transforms.v2.utils import check_type, is_simple_tensor
 
@@ -187,66 +184,6 @@ class TestFixedSizeCrop:
         assert params["needs_pad"]
         assert any(pad > 0 for pad in params["padding"])
 
-    @pytest.mark.parametrize("needs", list(itertools.product((False, True), repeat=2)))
-    def test__transform(self, mocker, needs):
-        fill_sentinel = 12
-        padding_mode_sentinel = mocker.MagicMock()
-
-        transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
-        transform._transformed_types = (mocker.MagicMock,)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        needs_crop, needs_pad = needs
-        top_sentinel = mocker.MagicMock()
-        left_sentinel = mocker.MagicMock()
-        height_sentinel = mocker.MagicMock()
-        width_sentinel = mocker.MagicMock()
-        is_valid = mocker.MagicMock() if needs_crop else None
-        padding_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=needs_crop,
-                top=top_sentinel,
-                left=left_sentinel,
-                height=height_sentinel,
-                width=width_sentinel,
-                is_valid=is_valid,
-                padding=padding_sentinel,
-                needs_pad=needs_pad,
-            ),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_crop = mocker.patch("torchvision.prototype.transforms._geometry.F.crop")
-        mock_pad = mocker.patch("torchvision.prototype.transforms._geometry.F.pad")
-        transform(inpt_sentinel)
-
-        if needs_crop:
-            mock_crop.assert_called_once_with(
-                inpt_sentinel,
-                top=top_sentinel,
-                left=left_sentinel,
-                height=height_sentinel,
-                width=width_sentinel,
-            )
-        else:
-            mock_crop.assert_not_called()
-
-        if needs_pad:
-            # If we cropped before, the input to F.pad is no longer inpt_sentinel. Thus, we can't use
-            # `MagicMock.assert_called_once_with` and have to perform the checks manually
-            mock_pad.assert_called_once()
-            args, kwargs = mock_pad.call_args
-            if not needs_crop:
-                assert args[0] is inpt_sentinel
-            assert args[1] is padding_sentinel
-            fill_sentinel = _convert_fill_arg(fill_sentinel)
-            assert kwargs == dict(fill=fill_sentinel, padding_mode=padding_mode_sentinel)
-        else:
-            mock_pad.assert_not_called()
-
     def test__transform_culling(self, mocker):
         batch_size = 10
         canvas_size = (10, 10)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 5f4a9b628..4db2abe7f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -27,7 +27,7 @@ from common_utils import (
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints
 from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import InterpolationMode, to_pil_image
+from torchvision.transforms.functional import to_pil_image
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
 
@@ -419,46 +419,6 @@ class TestPad:
         with pytest.raises(ValueError, match="Padding mode should be either"):
             transforms.Pad(12, padding_mode="abc")
 
-    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, fill, padding_mode, mocker):
-        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        _ = transform(inpt)
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        if isinstance(padding, tuple):
-            padding = list(padding)
-        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
-
-    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.Pad(1, fill=fill, padding_mode="constant")
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        image = datapoints.Image(torch.rand(3, 32, 32))
-        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-        _ = transform(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms._utils._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
-            ]
-        else:
-            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
-            ]
-        fn.assert_has_calls(calls)
-
 
 class TestRandomZoomOut:
     def test_assertions(self):
@@ -487,56 +447,6 @@ class TestRandomZoomOut:
         assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
         assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
 
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__transform(self, fill, side_range, mocker):
-        inpt = make_image((24, 32))
-
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill)
-
-    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, p=1.0)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        image = datapoints.Image(torch.rand(3, 32, 32))
-        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms._utils._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, **params, fill=fill),
-                mocker.call(mask, **params, fill=fill),
-            ]
-        else:
-            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, **params, fill=fill_img),
-                mocker.call(mask, **params, fill=fill_mask),
-            ]
-        fn.assert_has_calls(calls)
-
 
 class TestRandomCrop:
     def test_assertions(self):
@@ -599,51 +509,6 @@ class TestRandomCrop:
         assert params["needs_pad"] is any(padding)
         assert params["padding"] == padding
 
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("pad_if_needed", [False, True])
-    @pytest.mark.parametrize("fill", [False, True])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
-        output_size = [10, 12]
-        transform = transforms.RandomCrop(
-            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
-        )
-
-        h, w = size = (32, 32)
-        inpt = make_image(size)
-
-        if isinstance(padding, int):
-            new_size = (h + padding, w + padding)
-        elif isinstance(padding, list):
-            new_size = (h + sum(padding[0::2]), w + sum(padding[1::2]))
-        else:
-            new_size = size
-        expected = make_image(new_size)
-        _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
-        fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-        if padding is None and not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif padding is None:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-        else:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-
 
 class TestGaussianBlur:
     def test_assertions(self):
@@ -675,62 +540,6 @@ class TestGaussianBlur:
             assert sigma[0] <= params["sigma"][0] <= sigma[1]
             assert sigma[0] <= params["sigma"][1] <= sigma[1]
 
-    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
-    def test__transform(self, kernel_size, sigma, mocker):
-        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
-
-        if isinstance(kernel_size, (tuple, list)):
-            assert transform.kernel_size == kernel_size
-        else:
-            kernel_size = (kernel_size, kernel_size)
-            assert transform.kernel_size == kernel_size
-
-        if isinstance(sigma, (tuple, list)):
-            assert transform.sigma == sigma
-        else:
-            assert transform.sigma == [sigma, sigma]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.canvas_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-
-        fn.assert_called_once_with(inpt, kernel_size, **params)
-
-
-class TestRandomColorOp:
-    @pytest.mark.parametrize("p", [0.0, 1.0])
-    @pytest.mark.parametrize(
-        "transform_cls, func_op_name, kwargs",
-        [
-            (transforms.RandomEqualize, "equalize", {}),
-            (transforms.RandomInvert, "invert", {}),
-            (transforms.RandomAutocontrast, "autocontrast", {}),
-            (transforms.RandomPosterize, "posterize", {"bits": 4}),
-            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
-            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
-        ],
-    )
-    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
-        transform = transform_cls(p=p, **kwargs)
-
-        fn = mocker.patch(f"torchvision.transforms.v2.functional.{func_op_name}")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        _ = transform(inpt)
-        if p > 0.0:
-            fn.assert_called_once_with(inpt, **kwargs)
-        else:
-            assert fn.call_count == 0
-
 
 class TestRandomPerspective:
     def test_assertions(self):
@@ -751,28 +560,6 @@ class TestRandomPerspective:
         assert "coefficients" in params
         assert len(params["coefficients"]) == 8
 
-    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
-    def test__transform(self, distortion_scale, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
-
-        inpt = make_image((24, 32))
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
-
 
 class TestElasticTransform:
     def test_assertions(self):
@@ -813,35 +600,6 @@ class TestElasticTransform:
         assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
         assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
 
-    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
-    def test__transform(self, alpha, sigma, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
-
-        if isinstance(alpha, float):
-            assert transform.alpha == [alpha, alpha]
-        else:
-            assert transform.alpha == alpha
-
-        if isinstance(sigma, float):
-            assert transform.sigma == [sigma, sigma]
-        else:
-            assert transform.sigma == sigma
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.canvas_size = (24, 32)
-
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock()
-        _ = transform(inpt)
-        params = transform._get_params([inpt])
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
 
 class TestRandomErasing:
     def test_assertions(self):
@@ -889,40 +647,6 @@ class TestRandomErasing:
         assert 0 <= i <= height - h
         assert 0 <= j <= width - w
 
-    @pytest.mark.parametrize("p", [0, 1])
-    def test__transform(self, mocker, p):
-        transform = transforms.RandomErasing(p=p)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        i_sentinel = mocker.MagicMock()
-        j_sentinel = mocker.MagicMock()
-        h_sentinel = mocker.MagicMock()
-        w_sentinel = mocker.MagicMock()
-        v_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._augment.RandomErasing._get_params",
-            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._augment.F.erase")
-        output = transform(inpt_sentinel)
-
-        if p:
-            mock.assert_called_once_with(
-                inpt_sentinel,
-                i=i_sentinel,
-                j=j_sentinel,
-                h=h_sentinel,
-                w=w_sentinel,
-                v=v_sentinel,
-                inplace=transform.inplace,
-            )
-        else:
-            mock.assert_not_called()
-            assert output is inpt_sentinel
-
 
 class TestTransform:
     @pytest.mark.parametrize(
@@ -1111,23 +835,12 @@ class TestRandomIoUCrop:
 
         sample = [image, bboxes, masks]
 
-        fn = mocker.patch("torchvision.transforms.v2.functional.crop", side_effect=lambda x, **params: x)
         is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
 
         params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
         transform._get_params = mocker.MagicMock(return_value=params)
         output = transform(sample)
 
-        assert fn.call_count == 3
-
-        expected_calls = [
-            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-        ]
-
-        fn.assert_has_calls(expected_calls)
-
         # check number of bboxes vs number of labels:
         output_bboxes = output[1]
         assert isinstance(output_bboxes, datapoints.BoundingBoxes)
@@ -1164,29 +877,6 @@ class TestScaleJitter:
             assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max)
             assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max)
 
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.ScaleJitter(
-            target_size=(16, 12), interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
 
 class TestRandomShortestSize:
     @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
@@ -1211,30 +901,6 @@ class TestRandomShortestSize:
         else:
             assert shorter in min_size
 
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomShortestSize(
-            min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.RandomShortestSize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
 
 class TestLinearTransformation:
     def test_assertions(self):
@@ -1260,7 +926,7 @@ class TestLinearTransformation:
         transform = transforms.LinearTransformation(m, v)
 
         if isinstance(inpt, PIL.Image.Image):
-            with pytest.raises(TypeError, match="LinearTransformation does not work on PIL Images"):
+            with pytest.raises(TypeError, match="does not support PIL images"):
                 transform(inpt)
         else:
             output = transform(inpt)
@@ -1284,30 +950,6 @@ class TestRandomResize:
 
             assert min_size <= size < max_size
 
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomResize(
-            min_size=-1, max_size=-1, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.RandomResize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_resize = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock_resize.assert_called_with(
-            inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
 
 class TestUniformTemporalSubsample:
     @pytest.mark.parametrize(
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index f5ea69279..bcab4355c 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1259,68 +1259,6 @@ class TestRefSegTransforms:
     def test_common(self, t_ref, t, data_kwargs):
         self.check(t, t_ref, data_kwargs)
 
-    def check_resize(self, mocker, t_ref, t):
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        mock_ref = mocker.patch("torchvision.transforms.functional.resize")
-
-        for dp, dp_ref in self.make_datapoints():
-            mock.reset_mock()
-            mock_ref.reset_mock()
-
-            self.set_seed()
-            t(dp)
-            assert mock.call_count == 2
-            assert all(
-                actual is expected
-                for actual, expected in zip([call_args[0][0] for call_args in mock.call_args_list], dp)
-            )
-
-            self.set_seed()
-            t_ref(*dp_ref)
-            assert mock_ref.call_count == 2
-            assert all(
-                actual is expected
-                for actual, expected in zip([call_args[0][0] for call_args in mock_ref.call_args_list], dp_ref)
-            )
-
-            for args_kwargs, args_kwargs_ref in zip(mock.call_args_list, mock_ref.call_args_list):
-                assert args_kwargs[0][1] == [args_kwargs_ref[0][1]]
-
-    def test_random_resize_train(self, mocker):
-        base_size = 520
-        min_size = base_size // 2
-        max_size = base_size * 2
-
-        randint = torch.randint
-
-        def patched_randint(a, b, *other_args, **kwargs):
-            if kwargs or len(other_args) > 1 or other_args[0] != ():
-                return randint(a, b, *other_args, **kwargs)
-
-            return random.randint(a, b)
-
-        # We are patching torch.randint -> random.randint here, because we can't patch the modules that are not imported
-        # normally
-        t = v2_transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.torch.randint",
-            new=patched_randint,
-        )
-
-        t_ref = seg_transforms.RandomResize(min_size=min_size, max_size=max_size)
-
-        self.check_resize(mocker, t_ref, t)
-
-    def test_random_resize_eval(self, mocker):
-        torch.manual_seed(0)
-        base_size = 520
-
-        t = v2_transforms.Resize(size=base_size, antialias=True)
-
-        t_ref = seg_transforms.RandomResize(min_size=base_size, max_size=base_size)
-
-        self.check_resize(mocker, t_ref, t)
-
 
 @pytest.mark.parametrize(
     ("legacy_dispatcher", "name_only_params"),
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index c1a21b634..1e78c5ed6 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -39,7 +39,7 @@ from torchvision import datapoints
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.functional._utils import _get_kernel, _KERNEL_REGISTRY, _noop, _register_kernel_internal
+from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal
 
 
 @pytest.fixture(autouse=True)
@@ -376,35 +376,6 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
     return torch.stack([transform(b) for b in bounding_boxes.reshape(-1, 4).unbind()]).reshape(bounding_boxes.shape)
 
 
-@pytest.mark.parametrize(
-    ("dispatcher", "registered_input_types"),
-    [(dispatcher, set(registry.keys())) for dispatcher, registry in _KERNEL_REGISTRY.items()],
-)
-def test_exhaustive_kernel_registration(dispatcher, registered_input_types):
-    missing = {
-        torch.Tensor,
-        PIL.Image.Image,
-        datapoints.Image,
-        datapoints.BoundingBoxes,
-        datapoints.Mask,
-        datapoints.Video,
-    } - registered_input_types
-    if missing:
-        names = sorted(str(t) for t in missing)
-        raise AssertionError(
-            "\n".join(
-                [
-                    f"The dispatcher '{dispatcher.__name__}' has no kernel registered for",
-                    "",
-                    *[f"- {name}" for name in names],
-                    "",
-                    f"If available, register the kernels with @_register_kernel_internal({dispatcher.__name__}, ...).",
-                    f"If not, register explicit no-ops with @_register_explicit_noop({', '.join(names)})",
-                ]
-            )
-        )
-
-
 class TestResize:
     INPUT_SIZE = (17, 11)
     OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)]
@@ -2128,9 +2099,20 @@ class TestRegisterKernel:
         with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"):
             F.register_kernel(F.resize, object)
 
-        with pytest.raises(ValueError, match="already has a kernel registered for type"):
+        with pytest.raises(ValueError, match="cannot be registered for the builtin datapoint classes"):
             F.register_kernel(F.resize, datapoints.Image)(F.resize_image_tensor)
 
+        class CustomDatapoint(datapoints.Datapoint):
+            pass
+
+        def resize_custom_datapoint():
+            pass
+
+        F.register_kernel(F.resize, CustomDatapoint)(resize_custom_datapoint)
+
+        with pytest.raises(ValueError, match="already has a kernel registered for type"):
+            F.register_kernel(F.resize, CustomDatapoint)(resize_custom_datapoint)
+
 
 class TestGetKernel:
     # We are using F.resize as dispatcher and the kernels below as proxy. Any other dispatcher / kernels combination
@@ -2152,13 +2134,7 @@ class TestGetKernel:
             pass
 
         for input_type in [str, int, object, MyTensor, MyPILImage]:
-            with pytest.raises(
-                TypeError,
-                match=(
-                    "supports inputs of type torch.Tensor, PIL.Image.Image, "
-                    "and subclasses of torchvision.datapoints.Datapoint"
-                ),
-            ):
+            with pytest.raises(TypeError, match="supports inputs of type"):
                 _get_kernel(F.resize, input_type)
 
     def test_exact_match(self):
@@ -2211,8 +2187,8 @@ class TestGetKernel:
         class MyDatapoint(datapoints.Datapoint):
             pass
 
-        # Note that this will be an error in the future
-        assert _get_kernel(F.resize, MyDatapoint) is _noop
+        with pytest.raises(TypeError, match="supports inputs of type"):
+            _get_kernel(F.resize, MyDatapoint)
 
         def resize_my_datapoint():
             pass
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 1a2802db0..fe2e8df47 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -101,7 +101,8 @@ class FixedSizeCrop(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["needs_crop"]:
-            inpt = F.crop(
+            inpt = self._call_kernel(
+                F.crop,
                 inpt,
                 top=params["top"],
                 left=params["left"],
@@ -120,6 +121,6 @@ class FixedSizeCrop(Transform):
 
         if params["needs_pad"]:
             fill = _get_fill(self._fill, type(inpt))
-            inpt = F.pad(inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
+            inpt = self._call_kernel(F.pad, inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         return inpt
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 87a43b118..9be7a40e8 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -1,7 +1,7 @@
 import math
 import numbers
 import warnings
-from typing import Any, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Tuple
 
 import PIL.Image
 import torch
@@ -91,6 +91,14 @@ class RandomErasing(_RandomApplyTransform):
 
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
+    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"datapoints.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(dispatcher, inpt, *args, **kwargs)
+
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         img_c, img_h, img_w = query_chw(flat_inputs)
 
@@ -131,7 +139,7 @@ class RandomErasing(_RandomApplyTransform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["v"] is not None:
-            inpt = F.erase(inpt, **params, inplace=self.inplace)
+            inpt = self._call_kernel(F.erase, inpt, **params, inplace=self.inplace)
 
         return inpt
 
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 90e3ce2ff..a37927979 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -1,13 +1,12 @@
 import collections.abc
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
-import PIL.Image
 import torch
-from torchvision import datapoints, transforms as _transforms
+from torchvision import transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._transform import _RandomApplyTransform
-from .utils import is_simple_tensor, query_chw
+from .utils import query_chw
 
 
 class Grayscale(Transform):
@@ -24,19 +23,12 @@ class Grayscale(Transform):
 
     _v1_transform_cls = _transforms.Grayscale
 
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
     def __init__(self, num_output_channels: int = 1):
         super().__init__()
         self.num_output_channels = num_output_channels
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=self.num_output_channels)
 
 
 class RandomGrayscale(_RandomApplyTransform):
@@ -55,13 +47,6 @@ class RandomGrayscale(_RandomApplyTransform):
 
     _v1_transform_cls = _transforms.RandomGrayscale
 
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
     def __init__(self, p: float = 0.1) -> None:
         super().__init__(p=p)
 
@@ -70,7 +55,7 @@ class RandomGrayscale(_RandomApplyTransform):
         return dict(num_input_channels=num_input_channels)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=params["num_input_channels"])
 
 
 class ColorJitter(Transform):
@@ -167,13 +152,13 @@ class ColorJitter(Transform):
         hue_factor = params["hue_factor"]
         for fn_id in params["fn_idx"]:
             if fn_id == 0 and brightness_factor is not None:
-                output = F.adjust_brightness(output, brightness_factor=brightness_factor)
+                output = self._call_kernel(F.adjust_brightness, output, brightness_factor=brightness_factor)
             elif fn_id == 1 and contrast_factor is not None:
-                output = F.adjust_contrast(output, contrast_factor=contrast_factor)
+                output = self._call_kernel(F.adjust_contrast, output, contrast_factor=contrast_factor)
             elif fn_id == 2 and saturation_factor is not None:
-                output = F.adjust_saturation(output, saturation_factor=saturation_factor)
+                output = self._call_kernel(F.adjust_saturation, output, saturation_factor=saturation_factor)
             elif fn_id == 3 and hue_factor is not None:
-                output = F.adjust_hue(output, hue_factor=hue_factor)
+                output = self._call_kernel(F.adjust_hue, output, hue_factor=hue_factor)
         return output
 
 
@@ -183,19 +168,12 @@ class RandomChannelPermutation(Transform):
     .. v2betastatus:: RandomChannelPermutation transform
     """
 
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         num_channels, *_ = query_chw(flat_inputs)
         return dict(permutation=torch.randperm(num_channels))
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.permute_channels(inpt, params["permutation"])
+        return self._call_kernel(F.permute_channels, inpt, params["permutation"])
 
 
 class RandomPhotometricDistort(Transform):
@@ -224,13 +202,6 @@ class RandomPhotometricDistort(Transform):
             Default is 0.5.
     """
 
-    _transformed_types = (
-        datapoints.Image,
-        PIL.Image.Image,
-        is_simple_tensor,
-        datapoints.Video,
-    )
-
     def __init__(
         self,
         brightness: Tuple[float, float] = (0.875, 1.125),
@@ -263,17 +234,17 @@ class RandomPhotometricDistort(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["brightness_factor"] is not None:
-            inpt = F.adjust_brightness(inpt, brightness_factor=params["brightness_factor"])
+            inpt = self._call_kernel(F.adjust_brightness, inpt, brightness_factor=params["brightness_factor"])
         if params["contrast_factor"] is not None and params["contrast_before"]:
-            inpt = F.adjust_contrast(inpt, contrast_factor=params["contrast_factor"])
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
         if params["saturation_factor"] is not None:
-            inpt = F.adjust_saturation(inpt, saturation_factor=params["saturation_factor"])
+            inpt = self._call_kernel(F.adjust_saturation, inpt, saturation_factor=params["saturation_factor"])
         if params["hue_factor"] is not None:
-            inpt = F.adjust_hue(inpt, hue_factor=params["hue_factor"])
+            inpt = self._call_kernel(F.adjust_hue, inpt, hue_factor=params["hue_factor"])
         if params["contrast_factor"] is not None and not params["contrast_before"]:
-            inpt = F.adjust_contrast(inpt, contrast_factor=params["contrast_factor"])
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
         if params["channel_permutation"] is not None:
-            inpt = F.permute_channels(inpt, permutation=params["channel_permutation"])
+            inpt = self._call_kernel(F.permute_channels, inpt, permutation=params["channel_permutation"])
         return inpt
 
 
@@ -293,7 +264,7 @@ class RandomEqualize(_RandomApplyTransform):
     _v1_transform_cls = _transforms.RandomEqualize
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.equalize(inpt)
+        return self._call_kernel(F.equalize, inpt)
 
 
 class RandomInvert(_RandomApplyTransform):
@@ -312,7 +283,7 @@ class RandomInvert(_RandomApplyTransform):
     _v1_transform_cls = _transforms.RandomInvert
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.invert(inpt)
+        return self._call_kernel(F.invert, inpt)
 
 
 class RandomPosterize(_RandomApplyTransform):
@@ -337,7 +308,7 @@ class RandomPosterize(_RandomApplyTransform):
         self.bits = bits
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.posterize(inpt, bits=self.bits)
+        return self._call_kernel(F.posterize, inpt, bits=self.bits)
 
 
 class RandomSolarize(_RandomApplyTransform):
@@ -362,7 +333,7 @@ class RandomSolarize(_RandomApplyTransform):
         self.threshold = threshold
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.solarize(inpt, threshold=self.threshold)
+        return self._call_kernel(F.solarize, inpt, threshold=self.threshold)
 
 
 class RandomAutocontrast(_RandomApplyTransform):
@@ -381,7 +352,7 @@ class RandomAutocontrast(_RandomApplyTransform):
     _v1_transform_cls = _transforms.RandomAutocontrast
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.autocontrast(inpt)
+        return self._call_kernel(F.autocontrast, inpt)
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
@@ -406,4 +377,4 @@ class RandomAdjustSharpness(_RandomApplyTransform):
         self.sharpness_factor = sharpness_factor
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.adjust_sharpness(inpt, sharpness_factor=self.sharpness_factor)
+        return self._call_kernel(F.adjust_sharpness, inpt, sharpness_factor=self.sharpness_factor)
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 5c2850569..b20914061 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -1,7 +1,7 @@
 import math
 import numbers
 import warnings
-from typing import Any, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Callable, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
 
 import PIL.Image
 import torch
@@ -44,7 +44,7 @@ class RandomHorizontalFlip(_RandomApplyTransform):
     _v1_transform_cls = _transforms.RandomHorizontalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.horizontal_flip(inpt)
+        return self._call_kernel(F.horizontal_flip, inpt)
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
@@ -64,7 +64,7 @@ class RandomVerticalFlip(_RandomApplyTransform):
     _v1_transform_cls = _transforms.RandomVerticalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.vertical_flip(inpt)
+        return self._call_kernel(F.vertical_flip, inpt)
 
 
 class Resize(Transform):
@@ -152,7 +152,8 @@ class Resize(Transform):
         self.antialias = antialias
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(
+        return self._call_kernel(
+            F.resize,
             inpt,
             self.size,
             interpolation=self.interpolation,
@@ -186,7 +187,7 @@ class CenterCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.center_crop(inpt, output_size=self.size)
+        return self._call_kernel(F.center_crop, inpt, output_size=self.size)
 
 
 class RandomResizedCrop(Transform):
@@ -307,8 +308,8 @@ class RandomResizedCrop(Transform):
         return dict(top=i, left=j, height=h, width=w)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resized_crop(
-            inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
+        return self._call_kernel(
+            F.resized_crop, inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
         )
 
 
@@ -357,8 +358,16 @@ class FiveCrop(Transform):
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
+    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"datapoints.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(dispatcher, inpt, *args, **kwargs)
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.five_crop(inpt, self.size)
+        return self._call_kernel(F.five_crop, inpt, self.size)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
@@ -396,12 +405,20 @@ class TenCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
         self.vertical_flip = vertical_flip
 
+    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"datapoints.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(dispatcher, inpt, *args, **kwargs)
+
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
             raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
+        return self._call_kernel(F.ten_crop, inpt, self.size, vertical_flip=self.vertical_flip)
 
 
 class Pad(Transform):
@@ -475,7 +492,7 @@ class Pad(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = _get_fill(self._fill, type(inpt))
-        return F.pad(inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+        return self._call_kernel(F.pad, inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
 
 
 class RandomZoomOut(_RandomApplyTransform):
@@ -545,7 +562,7 @@ class RandomZoomOut(_RandomApplyTransform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = _get_fill(self._fill, type(inpt))
-        return F.pad(inpt, **params, fill=fill)
+        return self._call_kernel(F.pad, inpt, **params, fill=fill)
 
 
 class RandomRotation(Transform):
@@ -611,7 +628,8 @@ class RandomRotation(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = _get_fill(self._fill, type(inpt))
-        return F.rotate(
+        return self._call_kernel(
+            F.rotate,
             inpt,
             **params,
             interpolation=self.interpolation,
@@ -733,7 +751,8 @@ class RandomAffine(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = _get_fill(self._fill, type(inpt))
-        return F.affine(
+        return self._call_kernel(
+            F.affine,
             inpt,
             **params,
             interpolation=self.interpolation,
@@ -889,10 +908,12 @@ class RandomCrop(Transform):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if params["needs_pad"]:
             fill = _get_fill(self._fill, type(inpt))
-            inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
+            inpt = self._call_kernel(F.pad, inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
 
         if params["needs_crop"]:
-            inpt = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
+            inpt = self._call_kernel(
+                F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+            )
 
         return inpt
 
@@ -973,7 +994,8 @@ class RandomPerspective(_RandomApplyTransform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = _get_fill(self._fill, type(inpt))
-        return F.perspective(
+        return self._call_kernel(
+            F.perspective,
             inpt,
             None,
             None,
@@ -1050,7 +1072,7 @@ class ElasticTransform(Transform):
             # if kernel size is even we have to make it odd
             if kx % 2 == 0:
                 kx += 1
-            dx = F.gaussian_blur(dx, [kx, kx], list(self.sigma))
+            dx = self._call_kernel(F.gaussian_blur, dx, [kx, kx], list(self.sigma))
         dx = dx * self.alpha[0] / size[0]
 
         dy = torch.rand([1, 1] + size) * 2 - 1
@@ -1059,14 +1081,15 @@ class ElasticTransform(Transform):
             # if kernel size is even we have to make it odd
             if ky % 2 == 0:
                 ky += 1
-            dy = F.gaussian_blur(dy, [ky, ky], list(self.sigma))
+            dy = self._call_kernel(F.gaussian_blur, dy, [ky, ky], list(self.sigma))
         dy = dy * self.alpha[1] / size[1]
         displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
         return dict(displacement=displacement)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = _get_fill(self._fill, type(inpt))
-        return F.elastic(
+        return self._call_kernel(
+            F.elastic,
             inpt,
             **params,
             fill=fill,
@@ -1164,7 +1187,9 @@ class RandomIoUCrop(Transform):
 
                 # check for any valid boxes with centers within the crop area
                 xyxy_bboxes = F.convert_format_bounding_boxes(
-                    bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
+                    bboxes.as_subclass(torch.Tensor),
+                    bboxes.format,
+                    datapoints.BoundingBoxFormat.XYXY,
                 )
                 cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
                 cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
@@ -1188,7 +1213,9 @@ class RandomIoUCrop(Transform):
         if len(params) < 1:
             return inpt
 
-        output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
+        output = self._call_kernel(
+            F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+        )
 
         if isinstance(output, datapoints.BoundingBoxes):
             # We "mark" the invalid boxes as degenreate, and they can be
@@ -1262,7 +1289,9 @@ class ScaleJitter(Transform):
         return dict(size=(new_height, new_width))
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
 
 
 class RandomShortestSize(Transform):
@@ -1330,7 +1359,9 @@ class RandomShortestSize(Transform):
         return dict(size=(new_height, new_width))
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
 
 
 class RandomResize(Transform):
@@ -1400,4 +1431,6 @@ class RandomResize(Transform):
         return dict(size=[size])
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias)
+        return self._call_kernel(
+            F.resize, inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index da71cebb4..780d9f994 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -106,7 +106,7 @@ class LinearTransformation(Transform):
 
     def _check_inputs(self, sample: Any) -> Any:
         if has_any(sample, PIL.Image.Image):
-            raise TypeError("LinearTransformation does not work on PIL Images")
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         shape = inpt.shape
@@ -157,7 +157,6 @@ class Normalize(Transform):
     """
 
     _v1_transform_cls = _transforms.Normalize
-    _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
     def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
         super().__init__()
@@ -170,7 +169,7 @@ class Normalize(Transform):
             raise TypeError(f"{type(self).__name__}() does not support PIL images.")
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
+        return self._call_kernel(F.normalize, inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
 
 class GaussianBlur(Transform):
@@ -217,7 +216,7 @@ class GaussianBlur(Transform):
         return dict(sigma=[sigma, sigma])
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.gaussian_blur(inpt, self.kernel_size, **params)
+        return self._call_kernel(F.gaussian_blur, inpt, self.kernel_size, **params)
 
 
 class ToDtype(Transform):
@@ -290,7 +289,7 @@ class ToDtype(Transform):
                 )
             return inpt
 
-        return F.to_dtype(inpt, dtype=dtype, scale=self.scale)
+        return self._call_kernel(F.to_dtype, inpt, dtype=dtype, scale=self.scale)
 
 
 class ConvertImageDtype(Transform):
@@ -320,14 +319,12 @@ class ConvertImageDtype(Transform):
 
     _v1_transform_cls = _transforms.ConvertImageDtype
 
-    _transformed_types = (is_simple_tensor, datapoints.Image)
-
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
         super().__init__()
         self.dtype = dtype
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.to_dtype(inpt, dtype=self.dtype, scale=True)
+        return self._call_kernel(F.to_dtype, inpt, dtype=self.dtype, scale=True)
 
 
 class SanitizeBoundingBoxes(Transform):
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index 591341e7c..df39cde0e 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -25,4 +25,4 @@ class UniformTemporalSubsample(Transform):
         self.num_samples = num_samples
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.uniform_temporal_subsample(inpt, self.num_samples)
+        return self._call_kernel(F.uniform_temporal_subsample, inpt, self.num_samples)
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index f83ed5d6e..5a310ddbd 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -11,6 +11,8 @@ from torchvision import datapoints
 from torchvision.transforms.v2.utils import check_type, has_any, is_simple_tensor
 from torchvision.utils import _log_api_usage_once
 
+from .functional._utils import _get_kernel
+
 
 class Transform(nn.Module):
 
@@ -28,6 +30,10 @@ class Transform(nn.Module):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         return dict()
 
+    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        kernel = _get_kernel(dispatcher, type(inpt), allow_passthrough=True)
+        return kernel(inpt, *args, **kwargs)
+
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         raise NotImplementedError
 
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 1497638f6..4a927be97 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -5,10 +5,9 @@ from torchvision import datapoints
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
+from ._utils import _get_kernel, _register_kernel_internal
 
 
-@_register_explicit_noop(datapoints.Mask, datapoints.BoundingBoxes, warn_passthrough=True)
 def erase(
     inpt: torch.Tensor,
     i: int,
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 9ba88d31b..4c087965f 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -10,12 +10,10 @@ from torchvision.transforms._functional_tensor import _max_value
 from torchvision.utils import _log_api_usage_once
 
 from ._misc import _num_value_bits, to_dtype_image_tensor
-
 from ._type_conversion import pil_to_tensor, to_image_pil
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
+from ._utils import _get_kernel, _register_kernel_internal
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, datapoints.Video)
 def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
     if torch.jit.is_scripting():
         return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
@@ -70,8 +68,8 @@ def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Te
     return output if fp else output.to(image1.dtype)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+
     if torch.jit.is_scripting():
         return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
 
@@ -107,7 +105,6 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
     return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
@@ -146,7 +143,6 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
     return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
@@ -185,7 +181,6 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
     return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
@@ -258,7 +253,6 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
     return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
@@ -370,7 +364,6 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
     return adjust_hue_image_tensor(video, hue_factor=hue_factor)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
     if torch.jit.is_scripting():
         return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
@@ -410,7 +403,6 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
     return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
     if torch.jit.is_scripting():
         return posterize_image_tensor(inpt, bits=bits)
@@ -444,7 +436,6 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
     return posterize_image_tensor(video, bits=bits)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
     if torch.jit.is_scripting():
         return solarize_image_tensor(inpt, threshold=threshold)
@@ -472,7 +463,6 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
     return solarize_image_tensor(video, threshold=threshold)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return autocontrast_image_tensor(inpt)
@@ -522,7 +512,6 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
     return autocontrast_image_tensor(video)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def equalize(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return equalize_image_tensor(inpt)
@@ -612,7 +601,6 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
     return equalize_image_tensor(video)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def invert(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
         return invert_image_tensor(inpt)
@@ -643,7 +631,6 @@ def invert_video(video: torch.Tensor) -> torch.Tensor:
     return invert_image_tensor(video)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor:
     """Permute the channels of the input according to the given permutation.
 
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 6416a143c..f8f3b1da0 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -25,13 +25,7 @@ from torchvision.utils import _log_api_usage_once
 
 from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil
 
-from ._utils import (
-    _FillTypeJIT,
-    _get_kernel,
-    _register_explicit_noop,
-    _register_five_ten_crop_kernel,
-    _register_kernel_internal,
-)
+from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal
 
 
 def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
@@ -2203,7 +2197,6 @@ def resized_crop_video(
     )
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True)
 def five_crop(
     inpt: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -2230,8 +2223,8 @@ def _parse_five_crop_size(size: List[int]) -> List[int]:
     return size
 
 
-@_register_five_ten_crop_kernel(five_crop, torch.Tensor)
-@_register_five_ten_crop_kernel(five_crop, datapoints.Image)
+@_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(five_crop, datapoints.Image)
 def five_crop_image_tensor(
     image: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -2250,7 +2243,7 @@ def five_crop_image_tensor(
     return tl, tr, bl, br, center
 
 
-@_register_five_ten_crop_kernel(five_crop, PIL.Image.Image)
+@_register_five_ten_crop_kernel_internal(five_crop, PIL.Image.Image)
 def five_crop_image_pil(
     image: PIL.Image.Image, size: List[int]
 ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
@@ -2269,14 +2262,13 @@ def five_crop_image_pil(
     return tl, tr, bl, br, center
 
 
-@_register_five_ten_crop_kernel(five_crop, datapoints.Video)
+@_register_five_ten_crop_kernel_internal(five_crop, datapoints.Video)
 def five_crop_video(
     video: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     return five_crop_image_tensor(video, size)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True)
 def ten_crop(
     inpt: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
@@ -2300,8 +2292,8 @@ def ten_crop(
     return kernel(inpt, size=size, vertical_flip=vertical_flip)
 
 
-@_register_five_ten_crop_kernel(ten_crop, torch.Tensor)
-@_register_five_ten_crop_kernel(ten_crop, datapoints.Image)
+@_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Image)
 def ten_crop_image_tensor(
     image: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
@@ -2328,7 +2320,7 @@ def ten_crop_image_tensor(
     return non_flipped + flipped
 
 
-@_register_five_ten_crop_kernel(ten_crop, PIL.Image.Image)
+@_register_five_ten_crop_kernel_internal(ten_crop, PIL.Image.Image)
 def ten_crop_image_pil(
     image: PIL.Image.Image, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
@@ -2355,7 +2347,7 @@ def ten_crop_image_pil(
     return non_flipped + flipped
 
 
-@_register_five_ten_crop_kernel(ten_crop, datapoints.Video)
+@_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Video)
 def ten_crop_video(
     video: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index a7177ab04..82891b8cc 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -8,10 +8,9 @@ from torchvision.transforms import _functional_pil as _FP
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_kernel_internal, _register_unsupported_type, is_simple_tensor
+from ._utils import _get_kernel, _register_kernel_internal, is_simple_tensor
 
 
-@_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
 def get_dimensions(inpt: torch.Tensor) -> List[int]:
     if torch.jit.is_scripting():
         return get_dimensions_image_tensor(inpt)
@@ -44,7 +43,6 @@ def get_dimensions_video(video: torch.Tensor) -> List[int]:
     return get_dimensions_image_tensor(video)
 
 
-@_register_unsupported_type(datapoints.BoundingBoxes, datapoints.Mask)
 def get_num_channels(inpt: torch.Tensor) -> int:
     if torch.jit.is_scripting():
         return get_num_channels_image_tensor(inpt)
@@ -123,7 +121,6 @@ def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]
     return list(bounding_box.canvas_size)
 
 
-@_register_unsupported_type(PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)
 def get_num_frames(inpt: torch.Tensor) -> int:
     if torch.jit.is_scripting():
         return get_num_frames_video(inpt)
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index ec9c194d5..658b61ced 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -11,11 +11,9 @@ from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal, _register_unsupported_type
+from ._utils import _get_kernel, _register_kernel_internal
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
-@_register_unsupported_type(PIL.Image.Image)
 def normalize(
     inpt: torch.Tensor,
     mean: List[float],
@@ -73,7 +71,6 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
     return normalize_image_tensor(video, mean, std, inplace=inplace)
 
 
-@_register_explicit_noop(datapoints.BoundingBoxes, datapoints.Mask)
 def gaussian_blur(inpt: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None) -> torch.Tensor:
     if torch.jit.is_scripting():
         return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
@@ -182,7 +179,6 @@ def gaussian_blur_video(
     return gaussian_blur_image_tensor(video, kernel_size, sigma)
 
 
-@_register_unsupported_type(PIL.Image.Image)
 def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
     if torch.jit.is_scripting():
         return to_dtype_image_tensor(inpt, dtype=dtype, scale=scale)
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 78dcfc1ef..8edd66c66 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -1,16 +1,12 @@
-import PIL.Image
 import torch
 
 from torchvision import datapoints
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_explicit_noop, _register_kernel_internal
+from ._utils import _get_kernel, _register_kernel_internal
 
 
-@_register_explicit_noop(
-    PIL.Image.Image, datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask, warn_passthrough=True
-)
 def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Tensor:
     if torch.jit.is_scripting():
         return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index ce1c320a7..8c95828ee 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -1,5 +1,4 @@
 import functools
-import warnings
 from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
 
 import torch
@@ -53,6 +52,11 @@ def _name_to_dispatcher(name):
         ) from None
 
 
+_BUILTIN_DATAPOINT_TYPES = {
+    obj for obj in datapoints.__dict__.values() if isinstance(obj, type) and issubclass(obj, datapoints.Datapoint)
+}
+
+
 def register_kernel(dispatcher, datapoint_cls):
     """Decorate a kernel to register it for a dispatcher and a (custom) datapoint type.
 
@@ -70,20 +74,19 @@ def register_kernel(dispatcher, datapoint_cls):
             f"but got {dispatcher}."
         )
 
-    if not (
-        isinstance(datapoint_cls, type)
-        and issubclass(datapoint_cls, datapoints.Datapoint)
-        and datapoint_cls is not datapoints.Datapoint
-    ):
+    if not (isinstance(datapoint_cls, type) and issubclass(datapoint_cls, datapoints.Datapoint)):
         raise ValueError(
             f"Kernels can only be registered for subclasses of torchvision.datapoints.Datapoint, "
             f"but got {datapoint_cls}."
         )
 
+    if datapoint_cls in _BUILTIN_DATAPOINT_TYPES:
+        raise ValueError(f"Kernels cannot be registered for the builtin datapoint classes, but got {datapoint_cls}")
+
     return _register_kernel_internal(dispatcher, datapoint_cls, datapoint_wrapper=False)
 
 
-def _get_kernel(dispatcher, input_type):
+def _get_kernel(dispatcher, input_type, *, allow_passthrough=False):
     registry = _KERNEL_REGISTRY.get(dispatcher)
     if not registry:
         raise ValueError(f"No kernel registered for dispatcher {dispatcher.__name__}.")
@@ -104,78 +107,18 @@ def _get_kernel(dispatcher, input_type):
             elif cls in registry:
                 return registry[cls]
 
-        # Note that in the future we are not going to return a noop here, but rather raise the error below
-        return _noop
+    if allow_passthrough:
+        return lambda inpt, *args, **kwargs: inpt
 
     raise TypeError(
-        f"Dispatcher {dispatcher} supports inputs of type torch.Tensor, PIL.Image.Image, "
-        f"and subclasses of torchvision.datapoints.Datapoint, "
+        f"Dispatcher F.{dispatcher.__name__} supports inputs of type {registry.keys()}, "
         f"but got {input_type} instead."
     )
 
 
-# Everything below this block is stuff that we need right now, since it looks like we need to release in an intermediate
-# stage. See https://github.com/pytorch/vision/pull/7747#issuecomment-1661698450 for details.
-
-
-# In the future, the default behavior will be to error on unsupported types in dispatchers. The noop behavior that we
-# need for transforms will be handled by _get_kernel rather than actually registering no-ops on the dispatcher.
-# Finally, the use case of preventing users from registering kernels for our builtin types will be handled inside
-# register_kernel.
-def _register_explicit_noop(*datapoints_classes, warn_passthrough=False):
-    """
-    Although this looks redundant with the no-op behavior of _get_kernel, this explicit registration prevents users
-    from registering kernels for builtin datapoints on builtin dispatchers that rely on the no-op behavior.
-
-    For example, without explicit no-op registration the following would be valid user code:
-
-    .. code::
-        from torchvision.transforms.v2 import functional as F
-
-        @F.register_kernel(F.adjust_brightness, datapoints.BoundingBox)
-        def lol(...):
-            ...
-    """
-
-    def decorator(dispatcher):
-        for cls in datapoints_classes:
-            msg = (
-                f"F.{dispatcher.__name__} is currently passing through inputs of type datapoints.{cls.__name__}. "
-                f"This will likely change in the future."
-            )
-            _register_kernel_internal(dispatcher, cls, datapoint_wrapper=False)(
-                functools.partial(_noop, __msg__=msg if warn_passthrough else None)
-            )
-        return dispatcher
-
-    return decorator
-
-
-def _noop(inpt, *args, __msg__=None, **kwargs):
-    if __msg__:
-        warnings.warn(__msg__, UserWarning, stacklevel=2)
-    return inpt
-
-
-# TODO: we only need this, since our default behavior in case no kernel is found is passthrough. When we change that
-# to error later, this decorator can be removed, since the error will be raised by _get_kernel
-def _register_unsupported_type(*input_types):
-    def kernel(inpt, *args, __dispatcher_name__, **kwargs):
-        raise TypeError(f"F.{__dispatcher_name__} does not support inputs of type {type(inpt)}.")
-
-    def decorator(dispatcher):
-        for input_type in input_types:
-            _register_kernel_internal(dispatcher, input_type, datapoint_wrapper=False)(
-                functools.partial(kernel, __dispatcher_name__=dispatcher.__name__)
-            )
-        return dispatcher
-
-    return decorator
-
-
 # This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
 # We could get rid of this by letting _register_kernel_internal take arbitrary dispatchers rather than wrap_kernel: bool
-def _register_five_ten_crop_kernel(dispatcher, input_type):
+def _register_five_ten_crop_kernel_internal(dispatcher, input_type):
     registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
     if input_type in registry:
         raise TypeError(f"Dispatcher '{dispatcher}' already has a kernel registered for type '{input_type}'.")
-- 
GitLab


From cab01fc1b7c7f0620ab95c6064f2a3fb583e9bee Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 10 Aug 2023 14:18:04 -0400
Subject: [PATCH 553/624] Adding aarch64 linux builds (#7815)

---
 .../workflows/build-wheels-aarch64-linux.yml  | 53 +++++++++++++++++++
 packaging/wheel/relocate.py                   |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/build-wheels-aarch64-linux.yml

diff --git a/.github/workflows/build-wheels-aarch64-linux.yml b/.github/workflows/build-wheels-aarch64-linux.yml
new file mode 100644
index 000000000..035800e93
--- /dev/null
+++ b/.github/workflows/build-wheels-aarch64-linux.yml
@@ -0,0 +1,53 @@
+name: Build Aarch64 Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux-aarch64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: disable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+      architecture: aarch64
+      setup-miniconda: false
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py
index 6a8d35f1a..fb110abd8 100644
--- a/packaging/wheel/relocate.py
+++ b/packaging/wheel/relocate.py
@@ -181,7 +181,7 @@ def relocate_elf_library(patchelf, output_dir, output_library, binary):
 
     print("Copying dependencies to wheel directory")
     new_libraries_path = osp.join(output_dir, "torchvision.libs")
-    os.makedirs(new_libraries_path)
+    os.makedirs(new_libraries_path, exist_ok=True)
 
     new_names = {binary: binary_path}
 
-- 
GitLab


From 3065ad595bc842b5c4d3895fdc96db8c1714c962 Mon Sep 17 00:00:00 2001
From: Minliang Lin <62332477+MinliangLin@users.noreply.github.com>
Date: Mon, 14 Aug 2023 16:08:04 +0800
Subject: [PATCH 554/624] Fix typo in gallery/plot_transforms_v2_e2e.py (#7826)

---
 gallery/plot_transforms_v2_e2e.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 8a80c78e1..ccffea766 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -5,8 +5,8 @@ Transforms v2: End-to-end object detection example
 
 Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
 ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example
-showcases an end-to-end object detection training using the stable ``torchvisio.datasets`` and ``torchvision.models`` as
-well as the new ``torchvision.transforms.v2`` v2 API.
+showcases an end-to-end object detection training using the stable ``torchvision.datasets`` and ``torchvision.models``
+as well as the new ``torchvision.transforms.v2`` v2 API.
 """
 
 import pathlib
-- 
GitLab


From 885917174099af449fd26644a65f7fec43450476 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 14 Aug 2023 15:11:48 +0100
Subject: [PATCH 555/624] Allow users to choose whether to return Datapoint
 subclasses or pure Tensor (#7825)

---
 docs/source/datapoints.rst                    |   1 +
 test/test_datapoints.py                       | 168 +++++++++++++++---
 torchvision/datapoints/__init__.py            |   2 +
 torchvision/datapoints/_bounding_box.py       |  38 +++-
 torchvision/datapoints/_datapoint.py          |  31 +++-
 .../datapoints/_torch_function_helpers.py     |  53 ++++++
 torchvision/transforms/v2/_misc.py            |   2 +-
 .../transforms/v2/functional/_utils.py        |   2 +
 8 files changed, 259 insertions(+), 38 deletions(-)
 create mode 100644 torchvision/datapoints/_torch_function_helpers.py

diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index ea23a7ff7..7351c8685 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -18,3 +18,4 @@ see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
     BoundingBoxes
     Mask
     Datapoint
+    set_return_type
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 4987b1991..5acc278db 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -6,6 +6,20 @@ from common_utils import assert_equal
 from PIL import Image
 
 from torchvision import datapoints
+from common_utils import (
+    make_bounding_box,
+    make_detection_mask,
+    make_image,
+    make_image_tensor,
+    make_segmentation_mask,
+    make_video,
+)
+
+
+@pytest.fixture(autouse=True)
+def preserve_default_wrapping_behaviour():
+    yield
+    datapoints.set_return_type("Tensor")
 
 
 @pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
@@ -80,72 +94,88 @@ def test_to_wrapping():
     assert image_to.dtype is torch.float64
 
 
-def test_to_datapoint_reference():
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_to_datapoint_reference(return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
     image = datapoints.Image(tensor)
 
-    tensor_to = tensor.to(image)
+    with datapoints.set_return_type(return_type):
+        tensor_to = tensor.to(image)
 
-    assert type(tensor_to) is torch.Tensor
+    assert type(tensor_to) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
     assert tensor_to.dtype is torch.float64
 
 
-def test_clone_wrapping():
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_clone_wrapping(return_type):
     image = datapoints.Image(torch.rand(3, 16, 16))
 
-    image_clone = image.clone()
+    with datapoints.set_return_type(return_type):
+        image_clone = image.clone()
 
     assert type(image_clone) is datapoints.Image
     assert image_clone.data_ptr() != image.data_ptr()
 
 
-def test_requires_grad__wrapping():
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_requires_grad__wrapping(return_type):
     image = datapoints.Image(torch.rand(3, 16, 16))
 
     assert not image.requires_grad
 
-    image_requires_grad = image.requires_grad_(True)
+    with datapoints.set_return_type(return_type):
+        image_requires_grad = image.requires_grad_(True)
 
     assert type(image_requires_grad) is datapoints.Image
     assert image.requires_grad
     assert image_requires_grad.requires_grad
 
 
-def test_detach_wrapping():
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_detach_wrapping(return_type):
     image = datapoints.Image(torch.rand(3, 16, 16), requires_grad=True)
 
-    image_detached = image.detach()
+    with datapoints.set_return_type(return_type):
+        image_detached = image.detach()
 
     assert type(image_detached) is datapoints.Image
 
 
-def test_no_wrapping_exceptions_with_metadata():
-    # Sanity checks for the ops in _NO_WRAPPING_EXCEPTIONS and datapoints with metadata
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_force_subclass_with_metadata(return_type):
+    # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and datapoints with metadata
     format, canvas_size = "XYXY", (32, 32)
     bbox = datapoints.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
 
+    datapoints.set_return_type(return_type)
     bbox = bbox.clone()
-    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+    if return_type == "datapoint":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     bbox = bbox.to(torch.float64)
-    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+    if return_type == "datapoint":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     bbox = bbox.detach()
-    assert bbox.format, bbox.canvas_size == (format, canvas_size)
+    if return_type == "datapoint":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     assert not bbox.requires_grad
     bbox.requires_grad_(True)
-    assert bbox.format, bbox.canvas_size == (format, canvas_size)
-    assert bbox.requires_grad
+    if return_type == "datapoint":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+        assert bbox.requires_grad
 
 
-def test_other_op_no_wrapping():
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_other_op_no_wrapping(return_type):
     image = datapoints.Image(torch.rand(3, 16, 16))
 
-    # any operation besides the ones listed in `Datapoint._NO_WRAPPING_EXCEPTIONS` will do here
-    output = image * 2
+    with datapoints.set_return_type(return_type):
+        # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
+        output = image * 2
 
-    assert type(output) is torch.Tensor
+    assert type(output) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
 
 
 @pytest.mark.parametrize(
@@ -164,19 +194,21 @@ def test_no_tensor_output_op_no_wrapping(op):
     assert type(output) is not datapoints.Image
 
 
-def test_inplace_op_no_wrapping():
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_inplace_op_no_wrapping(return_type):
     image = datapoints.Image(torch.rand(3, 16, 16))
 
-    output = image.add_(0)
+    with datapoints.set_return_type(return_type):
+        output = image.add_(0)
 
-    assert type(output) is torch.Tensor
+    assert type(output) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
     assert type(image) is datapoints.Image
 
 
 def test_wrap_like():
     image = datapoints.Image(torch.rand(3, 16, 16))
 
-    # any operation besides the ones listed in `Datapoint._NO_WRAPPING_EXCEPTIONS` will do here
+    # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
     output = image * 2
 
     image_new = datapoints.Image.wrap_like(image, output)
@@ -209,3 +241,91 @@ def test_deepcopy(datapoint, requires_grad):
 
     assert type(datapoint_deepcopied) is type(datapoint)
     assert datapoint_deepcopied.requires_grad is requires_grad
+
+
+@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+def test_operations(return_type):
+    datapoints.set_return_type(return_type)
+
+    img = datapoints.Image(torch.rand(3, 10, 10))
+    t = torch.rand(3, 10, 10)
+    mask = datapoints.Mask(torch.rand(1, 10, 10))
+
+    for out in (
+        [
+            img + t,
+            t + img,
+            img * t,
+            t * img,
+            img + 3,
+            3 + img,
+            img * 3,
+            3 * img,
+            img + img,
+            img.sum(),
+            img.reshape(-1),
+            img.float(),
+            torch.stack([img, img]),
+        ]
+        + list(torch.chunk(img, 2))
+        + list(torch.unbind(img))
+    ):
+        assert type(out) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
+
+    for out in (
+        [
+            mask + t,
+            t + mask,
+            mask * t,
+            t * mask,
+            mask + 3,
+            3 + mask,
+            mask * 3,
+            3 * mask,
+            mask + mask,
+            mask.sum(),
+            mask.reshape(-1),
+            mask.float(),
+            torch.stack([mask, mask]),
+        ]
+        + list(torch.chunk(mask, 2))
+        + list(torch.unbind(mask))
+    ):
+        assert type(out) is (datapoints.Mask if return_type == "datapoint" else torch.Tensor)
+
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        img + mask
+
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        img * mask
+
+    bboxes = datapoints.BoundingBoxes(
+        [[17, 16, 344, 495], [0, 10, 0, 10]], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1000, 1000)
+    )
+    t = torch.rand(2, 4)
+
+    for out in (
+        [
+            bboxes + t,
+            t + bboxes,
+            bboxes * t,
+            t * bboxes,
+            bboxes + 3,
+            3 + bboxes,
+            bboxes * 3,
+            3 * bboxes,
+            bboxes + bboxes,
+            bboxes.sum(),
+            bboxes.reshape(-1),
+            bboxes.float(),
+            torch.stack([bboxes, bboxes]),
+        ]
+        + list(torch.chunk(bboxes, 2))
+        + list(torch.unbind(bboxes))
+    ):
+        if return_type == "Tensor":
+            assert type(out) is torch.Tensor
+        else:
+            assert isinstance(out, datapoints.BoundingBoxes)
+            assert hasattr(out, "format")
+            assert hasattr(out, "canvas_size")
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index de6f975e4..7e1295c11 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,9 +1,11 @@
+import torch
 from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
 
 from ._bounding_box import BoundingBoxes, BoundingBoxFormat
 from ._datapoint import Datapoint
 from ._image import Image
 from ._mask import Mask
+from ._torch_function_helpers import set_return_type
 from ._video import Video
 
 if _WARN_ABOUT_BETA_TRANSFORMS:
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index d459a5544..d6f0747df 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
+from torch.utils._pytree import tree_flatten
 
 from ._datapoint import Datapoint
 
@@ -48,11 +49,12 @@ class BoundingBoxes(Datapoint):
     canvas_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: Union[BoundingBoxFormat, str], canvas_size: Tuple[int, int]) -> BoundingBoxes:  # type: ignore[override]
-        if tensor.ndim == 1:
-            tensor = tensor.unsqueeze(0)
-        elif tensor.ndim != 2:
-            raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
+    def _wrap(cls, tensor: torch.Tensor, *, format: Union[BoundingBoxFormat, str], canvas_size: Tuple[int, int], check_dims: bool = True) -> BoundingBoxes:  # type: ignore[override]
+        if check_dims:
+            if tensor.ndim == 1:
+                tensor = tensor.unsqueeze(0)
+            elif tensor.ndim != 2:
+                raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
         if isinstance(format, str):
             format = BoundingBoxFormat[format.upper()]
         bounding_boxes = tensor.as_subclass(cls)
@@ -99,5 +101,29 @@ class BoundingBoxes(Datapoint):
             canvas_size=canvas_size if canvas_size is not None else other.canvas_size,
         )
 
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> BoundingBoxes:
+        # If there are BoundingBoxes instances in the output, their metadata got lost when we called
+        # super().__torch_function__. We need to restore the metadata somehow, so we choose to take
+        # the metadata from the first bbox in the parameters.
+        # This should be what we want in most cases. When it's not, it's probably a mis-use anyway, e.g.
+        # something like some_xyxy_bbox + some_xywh_bbox; we don't guard against those cases.
+        flat_params, _ = tree_flatten(args + (tuple(kwargs.values()) if kwargs else ()))  # type: ignore[operator]
+        first_bbox_from_args = next(x for x in flat_params if isinstance(x, BoundingBoxes))
+        format, canvas_size = first_bbox_from_args.format, first_bbox_from_args.canvas_size
+
+        if isinstance(output, torch.Tensor) and not isinstance(output, BoundingBoxes):
+            output = BoundingBoxes._wrap(output, format=format, canvas_size=canvas_size, check_dims=False)
+        elif isinstance(output, (tuple, list)):
+            output = type(output)(
+                BoundingBoxes._wrap(part, format=format, canvas_size=canvas_size, check_dims=False) for part in output
+            )
+        return output
+
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr(format=self.format, canvas_size=self.canvas_size)
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 7032d518f..c4b5ee48d 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -6,6 +6,8 @@ import torch
 from torch._C import DisableTorchFunctionSubclass
 from torch.types import _device, _dtype, _size
 
+from torchvision.datapoints._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass
+
 
 D = TypeVar("D", bound="Datapoint")
 
@@ -33,9 +35,21 @@ class Datapoint(torch.Tensor):
     def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
         return tensor.as_subclass(cls)
 
-    # The ops in this set are those that should *preserve* the Datapoint type,
-    # i.e. they are exceptions to the "no wrapping" rule.
-    _NO_WRAPPING_EXCEPTIONS = {torch.Tensor.clone, torch.Tensor.to, torch.Tensor.detach, torch.Tensor.requires_grad_}
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> torch.Tensor:
+        # Same as torch._tensor._convert
+        if isinstance(output, torch.Tensor) and not isinstance(output, cls):
+            output = output.as_subclass(cls)
+
+        if isinstance(output, (tuple, list)):
+            # Also handles things like namedtuples
+            output = type(output)(cls._wrap_output(part, args, kwargs) for part in output)
+        return output
 
     @classmethod
     def __torch_function__(
@@ -60,7 +74,7 @@ class Datapoint(torch.Tensor):
         2. For most operations, there is no way of knowing if the input type is still valid for the output.
 
         For these reasons, the automatic output wrapping is turned off for most operators. The only exceptions are
-        listed in :attr:`Datapoint._NO_WRAPPING_EXCEPTIONS`
+        listed in _FORCE_TORCHFUNCTION_SUBCLASS
         """
         # Since super().__torch_function__ has no hook to prevent the coercing of the output into the input type, we
         # need to reimplement the functionality.
@@ -68,19 +82,22 @@ class Datapoint(torch.Tensor):
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
 
+        # Like in the base Tensor.__torch_function__ implementation, it's easier to always use
+        # DisableTorchFunctionSubclass and then manually re-wrap the output if necessary
         with DisableTorchFunctionSubclass():
             output = func(*args, **kwargs or dict())
 
-        if func in cls._NO_WRAPPING_EXCEPTIONS and isinstance(args[0], cls):
+        must_return_subclass = _must_return_subclass()
+        if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)):
             # We also require the primary operand, i.e. `args[0]`, to be
             # an instance of the class that `__torch_function__` was invoked on. The __torch_function__ protocol will
             # invoke this method on *all* types involved in the computation by walking the MRO upwards. For example,
             # `torch.Tensor(...).to(datapoints.Image(...))` will invoke `datapoints.Image.__torch_function__` with
             # `args = (torch.Tensor(), datapoints.Image())` first. Without this guard, the original `torch.Tensor` would
             # be wrapped into a `datapoints.Image`.
-            return cls.wrap_like(args[0], output)
+            return cls._wrap_output(output, args, kwargs)
 
-        if isinstance(output, cls):
+        if not must_return_subclass and isinstance(output, cls):
             # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`,
             # so for those, the output is still a Datapoint. Thus, we need to manually unwrap.
             return output.as_subclass(torch.Tensor)
diff --git a/torchvision/datapoints/_torch_function_helpers.py b/torchvision/datapoints/_torch_function_helpers.py
new file mode 100644
index 000000000..68674eb02
--- /dev/null
+++ b/torchvision/datapoints/_torch_function_helpers.py
@@ -0,0 +1,53 @@
+import torch
+
+_TORCHFUNCTION_SUBCLASS = False
+
+
+class _ReturnTypeCM:
+    def __init__(self, to_restore):
+        self.to_restore = to_restore
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        global _TORCHFUNCTION_SUBCLASS
+        _TORCHFUNCTION_SUBCLASS = self.to_restore
+
+
+def set_return_type(return_type: str):
+    """Set the return type of torch operations on datapoints.
+
+    Can be used as a global flag for the entire program:
+
+    .. code:: python
+
+        set_return_type("datapoints")
+        img = datapoints.Image(torch.rand(3, 5, 5))
+        img + 2  # This is an Image
+
+    or as a context manager to restrict the scope:
+
+    .. code:: python
+
+        img = datapoints.Image(torch.rand(3, 5, 5))
+        with set_return_type("datapoints"):
+            img + 2  # This is an Image
+        img + 2  # This is a pure Tensor
+
+    Args:
+        return_type (str): Can be "datapoint" or "tensor". Default is "tensor".
+    """
+    global _TORCHFUNCTION_SUBCLASS
+    to_restore = _TORCHFUNCTION_SUBCLASS
+    _TORCHFUNCTION_SUBCLASS = {"tensor": False, "datapoint": True}[return_type.lower()]
+
+    return _ReturnTypeCM(to_restore)
+
+
+def _must_return_subclass():
+    return _TORCHFUNCTION_SUBCLASS
+
+
+# For those ops we always want to preserve the original subclass instead of returning a pure Tensor
+_FORCE_TORCHFUNCTION_SUBCLASS = {torch.Tensor.clone, torch.Tensor.to, torch.Tensor.detach, torch.Tensor.requires_grad_}
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 780d9f994..1550b5238 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -401,7 +401,7 @@ class SanitizeBoundingBoxes(Transform):
         valid &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
         valid &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
 
-        params = dict(valid=valid, labels=labels)
+        params = dict(valid=valid.as_subclass(torch.Tensor), labels=labels)
         flat_outputs = [
             # Even-though it may look like we're transforming all inputs, we don't:
             # _transform() will only care about BoundingBoxeses and the labels
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 8c95828ee..d9609a52e 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -19,6 +19,8 @@ _KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {}
 def _kernel_datapoint_wrapper(kernel):
     @functools.wraps(kernel)
     def wrapper(inpt, *args, **kwargs):
+        # We always pass datapoints as pure tensors to the kernels to avoid going through the
+        # Tensor.__torch_function__ logic, which is costly.
         output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
         return type(inpt).wrap_like(inpt, output)
 
-- 
GitLab


From 6ab8a96f5b1f9d27285875e4aaeb6b175ddd3808 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Mon, 14 Aug 2023 17:28:05 +0100
Subject: [PATCH 556/624] Test cleanups + add new tests for datapoints (#7828)

---
 test/test_datapoints.py | 295 ++++++++++++++++++----------------------
 1 file changed, 135 insertions(+), 160 deletions(-)

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 5acc278db..e38fcaf1d 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -2,22 +2,16 @@ from copy import deepcopy
 
 import pytest
 import torch
-from common_utils import assert_equal
+from common_utils import assert_equal, make_bounding_box, make_image, make_segmentation_mask, make_video
 from PIL import Image
 
 from torchvision import datapoints
-from common_utils import (
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_image_tensor,
-    make_segmentation_mask,
-    make_video,
-)
 
 
 @pytest.fixture(autouse=True)
-def preserve_default_wrapping_behaviour():
+def restore_tensor_return_type():
+    # This is for security, as we should already be restoring the default manually in each test anyway
+    # (at least at the time of writing...)
     yield
     datapoints.set_return_type("Tensor")
 
@@ -74,8 +68,9 @@ def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
     assert datapoint.requires_grad is expected_requires_grad
 
 
-def test_isinstance():
-    assert isinstance(datapoints.Image(torch.rand(3, 16, 16)), torch.Tensor)
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+def test_isinstance(make_input):
+    assert isinstance(make_input(), torch.Tensor)
 
 
 def test_wrapping_no_copy():
@@ -85,65 +80,71 @@ def test_wrapping_no_copy():
     assert image.data_ptr() == tensor.data_ptr()
 
 
-def test_to_wrapping():
-    image = datapoints.Image(torch.rand(3, 16, 16))
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+def test_to_wrapping(make_input):
+    dp = make_input()
 
-    image_to = image.to(torch.float64)
+    dp_to = dp.to(torch.float64)
 
-    assert type(image_to) is datapoints.Image
-    assert image_to.dtype is torch.float64
+    assert type(dp_to) is type(dp)
+    assert dp_to.dtype is torch.float64
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_to_datapoint_reference(return_type):
+def test_to_datapoint_reference(make_input, return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
-    image = datapoints.Image(tensor)
+    dp = make_input()
 
     with datapoints.set_return_type(return_type):
-        tensor_to = tensor.to(image)
+        tensor_to = tensor.to(dp)
 
-    assert type(tensor_to) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
-    assert tensor_to.dtype is torch.float64
+    assert type(tensor_to) is (type(dp) if return_type == "datapoint" else torch.Tensor)
+    assert tensor_to.dtype is dp.dtype
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_clone_wrapping(return_type):
-    image = datapoints.Image(torch.rand(3, 16, 16))
+def test_clone_wrapping(make_input, return_type):
+    dp = make_input()
 
     with datapoints.set_return_type(return_type):
-        image_clone = image.clone()
+        dp_clone = dp.clone()
 
-    assert type(image_clone) is datapoints.Image
-    assert image_clone.data_ptr() != image.data_ptr()
+    assert type(dp_clone) is type(dp)
+    assert dp_clone.data_ptr() != dp.data_ptr()
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_requires_grad__wrapping(return_type):
-    image = datapoints.Image(torch.rand(3, 16, 16))
+def test_requires_grad__wrapping(make_input, return_type):
+    dp = make_input(dtype=torch.float)
 
-    assert not image.requires_grad
+    assert not dp.requires_grad
 
     with datapoints.set_return_type(return_type):
-        image_requires_grad = image.requires_grad_(True)
+        dp_requires_grad = dp.requires_grad_(True)
 
-    assert type(image_requires_grad) is datapoints.Image
-    assert image.requires_grad
-    assert image_requires_grad.requires_grad
+    assert type(dp_requires_grad) is type(dp)
+    assert dp.requires_grad
+    assert dp_requires_grad.requires_grad
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_detach_wrapping(return_type):
-    image = datapoints.Image(torch.rand(3, 16, 16), requires_grad=True)
+def test_detach_wrapping(make_input, return_type):
+    dp = make_input(dtype=torch.float).requires_grad_(True)
 
     with datapoints.set_return_type(return_type):
-        image_detached = image.detach()
+        dp_detached = dp.detach()
 
-    assert type(image_detached) is datapoints.Image
+    assert type(dp_detached) is type(dp)
 
 
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_force_subclass_with_metadata(return_type):
     # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and datapoints with metadata
+    # Largely the same as above, we additionally check that the metadata is preserved
     format, canvas_size = "XYXY", (32, 32)
     bbox = datapoints.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
 
@@ -165,19 +166,22 @@ def test_force_subclass_with_metadata(return_type):
     if return_type == "datapoint":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
         assert bbox.requires_grad
+    datapoints.set_return_type("tensor")
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_other_op_no_wrapping(return_type):
-    image = datapoints.Image(torch.rand(3, 16, 16))
+def test_other_op_no_wrapping(make_input, return_type):
+    dp = make_input()
 
     with datapoints.set_return_type(return_type):
         # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
-        output = image * 2
+        output = dp * 2
 
-    assert type(output) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
+    assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize(
     "op",
     [
@@ -186,146 +190,117 @@ def test_other_op_no_wrapping(return_type):
         lambda t: t.max(dim=-1),
     ],
 )
-def test_no_tensor_output_op_no_wrapping(op):
-    image = datapoints.Image(torch.rand(3, 16, 16))
+def test_no_tensor_output_op_no_wrapping(make_input, op):
+    dp = make_input()
 
-    output = op(image)
+    output = op(dp)
 
-    assert type(output) is not datapoints.Image
+    assert type(output) is not type(dp)
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_inplace_op_no_wrapping(return_type):
-    image = datapoints.Image(torch.rand(3, 16, 16))
+def test_inplace_op_no_wrapping(make_input, return_type):
+    dp = make_input()
+    original_type = type(dp)
 
     with datapoints.set_return_type(return_type):
-        output = image.add_(0)
+        output = dp.add_(0)
 
-    assert type(output) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
-    assert type(image) is datapoints.Image
+    assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
+    assert type(dp) is original_type
 
 
-def test_wrap_like():
-    image = datapoints.Image(torch.rand(3, 16, 16))
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+def test_wrap_like(make_input):
+    dp = make_input()
 
     # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
-    output = image * 2
+    output = dp * 2
 
-    image_new = datapoints.Image.wrap_like(image, output)
+    dp_new = type(dp).wrap_like(dp, output)
 
-    assert type(image_new) is datapoints.Image
-    assert image_new.data_ptr() == output.data_ptr()
+    assert type(dp_new) is type(dp)
+    assert dp_new.data_ptr() == output.data_ptr()
 
 
-@pytest.mark.parametrize(
-    "datapoint",
-    [
-        datapoints.Image(torch.rand(3, 16, 16)),
-        datapoints.Video(torch.rand(2, 3, 16, 16)),
-        datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(10, 10)),
-        datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)),
-    ],
-)
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("requires_grad", [False, True])
-def test_deepcopy(datapoint, requires_grad):
-    if requires_grad and not datapoint.dtype.is_floating_point:
-        return
+def test_deepcopy(make_input, requires_grad):
+    dp = make_input(dtype=torch.float)
 
-    datapoint.requires_grad_(requires_grad)
+    dp.requires_grad_(requires_grad)
 
-    datapoint_deepcopied = deepcopy(datapoint)
+    dp_deepcopied = deepcopy(dp)
 
-    assert datapoint_deepcopied is not datapoint
-    assert datapoint_deepcopied.data_ptr() != datapoint.data_ptr()
-    assert_equal(datapoint_deepcopied, datapoint)
+    assert dp_deepcopied is not dp
+    assert dp_deepcopied.data_ptr() != dp.data_ptr()
+    assert_equal(dp_deepcopied, dp)
 
-    assert type(datapoint_deepcopied) is type(datapoint)
-    assert datapoint_deepcopied.requires_grad is requires_grad
+    assert type(dp_deepcopied) is type(dp)
+    assert dp_deepcopied.requires_grad is requires_grad
 
 
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_operations(return_type):
-    datapoints.set_return_type(return_type)
+@pytest.mark.parametrize(
+    "op",
+    (
+        lambda dp: dp + torch.rand(*dp.shape),
+        lambda dp: torch.rand(*dp.shape) + dp,
+        lambda dp: dp * torch.rand(*dp.shape),
+        lambda dp: torch.rand(*dp.shape) * dp,
+        lambda dp: dp + 3,
+        lambda dp: 3 + dp,
+        lambda dp: dp + dp,
+        lambda dp: dp.sum(),
+        lambda dp: dp.reshape(-1),
+        lambda dp: dp.int(),
+        lambda dp: torch.stack([dp, dp]),
+        lambda dp: torch.chunk(dp, 2)[0],
+        lambda dp: torch.unbind(dp)[0],
+    ),
+)
+def test_usual_operations(make_input, return_type, op):
+
+    dp = make_input()
+    with datapoints.set_return_type(return_type):
+        out = op(dp)
+    assert type(out) is (type(dp) if return_type == "datapoint" else torch.Tensor)
+    if isinstance(dp, datapoints.BoundingBoxes) and return_type == "datapoint":
+        assert hasattr(out, "format")
+        assert hasattr(out, "canvas_size")
+
+
+def test_subclasses():
+    img = make_image()
+    masks = make_segmentation_mask()
+
+    with pytest.raises(TypeError, match="unsupported operand"):
+        img + masks
+
+
+def test_set_return_type():
+    img = make_image()
+
+    assert type(img + 3) is torch.Tensor
+
+    with datapoints.set_return_type("datapoint"):
+        assert type(img + 3) is datapoints.Image
+    assert type(img + 3) is torch.Tensor
+
+    datapoints.set_return_type("datapoint")
+    assert type(img + 3) is datapoints.Image
+
+    with datapoints.set_return_type("tensor"):
+        assert type(img + 3) is torch.Tensor
+        with datapoints.set_return_type("datapoint"):
+            assert type(img + 3) is datapoints.Image
+            datapoints.set_return_type("tensor")
+            assert type(img + 3) is torch.Tensor
+        assert type(img + 3) is torch.Tensor
+    # Exiting a context manager will restore the return type as it was prior to entering it,
+    # regardless of whether the "global" datapoints.set_return_type() was called within the context manager.
+    assert type(img + 3) is datapoints.Image
 
-    img = datapoints.Image(torch.rand(3, 10, 10))
-    t = torch.rand(3, 10, 10)
-    mask = datapoints.Mask(torch.rand(1, 10, 10))
-
-    for out in (
-        [
-            img + t,
-            t + img,
-            img * t,
-            t * img,
-            img + 3,
-            3 + img,
-            img * 3,
-            3 * img,
-            img + img,
-            img.sum(),
-            img.reshape(-1),
-            img.float(),
-            torch.stack([img, img]),
-        ]
-        + list(torch.chunk(img, 2))
-        + list(torch.unbind(img))
-    ):
-        assert type(out) is (datapoints.Image if return_type == "datapoint" else torch.Tensor)
-
-    for out in (
-        [
-            mask + t,
-            t + mask,
-            mask * t,
-            t * mask,
-            mask + 3,
-            3 + mask,
-            mask * 3,
-            3 * mask,
-            mask + mask,
-            mask.sum(),
-            mask.reshape(-1),
-            mask.float(),
-            torch.stack([mask, mask]),
-        ]
-        + list(torch.chunk(mask, 2))
-        + list(torch.unbind(mask))
-    ):
-        assert type(out) is (datapoints.Mask if return_type == "datapoint" else torch.Tensor)
-
-    with pytest.raises(TypeError, match="unsupported operand type"):
-        img + mask
-
-    with pytest.raises(TypeError, match="unsupported operand type"):
-        img * mask
-
-    bboxes = datapoints.BoundingBoxes(
-        [[17, 16, 344, 495], [0, 10, 0, 10]], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1000, 1000)
-    )
-    t = torch.rand(2, 4)
-
-    for out in (
-        [
-            bboxes + t,
-            t + bboxes,
-            bboxes * t,
-            t * bboxes,
-            bboxes + 3,
-            3 + bboxes,
-            bboxes * 3,
-            3 * bboxes,
-            bboxes + bboxes,
-            bboxes.sum(),
-            bboxes.reshape(-1),
-            bboxes.float(),
-            torch.stack([bboxes, bboxes]),
-        ]
-        + list(torch.chunk(bboxes, 2))
-        + list(torch.unbind(bboxes))
-    ):
-        if return_type == "Tensor":
-            assert type(out) is torch.Tensor
-        else:
-            assert isinstance(out, datapoints.BoundingBoxes)
-            assert hasattr(out, "format")
-            assert hasattr(out, "canvas_size")
+    datapoints.set_return_type("tensor")
-- 
GitLab


From f244e27e2d36af6ece5194bd6ad870cc96b73687 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 15 Aug 2023 12:22:27 +0100
Subject: [PATCH 557/624] Dispatcher -> Functional (#7829)

---
 gallery/plot_custom_datapoints.py             |   6 +-
 test/test_transforms_v2_refactored.py         | 128 +++++++++---------
 torchvision/transforms/v2/_augment.py         |   4 +-
 torchvision/transforms/v2/_geometry.py        |   8 +-
 torchvision/transforms/v2/_transform.py       |   4 +-
 torchvision/transforms/v2/functional/_meta.py |   2 +-
 .../transforms/v2/functional/_utils.py        |  46 +++----
 7 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/gallery/plot_custom_datapoints.py b/gallery/plot_custom_datapoints.py
index ea757283e..0a62a991a 100644
--- a/gallery/plot_custom_datapoints.py
+++ b/gallery/plot_custom_datapoints.py
@@ -49,7 +49,7 @@ my_dp
 from torchvision.transforms.v2 import functional as F
 
 
-@F.register_kernel(dispatcher="hflip", datapoint_cls=MyDatapoint)
+@F.register_kernel(functional="hflip", datapoint_cls=MyDatapoint)
 def hflip_my_datapoint(my_dp, *args, **kwargs):
     print("Flipping!")
     out = my_dp.flip(-1)
@@ -64,9 +64,9 @@ def hflip_my_datapoint(my_dp, *args, **kwargs):
 # .. note::
 #
 #     In our call to ``register_kernel`` above we used a string
-#     ``dispatcher="hflip"`` to refer to the functional we want to hook into. We
+#     ``functional="hflip"`` to refer to the functional we want to hook into. We
 #     could also have used the  functional *itself*, i.e.
-#     ``@register_kernel(dispatcher=F.hflip, ...)``.
+#     ``@register_kernel(functional=F.hflip, ...)``.
 #
 #     The functionals that you can be hooked into are the ones in
 #     ``torchvision.transforms.v2.functional`` and they are documented in
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 1e78c5ed6..fa1ed05b8 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -163,25 +163,25 @@ def check_kernel(
         _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched))
 
 
-def _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs):
-    """Checks if the dispatcher can be scripted and the scripted version can be called without error."""
+def _check_functional_scripted_smoke(functional, input, *args, **kwargs):
+    """Checks if the functional can be scripted and the scripted version can be called without error."""
     if not isinstance(input, datapoints.Image):
         return
 
-    dispatcher_scripted = _script(dispatcher)
+    functional_scripted = _script(functional)
     with ignore_jit_no_profile_information_warning():
-        dispatcher_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
+        functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
 
 
-def check_dispatcher(dispatcher, input, *args, check_scripted_smoke=True, **kwargs):
+def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs):
     unknown_input = object()
     with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
-        dispatcher(unknown_input, *args, **kwargs)
+        functional(unknown_input, *args, **kwargs)
 
     with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy:
-        output = dispatcher(input, *args, **kwargs)
+        output = functional(input, *args, **kwargs)
 
-        spy.assert_any_call(f"{dispatcher.__module__}.{dispatcher.__name__}")
+        spy.assert_any_call(f"{functional.__module__}.{functional.__name__}")
 
     assert isinstance(output, type(input))
 
@@ -189,41 +189,41 @@ def check_dispatcher(dispatcher, input, *args, check_scripted_smoke=True, **kwar
         assert output.format == input.format
 
     if check_scripted_smoke:
-        _check_dispatcher_scripted_smoke(dispatcher, input, *args, **kwargs)
+        _check_functional_scripted_smoke(functional, input, *args, **kwargs)
 
 
-def check_dispatcher_kernel_signature_match(dispatcher, *, kernel, input_type):
-    """Checks if the signature of the dispatcher matches the kernel signature."""
-    dispatcher_params = list(inspect.signature(dispatcher).parameters.values())[1:]
+def check_functional_kernel_signature_match(functional, *, kernel, input_type):
+    """Checks if the signature of the functional matches the kernel signature."""
+    functional_params = list(inspect.signature(functional).parameters.values())[1:]
     kernel_params = list(inspect.signature(kernel).parameters.values())[1:]
 
     if issubclass(input_type, datapoints.Datapoint):
-        # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be
+        # We filter out metadata that is implicitly passed to the functional through the input datapoint, but has to be
         # explicitly passed to the kernel.
         explicit_metadata = {
             datapoints.BoundingBoxes: {"format", "canvas_size"},
         }
         kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
 
-    dispatcher_params = iter(dispatcher_params)
-    for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
+    functional_params = iter(functional_params)
+    for functional_param, kernel_param in zip(functional_params, kernel_params):
         try:
-            # In general, the dispatcher parameters are a superset of the kernel parameters. Thus, we filter out
-            # dispatcher parameters that have no kernel equivalent while keeping the order intact.
-            while dispatcher_param.name != kernel_param.name:
-                dispatcher_param = next(dispatcher_params)
+            # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out
+            # functional parameters that have no kernel equivalent while keeping the order intact.
+            while functional_param.name != kernel_param.name:
+                functional_param = next(functional_params)
         except StopIteration:
             raise AssertionError(
                 f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` "
-                f"has no corresponding parameter on the dispatcher `{dispatcher.__name__}`."
+                f"has no corresponding parameter on the functional `{functional.__name__}`."
             ) from None
 
         if issubclass(input_type, PIL.Image.Image):
             # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check
             # them in the first place.
-            dispatcher_param._annotation = kernel_param._annotation = inspect.Parameter.empty
+            functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty
 
-        assert dispatcher_param == kernel_param
+        assert functional_param == kernel_param
 
 
 def _check_transform_v1_compatibility(transform, input):
@@ -482,8 +482,8 @@ class TestResize:
         "make_input",
         [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, size, make_input):
-        check_dispatcher(
+    def test_functional(self, size, make_input):
+        check_functional(
             F.resize,
             make_input(self.INPUT_SIZE),
             size=size,
@@ -502,8 +502,8 @@ class TestResize:
             (F.resize_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -608,7 +608,7 @@ class TestResize:
                 interpolation=interpolation,
             )
 
-    def test_dispatcher_pil_antialias_warning(self):
+    def test_functional_pil_antialias_warning(self):
         with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
             F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
 
@@ -763,8 +763,8 @@ class TestHorizontalFlip:
         "make_input",
         [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, make_input):
-        check_dispatcher(F.horizontal_flip, make_input())
+    def test_functional(self, make_input):
+        check_functional(F.horizontal_flip, make_input())
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -777,8 +777,8 @@ class TestHorizontalFlip:
             (F.horizontal_flip_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -939,8 +939,8 @@ class TestAffine:
         "make_input",
         [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, make_input):
-        check_dispatcher(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+    def test_functional(self, make_input):
+        check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -953,8 +953,8 @@ class TestAffine:
             (F.affine_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1228,8 +1228,8 @@ class TestVerticalFlip:
         "make_input",
         [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, make_input):
-        check_dispatcher(F.vertical_flip, make_input())
+    def test_functional(self, make_input):
+        check_functional(F.vertical_flip, make_input())
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -1242,8 +1242,8 @@ class TestVerticalFlip:
             (F.vertical_flip_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1378,8 +1378,8 @@ class TestRotate:
         "make_input",
         [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
-    def test_dispatcher(self, make_input):
-        check_dispatcher(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+    def test_functional(self, make_input):
+        check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -1392,8 +1392,8 @@ class TestRotate:
             (F.rotate_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
         "make_input",
@@ -1643,8 +1643,8 @@ class TestToDtype:
     @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("scale", (True, False))
-    def test_dispatcher(self, make_input, input_dtype, output_dtype, device, scale):
-        check_dispatcher(
+    def test_functional(self, make_input, input_dtype, output_dtype, device, scale):
+        check_functional(
             F.to_dtype,
             make_input(dtype=input_dtype, device=device),
             dtype=output_dtype,
@@ -1810,8 +1810,8 @@ class TestAdjustBrightness:
         check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
 
     @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
-    def test_dispatcher(self, make_input):
-        check_dispatcher(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+    def test_functional(self, make_input):
+        check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -1822,8 +1822,8 @@ class TestAdjustBrightness:
             (F.adjust_brightness_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS)
     def test_image_correctness(self, brightness_factor):
@@ -2042,7 +2042,7 @@ class TestShapeGetters:
         assert kernel(input) == F.get_num_frames(input) == num_frames
 
     @pytest.mark.parametrize(
-        ("dispatcher", "make_input"),
+        ("functional", "make_input"),
         [
             (F.get_dimensions, make_bounding_box),
             (F.get_dimensions, make_detection_mask),
@@ -2057,22 +2057,22 @@ class TestShapeGetters:
             (F.get_num_frames, make_segmentation_mask),
         ],
     )
-    def test_unsupported_types(self, dispatcher, make_input):
+    def test_unsupported_types(self, functional, make_input):
         input = make_input()
 
         with pytest.raises(TypeError, match=re.escape(str(type(input)))):
-            dispatcher(input)
+            functional(input)
 
 
 class TestRegisterKernel:
-    @pytest.mark.parametrize("dispatcher", (F.resize, "resize"))
-    def test_register_kernel(self, dispatcher):
+    @pytest.mark.parametrize("functional", (F.resize, "resize"))
+    def test_register_kernel(self, functional):
         class CustomDatapoint(datapoints.Datapoint):
             pass
 
         kernel_was_called = False
 
-        @F.register_kernel(dispatcher, CustomDatapoint)
+        @F.register_kernel(functional, CustomDatapoint)
         def new_resize(dp, *args, **kwargs):
             nonlocal kernel_was_called
             kernel_was_called = True
@@ -2090,10 +2090,10 @@ class TestRegisterKernel:
         t(datapoints.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
 
     def test_errors(self):
-        with pytest.raises(ValueError, match="Could not find dispatcher with name"):
+        with pytest.raises(ValueError, match="Could not find functional with name"):
             F.register_kernel("bad_name", datapoints.Image)
 
-        with pytest.raises(ValueError, match="Kernels can only be registered on dispatchers"):
+        with pytest.raises(ValueError, match="Kernels can only be registered on functionals"):
             F.register_kernel(datapoints.Image, F.resize)
 
         with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"):
@@ -2115,7 +2115,7 @@ class TestRegisterKernel:
 
 
 class TestGetKernel:
-    # We are using F.resize as dispatcher and the kernels below as proxy. Any other dispatcher / kernels combination
+    # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination
     # would also be fine
     KERNELS = {
         torch.Tensor: F.resize_image_tensor,
@@ -2139,7 +2139,7 @@ class TestGetKernel:
 
     def test_exact_match(self):
         # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
-        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize dispatcher
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
         # here, register the kernels without wrapper, and check the exact matching afterwards.
         def resize_with_pure_kernels():
             pass
@@ -2151,7 +2151,7 @@ class TestGetKernel:
 
     def test_builtin_datapoint_subclass(self):
         # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
-        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize dispatcher
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
         # here, register the kernels without wrapper, and check if subclasses of our builtin datapoints get dispatched
         # to the kernel of the corresponding superclass
         def resize_with_pure_kernels():
@@ -2217,8 +2217,8 @@ class TestPermuteChannels:
         check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION)
 
     @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
-    def test_dispatcher(self, make_input):
-        check_dispatcher(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION)
+    def test_functional(self, make_input):
+        check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION)
 
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
@@ -2229,8 +2229,8 @@ class TestPermuteChannels:
             (F.permute_channels_video, datapoints.Video),
         ],
     )
-    def test_dispatcher_signature(self, kernel, input_type):
-        check_dispatcher_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type)
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type)
 
     def reference_image_correctness(self, image, permutation):
         channel_images = image.split(1, dim=-3)
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 9be7a40e8..844e0321e 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -91,13 +91,13 @@ class RandomErasing(_RandomApplyTransform):
 
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
-    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
         if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
             warnings.warn(
                 f"{type(self).__name__}() is currently passing through inputs of type "
                 f"datapoints.{type(inpt).__name__}. This will likely change in the future."
             )
-        return super()._call_kernel(dispatcher, inpt, *args, **kwargs)
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         img_c, img_h, img_w = query_chw(flat_inputs)
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index b20914061..b28fad6ea 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -358,13 +358,13 @@ class FiveCrop(Transform):
         super().__init__()
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
-    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
         if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
             warnings.warn(
                 f"{type(self).__name__}() is currently passing through inputs of type "
                 f"datapoints.{type(inpt).__name__}. This will likely change in the future."
             )
-        return super()._call_kernel(dispatcher, inpt, *args, **kwargs)
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return self._call_kernel(F.five_crop, inpt, self.size)
@@ -405,13 +405,13 @@ class TenCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
         self.vertical_flip = vertical_flip
 
-    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
         if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
             warnings.warn(
                 f"{type(self).__name__}() is currently passing through inputs of type "
                 f"datapoints.{type(inpt).__name__}. This will likely change in the future."
             )
-        return super()._call_kernel(dispatcher, inpt, *args, **kwargs)
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index 5a310ddbd..d4ee8af55 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -30,8 +30,8 @@ class Transform(nn.Module):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         return dict()
 
-    def _call_kernel(self, dispatcher: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
-        kernel = _get_kernel(dispatcher, type(inpt), allow_passthrough=True)
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        kernel = _get_kernel(functional, type(inpt), allow_passthrough=True)
         return kernel(inpt, *args, **kwargs)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 82891b8cc..aed113302 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -203,7 +203,7 @@ def convert_format_bounding_boxes(
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
 ) -> torch.Tensor:
-    # This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor
+    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for simple tensor
     # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
     # default error that would be thrown if `new_format` had no default value.
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index d9609a52e..95145beee 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -12,7 +12,7 @@ def is_simple_tensor(inpt: Any) -> bool:
     return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint)
 
 
-# {dispatcher: {input_type: type_specific_kernel}}
+# {functional: {input_type: type_specific_kernel}}
 _KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {}
 
 
@@ -27,10 +27,10 @@ def _kernel_datapoint_wrapper(kernel):
     return wrapper
 
 
-def _register_kernel_internal(dispatcher, input_type, *, datapoint_wrapper=True):
-    registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
+def _register_kernel_internal(functional, input_type, *, datapoint_wrapper=True):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
     if input_type in registry:
-        raise ValueError(f"Dispatcher {dispatcher} already has a kernel registered for type {input_type}.")
+        raise ValueError(f"Functional {functional} already has a kernel registered for type {input_type}.")
 
     def decorator(kernel):
         registry[input_type] = (
@@ -43,14 +43,14 @@ def _register_kernel_internal(dispatcher, input_type, *, datapoint_wrapper=True)
     return decorator
 
 
-def _name_to_dispatcher(name):
+def _name_to_functional(name):
     import torchvision.transforms.v2.functional  # noqa
 
     try:
         return getattr(torchvision.transforms.v2.functional, name)
     except AttributeError:
         raise ValueError(
-            f"Could not find dispatcher with name '{name}' in torchvision.transforms.v2.functional."
+            f"Could not find functional with name '{name}' in torchvision.transforms.v2.functional."
         ) from None
 
 
@@ -59,21 +59,21 @@ _BUILTIN_DATAPOINT_TYPES = {
 }
 
 
-def register_kernel(dispatcher, datapoint_cls):
-    """Decorate a kernel to register it for a dispatcher and a (custom) datapoint type.
+def register_kernel(functional, datapoint_cls):
+    """Decorate a kernel to register it for a functional and a (custom) datapoint type.
 
     See :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for usage
     details.
     """
-    if isinstance(dispatcher, str):
-        dispatcher = _name_to_dispatcher(name=dispatcher)
+    if isinstance(functional, str):
+        functional = _name_to_functional(name=functional)
     elif not (
-        callable(dispatcher)
-        and getattr(dispatcher, "__module__", "").startswith("torchvision.transforms.v2.functional")
+        callable(functional)
+        and getattr(functional, "__module__", "").startswith("torchvision.transforms.v2.functional")
     ):
         raise ValueError(
-            f"Kernels can only be registered on dispatchers from the torchvision.transforms.v2.functional namespace, "
-            f"but got {dispatcher}."
+            f"Kernels can only be registered on functionals from the torchvision.transforms.v2.functional namespace, "
+            f"but got {functional}."
         )
 
     if not (isinstance(datapoint_cls, type) and issubclass(datapoint_cls, datapoints.Datapoint)):
@@ -85,13 +85,13 @@ def register_kernel(dispatcher, datapoint_cls):
     if datapoint_cls in _BUILTIN_DATAPOINT_TYPES:
         raise ValueError(f"Kernels cannot be registered for the builtin datapoint classes, but got {datapoint_cls}")
 
-    return _register_kernel_internal(dispatcher, datapoint_cls, datapoint_wrapper=False)
+    return _register_kernel_internal(functional, datapoint_cls, datapoint_wrapper=False)
 
 
-def _get_kernel(dispatcher, input_type, *, allow_passthrough=False):
-    registry = _KERNEL_REGISTRY.get(dispatcher)
+def _get_kernel(functional, input_type, *, allow_passthrough=False):
+    registry = _KERNEL_REGISTRY.get(functional)
     if not registry:
-        raise ValueError(f"No kernel registered for dispatcher {dispatcher.__name__}.")
+        raise ValueError(f"No kernel registered for functional {functional.__name__}.")
 
     # In case we have an exact type match, we take a shortcut.
     if input_type in registry:
@@ -113,17 +113,17 @@ def _get_kernel(dispatcher, input_type, *, allow_passthrough=False):
         return lambda inpt, *args, **kwargs: inpt
 
     raise TypeError(
-        f"Dispatcher F.{dispatcher.__name__} supports inputs of type {registry.keys()}, "
+        f"Functional F.{functional.__name__} supports inputs of type {registry.keys()}, "
         f"but got {input_type} instead."
     )
 
 
 # This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
-# We could get rid of this by letting _register_kernel_internal take arbitrary dispatchers rather than wrap_kernel: bool
-def _register_five_ten_crop_kernel_internal(dispatcher, input_type):
-    registry = _KERNEL_REGISTRY.setdefault(dispatcher, {})
+# We could get rid of this by letting _register_kernel_internal take arbitrary functionals rather than wrap_kernel: bool
+def _register_five_ten_crop_kernel_internal(functional, input_type):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
     if input_type in registry:
-        raise TypeError(f"Dispatcher '{dispatcher}' already has a kernel registered for type '{input_type}'.")
+        raise TypeError(f"Functional '{functional}' already has a kernel registered for type '{input_type}'.")
 
     def wrap(kernel):
         @functools.wraps(kernel)
-- 
GitLab


From 6c44ceb5c3063dd41e9e1e091788fda2007c8792 Mon Sep 17 00:00:00 2001
From: Nicolas Granger <3764009+nlgranger@users.noreply.github.com>
Date: Tue, 15 Aug 2023 14:37:15 +0200
Subject: [PATCH 558/624] Replace stack/mask/reduce by indexing in _hsv2rgb
 (#7754)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 torchvision/transforms/v2/functional/_color.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 4c087965f..82bd23664 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -317,14 +317,20 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
     p = one_minus_s.mul_(v).clamp_(0.0, 1.0)
     i.remainder_(6)
 
-    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
+    vpqt = torch.stack((v, p, q, t), dim=-3)
 
-    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
-    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
-    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
-    a4 = torch.stack((a1, a2, a3), dim=-4)
+    # vpqt -> rgb mapping based on i
+    select = torch.tensor([[0, 2, 1, 1, 3, 0], [3, 0, 0, 2, 1, 1], [1, 1, 3, 0, 0, 2]], dtype=torch.long)
+    select = select.to(device=img.device, non_blocking=True)
 
-    return (a4.mul_(mask.unsqueeze(dim=-4))).sum(dim=-3)
+    select = select[:, i]
+    if select.ndim > 3:
+        # if input.shape is (B, ..., C, H, W) then
+        # select.shape is (C, B, ...,  H, W)
+        # thus we move C axis to get (B, ..., C, H, W)
+        select = select.moveaxis(0, -3)
+
+    return vpqt.gather(-3, select)
 
 
 @_register_kernel_internal(adjust_hue, torch.Tensor)
-- 
GitLab


From f4f685dd1c0c96d6b2f08a2b3ced563ed6f1b36d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 15 Aug 2023 14:55:23 +0100
Subject: [PATCH 559/624] More datapoints docs and comments (#7830)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 gallery/plot_custom_datapoints.py             |  6 +-
 gallery/plot_datapoints.py                    | 82 +++++++++++++------
 test/test_datapoints.py                       |  1 +
 torchvision/datapoints/_datapoint.py          | 28 +++----
 .../datapoints/_torch_function_helpers.py     |  9 +-
 .../transforms/v2/functional/_utils.py        | 11 ++-
 6 files changed, 87 insertions(+), 50 deletions(-)

diff --git a/gallery/plot_custom_datapoints.py b/gallery/plot_custom_datapoints.py
index 0a62a991a..b0a48d75d 100644
--- a/gallery/plot_custom_datapoints.py
+++ b/gallery/plot_custom_datapoints.py
@@ -3,7 +3,7 @@
 How to write your own Datapoint class
 =====================================
 
-This guide is intended for downstream library maintainers. We explain how to
+This guide is intended for advanced users and downstream library maintainers. We explain how to
 write your own datapoint class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
 :ref:`sphx_glr_auto_examples_plot_datapoints.py`.
@@ -68,10 +68,6 @@ def hflip_my_datapoint(my_dp, *args, **kwargs):
 #     could also have used the  functional *itself*, i.e.
 #     ``@register_kernel(functional=F.hflip, ...)``.
 #
-#     The functionals that you can be hooked into are the ones in
-#     ``torchvision.transforms.v2.functional`` and they are documented in
-#     :ref:`functional_transforms`.
-#
 # Now that we have registered our kernel, we can call the functional API on a
 # ``MyDatapoint`` instance:
 
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index d87575cdb..5bbf6c200 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -48,26 +48,22 @@ assert image.data_ptr() == tensor.data_ptr()
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
+# :mod:`torchvision.datapoints` supports four types of datapoints:
+#
+# * :class:`~torchvision.datapoints.Image`
+# * :class:`~torchvision.datapoints.Video`
+# * :class:`~torchvision.datapoints.BoundingBoxes`
+# * :class:`~torchvision.datapoints.Mask`
+#
 # What can I do with a datapoint?
 # -------------------------------
 #
 # Datapoints look and feel just like regular tensors - they **are** tensors.
 # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
-# any ``torch.*`` operator will also works on datapoints. See
+# any ``torch.*`` operator will also work on datapoints. See
 # :ref:`datapoint_unwrapping_behaviour` for a few gotchas.
 
 # %%
-#
-# What datapoints are supported?
-# ------------------------------
-#
-# So far :mod:`torchvision.datapoints` supports four types of datapoints:
-#
-# * :class:`~torchvision.datapoints.Image`
-# * :class:`~torchvision.datapoints.Video`
-# * :class:`~torchvision.datapoints.BoundingBoxes`
-# * :class:`~torchvision.datapoints.Mask`
-#
 # .. _datapoint_creation:
 #
 # How do I construct a datapoint?
@@ -209,9 +205,8 @@ def get_transform(train):
 # I had a Datapoint but now I have a Tensor. Help!
 # ------------------------------------------------
 #
-# For a lot of operations involving datapoints, we cannot safely infer whether
-# the result should retain the datapoint type, so we choose to return a plain
-# tensor instead of a datapoint (this might change, see note below):
+# By default, operations on :class:`~torchvision.datapoints.Datapoint` objects
+# will return a pure Tensor:
 
 
 assert isinstance(bboxes, datapoints.BoundingBoxes)
@@ -219,32 +214,69 @@ assert isinstance(bboxes, datapoints.BoundingBoxes)
 # Shift bboxes by 3 pixels in both H and W
 new_bboxes = bboxes + 3
 
-assert isinstance(new_bboxes, torch.Tensor) and not isinstance(new_bboxes, datapoints.BoundingBoxes)
+assert isinstance(new_bboxes, torch.Tensor)
+assert not isinstance(new_bboxes, datapoints.BoundingBoxes)
+
+# %%
+# .. note::
+#
+#    This behavior only affects native ``torch`` operations. If you are using
+#    the built-in ``torchvision`` transforms or functionals, you will always get
+#    as output the same type that you passed as input (pure ``Tensor`` or
+#    ``Datapoint``).
 
 # %%
-# If you're writing your own custom transforms or code involving datapoints, you
-# can re-wrap the output into a datapoint by just calling their constructor, or
-# by using the ``.wrap_like()`` class method:
+# But I want a Datapoint back!
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can re-wrap a pure tensor into a datapoint by just calling the datapoint
+# constructor, or by using the ``.wrap_like()`` class method (see more details
+# above in :ref:`datapoint_creation`):
 
 new_bboxes = bboxes + 3
 new_bboxes = datapoints.BoundingBoxes.wrap_like(bboxes, new_bboxes)
 assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 
 # %%
-# See more details above in :ref:`datapoint_creation`.
+# Alternatively, you can use the :func:`~torchvision.datapoints.set_return_type`
+# as a global config setting for the whole program, or as a context manager:
+
+with datapoints.set_return_type("datapoint"):
+    new_bboxes = bboxes + 3
+assert isinstance(new_bboxes, datapoints.BoundingBoxes)
+
+# %%
+# Why is this happening?
+# ^^^^^^^^^^^^^^^^^^^^^^
 #
-# .. note::
+# **For performance reasons**. :class:`~torchvision.datapoints.Datapoint`
+# classes are Tensor subclasses, so any operation involving a
+# :class:`~torchvision.datapoints.Datapoint` object will go through the
+# `__torch_function__
+# <https://pytorch.org/docs/stable/notes/extending.html#extending-torch>`_
+# protocol. This induces a small overhead, which we want to avoid when possible.
+# This doesn't matter for built-in ``torchvision`` transforms because we can
+# avoid the overhead there, but it could be a problem in your model's
+# ``forward``.
 #
-#    You never need to re-wrap manually if you're using the built-in transforms
-#    or their functional equivalents: this is automatically taken care of for
-#    you.
+# **The alternative isn't much better anyway.** For every operation where
+# preserving the :class:`~torchvision.datapoints.Datapoint` type makes
+# sense, there are just as many operations where returning a pure Tensor is
+# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.datapoints.Image`?
+# If we were to preserve :class:`~torchvision.datapoints.Datapoint` types all
+# the way, even model's logits or the output of the loss function would end up
+# being of type :class:`~torchvision.datapoints.Image`, and surely that's not
+# desirable.
 #
 # .. note::
 #
-#    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    This behaviour is something we're actively seeking feedback on. If you find this surprising or if you
 #    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
 #    https://github.com/pytorch/vision/issues/7319
 #
+# Exceptions
+# ^^^^^^^^^^
+#
 # There are a few exceptions to this "unwrapping" rule:
 #
 # 1. Operations like :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index e38fcaf1d..1042587e3 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -101,6 +101,7 @@ def test_to_datapoint_reference(make_input, return_type):
 
     assert type(tensor_to) is (type(dp) if return_type == "datapoint" else torch.Tensor)
     assert tensor_to.dtype is dp.dtype
+    assert type(tensor) is torch.Tensor
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index c4b5ee48d..613a1fb8b 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -66,19 +66,12 @@ class Datapoint(torch.Tensor):
         ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
         ``args`` and ``kwargs`` of the original call.
 
-        The default behavior of :class:`~torch.Tensor`'s is to retain a custom tensor type. For the :class:`Datapoint`
-        use case, this has two downsides:
+        Why do we override this? Because the base implementation in torch.Tensor would preserve the Datapoint type
+        of the output. In our case, we want to return pure tensors instead (with a few exceptions). Refer to the
+        "Datapoints FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet).
 
-        1. Since some :class:`Datapoint`'s require metadata to be constructed, the default wrapping, i.e.
-           ``return cls(func(*args, **kwargs))``, will fail for them.
-        2. For most operations, there is no way of knowing if the input type is still valid for the output.
-
-        For these reasons, the automatic output wrapping is turned off for most operators. The only exceptions are
-        listed in _FORCE_TORCHFUNCTION_SUBCLASS
+        Our implementation below is very similar to the base implementation in ``torch.Tensor`` - go check it out.
         """
-        # Since super().__torch_function__ has no hook to prevent the coercing of the output into the input type, we
-        # need to reimplement the functionality.
-
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
 
@@ -89,12 +82,13 @@ class Datapoint(torch.Tensor):
 
         must_return_subclass = _must_return_subclass()
         if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)):
-            # We also require the primary operand, i.e. `args[0]`, to be
-            # an instance of the class that `__torch_function__` was invoked on. The __torch_function__ protocol will
-            # invoke this method on *all* types involved in the computation by walking the MRO upwards. For example,
-            # `torch.Tensor(...).to(datapoints.Image(...))` will invoke `datapoints.Image.__torch_function__` with
-            # `args = (torch.Tensor(), datapoints.Image())` first. Without this guard, the original `torch.Tensor` would
-            # be wrapped into a `datapoints.Image`.
+            # If you're wondering why we need the `isinstance(args[0], cls)` check, remove it and see what fails
+            # in test_to_datapoint_reference().
+            # The __torch_function__ protocol will invoke the __torch_function__ method on *all* types involved in
+            # the computation by walking the MRO upwards. For example,
+            # `out = a_pure_tensor.to(an_image)` will invoke `Image.__torch_function__` with
+            # `args = (a_pure_tensor, an_image)` first. Without this guard, `out` would
+            # be wrapped into an `Image`.
             return cls._wrap_output(output, args, kwargs)
 
         if not must_return_subclass and isinstance(output, cls):
diff --git a/torchvision/datapoints/_torch_function_helpers.py b/torchvision/datapoints/_torch_function_helpers.py
index 68674eb02..6ab4f4158 100644
--- a/torchvision/datapoints/_torch_function_helpers.py
+++ b/torchvision/datapoints/_torch_function_helpers.py
@@ -18,12 +18,18 @@ class _ReturnTypeCM:
 def set_return_type(return_type: str):
     """Set the return type of torch operations on datapoints.
 
+    This only affects the behaviour of torch operations. It has no effect on
+    ``torchvision`` transforms or functionals, which will always return as
+    output the same type that was passed as input.
+
     Can be used as a global flag for the entire program:
 
     .. code:: python
 
-        set_return_type("datapoints")
         img = datapoints.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor (default behaviour)
+
+        set_return_type("datapoints")
         img + 2  # This is an Image
 
     or as a context manager to restrict the scope:
@@ -31,6 +37,7 @@ def set_return_type(return_type: str):
     .. code:: python
 
         img = datapoints.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor
         with set_return_type("datapoints"):
             img + 2  # This is an Image
         img + 2  # This is a pure Tensor
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 95145beee..1f5c6f5ee 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -19,8 +19,15 @@ _KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {}
 def _kernel_datapoint_wrapper(kernel):
     @functools.wraps(kernel)
     def wrapper(inpt, *args, **kwargs):
-        # We always pass datapoints as pure tensors to the kernels to avoid going through the
-        # Tensor.__torch_function__ logic, which is costly.
+        # If you're wondering whether we could / should get rid of this wrapper,
+        # the answer is no: we want to pass pure Tensors to avoid the overhead
+        # of the __torch_function__ machinery. Note that this is always valid,
+        # regardless of whether we override __torch_function__ in our base class
+        # or not.
+        # Also, even if we didn't call `as_subclass` here, we would still need
+        # this wrapper to call wrap_like(), because the Datapoint type would be
+        # lost after the first operation due to our own __torch_function__
+        # logic.
         output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
         return type(inpt).wrap_like(inpt, output)
 
-- 
GitLab


From 498b9c8662e2322615748aafc321ad4a5bc02afb Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 15 Aug 2023 12:38:15 -0400
Subject: [PATCH 560/624] Fix torchvision test for release (#7833)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/smoke_test.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/smoke_test.py b/test/smoke_test.py
index a157f1c91..6cc07c00a 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -78,8 +78,13 @@ def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
 def main() -> None:
     print(f"torchvision: {torchvision.__version__}")
     print(f"torch.cuda.is_available: {torch.cuda.is_available()}")
-    print(f"{torch.ops.image._jpeg_version() = }")
-    assert torch.ops.image._is_compiled_against_turbo()
+
+    # Turn 1.11.0aHASH into 1.11 (major.minor only)
+    version = ".".join(torchvision.__version__.split(".")[:2])
+    if version >= "0.16":
+        print(f"{torch.ops.image._jpeg_version() = }")
+        assert torch.ops.image._is_compiled_against_turbo()
+
     smoke_test_torchvision()
     smoke_test_torchvision_read_decode()
     smoke_test_torchvision_resnet50_classify()
-- 
GitLab


From c1592f963ab69baa740eee9e6c0d167446cd92c0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 16 Aug 2023 09:36:58 +0100
Subject: [PATCH 561/624] Remove wrap_like class method and add
 datapoints.wrap() function (#7832)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/datapoints.rst                    |  1 +
 gallery/plot_custom_datapoints.py             |  6 ++--
 gallery/plot_datapoints.py                    | 21 +++++-------
 test/test_datapoints.py                       |  4 +--
 test/test_transforms_v2_refactored.py         |  6 ++--
 torchvision/datapoints/__init__.py            | 22 ++++++++++++
 torchvision/datapoints/_bounding_box.py       | 26 --------------
 torchvision/datapoints/_datapoint.py          |  4 ---
 torchvision/prototype/datapoints/_label.py    |  7 ----
 torchvision/prototype/transforms/_augment.py  | 16 ++++-----
 torchvision/prototype/transforms/_geometry.py |  6 ++--
 torchvision/transforms/v2/_augment.py         |  4 +--
 torchvision/transforms/v2/_auto_augment.py    |  2 +-
 torchvision/transforms/v2/_geometry.py        |  2 +-
 torchvision/transforms/v2/_misc.py            |  4 +--
 .../transforms/v2/functional/_geometry.py     | 34 +++++++++----------
 torchvision/transforms/v2/functional/_meta.py |  4 +--
 .../transforms/v2/functional/_utils.py        |  6 ++--
 18 files changed, 78 insertions(+), 97 deletions(-)

diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 7351c8685..0599545f7 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -19,3 +19,4 @@ see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
     Mask
     Datapoint
     set_return_type
+    wrap
diff --git a/gallery/plot_custom_datapoints.py b/gallery/plot_custom_datapoints.py
index b0a48d75d..a8db87811 100644
--- a/gallery/plot_custom_datapoints.py
+++ b/gallery/plot_custom_datapoints.py
@@ -53,11 +53,11 @@ from torchvision.transforms.v2 import functional as F
 def hflip_my_datapoint(my_dp, *args, **kwargs):
     print("Flipping!")
     out = my_dp.flip(-1)
-    return MyDatapoint.wrap_like(my_dp, out)
+    return datapoints.wrap(out, like=my_dp)
 
 
 # %%
-# To understand why ``wrap_like`` is used, see
+# To understand why :func:`~torchvision.datapoints.wrap` is used, see
 # :ref:`datapoint_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
 # we will explain it below in :ref:`param_forwarding`.
 #
@@ -107,7 +107,7 @@ _ = t(my_dp)
 def hflip_my_datapoint(my_dp):  # noqa
     print("Flipping!")
     out = my_dp.flip(-1)
-    return MyDatapoint.wrap_like(my_dp, out)
+    return datapoints.wrap(out, like=my_dp)
 
 
 # %%
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index 5bbf6c200..eecefe955 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -107,26 +107,23 @@ bboxes = datapoints.BoundingBoxes(
 print(bboxes)
 
 # %%
-# Using the ``wrap_like()`` class method
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Using ``datapoints.wrap()``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# You can also use the ``wrap_like()`` class method to wrap a tensor object
+# You can also use the :func:`~torchvision.datapoints.wrap` function to wrap a tensor object
 # into a datapoint. This is useful when you already have an object of the
 # desired type, which typically happens when writing transforms: you just want
-# to wrap the output like the input. This API is inspired by utils like
-# :func:`torch.zeros_like`:
+# to wrap the output like the input.
 
 new_bboxes = torch.tensor([0, 20, 30, 40])
-new_bboxes = datapoints.BoundingBoxes.wrap_like(bboxes, new_bboxes)
+new_bboxes = datapoints.wrap(new_bboxes, like=bboxes)
 assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 assert new_bboxes.canvas_size == bboxes.canvas_size
 
 
 # %%
 # The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
-# it as a parameter to override it. Check the
-# :meth:`~torchvision.datapoints.BoundingBoxes.wrap_like` documentation for
-# more details.
+# it as a parameter to override it.
 #
 # Do I have to wrap the output of the datasets myself?
 # ----------------------------------------------------
@@ -230,11 +227,11 @@ assert not isinstance(new_bboxes, datapoints.BoundingBoxes)
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # You can re-wrap a pure tensor into a datapoint by just calling the datapoint
-# constructor, or by using the ``.wrap_like()`` class method (see more details
-# above in :ref:`datapoint_creation`):
+# constructor, or by using the :func:`~torchvision.datapoints.wrap` function
+# (see more details above in :ref:`datapoint_creation`):
 
 new_bboxes = bboxes + 3
-new_bboxes = datapoints.BoundingBoxes.wrap_like(bboxes, new_bboxes)
+new_bboxes = datapoints.wrap(new_bboxes, like=bboxes)
 assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 
 # %%
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 1042587e3..4da2eb393 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -213,13 +213,13 @@ def test_inplace_op_no_wrapping(make_input, return_type):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
-def test_wrap_like(make_input):
+def test_wrap(make_input):
     dp = make_input()
 
     # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
     output = dp * 2
 
-    dp_new = type(dp).wrap_like(dp, output)
+    dp_new = datapoints.wrap(output, like=dp)
 
     assert type(dp_new) is type(dp)
     assert dp_new.data_ptr() == output.data_ptr()
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index fa1ed05b8..414104a55 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -570,7 +570,7 @@ class TestResize:
             canvas_size=(new_height, new_width),
             affine_matrix=affine_matrix,
         )
-        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, canvas_size=(new_height, new_width))
+        return datapoints.wrap(expected_bboxes, like=bounding_boxes, canvas_size=(new_height, new_width))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
@@ -815,7 +815,7 @@ class TestHorizontalFlip:
             affine_matrix=affine_matrix,
         )
 
-        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes)
+        return datapoints.wrap(expected_bboxes, like=bounding_boxes)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize(
@@ -1278,7 +1278,7 @@ class TestVerticalFlip:
             affine_matrix=affine_matrix,
         )
 
-        return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes)
+        return datapoints.wrap(expected_bboxes, like=bounding_boxes)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index 7e1295c11..f99e25b62 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -12,3 +12,25 @@ if _WARN_ABOUT_BETA_TRANSFORMS:
     import warnings
 
     warnings.warn(_BETA_TRANSFORMS_WARNING)
+
+
+def wrap(wrappee, *, like, **kwargs):
+    """Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.datapoint.Datapoint` subclass as ``like``.
+
+    If ``like`` is a :class:`~torchvision.datapoint.BoundingBoxes`, the ``format`` and ``canvas_size`` of
+    ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``.
+
+    Args:
+        wrappee (Tensor): The tensor to convert.
+        like (Datapoint): The
+        kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.datapoint.BoundingBoxes`.
+            Ignored otherwise.
+    """
+    if isinstance(like, BoundingBoxes):
+        return BoundingBoxes._wrap(
+            wrappee,
+            format=kwargs.get("format", like.format),
+            canvas_size=kwargs.get("canvas_size", like.canvas_size),
+        )
+    else:
+        return wrappee.as_subclass(type(like))
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index d6f0747df..ebed06282 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -75,32 +75,6 @@ class BoundingBoxes(Datapoint):
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return cls._wrap(tensor, format=format, canvas_size=canvas_size)
 
-    @classmethod
-    def wrap_like(
-        cls,
-        other: BoundingBoxes,
-        tensor: torch.Tensor,
-        *,
-        format: Optional[Union[BoundingBoxFormat, str]] = None,
-        canvas_size: Optional[Tuple[int, int]] = None,
-    ) -> BoundingBoxes:
-        """Wrap a :class:`torch.Tensor` as :class:`BoundingBoxes` from a reference.
-
-        Args:
-            other (BoundingBoxes): Reference bounding box.
-            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBoxes`
-            format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
-                reference.
-            canvas_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
-                omitted, it is taken from the reference.
-
-        """
-        return cls._wrap(
-            tensor,
-            format=format if format is not None else other.format,
-            canvas_size=canvas_size if canvas_size is not None else other.canvas_size,
-        )
-
     @classmethod
     def _wrap_output(
         cls,
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 613a1fb8b..59b017b44 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -31,10 +31,6 @@ class Datapoint(torch.Tensor):
             requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
         return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
 
-    @classmethod
-    def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
-        return tensor.as_subclass(cls)
-
     @classmethod
     def _wrap_output(
         cls,
diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/datapoints/_label.py
index 7ed2f7522..10ac1bf82 100644
--- a/torchvision/prototype/datapoints/_label.py
+++ b/torchvision/prototype/datapoints/_label.py
@@ -32,13 +32,6 @@ class _LabelBase(Datapoint):
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
         return cls._wrap(tensor, categories=categories)
 
-    @classmethod
-    def wrap_like(cls: Type[L], other: L, tensor: torch.Tensor, *, categories: Optional[Sequence[str]] = None) -> L:
-        return cls._wrap(
-            tensor,
-            categories=categories if categories is not None else other.categories,
-        )
-
     @classmethod
     def from_category(
         cls: Type[L],
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 53f3f8013..f2c6e89dd 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -36,11 +36,9 @@ class SimpleCopyPaste(Transform):
         antialias: Optional[bool],
     ) -> Tuple[torch.Tensor, Dict[str, Any]]:
 
-        paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection])
-        paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection])
-        paste_labels = paste_target["labels"].wrap_like(
-            paste_target["labels"], paste_target["labels"][random_selection]
-        )
+        paste_masks = datapoints.wrap(paste_target["masks"][random_selection], like=paste_target["masks"])
+        paste_boxes = datapoints.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"])
+        paste_labels = datapoints.wrap(paste_target["labels"][random_selection], like=paste_target["labels"])
 
         masks = target["masks"]
 
@@ -143,7 +141,7 @@ class SimpleCopyPaste(Transform):
         c0, c1, c2, c3 = 0, 0, 0, 0
         for i, obj in enumerate(flat_sample):
             if isinstance(obj, datapoints.Image):
-                flat_sample[i] = datapoints.Image.wrap_like(obj, output_images[c0])
+                flat_sample[i] = datapoints.wrap(output_images[c0], like=obj)
                 c0 += 1
             elif isinstance(obj, PIL.Image.Image):
                 flat_sample[i] = F.to_image_pil(output_images[c0])
@@ -152,13 +150,13 @@ class SimpleCopyPaste(Transform):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
             elif isinstance(obj, datapoints.BoundingBoxes):
-                flat_sample[i] = datapoints.BoundingBoxes.wrap_like(obj, output_targets[c1]["boxes"])
+                flat_sample[i] = datapoints.wrap(output_targets[c1]["boxes"], like=obj)
                 c1 += 1
             elif isinstance(obj, datapoints.Mask):
-                flat_sample[i] = datapoints.Mask.wrap_like(obj, output_targets[c2]["masks"])
+                flat_sample[i] = datapoints.wrap(output_targets[c2]["masks"], like=obj)
                 c2 += 1
             elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)):
-                flat_sample[i] = obj.wrap_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
+                flat_sample[i] = datapoints.wrap(output_targets[c3]["labels"], like=obj)
                 c3 += 1
 
     def forward(self, *inputs: Any) -> Any:
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index fe2e8df47..8d8e7eb42 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -112,11 +112,11 @@ class FixedSizeCrop(Transform):
 
         if params["is_valid"] is not None:
             if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)):
-                inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
+                inpt = datapoints.wrap(inpt[params["is_valid"]], like=inpt)
             elif isinstance(inpt, datapoints.BoundingBoxes):
-                inpt = datapoints.BoundingBoxes.wrap_like(
-                    inpt,
+                inpt = datapoints.wrap(
                     F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size),
+                    like=inpt,
                 )
 
         if params["needs_pad"]:
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 844e0321e..f64ae564b 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -249,7 +249,7 @@ class MixUp(_BaseMixUpCutMix):
             output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
 
             if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-                output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
+                output = datapoints.wrap(output, like=inpt)
 
             return output
         else:
@@ -319,7 +319,7 @@ class CutMix(_BaseMixUpCutMix):
             output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
 
             if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-                output = inpt.wrap_like(inpt, output)  # type: ignore[arg-type]
+                output = datapoints.wrap(output, like=inpt)
 
             return output
         else:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 26eb3abbc..8494b64b9 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -620,7 +620,7 @@ class AugMix(_AutoAugmentBase):
         mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
         if isinstance(orig_image_or_video, (datapoints.Image, datapoints.Video)):
-            mix = orig_image_or_video.wrap_like(orig_image_or_video, mix)  # type: ignore[arg-type]
+            mix = datapoints.wrap(mix, like=orig_image_or_video)
         elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_image_pil(mix)
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index b28fad6ea..f441a0b74 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -338,7 +338,7 @@ class FiveCrop(Transform):
         ...         images_or_videos, labels = sample
         ...         batch_size = len(images_or_videos)
         ...         image_or_video = images_or_videos[0]
-        ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
+        ...         images_or_videos = datapoints.wrap(torch.stack(images_or_videos), like=image_or_video)
         ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
         ...         return images_or_videos, labels
         ...
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 1550b5238..ef9ac5fd0 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -131,7 +131,7 @@ class LinearTransformation(Transform):
         output = output.reshape(shape)
 
         if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = type(inpt).wrap_like(inpt, output)  # type: ignore[arg-type]
+            output = datapoints.wrap(output, like=inpt)
         return output
 
 
@@ -423,4 +423,4 @@ class SanitizeBoundingBoxes(Transform):
         if is_label:
             return output
 
-        return type(inpt).wrap_like(inpt, output)
+        return datapoints.wrap(output, like=inpt)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index f8f3b1da0..0872d71dd 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -87,7 +87,7 @@ def _horizontal_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) ->
     output = horizontal_flip_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(horizontal_flip, datapoints.Video)
@@ -143,7 +143,7 @@ def _vertical_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> da
     output = vertical_flip_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(vertical_flip, datapoints.Video)
@@ -321,7 +321,7 @@ def _resize_mask_dispatch(
     inpt: datapoints.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any
 ) -> datapoints.Mask:
     output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size)
-    return datapoints.Mask.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 def resize_bounding_boxes(
@@ -349,7 +349,7 @@ def _resize_bounding_boxes_dispatch(
     output, canvas_size = resize_bounding_boxes(
         inpt.as_subclass(torch.Tensor), inpt.canvas_size, size, max_size=max_size
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 @_register_kernel_internal(resize, datapoints.Video)
@@ -857,7 +857,7 @@ def _affine_bounding_boxes_dispatch(
         shear=shear,
         center=center,
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 def affine_mask(
@@ -912,7 +912,7 @@ def _affine_mask_dispatch(
         fill=fill,
         center=center,
     )
-    return datapoints.Mask.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(affine, datapoints.Video)
@@ -1058,7 +1058,7 @@ def _rotate_bounding_boxes_dispatch(
         expand=expand,
         center=center,
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 def rotate_mask(
@@ -1099,7 +1099,7 @@ def _rotate_mask_dispatch(
     **kwargs,
 ) -> datapoints.Mask:
     output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
-    return datapoints.Mask.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(rotate, datapoints.Video)
@@ -1321,7 +1321,7 @@ def _pad_bounding_boxes_dispatch(
         padding=padding,
         padding_mode=padding_mode,
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 @_register_kernel_internal(pad, datapoints.Video)
@@ -1396,7 +1396,7 @@ def _crop_bounding_boxes_dispatch(
     output, canvas_size = crop_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 @_register_kernel_internal(crop, datapoints.Mask)
@@ -1670,7 +1670,7 @@ def _perspective_bounding_boxes_dispatch(
         endpoints=endpoints,
         coefficients=coefficients,
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 def perspective_mask(
@@ -1712,7 +1712,7 @@ def _perspective_mask_dispatch(
         fill=fill,
         coefficients=coefficients,
     )
-    return datapoints.Mask.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(perspective, datapoints.Video)
@@ -1887,7 +1887,7 @@ def _elastic_bounding_boxes_dispatch(
     output = elastic_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, displacement=displacement
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 def elastic_mask(
@@ -1914,7 +1914,7 @@ def _elastic_mask_dispatch(
     inpt: datapoints.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
 ) -> datapoints.Mask:
     output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
-    return datapoints.Mask.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(elastic, datapoints.Video)
@@ -2022,7 +2022,7 @@ def _center_crop_bounding_boxes_dispatch(
     output, canvas_size = center_crop_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, output_size=output_size
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 @_register_kernel_internal(center_crop, datapoints.Mask)
@@ -2156,7 +2156,7 @@ def _resized_crop_bounding_boxes_dispatch(
     output, canvas_size = resized_crop_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width, size=size
     )
-    return datapoints.BoundingBoxes.wrap_like(inpt, output, canvas_size=canvas_size)
+    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 def resized_crop_mask(
@@ -2178,7 +2178,7 @@ def _resized_crop_mask_dispatch(
     output = resized_crop_mask(
         inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
     )
-    return datapoints.Mask.wrap_like(inpt, output)
+    return datapoints.wrap(output, like=inpt)
 
 
 @_register_kernel_internal(resized_crop, datapoints.Video)
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index aed113302..89b19d9e8 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -223,7 +223,7 @@ def convert_format_bounding_boxes(
         output = _convert_format_bounding_boxes(
             inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
         )
-        return datapoints.BoundingBoxes.wrap_like(inpt, output, format=new_format)
+        return datapoints.wrap(output, like=inpt, format=new_format)
     else:
         raise TypeError(
             f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
@@ -265,7 +265,7 @@ def clamp_bounding_boxes(
         if format is not None or canvas_size is not None:
             raise ValueError("For bounding box datapoint inputs, `format` and `canvas_size` must not be passed.")
         output = _clamp_bounding_boxes(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size)
-        return datapoints.BoundingBoxes.wrap_like(inpt, output)
+        return datapoints.wrap(output, like=inpt)
     else:
         raise TypeError(
             f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 1f5c6f5ee..0ea8e5658 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -25,11 +25,11 @@ def _kernel_datapoint_wrapper(kernel):
         # regardless of whether we override __torch_function__ in our base class
         # or not.
         # Also, even if we didn't call `as_subclass` here, we would still need
-        # this wrapper to call wrap_like(), because the Datapoint type would be
+        # this wrapper to call wrap(), because the Datapoint type would be
         # lost after the first operation due to our own __torch_function__
         # logic.
         output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
-        return type(inpt).wrap_like(inpt, output)
+        return datapoints.wrap(output, like=inpt)
 
     return wrapper
 
@@ -137,7 +137,7 @@ def _register_five_ten_crop_kernel_internal(functional, input_type):
         def wrapper(inpt, *args, **kwargs):
             output = kernel(inpt, *args, **kwargs)
             container_type = type(output)
-            return container_type(type(inpt).wrap_like(inpt, o) for o in output)
+            return container_type(datapoints.wrap(o, like=inpt) for o in output)
 
         return wrapper
 
-- 
GitLab


From e0e6f7e2c50a65f47eb49269c76a285a22d142d2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 16 Aug 2023 13:06:39 +0200
Subject: [PATCH 562/624] allow dispatch to PIL image subclasses (#7835)

---
 test/test_transforms_v2_refactored.py         | 33 +++++++++++++------
 .../transforms/v2/functional/_utils.py        | 23 +++++--------
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 414104a55..56eb6747c 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -3,6 +3,7 @@ import decimal
 import inspect
 import math
 import re
+from pathlib import Path
 from unittest import mock
 
 import numpy as np
@@ -2126,16 +2127,10 @@ class TestGetKernel:
         datapoints.Video: F.resize_video,
     }
 
-    def test_unsupported_types(self):
-        class MyTensor(torch.Tensor):
-            pass
-
-        class MyPILImage(PIL.Image.Image):
-            pass
-
-        for input_type in [str, int, object, MyTensor, MyPILImage]:
-            with pytest.raises(TypeError, match="supports inputs of type"):
-                _get_kernel(F.resize, input_type)
+    @pytest.mark.parametrize("input_type", [str, int, object])
+    def test_unsupported_types(self, input_type):
+        with pytest.raises(TypeError, match="supports inputs of type"):
+            _get_kernel(F.resize, input_type)
 
     def test_exact_match(self):
         # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
@@ -2197,6 +2192,24 @@ class TestGetKernel:
 
         assert _get_kernel(F.resize, MyDatapoint) is resize_my_datapoint
 
+    def test_pil_image_subclass(self):
+        opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+        loaded_image = opened_image.convert("RGB")
+
+        # check the assumptions
+        assert isinstance(opened_image, PIL.Image.Image)
+        assert type(opened_image) is not PIL.Image.Image
+
+        assert type(loaded_image) is PIL.Image.Image
+
+        size = [17, 11]
+        for image in [opened_image, loaded_image]:
+            kernel = _get_kernel(F.resize, type(image))
+
+            output = kernel(image, size=size)
+
+            assert F.get_size(output) == size
+
 
 class TestPermuteChannels:
     _DEFAULT_PERMUTATION = [2, 0, 1]
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 0ea8e5658..7fc489299 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -100,21 +100,14 @@ def _get_kernel(functional, input_type, *, allow_passthrough=False):
     if not registry:
         raise ValueError(f"No kernel registered for functional {functional.__name__}.")
 
-    # In case we have an exact type match, we take a shortcut.
-    if input_type in registry:
-        return registry[input_type]
-
-    # In case of datapoints, we check if we have a kernel for a superclass registered
-    if issubclass(input_type, datapoints.Datapoint):
-        # Since we have already checked for an exact match above, we can start the traversal at the superclass.
-        for cls in input_type.__mro__[1:]:
-            if cls is datapoints.Datapoint:
-                # We don't want user-defined datapoints to dispatch to the pure Tensor kernels, so we explicit stop the
-                # MRO traversal before hitting torch.Tensor. We can even stop at datapoints.Datapoint, since we don't
-                # allow kernels to be registered for datapoints.Datapoint anyway.
-                break
-            elif cls in registry:
-                return registry[cls]
+    for cls in input_type.__mro__:
+        if cls in registry:
+            return registry[cls]
+        elif cls is datapoints.Datapoint:
+            # We don't want user-defined datapoints to dispatch to the pure Tensor kernels, so we explicit stop the
+            # MRO traversal before hitting torch.Tensor. We can even stop at datapoints.Datapoint, since we don't
+            # allow kernels to be registered for datapoints.Datapoint anyway.
+            break
 
     if allow_passthrough:
         return lambda inpt, *args, **kwargs: inpt
-- 
GitLab


From cdbbd6664bbd2e212739519aa0eb70c06252e88c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 16 Aug 2023 15:55:45 +0200
Subject: [PATCH 563/624] fix elastic error (#7838)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 test/test_transforms_v2_refactored.py         | 96 +++++++++++++++++++
 .../transforms/v2/functional/_geometry.py     |  9 ++
 2 files changed, 105 insertions(+)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 56eb6747c..c83327a06 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2259,3 +2259,99 @@ class TestPermuteChannels:
         expected = self.reference_image_correctness(image, permutation=permutation)
 
         torch.testing.assert_close(actual, expected)
+
+
+class TestElastic:
+    def _make_displacement(self, inpt):
+        return torch.rand(
+            1,
+            *F.get_size(inpt),
+            2,
+            dtype=torch.float32,
+            device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu",
+        )
+
+    @param_value_parametrization(
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_tensor(self, param, value, dtype, device):
+        image = make_image_tensor(dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_image_tensor,
+            image,
+            displacement=self._make_displacement(image),
+            **{param: value},
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+
+    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            displacement=self._make_displacement(bounding_boxes),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        mask = make_mask()
+        check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask))
+
+    def test_kernel_video(self):
+        video = make_video()
+        check_kernel(F.elastic_video, video, displacement=self._make_displacement(video))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        input = make_input()
+        check_functional(F.elastic, input, displacement=self._make_displacement(input))
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.elastic_image_tensor, torch.Tensor),
+            (F.elastic_image_pil, PIL.Image.Image),
+            (F.elastic_image_tensor, datapoints.Image),
+            (F.elastic_bounding_boxes, datapoints.BoundingBoxes),
+            (F.elastic_mask, datapoints.Mask),
+            (F.elastic_video, datapoints.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+    )
+    def test_displacement_error(self, make_input):
+        input = make_input()
+
+        with pytest.raises(TypeError, match="displacement should be a Tensor"):
+            F.elastic(input, displacement=None)
+
+        with pytest.raises(ValueError, match="displacement shape should be"):
+            F.elastic(input, displacement=torch.rand(F.get_size(input)))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+    )
+    # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
+    @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, size, device):
+        check_transform(transforms.ElasticTransform, make_input(size, device=device))
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 0872d71dd..898e7e0c1 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -1755,6 +1755,9 @@ def elastic_image_tensor(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
     interpolation = _check_interpolation(interpolation)
 
     if image.numel() == 0:
@@ -1835,6 +1838,12 @@ def elastic_bounding_boxes(
     canvas_size: Tuple[int, int],
     displacement: torch.Tensor,
 ) -> torch.Tensor:
+    expected_shape = (1, canvas_size[0], canvas_size[1], 2)
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+    elif displacement.shape != expected_shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
     if bounding_boxes.numel() == 0:
         return bounding_boxes
 
-- 
GitLab


From ca012d39c6ba265091d9373c8ca00157b933d3e9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 16 Aug 2023 16:09:13 +0200
Subject: [PATCH 564/624] make PIL kernels private (#7831)

---
 docs/source/transforms.rst                    |   3 +-
 gallery/plot_transforms_v2_e2e.py             |   4 +-
 references/detection/presets.py               |   8 +-
 references/segmentation/presets.py            |   6 +-
 test/common_utils.py                          |  10 +-
 test/test_prototype_transforms.py             |   4 +-
 test/test_transforms_v2.py                    |  29 +--
 test/test_transforms_v2_consistency.py        |  16 +-
 test/test_transforms_v2_functional.py         |  28 +--
 test/test_transforms_v2_refactored.py         | 110 +++++-----
 test/test_transforms_v2_utils.py              |   4 +-
 test/transforms_v2_dispatcher_infos.py        |  82 ++++----
 test/transforms_v2_kernel_infos.py            |  91 ++++----
 torchvision/prototype/transforms/_augment.py  |   4 +-
 torchvision/transforms/v2/__init__.py         |   2 +-
 torchvision/transforms/v2/_auto_augment.py    |   2 +-
 torchvision/transforms/v2/_type_conversion.py |  13 +-
 .../transforms/v2/functional/__init__.py      | 128 ++++++------
 .../transforms/v2/functional/_augment.py      |  10 +-
 .../transforms/v2/functional/_color.py        | 130 ++++++------
 .../transforms/v2/functional/_deprecated.py   |   2 +-
 .../transforms/v2/functional/_geometry.py     | 196 +++++++++---------
 torchvision/transforms/v2/functional/_meta.py |  26 +--
 torchvision/transforms/v2/functional/_misc.py |  26 ++-
 .../v2/functional/_type_conversion.py         |   8 +-
 25 files changed, 454 insertions(+), 488 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 0df46c925..670039571 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -228,12 +228,11 @@ Conversion
 
     ToPILImage
     v2.ToPILImage
-    v2.ToImagePIL
     ToTensor
     v2.ToTensor
     PILToTensor
     v2.PILToTensor
-    v2.ToImageTensor
+    v2.ToImage
     ConvertImageDtype
     v2.ConvertImageDtype
     v2.ToDtype
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index ccffea766..b837b9ba9 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -27,7 +27,7 @@ def show(sample):
 
     image, target = sample
     if isinstance(image, PIL.Image.Image):
-        image = F.to_image_tensor(image)
+        image = F.to_image(image)
     image = F.to_dtype(image, torch.uint8, scale=True)
     annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
 
@@ -101,7 +101,7 @@ transform = transforms.Compose(
         transforms.RandomZoomOut(fill={PIL.Image.Image: (123, 117, 104), "others": 0}),
         transforms.RandomIoUCrop(),
         transforms.RandomHorizontalFlip(),
-        transforms.ToImageTensor(),
+        transforms.ToImage(),
         transforms.ConvertImageDtype(torch.float32),
         transforms.SanitizeBoundingBoxes(),
     ]
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 098ec85e6..09ca148a2 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -33,7 +33,7 @@ class DetectionPresetTrain:
         transforms = []
         backend = backend.lower()
         if backend == "datapoint":
-            transforms.append(T.ToImageTensor())
+            transforms.append(T.ToImage())
         elif backend == "tensor":
             transforms.append(T.PILToTensor())
         elif backend != "pil":
@@ -71,7 +71,7 @@ class DetectionPresetTrain:
 
         if backend == "pil":
             # Note: we could just convert to pure tensors even in v2.
-            transforms += [T.ToImageTensor() if use_v2 else T.PILToTensor()]
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
 
         transforms += [T.ConvertImageDtype(torch.float)]
 
@@ -94,11 +94,11 @@ class DetectionPresetEval:
         backend = backend.lower()
         if backend == "pil":
             # Note: we could just convert to pure tensors even in v2?
-            transforms += [T.ToImageTensor() if use_v2 else T.PILToTensor()]
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
         elif backend == "tensor":
             transforms += [T.PILToTensor()]
         elif backend == "datapoint":
-            transforms += [T.ToImageTensor()]
+            transforms += [T.ToImage()]
         else:
             raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
 
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index e62fd5ae3..755cb236d 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -32,7 +32,7 @@ class SegmentationPresetTrain:
         transforms = []
         backend = backend.lower()
         if backend == "datapoint":
-            transforms.append(T.ToImageTensor())
+            transforms.append(T.ToImage())
         elif backend == "tensor":
             transforms.append(T.PILToTensor())
         elif backend != "pil":
@@ -81,7 +81,7 @@ class SegmentationPresetEval:
         if backend == "tensor":
             transforms += [T.PILToTensor()]
         elif backend == "datapoint":
-            transforms += [T.ToImageTensor()]
+            transforms += [T.ToImage()]
         elif backend != "pil":
             raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
 
@@ -92,7 +92,7 @@ class SegmentationPresetEval:
 
         if backend == "pil":
             # Note: we could just convert to pure tensors even in v2?
-            transforms += [T.ToImageTensor() if use_v2 else T.PILToTensor()]
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
 
         transforms += [
             T.ConvertImageDtype(torch.float),
diff --git a/test/common_utils.py b/test/common_utils.py
index 8d5eb0475..9713901bd 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -27,7 +27,7 @@ from PIL import Image
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import to_dtype_image_tensor, to_image_pil, to_image_tensor
+from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -293,7 +293,7 @@ class ImagePair(TensorLikePair):
         **other_parameters,
     ):
         if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
-            actual, expected = [to_image_tensor(input) for input in [actual, expected]]
+            actual, expected = [to_image(input) for input in [actual, expected]]
 
         super().__init__(actual, expected, **other_parameters)
         self.mae = mae
@@ -536,7 +536,7 @@ def make_image_tensor(*args, **kwargs):
 
 
 def make_image_pil(*args, **kwargs):
-    return to_image_pil(make_image(*args, **kwargs))
+    return to_pil_image(make_image(*args, **kwargs))
 
 
 def make_image_loader(
@@ -609,12 +609,12 @@ def make_image_loader_for_interpolation(
             )
         )
 
-        image_tensor = to_image_tensor(image_pil)
+        image_tensor = to_image(image_pil)
         if memory_format == torch.contiguous_format:
             image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
         else:
             image_tensor = image_tensor.to(device=device)
-        image_tensor = to_dtype_image_tensor(image_tensor, dtype=dtype, scale=True)
+        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
 
         return datapoints.Image(image_tensor)
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 43a7df4f3..32a68e140 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -17,7 +17,7 @@ from prototype_common_utils import make_label
 
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
-from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_image_pil
+from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
 from torchvision.transforms.v2.utils import check_type, is_simple_tensor
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
@@ -387,7 +387,7 @@ def test_fixed_sized_crop_against_detection_reference():
         size = (600, 800)
         num_objects = 22
 
-        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 4db2abe7f..ade3bdf0b 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -666,19 +666,19 @@ class TestTransform:
                 t(inpt)
 
 
-class TestToImageTensor:
+class TestToImage:
     @pytest.mark.parametrize(
         "inpt_type",
         [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch(
-            "torchvision.transforms.v2.functional.to_image_tensor",
+            "torchvision.transforms.v2.functional.to_image",
             return_value=torch.rand(1, 3, 8, 8),
         )
 
         inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImageTensor()
+        transform = transforms.ToImage()
         transform(inpt)
         if inpt_type in (datapoints.BoundingBoxes, datapoints.Image, str, int):
             assert fn.call_count == 0
@@ -686,30 +686,13 @@ class TestToImageTensor:
             fn.assert_called_once_with(inpt)
 
 
-class TestToImagePIL:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImagePIL()
-        transform(inpt)
-        if inpt_type in (datapoints.BoundingBoxes, PIL.Image.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
 class TestToPILImage:
     @pytest.mark.parametrize(
         "inpt_type",
         [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_pil_image")
 
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToPILImage()
@@ -1013,7 +996,7 @@ def test_antialias_warning():
 @pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
 @pytest.mark.parametrize("label_type", (torch.Tensor, int))
 @pytest.mark.parametrize("dataset_return_type", (dict, tuple))
-@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
 def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 
     image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
@@ -1074,7 +1057,7 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 
 @pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
 @pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
-@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
 @pytest.mark.parametrize("sanitize", (True, False))
 def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     torch.manual_seed(0)
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index bcab4355c..5855fbe44 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -30,7 +30,7 @@ from torchvision._utils import sequence_to_str
 from torchvision.transforms import functional as legacy_F
 from torchvision.transforms.v2 import functional as prototype_F
 from torchvision.transforms.v2._utils import _get_fill
-from torchvision.transforms.v2.functional import to_image_pil
+from torchvision.transforms.v2.functional import to_pil_image
 from torchvision.transforms.v2.utils import query_size
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
@@ -630,7 +630,7 @@ def check_call_consistency(
         )
 
         if image.ndim == 3 and supports_pil:
-            image_pil = to_image_pil(image)
+            image_pil = to_pil_image(image)
 
             try:
                 torch.manual_seed(0)
@@ -869,7 +869,7 @@ class TestToTensorTransforms:
         legacy_transform = legacy_transforms.PILToTensor()
 
         for image in make_images(extra_dims=[()]):
-            image_pil = to_image_pil(image)
+            image_pil = to_pil_image(image)
 
             assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
 
@@ -879,7 +879,7 @@ class TestToTensorTransforms:
         legacy_transform = legacy_transforms.ToTensor()
 
         for image in make_images(extra_dims=[()]):
-            image_pil = to_image_pil(image)
+            image_pil = to_pil_image(image)
             image_numpy = np.array(image_pil)
 
             assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
@@ -1088,7 +1088,7 @@ class TestRefDetTransforms:
         def make_label(extra_dims, categories):
             return torch.randint(categories, extra_dims, dtype=torch.int64)
 
-        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -1192,7 +1192,7 @@ class TestRefSegTransforms:
 
         conv_fns = []
         if supports_pil:
-            conv_fns.append(to_image_pil)
+            conv_fns.append(to_pil_image)
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
@@ -1201,8 +1201,8 @@ class TestRefSegTransforms:
 
             dp = (conv_fn(datapoint_image), datapoint_mask)
             dp_ref = (
-                to_image_pil(datapoint_image) if supports_pil else datapoint_image.as_subclass(torch.Tensor),
-                to_image_pil(datapoint_mask),
+                to_pil_image(datapoint_image) if supports_pil else datapoint_image.as_subclass(torch.Tensor),
+                to_pil_image(datapoint_mask),
             )
 
             yield dp, dp_ref
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index bf447c8ce..14a1f82b2 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -280,12 +280,12 @@ class TestKernels:
         adapted_other_args, adapted_kwargs = info.float32_vs_uint8(other_args, kwargs)
 
         actual = info.kernel(
-            F.to_dtype_image_tensor(input, dtype=torch.float32, scale=True),
+            F.to_dtype_image(input, dtype=torch.float32, scale=True),
             *adapted_other_args,
             **adapted_kwargs,
         )
 
-        expected = F.to_dtype_image_tensor(info.kernel(input, *other_args, **kwargs), dtype=torch.float32, scale=True)
+        expected = F.to_dtype_image(info.kernel(input, *other_args, **kwargs), dtype=torch.float32, scale=True)
 
         assert_close(
             actual,
@@ -377,7 +377,7 @@ class TestDispatchers:
         if image_datapoint.ndim > 3:
             pytest.skip("Input is batched")
 
-        image_pil = F.to_image_pil(image_datapoint)
+        image_pil = F.to_pil_image(image_datapoint)
 
         output = info.dispatcher(image_pil, *other_args, **kwargs)
 
@@ -470,7 +470,7 @@ class TestDispatchers:
             (F.hflip, F.horizontal_flip),
             (F.vflip, F.vertical_flip),
             (F.get_image_num_channels, F.get_num_channels),
-            (F.to_pil_image, F.to_image_pil),
+            (F.to_pil_image, F.to_pil_image),
             (F.elastic_transform, F.elastic),
             (F.to_grayscale, F.rgb_to_grayscale),
         ]
@@ -493,7 +493,7 @@ def test_normalize_image_tensor_stats(device, num_channels):
     mean = image.mean(dim=(1, 2)).tolist()
     std = image.std(dim=(1, 2)).tolist()
 
-    assert_samples_from_standard_normal(F.normalize_image_tensor(image, mean, std))
+    assert_samples_from_standard_normal(F.normalize_image(image, mean, std))
 
 
 class TestClampBoundingBoxes:
@@ -899,7 +899,7 @@ def test_correctness_center_crop_mask(device, output_size):
         _, image_height, image_width = mask.shape
         if crop_width > image_height or crop_height > image_width:
             padding = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-            mask = F.pad_image_tensor(mask, padding, fill=0)
+            mask = F.pad_image(mask, padding, fill=0)
 
         left = round((image_width - crop_width) * 0.5)
         top = round((image_height - crop_height) * 0.5)
@@ -920,7 +920,7 @@ def test_correctness_center_crop_mask(device, output_size):
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
 @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
 def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, sigma):
-    fn = F.gaussian_blur_image_tensor
+    fn = F.gaussian_blur_image
 
     # true_cv2_results = {
     #     # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))
@@ -977,8 +977,8 @@ def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize,
         PIL.Image.new("RGB", (32, 32), 122),
     ],
 )
-def test_to_image_tensor(inpt):
-    output = F.to_image_tensor(inpt)
+def test_to_image(inpt):
+    output = F.to_image(inpt)
     assert isinstance(output, torch.Tensor)
     assert output.shape == (3, 32, 32)
 
@@ -993,8 +993,8 @@ def test_to_image_tensor(inpt):
     ],
 )
 @pytest.mark.parametrize("mode", [None, "RGB"])
-def test_to_image_pil(inpt, mode):
-    output = F.to_image_pil(inpt, mode=mode)
+def test_to_pil_image(inpt, mode):
+    output = F.to_pil_image(inpt, mode=mode)
     assert isinstance(output, PIL.Image.Image)
 
     assert np.asarray(inpt).sum() == np.asarray(output).sum()
@@ -1002,12 +1002,12 @@ def test_to_image_pil(inpt, mode):
 
 def test_equalize_image_tensor_edge_cases():
     inpt = torch.zeros(3, 200, 200, dtype=torch.uint8)
-    output = F.equalize_image_tensor(inpt)
+    output = F.equalize_image(inpt)
     torch.testing.assert_close(inpt, output)
 
     inpt = torch.zeros(5, 3, 200, 200, dtype=torch.uint8)
     inpt[..., 100:, 100:] = 1
-    output = F.equalize_image_tensor(inpt)
+    output = F.equalize_image(inpt)
     assert output.unique().tolist() == [0, 255]
 
 
@@ -1024,7 +1024,7 @@ def test_correctness_uniform_temporal_subsample(device):
 # TODO: We can remove this test and related torchvision workaround
 # once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430
 @make_info_args_kwargs_parametrization(
-    [info for info in KERNEL_INFOS if info.kernel is F.resize_image_tensor],
+    [info for info in KERNEL_INFOS if info.kernel is F.resize_image],
     args_kwargs_fn=lambda info: info.reference_inputs_fn(),
 )
 def test_memory_format_consistency_resize_image_tensor(test_id, info, args_kwargs):
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index c83327a06..9d359e595 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -437,7 +437,7 @@ class TestResize:
         check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol)
 
         check_kernel(
-            F.resize_image_tensor,
+            F.resize_image,
             make_image(self.INPUT_SIZE, dtype=dtype, device=device),
             size=size,
             interpolation=interpolation,
@@ -495,9 +495,9 @@ class TestResize:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.resize_image_tensor, torch.Tensor),
-            (F.resize_image_pil, PIL.Image.Image),
-            (F.resize_image_tensor, datapoints.Image),
+            (F.resize_image, torch.Tensor),
+            (F._resize_image_pil, PIL.Image.Image),
+            (F.resize_image, datapoints.Image),
             (F.resize_bounding_boxes, datapoints.BoundingBoxes),
             (F.resize_mask, datapoints.Mask),
             (F.resize_video, datapoints.Video),
@@ -541,9 +541,7 @@ class TestResize:
         image = make_image(self.INPUT_SIZE, dtype=torch.uint8)
 
         actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
-        expected = F.to_image_tensor(
-            F.resize(F.to_image_pil(image), size=size, interpolation=interpolation, **max_size_kwarg)
-        )
+        expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg))
 
         self._check_output_size(image, actual, size=size, **max_size_kwarg)
         torch.testing.assert_close(actual, expected, atol=1, rtol=0)
@@ -739,7 +737,7 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.horizontal_flip_image_tensor, make_image(dtype=dtype, device=device))
+        check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@@ -770,9 +768,9 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.horizontal_flip_image_tensor, torch.Tensor),
-            (F.horizontal_flip_image_pil, PIL.Image.Image),
-            (F.horizontal_flip_image_tensor, datapoints.Image),
+            (F.horizontal_flip_image, torch.Tensor),
+            (F._horizontal_flip_image_pil, PIL.Image.Image),
+            (F.horizontal_flip_image, datapoints.Image),
             (F.horizontal_flip_bounding_boxes, datapoints.BoundingBoxes),
             (F.horizontal_flip_mask, datapoints.Mask),
             (F.horizontal_flip_video, datapoints.Video),
@@ -796,7 +794,7 @@ class TestHorizontalFlip:
         image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
-        expected = F.to_image_tensor(F.horizontal_flip(F.to_image_pil(image)))
+        expected = F.to_image(F.horizontal_flip(F.to_pil_image(image)))
 
         torch.testing.assert_close(actual, expected)
 
@@ -900,7 +898,7 @@ class TestAffine:
         if param == "fill":
             value = adapt_fill(value, dtype=dtype)
         self._check_kernel(
-            F.affine_image_tensor,
+            F.affine_image,
             make_image(dtype=dtype, device=device),
             **{param: value},
             check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
@@ -946,9 +944,9 @@ class TestAffine:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.affine_image_tensor, torch.Tensor),
-            (F.affine_image_pil, PIL.Image.Image),
-            (F.affine_image_tensor, datapoints.Image),
+            (F.affine_image, torch.Tensor),
+            (F._affine_image_pil, PIL.Image.Image),
+            (F.affine_image, datapoints.Image),
             (F.affine_bounding_boxes, datapoints.BoundingBoxes),
             (F.affine_mask, datapoints.Mask),
             (F.affine_video, datapoints.Video),
@@ -991,9 +989,9 @@ class TestAffine:
             interpolation=interpolation,
             fill=fill,
         )
-        expected = F.to_image_tensor(
+        expected = F.to_image(
             F.affine(
-                F.to_image_pil(image),
+                F.to_pil_image(image),
                 angle=angle,
                 translate=translate,
                 scale=scale,
@@ -1026,7 +1024,7 @@ class TestAffine:
         actual = transform(image)
 
         torch.manual_seed(seed)
-        expected = F.to_image_tensor(transform(F.to_image_pil(image)))
+        expected = F.to_image(transform(F.to_pil_image(image)))
 
         mae = (actual.float() - expected.float()).abs().mean()
         assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
@@ -1204,7 +1202,7 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.vertical_flip_image_tensor, make_image(dtype=dtype, device=device))
+        check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@@ -1235,9 +1233,9 @@ class TestVerticalFlip:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.vertical_flip_image_tensor, torch.Tensor),
-            (F.vertical_flip_image_pil, PIL.Image.Image),
-            (F.vertical_flip_image_tensor, datapoints.Image),
+            (F.vertical_flip_image, torch.Tensor),
+            (F._vertical_flip_image_pil, PIL.Image.Image),
+            (F.vertical_flip_image, datapoints.Image),
             (F.vertical_flip_bounding_boxes, datapoints.BoundingBoxes),
             (F.vertical_flip_mask, datapoints.Mask),
             (F.vertical_flip_video, datapoints.Video),
@@ -1259,7 +1257,7 @@ class TestVerticalFlip:
         image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
-        expected = F.to_image_tensor(F.vertical_flip(F.to_image_pil(image)))
+        expected = F.to_image(F.vertical_flip(F.to_pil_image(image)))
 
         torch.testing.assert_close(actual, expected)
 
@@ -1339,7 +1337,7 @@ class TestRotate:
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
         check_kernel(
-            F.rotate_image_tensor,
+            F.rotate_image,
             make_image(dtype=dtype, device=device),
             **kwargs,
             check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
@@ -1385,9 +1383,9 @@ class TestRotate:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.rotate_image_tensor, torch.Tensor),
-            (F.rotate_image_pil, PIL.Image.Image),
-            (F.rotate_image_tensor, datapoints.Image),
+            (F.rotate_image, torch.Tensor),
+            (F._rotate_image_pil, PIL.Image.Image),
+            (F.rotate_image, datapoints.Image),
             (F.rotate_bounding_boxes, datapoints.BoundingBoxes),
             (F.rotate_mask, datapoints.Mask),
             (F.rotate_video, datapoints.Video),
@@ -1419,9 +1417,9 @@ class TestRotate:
         fill = adapt_fill(fill, dtype=torch.uint8)
 
         actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill)
-        expected = F.to_image_tensor(
+        expected = F.to_image(
             F.rotate(
-                F.to_image_pil(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill
+                F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill
             )
         )
 
@@ -1452,7 +1450,7 @@ class TestRotate:
         actual = transform(image)
 
         torch.manual_seed(seed)
-        expected = F.to_image_tensor(transform(F.to_image_pil(image)))
+        expected = F.to_image(transform(F.to_pil_image(image)))
 
         mae = (actual.float() - expected.float()).abs().mean()
         assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
@@ -1621,8 +1619,8 @@ class TestToDtype:
     @pytest.mark.parametrize(
         ("kernel", "make_input"),
         [
-            (F.to_dtype_image_tensor, make_image_tensor),
-            (F.to_dtype_image_tensor, make_image),
+            (F.to_dtype_image, make_image_tensor),
+            (F.to_dtype_image, make_image),
             (F.to_dtype_video, make_video),
         ],
     )
@@ -1801,7 +1799,7 @@ class TestAdjustBrightness:
     @pytest.mark.parametrize(
         ("kernel", "make_input"),
         [
-            (F.adjust_brightness_image_tensor, make_image),
+            (F.adjust_brightness_image, make_image),
             (F.adjust_brightness_video, make_video),
         ],
     )
@@ -1817,9 +1815,9 @@ class TestAdjustBrightness:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.adjust_brightness_image_tensor, torch.Tensor),
-            (F.adjust_brightness_image_pil, PIL.Image.Image),
-            (F.adjust_brightness_image_tensor, datapoints.Image),
+            (F.adjust_brightness_image, torch.Tensor),
+            (F._adjust_brightness_image_pil, PIL.Image.Image),
+            (F.adjust_brightness_image, datapoints.Image),
             (F.adjust_brightness_video, datapoints.Video),
         ],
     )
@@ -1831,7 +1829,7 @@ class TestAdjustBrightness:
         image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = F.adjust_brightness(image, brightness_factor=brightness_factor)
-        expected = F.to_image_tensor(F.adjust_brightness(F.to_image_pil(image), brightness_factor=brightness_factor))
+        expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor))
 
         torch.testing.assert_close(actual, expected)
 
@@ -1979,9 +1977,9 @@ class TestShapeGetters:
     @pytest.mark.parametrize(
         ("kernel", "make_input"),
         [
-            (F.get_dimensions_image_tensor, make_image_tensor),
-            (F.get_dimensions_image_pil, make_image_pil),
-            (F.get_dimensions_image_tensor, make_image),
+            (F.get_dimensions_image, make_image_tensor),
+            (F._get_dimensions_image_pil, make_image_pil),
+            (F.get_dimensions_image, make_image),
             (F.get_dimensions_video, make_video),
         ],
     )
@@ -1996,9 +1994,9 @@ class TestShapeGetters:
     @pytest.mark.parametrize(
         ("kernel", "make_input"),
         [
-            (F.get_num_channels_image_tensor, make_image_tensor),
-            (F.get_num_channels_image_pil, make_image_pil),
-            (F.get_num_channels_image_tensor, make_image),
+            (F.get_num_channels_image, make_image_tensor),
+            (F._get_num_channels_image_pil, make_image_pil),
+            (F.get_num_channels_image, make_image),
             (F.get_num_channels_video, make_video),
         ],
     )
@@ -2012,9 +2010,9 @@ class TestShapeGetters:
     @pytest.mark.parametrize(
         ("kernel", "make_input"),
         [
-            (F.get_size_image_tensor, make_image_tensor),
-            (F.get_size_image_pil, make_image_pil),
-            (F.get_size_image_tensor, make_image),
+            (F.get_size_image, make_image_tensor),
+            (F._get_size_image_pil, make_image_pil),
+            (F.get_size_image, make_image),
             (F.get_size_bounding_boxes, make_bounding_box),
             (F.get_size_mask, make_detection_mask),
             (F.get_size_mask, make_segmentation_mask),
@@ -2101,7 +2099,7 @@ class TestRegisterKernel:
             F.register_kernel(F.resize, object)
 
         with pytest.raises(ValueError, match="cannot be registered for the builtin datapoint classes"):
-            F.register_kernel(F.resize, datapoints.Image)(F.resize_image_tensor)
+            F.register_kernel(F.resize, datapoints.Image)(F.resize_image)
 
         class CustomDatapoint(datapoints.Datapoint):
             pass
@@ -2119,9 +2117,9 @@ class TestGetKernel:
     # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination
     # would also be fine
     KERNELS = {
-        torch.Tensor: F.resize_image_tensor,
-        PIL.Image.Image: F.resize_image_pil,
-        datapoints.Image: F.resize_image_tensor,
+        torch.Tensor: F.resize_image,
+        PIL.Image.Image: F._resize_image_pil,
+        datapoints.Image: F.resize_image,
         datapoints.BoundingBoxes: F.resize_bounding_boxes,
         datapoints.Mask: F.resize_mask,
         datapoints.Video: F.resize_video,
@@ -2217,10 +2215,10 @@ class TestPermuteChannels:
     @pytest.mark.parametrize(
         ("kernel", "make_input"),
         [
-            (F.permute_channels_image_tensor, make_image_tensor),
+            (F.permute_channels_image, make_image_tensor),
             # FIXME
             # check_kernel does not support PIL kernel, but it should
-            (F.permute_channels_image_tensor, make_image),
+            (F.permute_channels_image, make_image),
             (F.permute_channels_video, make_video),
         ],
     )
@@ -2236,9 +2234,9 @@ class TestPermuteChannels:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.permute_channels_image_tensor, torch.Tensor),
-            (F.permute_channels_image_pil, PIL.Image.Image),
-            (F.permute_channels_image_tensor, datapoints.Image),
+            (F.permute_channels_image, torch.Tensor),
+            (F._permute_channels_image_pil, PIL.Image.Image),
+            (F.permute_channels_image, datapoints.Image),
             (F.permute_channels_video, datapoints.Video),
         ],
     )
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index f880dac6c..0cf7a77ac 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -7,7 +7,7 @@ import torchvision.transforms.v2.utils
 from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image
 
 from torchvision import datapoints
-from torchvision.transforms.v2.functional import to_image_pil
+from torchvision.transforms.v2.functional import to_pil_image
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
@@ -44,7 +44,7 @@ MASK = make_detection_mask(DEFAULT_SIZE)
             True,
         ),
         (
-            (to_image_pil(IMAGE),),
+            (to_pil_image(IMAGE),),
             (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
             True,
         ),
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index cef5c3604..8f212c850 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -142,32 +142,32 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.crop,
         kernels={
-            datapoints.Image: F.crop_image_tensor,
+            datapoints.Image: F.crop_image,
             datapoints.Video: F.crop_video,
             datapoints.BoundingBoxes: F.crop_bounding_boxes,
             datapoints.Mask: F.crop_mask,
         },
-        pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._crop_image_pil, kernel_name="crop_image_pil"),
     ),
     DispatcherInfo(
         F.resized_crop,
         kernels={
-            datapoints.Image: F.resized_crop_image_tensor,
+            datapoints.Image: F.resized_crop_image,
             datapoints.Video: F.resized_crop_video,
             datapoints.BoundingBoxes: F.resized_crop_bounding_boxes,
             datapoints.Mask: F.resized_crop_mask,
         },
-        pil_kernel_info=PILKernelInfo(F.resized_crop_image_pil),
+        pil_kernel_info=PILKernelInfo(F._resized_crop_image_pil),
     ),
     DispatcherInfo(
         F.pad,
         kernels={
-            datapoints.Image: F.pad_image_tensor,
+            datapoints.Image: F.pad_image,
             datapoints.Video: F.pad_video,
             datapoints.BoundingBoxes: F.pad_bounding_boxes,
             datapoints.Mask: F.pad_mask,
         },
-        pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"),
         test_marks=[
             *xfails_pil(
                 reason=(
@@ -184,12 +184,12 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.perspective,
         kernels={
-            datapoints.Image: F.perspective_image_tensor,
+            datapoints.Image: F.perspective_image,
             datapoints.Video: F.perspective_video,
             datapoints.BoundingBoxes: F.perspective_bounding_boxes,
             datapoints.Mask: F.perspective_mask,
         },
-        pil_kernel_info=PILKernelInfo(F.perspective_image_pil),
+        pil_kernel_info=PILKernelInfo(F._perspective_image_pil),
         test_marks=[
             *xfails_pil_if_fill_sequence_needs_broadcast,
             xfail_jit_python_scalar_arg("fill"),
@@ -198,23 +198,23 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.elastic,
         kernels={
-            datapoints.Image: F.elastic_image_tensor,
+            datapoints.Image: F.elastic_image,
             datapoints.Video: F.elastic_video,
             datapoints.BoundingBoxes: F.elastic_bounding_boxes,
             datapoints.Mask: F.elastic_mask,
         },
-        pil_kernel_info=PILKernelInfo(F.elastic_image_pil),
+        pil_kernel_info=PILKernelInfo(F._elastic_image_pil),
         test_marks=[xfail_jit_python_scalar_arg("fill")],
     ),
     DispatcherInfo(
         F.center_crop,
         kernels={
-            datapoints.Image: F.center_crop_image_tensor,
+            datapoints.Image: F.center_crop_image,
             datapoints.Video: F.center_crop_video,
             datapoints.BoundingBoxes: F.center_crop_bounding_boxes,
             datapoints.Mask: F.center_crop_mask,
         },
-        pil_kernel_info=PILKernelInfo(F.center_crop_image_pil),
+        pil_kernel_info=PILKernelInfo(F._center_crop_image_pil),
         test_marks=[
             xfail_jit_python_scalar_arg("output_size"),
         ],
@@ -222,10 +222,10 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.gaussian_blur,
         kernels={
-            datapoints.Image: F.gaussian_blur_image_tensor,
+            datapoints.Image: F.gaussian_blur_image,
             datapoints.Video: F.gaussian_blur_video,
         },
-        pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil),
+        pil_kernel_info=PILKernelInfo(F._gaussian_blur_image_pil),
         test_marks=[
             xfail_jit_python_scalar_arg("kernel_size"),
             xfail_jit_python_scalar_arg("sigma"),
@@ -234,58 +234,58 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.equalize,
         kernels={
-            datapoints.Image: F.equalize_image_tensor,
+            datapoints.Image: F.equalize_image,
             datapoints.Video: F.equalize_video,
         },
-        pil_kernel_info=PILKernelInfo(F.equalize_image_pil, kernel_name="equalize_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._equalize_image_pil, kernel_name="equalize_image_pil"),
     ),
     DispatcherInfo(
         F.invert,
         kernels={
-            datapoints.Image: F.invert_image_tensor,
+            datapoints.Image: F.invert_image,
             datapoints.Video: F.invert_video,
         },
-        pil_kernel_info=PILKernelInfo(F.invert_image_pil, kernel_name="invert_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._invert_image_pil, kernel_name="invert_image_pil"),
     ),
     DispatcherInfo(
         F.posterize,
         kernels={
-            datapoints.Image: F.posterize_image_tensor,
+            datapoints.Image: F.posterize_image,
             datapoints.Video: F.posterize_video,
         },
-        pil_kernel_info=PILKernelInfo(F.posterize_image_pil, kernel_name="posterize_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._posterize_image_pil, kernel_name="posterize_image_pil"),
     ),
     DispatcherInfo(
         F.solarize,
         kernels={
-            datapoints.Image: F.solarize_image_tensor,
+            datapoints.Image: F.solarize_image,
             datapoints.Video: F.solarize_video,
         },
-        pil_kernel_info=PILKernelInfo(F.solarize_image_pil, kernel_name="solarize_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._solarize_image_pil, kernel_name="solarize_image_pil"),
     ),
     DispatcherInfo(
         F.autocontrast,
         kernels={
-            datapoints.Image: F.autocontrast_image_tensor,
+            datapoints.Image: F.autocontrast_image,
             datapoints.Video: F.autocontrast_video,
         },
-        pil_kernel_info=PILKernelInfo(F.autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_sharpness,
         kernels={
-            datapoints.Image: F.adjust_sharpness_image_tensor,
+            datapoints.Image: F.adjust_sharpness_image,
             datapoints.Video: F.adjust_sharpness_video,
         },
-        pil_kernel_info=PILKernelInfo(F.adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
     ),
     DispatcherInfo(
         F.erase,
         kernels={
-            datapoints.Image: F.erase_image_tensor,
+            datapoints.Image: F.erase_image,
             datapoints.Video: F.erase_video,
         },
-        pil_kernel_info=PILKernelInfo(F.erase_image_pil),
+        pil_kernel_info=PILKernelInfo(F._erase_image_pil),
         test_marks=[
             skip_dispatch_datapoint,
         ],
@@ -293,42 +293,42 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.adjust_contrast,
         kernels={
-            datapoints.Image: F.adjust_contrast_image_tensor,
+            datapoints.Image: F.adjust_contrast_image,
             datapoints.Video: F.adjust_contrast_video,
         },
-        pil_kernel_info=PILKernelInfo(F.adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_gamma,
         kernels={
-            datapoints.Image: F.adjust_gamma_image_tensor,
+            datapoints.Image: F.adjust_gamma_image,
             datapoints.Video: F.adjust_gamma_video,
         },
-        pil_kernel_info=PILKernelInfo(F.adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_hue,
         kernels={
-            datapoints.Image: F.adjust_hue_image_tensor,
+            datapoints.Image: F.adjust_hue_image,
             datapoints.Video: F.adjust_hue_video,
         },
-        pil_kernel_info=PILKernelInfo(F.adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_saturation,
         kernels={
-            datapoints.Image: F.adjust_saturation_image_tensor,
+            datapoints.Image: F.adjust_saturation_image,
             datapoints.Video: F.adjust_saturation_video,
         },
-        pil_kernel_info=PILKernelInfo(F.adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
+        pil_kernel_info=PILKernelInfo(F._adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
     ),
     DispatcherInfo(
         F.five_crop,
         kernels={
-            datapoints.Image: F.five_crop_image_tensor,
+            datapoints.Image: F.five_crop_image,
             datapoints.Video: F.five_crop_video,
         },
-        pil_kernel_info=PILKernelInfo(F.five_crop_image_pil),
+        pil_kernel_info=PILKernelInfo(F._five_crop_image_pil),
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
             *multi_crop_skips,
@@ -337,19 +337,19 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.ten_crop,
         kernels={
-            datapoints.Image: F.ten_crop_image_tensor,
+            datapoints.Image: F.ten_crop_image,
             datapoints.Video: F.ten_crop_video,
         },
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
             *multi_crop_skips,
         ],
-        pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil),
+        pil_kernel_info=PILKernelInfo(F._ten_crop_image_pil),
     ),
     DispatcherInfo(
         F.normalize,
         kernels={
-            datapoints.Image: F.normalize_image_tensor,
+            datapoints.Image: F.normalize_image,
             datapoints.Video: F.normalize_video,
         },
         test_marks=[
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index ac5651d32..acb9a8577 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -122,12 +122,12 @@ def pil_reference_wrapper(pil_kernel):
                 f"Can only test single tensor images against PIL, but input has shape {input_tensor.shape}"
             )
 
-        input_pil = F.to_image_pil(input_tensor)
+        input_pil = F.to_pil_image(input_tensor)
         output_pil = pil_kernel(input_pil, *other_args, **kwargs)
         if not isinstance(output_pil, PIL.Image.Image):
             return output_pil
 
-        output_tensor = F.to_image_tensor(output_pil)
+        output_tensor = F.to_image(output_pil)
 
         # 2D mask shenanigans
         if output_tensor.ndim == 2 and input_tensor.ndim == 3:
@@ -331,10 +331,10 @@ def reference_inputs_crop_bounding_boxes():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.crop_image_tensor,
+            F.crop_image,
             kernel_name="crop_image_tensor",
             sample_inputs_fn=sample_inputs_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.crop_image_pil),
+            reference_fn=pil_reference_wrapper(F._crop_image_pil),
             reference_inputs_fn=reference_inputs_crop_image_tensor,
             float32_vs_uint8=True,
         ),
@@ -347,7 +347,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.crop_mask,
             sample_inputs_fn=sample_inputs_crop_mask,
-            reference_fn=pil_reference_wrapper(F.crop_image_pil),
+            reference_fn=pil_reference_wrapper(F._crop_image_pil),
             reference_inputs_fn=reference_inputs_crop_mask,
             float32_vs_uint8=True,
         ),
@@ -373,7 +373,7 @@ def reference_resized_crop_image_tensor(*args, **kwargs):
         F.InterpolationMode.BICUBIC,
     }:
         raise pytest.UsageError("Anti-aliasing is always active in PIL")
-    return F.resized_crop_image_pil(*args, **kwargs)
+    return F._resized_crop_image_pil(*args, **kwargs)
 
 
 def reference_inputs_resized_crop_image_tensor():
@@ -417,7 +417,7 @@ def sample_inputs_resized_crop_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.resized_crop_image_tensor,
+            F.resized_crop_image,
             sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
             reference_fn=reference_resized_crop_image_tensor,
             reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
@@ -570,9 +570,9 @@ def pad_xfail_jit_fill_condition(args_kwargs):
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.pad_image_tensor,
+            F.pad_image,
             sample_inputs_fn=sample_inputs_pad_image_tensor,
-            reference_fn=pil_reference_wrapper(F.pad_image_pil),
+            reference_fn=pil_reference_wrapper(F._pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs=float32_vs_uint8_pixel_difference(),
@@ -595,7 +595,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.pad_mask,
             sample_inputs_fn=sample_inputs_pad_mask,
-            reference_fn=pil_reference_wrapper(F.pad_image_pil),
+            reference_fn=pil_reference_wrapper(F._pad_image_pil),
             reference_inputs_fn=reference_inputs_pad_mask,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
         ),
@@ -690,9 +690,9 @@ def sample_inputs_perspective_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.perspective_image_tensor,
+            F.perspective_image,
             sample_inputs_fn=sample_inputs_perspective_image_tensor,
-            reference_fn=pil_reference_wrapper(F.perspective_image_pil),
+            reference_fn=pil_reference_wrapper(F._perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
             closeness_kwargs={
@@ -715,7 +715,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.perspective_mask,
             sample_inputs_fn=sample_inputs_perspective_mask,
-            reference_fn=pil_reference_wrapper(F.perspective_image_pil),
+            reference_fn=pil_reference_wrapper(F._perspective_image_pil),
             reference_inputs_fn=reference_inputs_perspective_mask,
             float32_vs_uint8=True,
             closeness_kwargs={
@@ -786,7 +786,7 @@ def sample_inputs_elastic_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.elastic_image_tensor,
+            F.elastic_image,
             sample_inputs_fn=sample_inputs_elastic_image_tensor,
             reference_inputs_fn=reference_inputs_elastic_image_tensor,
             float32_vs_uint8=float32_vs_uint8_fill_adapter,
@@ -870,9 +870,9 @@ def sample_inputs_center_crop_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.center_crop_image_tensor,
+            F.center_crop_image,
             sample_inputs_fn=sample_inputs_center_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
+            reference_fn=pil_reference_wrapper(F._center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_image_tensor,
             float32_vs_uint8=True,
             test_marks=[
@@ -889,7 +889,7 @@ KERNEL_INFOS.extend(
         KernelInfo(
             F.center_crop_mask,
             sample_inputs_fn=sample_inputs_center_crop_mask,
-            reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
+            reference_fn=pil_reference_wrapper(F._center_crop_image_pil),
             reference_inputs_fn=reference_inputs_center_crop_mask,
             float32_vs_uint8=True,
             test_marks=[
@@ -924,7 +924,7 @@ def sample_inputs_gaussian_blur_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.gaussian_blur_image_tensor,
+            F.gaussian_blur_image,
             sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
             closeness_kwargs=cuda_vs_cpu_pixel_difference(),
             test_marks=[
@@ -1010,10 +1010,10 @@ def sample_inputs_equalize_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.equalize_image_tensor,
+            F.equalize_image,
             kernel_name="equalize_image_tensor",
             sample_inputs_fn=sample_inputs_equalize_image_tensor,
-            reference_fn=pil_reference_wrapper(F.equalize_image_pil),
+            reference_fn=pil_reference_wrapper(F._equalize_image_pil),
             float32_vs_uint8=True,
             reference_inputs_fn=reference_inputs_equalize_image_tensor,
         ),
@@ -1043,10 +1043,10 @@ def sample_inputs_invert_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.invert_image_tensor,
+            F.invert_image,
             kernel_name="invert_image_tensor",
             sample_inputs_fn=sample_inputs_invert_image_tensor,
-            reference_fn=pil_reference_wrapper(F.invert_image_pil),
+            reference_fn=pil_reference_wrapper(F._invert_image_pil),
             reference_inputs_fn=reference_inputs_invert_image_tensor,
             float32_vs_uint8=True,
         ),
@@ -1082,10 +1082,10 @@ def sample_inputs_posterize_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.posterize_image_tensor,
+            F.posterize_image,
             kernel_name="posterize_image_tensor",
             sample_inputs_fn=sample_inputs_posterize_image_tensor,
-            reference_fn=pil_reference_wrapper(F.posterize_image_pil),
+            reference_fn=pil_reference_wrapper(F._posterize_image_pil),
             reference_inputs_fn=reference_inputs_posterize_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs=float32_vs_uint8_pixel_difference(),
@@ -1127,10 +1127,10 @@ def sample_inputs_solarize_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.solarize_image_tensor,
+            F.solarize_image,
             kernel_name="solarize_image_tensor",
             sample_inputs_fn=sample_inputs_solarize_image_tensor,
-            reference_fn=pil_reference_wrapper(F.solarize_image_pil),
+            reference_fn=pil_reference_wrapper(F._solarize_image_pil),
             reference_inputs_fn=reference_inputs_solarize_image_tensor,
             float32_vs_uint8=uint8_to_float32_threshold_adapter,
             closeness_kwargs=float32_vs_uint8_pixel_difference(),
@@ -1161,10 +1161,10 @@ def sample_inputs_autocontrast_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.autocontrast_image_tensor,
+            F.autocontrast_image,
             kernel_name="autocontrast_image_tensor",
             sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
-            reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
+            reference_fn=pil_reference_wrapper(F._autocontrast_image_pil),
             reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
@@ -1206,10 +1206,10 @@ def sample_inputs_adjust_sharpness_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.adjust_sharpness_image_tensor,
+            F.adjust_sharpness_image,
             kernel_name="adjust_sharpness_image_tensor",
             sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
-            reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
+            reference_fn=pil_reference_wrapper(F._adjust_sharpness_image_pil),
             reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs=float32_vs_uint8_pixel_difference(2),
@@ -1241,7 +1241,7 @@ def sample_inputs_erase_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.erase_image_tensor,
+            F.erase_image,
             kernel_name="erase_image_tensor",
             sample_inputs_fn=sample_inputs_erase_image_tensor,
         ),
@@ -1276,10 +1276,10 @@ def sample_inputs_adjust_contrast_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.adjust_contrast_image_tensor,
+            F.adjust_contrast_image,
             kernel_name="adjust_contrast_image_tensor",
             sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
-            reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
+            reference_fn=pil_reference_wrapper(F._adjust_contrast_image_pil),
             reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
@@ -1329,10 +1329,10 @@ def sample_inputs_adjust_gamma_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.adjust_gamma_image_tensor,
+            F.adjust_gamma_image,
             kernel_name="adjust_gamma_image_tensor",
             sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
-            reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
+            reference_fn=pil_reference_wrapper(F._adjust_gamma_image_pil),
             reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
@@ -1372,10 +1372,10 @@ def sample_inputs_adjust_hue_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.adjust_hue_image_tensor,
+            F.adjust_hue_image,
             kernel_name="adjust_hue_image_tensor",
             sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
-            reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
+            reference_fn=pil_reference_wrapper(F._adjust_hue_image_pil),
             reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
@@ -1414,10 +1414,10 @@ def sample_inputs_adjust_saturation_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.adjust_saturation_image_tensor,
+            F.adjust_saturation_image,
             kernel_name="adjust_saturation_image_tensor",
             sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
-            reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
+            reference_fn=pil_reference_wrapper(F._adjust_saturation_image_pil),
             reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
             float32_vs_uint8=True,
             closeness_kwargs={
@@ -1517,8 +1517,7 @@ def multi_crop_pil_reference_wrapper(pil_kernel):
     def wrapper(input_tensor, *other_args, **kwargs):
         output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs)
         return type(output)(
-            F.to_dtype_image_tensor(F.to_image_tensor(output_pil), dtype=input_tensor.dtype, scale=True)
-            for output_pil in output
+            F.to_dtype_image(F.to_image(output_pil), dtype=input_tensor.dtype, scale=True) for output_pil in output
         )
 
     return wrapper
@@ -1532,9 +1531,9 @@ _common_five_ten_crop_marks = [
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.five_crop_image_tensor,
+            F.five_crop_image,
             sample_inputs_fn=sample_inputs_five_crop_image_tensor,
-            reference_fn=multi_crop_pil_reference_wrapper(F.five_crop_image_pil),
+            reference_fn=multi_crop_pil_reference_wrapper(F._five_crop_image_pil),
             reference_inputs_fn=reference_inputs_five_crop_image_tensor,
             test_marks=_common_five_ten_crop_marks,
         ),
@@ -1544,9 +1543,9 @@ KERNEL_INFOS.extend(
             test_marks=_common_five_ten_crop_marks,
         ),
         KernelInfo(
-            F.ten_crop_image_tensor,
+            F.ten_crop_image,
             sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
-            reference_fn=multi_crop_pil_reference_wrapper(F.ten_crop_image_pil),
+            reference_fn=multi_crop_pil_reference_wrapper(F._ten_crop_image_pil),
             reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
             test_marks=_common_five_ten_crop_marks,
         ),
@@ -1600,7 +1599,7 @@ def sample_inputs_normalize_video():
 KERNEL_INFOS.extend(
     [
         KernelInfo(
-            F.normalize_image_tensor,
+            F.normalize_image,
             kernel_name="normalize_image_tensor",
             sample_inputs_fn=sample_inputs_normalize_image_tensor,
             reference_fn=reference_normalize_image_tensor,
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index f2c6e89dd..81f726a2d 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -112,7 +112,7 @@ class SimpleCopyPaste(Transform):
             if isinstance(obj, datapoints.Image) or is_simple_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
-                images.append(F.to_image_tensor(obj))
+                images.append(F.to_image(obj))
             elif isinstance(obj, datapoints.BoundingBoxes):
                 bboxes.append(obj)
             elif isinstance(obj, datapoints.Mask):
@@ -144,7 +144,7 @@ class SimpleCopyPaste(Transform):
                 flat_sample[i] = datapoints.wrap(output_images[c0], like=obj)
                 c0 += 1
             elif isinstance(obj, PIL.Image.Image):
-                flat_sample[i] = F.to_image_pil(output_images[c0])
+                flat_sample[i] = F.to_pil_image(output_images[c0])
                 c0 += 1
             elif is_simple_tensor(obj):
                 flat_sample[i] = output_images[c0]
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 4451cb7a1..38da78fa4 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -52,7 +52,7 @@ from ._misc import (
     ToDtype,
 )
 from ._temporal import UniformTemporalSubsample
-from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
+from ._type_conversion import PILToTensor, ToImage, ToPILImage
 
 from ._deprecated import ToTensor  # usort: skip
 
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 8494b64b9..687a2396e 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -622,6 +622,6 @@ class AugMix(_AutoAugmentBase):
         if isinstance(orig_image_or_video, (datapoints.Image, datapoints.Video)):
             mix = datapoints.wrap(mix, like=orig_image_or_video)
         elif isinstance(orig_image_or_video, PIL.Image.Image):
-            mix = F.to_image_pil(mix)
+            mix = F.to_pil_image(mix)
 
         return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, mix)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 60f44c5d3..aec82f46f 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -26,7 +26,7 @@ class PILToTensor(Transform):
         return F.pil_to_tensor(inpt)
 
 
-class ToImageTensor(Transform):
+class ToImage(Transform):
     """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
     ; this does not scale values.
 
@@ -40,10 +40,10 @@ class ToImageTensor(Transform):
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> datapoints.Image:
-        return F.to_image_tensor(inpt)
+        return F.to_image(inpt)
 
 
-class ToImagePIL(Transform):
+class ToPILImage(Transform):
     """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
     .. v2betastatus:: ToImagePIL transform
@@ -74,9 +74,4 @@ class ToImagePIL(Transform):
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> PIL.Image.Image:
-        return F.to_image_pil(inpt, mode=self.mode)
-
-
-# We changed the name to align them with the new naming scheme. Still, `ToPILImage` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-ToPILImage = ToImagePIL
+        return F.to_pil_image(inpt, mode=self.mode)
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index f32958601..3510962ff 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -5,173 +5,173 @@ from ._utils import is_simple_tensor, register_kernel  # usort: skip
 from ._meta import (
     clamp_bounding_boxes,
     convert_format_bounding_boxes,
-    get_dimensions_image_tensor,
-    get_dimensions_image_pil,
+    get_dimensions_image,
+    _get_dimensions_image_pil,
     get_dimensions_video,
     get_dimensions,
     get_num_frames_video,
     get_num_frames,
     get_image_num_channels,
-    get_num_channels_image_tensor,
-    get_num_channels_image_pil,
+    get_num_channels_image,
+    _get_num_channels_image_pil,
     get_num_channels_video,
     get_num_channels,
     get_size_bounding_boxes,
-    get_size_image_tensor,
-    get_size_image_pil,
+    get_size_image,
+    _get_size_image_pil,
     get_size_mask,
     get_size_video,
     get_size,
 )  # usort: skip
 
-from ._augment import erase, erase_image_pil, erase_image_tensor, erase_video
+from ._augment import _erase_image_pil, erase, erase_image, erase_video
 from ._color import (
+    _adjust_brightness_image_pil,
+    _adjust_contrast_image_pil,
+    _adjust_gamma_image_pil,
+    _adjust_hue_image_pil,
+    _adjust_saturation_image_pil,
+    _adjust_sharpness_image_pil,
+    _autocontrast_image_pil,
+    _equalize_image_pil,
+    _invert_image_pil,
+    _permute_channels_image_pil,
+    _posterize_image_pil,
+    _rgb_to_grayscale_image_pil,
+    _solarize_image_pil,
     adjust_brightness,
-    adjust_brightness_image_pil,
-    adjust_brightness_image_tensor,
+    adjust_brightness_image,
     adjust_brightness_video,
     adjust_contrast,
-    adjust_contrast_image_pil,
-    adjust_contrast_image_tensor,
+    adjust_contrast_image,
     adjust_contrast_video,
     adjust_gamma,
-    adjust_gamma_image_pil,
-    adjust_gamma_image_tensor,
+    adjust_gamma_image,
     adjust_gamma_video,
     adjust_hue,
-    adjust_hue_image_pil,
-    adjust_hue_image_tensor,
+    adjust_hue_image,
     adjust_hue_video,
     adjust_saturation,
-    adjust_saturation_image_pil,
-    adjust_saturation_image_tensor,
+    adjust_saturation_image,
     adjust_saturation_video,
     adjust_sharpness,
-    adjust_sharpness_image_pil,
-    adjust_sharpness_image_tensor,
+    adjust_sharpness_image,
     adjust_sharpness_video,
     autocontrast,
-    autocontrast_image_pil,
-    autocontrast_image_tensor,
+    autocontrast_image,
     autocontrast_video,
     equalize,
-    equalize_image_pil,
-    equalize_image_tensor,
+    equalize_image,
     equalize_video,
     invert,
-    invert_image_pil,
-    invert_image_tensor,
+    invert_image,
     invert_video,
     permute_channels,
-    permute_channels_image_pil,
-    permute_channels_image_tensor,
+    permute_channels_image,
     permute_channels_video,
     posterize,
-    posterize_image_pil,
-    posterize_image_tensor,
+    posterize_image,
     posterize_video,
     rgb_to_grayscale,
-    rgb_to_grayscale_image_pil,
-    rgb_to_grayscale_image_tensor,
+    rgb_to_grayscale_image,
     solarize,
-    solarize_image_pil,
-    solarize_image_tensor,
+    solarize_image,
     solarize_video,
     to_grayscale,
 )
 from ._geometry import (
+    _affine_image_pil,
+    _center_crop_image_pil,
+    _crop_image_pil,
+    _elastic_image_pil,
+    _five_crop_image_pil,
+    _horizontal_flip_image_pil,
+    _pad_image_pil,
+    _perspective_image_pil,
+    _resize_image_pil,
+    _resized_crop_image_pil,
+    _rotate_image_pil,
+    _ten_crop_image_pil,
+    _vertical_flip_image_pil,
     affine,
     affine_bounding_boxes,
-    affine_image_pil,
-    affine_image_tensor,
+    affine_image,
     affine_mask,
     affine_video,
     center_crop,
     center_crop_bounding_boxes,
-    center_crop_image_pil,
-    center_crop_image_tensor,
+    center_crop_image,
     center_crop_mask,
     center_crop_video,
     crop,
     crop_bounding_boxes,
-    crop_image_pil,
-    crop_image_tensor,
+    crop_image,
     crop_mask,
     crop_video,
     elastic,
     elastic_bounding_boxes,
-    elastic_image_pil,
-    elastic_image_tensor,
+    elastic_image,
     elastic_mask,
     elastic_transform,
     elastic_video,
     five_crop,
-    five_crop_image_pil,
-    five_crop_image_tensor,
+    five_crop_image,
     five_crop_video,
     hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
     horizontal_flip,
     horizontal_flip_bounding_boxes,
-    horizontal_flip_image_pil,
-    horizontal_flip_image_tensor,
+    horizontal_flip_image,
     horizontal_flip_mask,
     horizontal_flip_video,
     pad,
     pad_bounding_boxes,
-    pad_image_pil,
-    pad_image_tensor,
+    pad_image,
     pad_mask,
     pad_video,
     perspective,
     perspective_bounding_boxes,
-    perspective_image_pil,
-    perspective_image_tensor,
+    perspective_image,
     perspective_mask,
     perspective_video,
     resize,
     resize_bounding_boxes,
-    resize_image_pil,
-    resize_image_tensor,
+    resize_image,
     resize_mask,
     resize_video,
     resized_crop,
     resized_crop_bounding_boxes,
-    resized_crop_image_pil,
-    resized_crop_image_tensor,
+    resized_crop_image,
     resized_crop_mask,
     resized_crop_video,
     rotate,
     rotate_bounding_boxes,
-    rotate_image_pil,
-    rotate_image_tensor,
+    rotate_image,
     rotate_mask,
     rotate_video,
     ten_crop,
-    ten_crop_image_pil,
-    ten_crop_image_tensor,
+    ten_crop_image,
     ten_crop_video,
     vertical_flip,
     vertical_flip_bounding_boxes,
-    vertical_flip_image_pil,
-    vertical_flip_image_tensor,
+    vertical_flip_image,
     vertical_flip_mask,
     vertical_flip_video,
     vflip,
 )
 from ._misc import (
+    _gaussian_blur_image_pil,
     convert_image_dtype,
     gaussian_blur,
-    gaussian_blur_image_pil,
-    gaussian_blur_image_tensor,
+    gaussian_blur_image,
     gaussian_blur_video,
     normalize,
-    normalize_image_tensor,
+    normalize_image,
     normalize_video,
     to_dtype,
-    to_dtype_image_tensor,
+    to_dtype_image,
     to_dtype_video,
 )
 from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
-from ._type_conversion import pil_to_tensor, to_image_pil, to_image_tensor, to_pil_image
+from ._type_conversion import pil_to_tensor, to_image, to_pil_image
 
 from ._deprecated import get_image_size, to_tensor  # usort: skip
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 4a927be97..48b8865c4 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -18,7 +18,7 @@ def erase(
     inplace: bool = False,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+        return erase_image(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
 
     _log_api_usage_once(erase)
 
@@ -28,7 +28,7 @@ def erase(
 
 @_register_kernel_internal(erase, torch.Tensor)
 @_register_kernel_internal(erase, datapoints.Image)
-def erase_image_tensor(
+def erase_image(
     image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> torch.Tensor:
     if not inplace:
@@ -39,11 +39,11 @@ def erase_image_tensor(
 
 
 @_register_kernel_internal(erase, PIL.Image.Image)
-def erase_image_pil(
+def _erase_image_pil(
     image: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
-    output = erase_image_tensor(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    output = erase_image(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
     return to_pil_image(output, mode=image.mode)
 
 
@@ -51,4 +51,4 @@ def erase_image_pil(
 def erase_video(
     video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> torch.Tensor:
-    return erase_image_tensor(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    return erase_image(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 82bd23664..825ffa207 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -9,14 +9,14 @@ from torchvision.transforms._functional_tensor import _max_value
 
 from torchvision.utils import _log_api_usage_once
 
-from ._misc import _num_value_bits, to_dtype_image_tensor
-from ._type_conversion import pil_to_tensor, to_image_pil
+from ._misc import _num_value_bits, to_dtype_image
+from ._type_conversion import pil_to_tensor, to_pil_image
 from ._utils import _get_kernel, _register_kernel_internal
 
 
 def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return rgb_to_grayscale_image_tensor(inpt, num_output_channels=num_output_channels)
+        return rgb_to_grayscale_image(inpt, num_output_channels=num_output_channels)
 
     _log_api_usage_once(rgb_to_grayscale)
 
@@ -29,7 +29,7 @@ def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.
 to_grayscale = rgb_to_grayscale
 
 
-def _rgb_to_grayscale_image_tensor(
+def _rgb_to_grayscale_image(
     image: torch.Tensor, num_output_channels: int = 1, preserve_dtype: bool = True
 ) -> torch.Tensor:
     if image.shape[-3] == 1:
@@ -47,14 +47,14 @@ def _rgb_to_grayscale_image_tensor(
 
 @_register_kernel_internal(rgb_to_grayscale, torch.Tensor)
 @_register_kernel_internal(rgb_to_grayscale, datapoints.Image)
-def rgb_to_grayscale_image_tensor(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+def rgb_to_grayscale_image(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
     if num_output_channels not in (1, 3):
         raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
-    return _rgb_to_grayscale_image_tensor(image, num_output_channels=num_output_channels, preserve_dtype=True)
+    return _rgb_to_grayscale_image(image, num_output_channels=num_output_channels, preserve_dtype=True)
 
 
 @_register_kernel_internal(rgb_to_grayscale, PIL.Image.Image)
-def rgb_to_grayscale_image_pil(image: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
+def _rgb_to_grayscale_image_pil(image: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
     if num_output_channels not in (1, 3):
         raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
     return _FP.to_grayscale(image, num_output_channels=num_output_channels)
@@ -71,7 +71,7 @@ def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Te
 def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
 
     if torch.jit.is_scripting():
-        return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
+        return adjust_brightness_image(inpt, brightness_factor=brightness_factor)
 
     _log_api_usage_once(adjust_brightness)
 
@@ -81,7 +81,7 @@ def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Ten
 
 @_register_kernel_internal(adjust_brightness, torch.Tensor)
 @_register_kernel_internal(adjust_brightness, datapoints.Image)
-def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+def adjust_brightness_image(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     if brightness_factor < 0:
         raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
 
@@ -96,18 +96,18 @@ def adjust_brightness_image_tensor(image: torch.Tensor, brightness_factor: float
 
 
 @_register_kernel_internal(adjust_brightness, PIL.Image.Image)
-def adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
+def _adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
     return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
 
 
 @_register_kernel_internal(adjust_brightness, datapoints.Video)
 def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
-    return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor)
+    return adjust_brightness_image(video, brightness_factor=brightness_factor)
 
 
 def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
+        return adjust_saturation_image(inpt, saturation_factor=saturation_factor)
 
     _log_api_usage_once(adjust_saturation)
 
@@ -117,7 +117,7 @@ def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Ten
 
 @_register_kernel_internal(adjust_saturation, torch.Tensor)
 @_register_kernel_internal(adjust_saturation, datapoints.Image)
-def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if saturation_factor < 0:
         raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
 
@@ -128,24 +128,24 @@ def adjust_saturation_image_tensor(image: torch.Tensor, saturation_factor: float
     if c == 1:  # Match PIL behaviour
         return image
 
-    grayscale_image = _rgb_to_grayscale_image_tensor(image, num_output_channels=1, preserve_dtype=False)
+    grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
     if not image.is_floating_point():
         grayscale_image = grayscale_image.floor_()
 
     return _blend(image, grayscale_image, saturation_factor)
 
 
-adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
+_adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
 
 
 @_register_kernel_internal(adjust_saturation, datapoints.Video)
 def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
-    return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor)
+    return adjust_saturation_image(video, saturation_factor=saturation_factor)
 
 
 def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
+        return adjust_contrast_image(inpt, contrast_factor=contrast_factor)
 
     _log_api_usage_once(adjust_contrast)
 
@@ -155,7 +155,7 @@ def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
 
 @_register_kernel_internal(adjust_contrast, torch.Tensor)
 @_register_kernel_internal(adjust_contrast, datapoints.Image)
-def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if contrast_factor < 0:
         raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
 
@@ -164,7 +164,7 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
         raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
     fp = image.is_floating_point()
     if c == 3:
-        grayscale_image = _rgb_to_grayscale_image_tensor(image, num_output_channels=1, preserve_dtype=False)
+        grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
         if not fp:
             grayscale_image = grayscale_image.floor_()
     else:
@@ -173,17 +173,17 @@ def adjust_contrast_image_tensor(image: torch.Tensor, contrast_factor: float) ->
     return _blend(image, mean, contrast_factor)
 
 
-adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
+_adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
 
 
 @_register_kernel_internal(adjust_contrast, datapoints.Video)
 def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
-    return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor)
+    return adjust_contrast_image(video, contrast_factor=contrast_factor)
 
 
 def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
+        return adjust_sharpness_image(inpt, sharpness_factor=sharpness_factor)
 
     _log_api_usage_once(adjust_sharpness)
 
@@ -193,7 +193,7 @@ def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tenso
 
 @_register_kernel_internal(adjust_sharpness, torch.Tensor)
 @_register_kernel_internal(adjust_sharpness, datapoints.Image)
-def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     num_channels, height, width = image.shape[-3:]
     if num_channels not in (1, 3):
         raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
@@ -245,17 +245,17 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
     return output
 
 
-adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
+_adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
 
 
 @_register_kernel_internal(adjust_sharpness, datapoints.Video)
 def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
-    return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor)
+    return adjust_sharpness_image(video, sharpness_factor=sharpness_factor)
 
 
 def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
+        return adjust_hue_image(inpt, hue_factor=hue_factor)
 
     _log_api_usage_once(adjust_hue)
 
@@ -335,7 +335,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
 
 @_register_kernel_internal(adjust_hue, torch.Tensor)
 @_register_kernel_internal(adjust_hue, datapoints.Image)
-def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
+def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if not (-0.5 <= hue_factor <= 0.5):
         raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
 
@@ -351,7 +351,7 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
         return image
 
     orig_dtype = image.dtype
-    image = to_dtype_image_tensor(image, torch.float32, scale=True)
+    image = to_dtype_image(image, torch.float32, scale=True)
 
     image = _rgb_to_hsv(image)
     h, s, v = image.unbind(dim=-3)
@@ -359,20 +359,20 @@ def adjust_hue_image_tensor(image: torch.Tensor, hue_factor: float) -> torch.Ten
     image = torch.stack((h, s, v), dim=-3)
     image_hue_adj = _hsv_to_rgb(image)
 
-    return to_dtype_image_tensor(image_hue_adj, orig_dtype, scale=True)
+    return to_dtype_image(image_hue_adj, orig_dtype, scale=True)
 
 
-adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
+_adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
 
 
 @_register_kernel_internal(adjust_hue, datapoints.Video)
 def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
-    return adjust_hue_image_tensor(video, hue_factor=hue_factor)
+    return adjust_hue_image(video, hue_factor=hue_factor)
 
 
 def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
+        return adjust_gamma_image(inpt, gamma=gamma, gain=gain)
 
     _log_api_usage_once(adjust_gamma)
 
@@ -382,14 +382,14 @@ def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Ten
 
 @_register_kernel_internal(adjust_gamma, torch.Tensor)
 @_register_kernel_internal(adjust_gamma, datapoints.Image)
-def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
+def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
     if gamma < 0:
         raise ValueError("Gamma should be a non-negative real number")
 
     # The input image is either assumed to be at [0, 1] scale (if float) or is converted to that scale (if integer).
     # Since the gamma is non-negative, the output remains at [0, 1] scale.
     if not torch.is_floating_point(image):
-        output = to_dtype_image_tensor(image, torch.float32, scale=True).pow_(gamma)
+        output = to_dtype_image(image, torch.float32, scale=True).pow_(gamma)
     else:
         output = image.pow(gamma)
 
@@ -398,20 +398,20 @@ def adjust_gamma_image_tensor(image: torch.Tensor, gamma: float, gain: float = 1
         # of the output can go beyond [0, 1].
         output = output.mul_(gain).clamp_(0.0, 1.0)
 
-    return to_dtype_image_tensor(output, image.dtype, scale=True)
+    return to_dtype_image(output, image.dtype, scale=True)
 
 
-adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
+_adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
 
 
 @_register_kernel_internal(adjust_gamma, datapoints.Video)
 def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
-    return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain)
+    return adjust_gamma_image(video, gamma=gamma, gain=gain)
 
 
 def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return posterize_image_tensor(inpt, bits=bits)
+        return posterize_image(inpt, bits=bits)
 
     _log_api_usage_once(posterize)
 
@@ -421,7 +421,7 @@ def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
 
 @_register_kernel_internal(posterize, torch.Tensor)
 @_register_kernel_internal(posterize, datapoints.Image)
-def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
+def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor:
     if image.is_floating_point():
         levels = 1 << bits
         return image.mul(levels).floor_().clamp_(0, levels - 1).mul_(1.0 / levels)
@@ -434,17 +434,17 @@ def posterize_image_tensor(image: torch.Tensor, bits: int) -> torch.Tensor:
         return image & mask
 
 
-posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
+_posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
 
 
 @_register_kernel_internal(posterize, datapoints.Video)
 def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
-    return posterize_image_tensor(video, bits=bits)
+    return posterize_image(video, bits=bits)
 
 
 def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return solarize_image_tensor(inpt, threshold=threshold)
+        return solarize_image(inpt, threshold=threshold)
 
     _log_api_usage_once(solarize)
 
@@ -454,24 +454,24 @@ def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
 
 @_register_kernel_internal(solarize, torch.Tensor)
 @_register_kernel_internal(solarize, datapoints.Image)
-def solarize_image_tensor(image: torch.Tensor, threshold: float) -> torch.Tensor:
+def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor:
     if threshold > _max_value(image.dtype):
         raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
 
-    return torch.where(image >= threshold, invert_image_tensor(image), image)
+    return torch.where(image >= threshold, invert_image(image), image)
 
 
-solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
+_solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
 
 
 @_register_kernel_internal(solarize, datapoints.Video)
 def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
-    return solarize_image_tensor(video, threshold=threshold)
+    return solarize_image(video, threshold=threshold)
 
 
 def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return autocontrast_image_tensor(inpt)
+        return autocontrast_image(inpt)
 
     _log_api_usage_once(autocontrast)
 
@@ -481,7 +481,7 @@ def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
 
 @_register_kernel_internal(autocontrast, torch.Tensor)
 @_register_kernel_internal(autocontrast, datapoints.Image)
-def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
+def autocontrast_image(image: torch.Tensor) -> torch.Tensor:
     c = image.shape[-3]
     if c not in [1, 3]:
         raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
@@ -510,17 +510,17 @@ def autocontrast_image_tensor(image: torch.Tensor) -> torch.Tensor:
     return diff.div_(inv_scale).clamp_(0, bound).to(image.dtype)
 
 
-autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
+_autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
 
 
 @_register_kernel_internal(autocontrast, datapoints.Video)
 def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
-    return autocontrast_image_tensor(video)
+    return autocontrast_image(video)
 
 
 def equalize(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return equalize_image_tensor(inpt)
+        return equalize_image(inpt)
 
     _log_api_usage_once(equalize)
 
@@ -530,7 +530,7 @@ def equalize(inpt: torch.Tensor) -> torch.Tensor:
 
 @_register_kernel_internal(equalize, torch.Tensor)
 @_register_kernel_internal(equalize, datapoints.Image)
-def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
+def equalize_image(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
 
@@ -545,7 +545,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
     # by far the most common, we choose it as base.
     output_dtype = image.dtype
-    image = to_dtype_image_tensor(image, torch.uint8, scale=True)
+    image = to_dtype_image(image, torch.uint8, scale=True)
 
     # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
     # corresponds to adding 1 to index 127 in the histogram.
@@ -596,20 +596,20 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor:
     equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
 
     output = torch.where(valid_equalization, equalized_image, image)
-    return to_dtype_image_tensor(output, output_dtype, scale=True)
+    return to_dtype_image(output, output_dtype, scale=True)
 
 
-equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
+_equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
 
 
 @_register_kernel_internal(equalize, datapoints.Video)
 def equalize_video(video: torch.Tensor) -> torch.Tensor:
-    return equalize_image_tensor(video)
+    return equalize_image(video)
 
 
 def invert(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return invert_image_tensor(inpt)
+        return invert_image(inpt)
 
     _log_api_usage_once(invert)
 
@@ -619,7 +619,7 @@ def invert(inpt: torch.Tensor) -> torch.Tensor:
 
 @_register_kernel_internal(invert, torch.Tensor)
 @_register_kernel_internal(invert, datapoints.Image)
-def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
+def invert_image(image: torch.Tensor) -> torch.Tensor:
     if image.is_floating_point():
         return 1.0 - image
     elif image.dtype == torch.uint8:
@@ -629,12 +629,12 @@ def invert_image_tensor(image: torch.Tensor) -> torch.Tensor:
         return image.bitwise_xor((1 << _num_value_bits(image.dtype)) - 1)
 
 
-invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
+_invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
 
 
 @_register_kernel_internal(invert, datapoints.Video)
 def invert_video(video: torch.Tensor) -> torch.Tensor:
-    return invert_image_tensor(video)
+    return invert_image(video)
 
 
 def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor:
@@ -660,7 +660,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor
         ValueError: If ``len(permutation)`` doesn't match the number of channels in the input.
     """
     if torch.jit.is_scripting():
-        return permute_channels_image_tensor(inpt, permutation=permutation)
+        return permute_channels_image(inpt, permutation=permutation)
 
     _log_api_usage_once(permute_channels)
 
@@ -670,7 +670,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor
 
 @_register_kernel_internal(permute_channels, torch.Tensor)
 @_register_kernel_internal(permute_channels, datapoints.Image)
-def permute_channels_image_tensor(image: torch.Tensor, permutation: List[int]) -> torch.Tensor:
+def permute_channels_image(image: torch.Tensor, permutation: List[int]) -> torch.Tensor:
     shape = image.shape
     num_channels, height, width = shape[-3:]
 
@@ -688,10 +688,10 @@ def permute_channels_image_tensor(image: torch.Tensor, permutation: List[int]) -
 
 
 @_register_kernel_internal(permute_channels, PIL.Image.Image)
-def permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) -> PIL.Image:
-    return to_image_pil(permute_channels_image_tensor(pil_to_tensor(image), permutation=permutation))
+def _permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) -> PIL.Image:
+    return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation))
 
 
 @_register_kernel_internal(permute_channels, datapoints.Video)
 def permute_channels_video(video: torch.Tensor, permutation: List[int]) -> torch.Tensor:
-    return permute_channels_image_tensor(video, permutation=permutation)
+    return permute_channels_image(video, permutation=permutation)
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index 1cb7f50e5..aac56c51c 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -10,7 +10,7 @@ from torchvision.transforms import functional as _F
 def to_tensor(inpt: Any) -> torch.Tensor:
     warnings.warn(
         "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
-        "Instead, please use `to_image_tensor(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
+        "Instead, please use `to_image(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
     )
     return _F.to_tensor(inpt)
 
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 898e7e0c1..0cd43590b 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -23,7 +23,7 @@ from torchvision.transforms.functional import (
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil
+from ._meta import _get_size_image_pil, clamp_bounding_boxes, convert_format_bounding_boxes
 
 from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal
 
@@ -41,7 +41,7 @@ def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> Interp
 
 def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return horizontal_flip_image_tensor(inpt)
+        return horizontal_flip_image(inpt)
 
     _log_api_usage_once(horizontal_flip)
 
@@ -51,18 +51,18 @@ def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
 
 @_register_kernel_internal(horizontal_flip, torch.Tensor)
 @_register_kernel_internal(horizontal_flip, datapoints.Image)
-def horizontal_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
+def horizontal_flip_image(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
 
 
 @_register_kernel_internal(horizontal_flip, PIL.Image.Image)
-def horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
     return _FP.hflip(image)
 
 
 @_register_kernel_internal(horizontal_flip, datapoints.Mask)
 def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
-    return horizontal_flip_image_tensor(mask)
+    return horizontal_flip_image(mask)
 
 
 def horizontal_flip_bounding_boxes(
@@ -92,12 +92,12 @@ def _horizontal_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) ->
 
 @_register_kernel_internal(horizontal_flip, datapoints.Video)
 def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
-    return horizontal_flip_image_tensor(video)
+    return horizontal_flip_image(video)
 
 
 def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return vertical_flip_image_tensor(inpt)
+        return vertical_flip_image(inpt)
 
     _log_api_usage_once(vertical_flip)
 
@@ -107,18 +107,18 @@ def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
 
 @_register_kernel_internal(vertical_flip, torch.Tensor)
 @_register_kernel_internal(vertical_flip, datapoints.Image)
-def vertical_flip_image_tensor(image: torch.Tensor) -> torch.Tensor:
+def vertical_flip_image(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-2)
 
 
 @_register_kernel_internal(vertical_flip, PIL.Image.Image)
-def vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
+def _vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
     return _FP.vflip(image)
 
 
 @_register_kernel_internal(vertical_flip, datapoints.Mask)
 def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
-    return vertical_flip_image_tensor(mask)
+    return vertical_flip_image(mask)
 
 
 def vertical_flip_bounding_boxes(
@@ -148,7 +148,7 @@ def _vertical_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> da
 
 @_register_kernel_internal(vertical_flip, datapoints.Video)
 def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
-    return vertical_flip_image_tensor(video)
+    return vertical_flip_image(video)
 
 
 # We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
@@ -178,7 +178,7 @@ def resize(
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return resize_image_tensor(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+        return resize_image(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
     _log_api_usage_once(resize)
 
@@ -188,7 +188,7 @@ def resize(
 
 @_register_kernel_internal(resize, torch.Tensor)
 @_register_kernel_internal(resize, datapoints.Image)
-def resize_image_tensor(
+def resize_image(
     image: torch.Tensor,
     size: List[int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
@@ -267,7 +267,7 @@ def resize_image_tensor(
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
-def resize_image_pil(
+def _resize_image_pil(
     image: PIL.Image.Image,
     size: Union[Sequence[int], int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
@@ -289,7 +289,7 @@ def resize_image_pil(
 
 
 @_register_kernel_internal(resize, PIL.Image.Image)
-def _resize_image_pil_dispatch(
+def __resize_image_pil_dispatch(
     image: PIL.Image.Image,
     size: Union[Sequence[int], int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
@@ -298,7 +298,7 @@ def _resize_image_pil_dispatch(
 ) -> PIL.Image.Image:
     if antialias is False:
         warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
-    return resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
+    return _resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
 
 
 def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = None) -> torch.Tensor:
@@ -308,7 +308,7 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
     else:
         needs_squeeze = False
 
-    output = resize_image_tensor(mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
+    output = resize_image(mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
 
     if needs_squeeze:
         output = output.squeeze(0)
@@ -360,7 +360,7 @@ def resize_video(
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
-    return resize_image_tensor(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+    return resize_image(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
 
 def affine(
@@ -374,7 +374,7 @@ def affine(
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return affine_image_tensor(
+        return affine_image(
             inpt,
             angle=angle,
             translate=translate,
@@ -648,7 +648,7 @@ def _affine_grid(
 
 @_register_kernel_internal(affine, torch.Tensor)
 @_register_kernel_internal(affine, datapoints.Image)
-def affine_image_tensor(
+def affine_image(
     image: torch.Tensor,
     angle: Union[int, float],
     translate: List[float],
@@ -700,7 +700,7 @@ def affine_image_tensor(
 
 
 @_register_kernel_internal(affine, PIL.Image.Image)
-def affine_image_pil(
+def _affine_image_pil(
     image: PIL.Image.Image,
     angle: Union[int, float],
     translate: List[float],
@@ -717,7 +717,7 @@ def affine_image_pil(
     # it is visually better to estimate the center without 0.5 offset
     # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
     if center is None:
-        height, width = get_size_image_pil(image)
+        height, width = _get_size_image_pil(image)
         center = [width * 0.5, height * 0.5]
     matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
 
@@ -875,7 +875,7 @@ def affine_mask(
     else:
         needs_squeeze = False
 
-    output = affine_image_tensor(
+    output = affine_image(
         mask,
         angle=angle,
         translate=translate,
@@ -926,7 +926,7 @@ def affine_video(
     fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    return affine_image_tensor(
+    return affine_image(
         video,
         angle=angle,
         translate=translate,
@@ -947,9 +947,7 @@ def rotate(
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return rotate_image_tensor(
-            inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
+        return rotate_image(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
     _log_api_usage_once(rotate)
 
@@ -959,7 +957,7 @@ def rotate(
 
 @_register_kernel_internal(rotate, torch.Tensor)
 @_register_kernel_internal(rotate, datapoints.Image)
-def rotate_image_tensor(
+def rotate_image(
     image: torch.Tensor,
     angle: float,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
@@ -1004,7 +1002,7 @@ def rotate_image_tensor(
 
 
 @_register_kernel_internal(rotate, PIL.Image.Image)
-def rotate_image_pil(
+def _rotate_image_pil(
     image: PIL.Image.Image,
     angle: float,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
@@ -1074,7 +1072,7 @@ def rotate_mask(
     else:
         needs_squeeze = False
 
-    output = rotate_image_tensor(
+    output = rotate_image(
         mask,
         angle=angle,
         expand=expand,
@@ -1111,7 +1109,7 @@ def rotate_video(
     center: Optional[List[float]] = None,
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
-    return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+    return rotate_image(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
 
 def pad(
@@ -1121,7 +1119,7 @@ def pad(
     padding_mode: str = "constant",
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return pad_image_tensor(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+        return pad_image(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
 
     _log_api_usage_once(pad)
 
@@ -1155,7 +1153,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
 
 @_register_kernel_internal(pad, torch.Tensor)
 @_register_kernel_internal(pad, datapoints.Image)
-def pad_image_tensor(
+def pad_image(
     image: torch.Tensor,
     padding: List[int],
     fill: Optional[Union[int, float, List[float]]] = None,
@@ -1253,7 +1251,7 @@ def _pad_with_vector_fill(
     return output
 
 
-pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
+_pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
 
 
 @_register_kernel_internal(pad, datapoints.Mask)
@@ -1275,7 +1273,7 @@ def pad_mask(
     else:
         needs_squeeze = False
 
-    output = pad_image_tensor(mask, padding=padding, fill=fill, padding_mode=padding_mode)
+    output = pad_image(mask, padding=padding, fill=fill, padding_mode=padding_mode)
 
     if needs_squeeze:
         output = output.squeeze(0)
@@ -1331,12 +1329,12 @@ def pad_video(
     fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
-    return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode)
+    return pad_image(video, padding, fill=fill, padding_mode=padding_mode)
 
 
 def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return crop_image_tensor(inpt, top=top, left=left, height=height, width=width)
+        return crop_image(inpt, top=top, left=left, height=height, width=width)
 
     _log_api_usage_once(crop)
 
@@ -1346,7 +1344,7 @@ def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> to
 
 @_register_kernel_internal(crop, torch.Tensor)
 @_register_kernel_internal(crop, datapoints.Image)
-def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     h, w = image.shape[-2:]
 
     right = left + width
@@ -1364,8 +1362,8 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid
     return image[..., top:bottom, left:right]
 
 
-crop_image_pil = _FP.crop
-_register_kernel_internal(crop, PIL.Image.Image)(crop_image_pil)
+_crop_image_pil = _FP.crop
+_register_kernel_internal(crop, PIL.Image.Image)(_crop_image_pil)
 
 
 def crop_bounding_boxes(
@@ -1407,7 +1405,7 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int)
     else:
         needs_squeeze = False
 
-    output = crop_image_tensor(mask, top, left, height, width)
+    output = crop_image(mask, top, left, height, width)
 
     if needs_squeeze:
         output = output.squeeze(0)
@@ -1417,7 +1415,7 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int)
 
 @_register_kernel_internal(crop, datapoints.Video)
 def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-    return crop_image_tensor(video, top, left, height, width)
+    return crop_image(video, top, left, height, width)
 
 
 def perspective(
@@ -1429,7 +1427,7 @@ def perspective(
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return perspective_image_tensor(
+        return perspective_image(
             inpt,
             startpoints=startpoints,
             endpoints=endpoints,
@@ -1500,7 +1498,7 @@ def _perspective_coefficients(
 
 @_register_kernel_internal(perspective, torch.Tensor)
 @_register_kernel_internal(perspective, datapoints.Image)
-def perspective_image_tensor(
+def perspective_image(
     image: torch.Tensor,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
@@ -1547,7 +1545,7 @@ def perspective_image_tensor(
 
 
 @_register_kernel_internal(perspective, PIL.Image.Image)
-def perspective_image_pil(
+def _perspective_image_pil(
     image: PIL.Image.Image,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
@@ -1686,7 +1684,7 @@ def perspective_mask(
     else:
         needs_squeeze = False
 
-    output = perspective_image_tensor(
+    output = perspective_image(
         mask, startpoints, endpoints, interpolation=InterpolationMode.NEAREST, fill=fill, coefficients=coefficients
     )
 
@@ -1724,7 +1722,7 @@ def perspective_video(
     fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    return perspective_image_tensor(
+    return perspective_image(
         video, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
     )
 
@@ -1736,7 +1734,7 @@ def elastic(
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return elastic_image_tensor(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+        return elastic_image(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
 
     _log_api_usage_once(elastic)
 
@@ -1749,7 +1747,7 @@ elastic_transform = elastic
 
 @_register_kernel_internal(elastic, torch.Tensor)
 @_register_kernel_internal(elastic, datapoints.Image)
-def elastic_image_tensor(
+def elastic_image(
     image: torch.Tensor,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
@@ -1809,14 +1807,14 @@ def elastic_image_tensor(
 
 
 @_register_kernel_internal(elastic, PIL.Image.Image)
-def elastic_image_pil(
+def _elastic_image_pil(
     image: PIL.Image.Image,
     displacement: torch.Tensor,
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: _FillTypeJIT = None,
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
-    output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill)
+    output = elastic_image(t_img, displacement, interpolation=interpolation, fill=fill)
     return to_pil_image(output, mode=image.mode)
 
 
@@ -1910,7 +1908,7 @@ def elastic_mask(
     else:
         needs_squeeze = False
 
-    output = elastic_image_tensor(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST, fill=fill)
+    output = elastic_image(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST, fill=fill)
 
     if needs_squeeze:
         output = output.squeeze(0)
@@ -1933,12 +1931,12 @@ def elastic_video(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
-    return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill)
+    return elastic_image(video, displacement, interpolation=interpolation, fill=fill)
 
 
 def center_crop(inpt: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return center_crop_image_tensor(inpt, output_size=output_size)
+        return center_crop_image(inpt, output_size=output_size)
 
     _log_api_usage_once(center_crop)
 
@@ -1975,7 +1973,7 @@ def _center_crop_compute_crop_anchor(
 
 @_register_kernel_internal(center_crop, torch.Tensor)
 @_register_kernel_internal(center_crop, datapoints.Image)
-def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+def center_crop_image(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
     shape = image.shape
     if image.numel() == 0:
@@ -1995,20 +1993,20 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
 
 
 @_register_kernel_internal(center_crop, PIL.Image.Image)
-def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
+def _center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    image_height, image_width = get_size_image_pil(image)
+    image_height, image_width = _get_size_image_pil(image)
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        image = pad_image_pil(image, padding_ltrb, fill=0)
+        image = _pad_image_pil(image, padding_ltrb, fill=0)
 
-        image_height, image_width = get_size_image_pil(image)
+        image_height, image_width = _get_size_image_pil(image)
         if crop_width == image_width and crop_height == image_height:
             return image
 
     crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
-    return crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
+    return _crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
 
 
 def center_crop_bounding_boxes(
@@ -2042,7 +2040,7 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor
     else:
         needs_squeeze = False
 
-    output = center_crop_image_tensor(image=mask, output_size=output_size)
+    output = center_crop_image(image=mask, output_size=output_size)
 
     if needs_squeeze:
         output = output.squeeze(0)
@@ -2052,7 +2050,7 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor
 
 @_register_kernel_internal(center_crop, datapoints.Video)
 def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor:
-    return center_crop_image_tensor(video, output_size)
+    return center_crop_image(video, output_size)
 
 
 def resized_crop(
@@ -2066,7 +2064,7 @@ def resized_crop(
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return resized_crop_image_tensor(
+        return resized_crop_image(
             inpt,
             top=top,
             left=left,
@@ -2094,7 +2092,7 @@ def resized_crop(
 
 @_register_kernel_internal(resized_crop, torch.Tensor)
 @_register_kernel_internal(resized_crop, datapoints.Image)
-def resized_crop_image_tensor(
+def resized_crop_image(
     image: torch.Tensor,
     top: int,
     left: int,
@@ -2104,11 +2102,11 @@ def resized_crop_image_tensor(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
-    image = crop_image_tensor(image, top, left, height, width)
-    return resize_image_tensor(image, size, interpolation=interpolation, antialias=antialias)
+    image = crop_image(image, top, left, height, width)
+    return resize_image(image, size, interpolation=interpolation, antialias=antialias)
 
 
-def resized_crop_image_pil(
+def _resized_crop_image_pil(
     image: PIL.Image.Image,
     top: int,
     left: int,
@@ -2117,12 +2115,12 @@ def resized_crop_image_pil(
     size: List[int],
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
 ) -> PIL.Image.Image:
-    image = crop_image_pil(image, top, left, height, width)
-    return resize_image_pil(image, size, interpolation=interpolation)
+    image = _crop_image_pil(image, top, left, height, width)
+    return _resize_image_pil(image, size, interpolation=interpolation)
 
 
 @_register_kernel_internal(resized_crop, PIL.Image.Image)
-def resized_crop_image_pil_dispatch(
+def _resized_crop_image_pil_dispatch(
     image: PIL.Image.Image,
     top: int,
     left: int,
@@ -2134,7 +2132,7 @@ def resized_crop_image_pil_dispatch(
 ) -> PIL.Image.Image:
     if antialias is False:
         warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
-    return resized_crop_image_pil(
+    return _resized_crop_image_pil(
         image,
         top=top,
         left=left,
@@ -2201,7 +2199,7 @@ def resized_crop_video(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
-    return resized_crop_image_tensor(
+    return resized_crop_image(
         video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
     )
 
@@ -2210,7 +2208,7 @@ def five_crop(
     inpt: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     if torch.jit.is_scripting():
-        return five_crop_image_tensor(inpt, size=size)
+        return five_crop_image(inpt, size=size)
 
     _log_api_usage_once(five_crop)
 
@@ -2234,7 +2232,7 @@ def _parse_five_crop_size(size: List[int]) -> List[int]:
 
 @_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor)
 @_register_five_ten_crop_kernel_internal(five_crop, datapoints.Image)
-def five_crop_image_tensor(
+def five_crop_image(
     image: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     crop_height, crop_width = _parse_five_crop_size(size)
@@ -2243,30 +2241,30 @@ def five_crop_image_tensor(
     if crop_width > image_width or crop_height > image_height:
         raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
 
-    tl = crop_image_tensor(image, 0, 0, crop_height, crop_width)
-    tr = crop_image_tensor(image, 0, image_width - crop_width, crop_height, crop_width)
-    bl = crop_image_tensor(image, image_height - crop_height, 0, crop_height, crop_width)
-    br = crop_image_tensor(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
-    center = center_crop_image_tensor(image, [crop_height, crop_width])
+    tl = crop_image(image, 0, 0, crop_height, crop_width)
+    tr = crop_image(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = crop_image(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = crop_image(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = center_crop_image(image, [crop_height, crop_width])
 
     return tl, tr, bl, br, center
 
 
 @_register_five_ten_crop_kernel_internal(five_crop, PIL.Image.Image)
-def five_crop_image_pil(
+def _five_crop_image_pil(
     image: PIL.Image.Image, size: List[int]
 ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
     crop_height, crop_width = _parse_five_crop_size(size)
-    image_height, image_width = get_size_image_pil(image)
+    image_height, image_width = _get_size_image_pil(image)
 
     if crop_width > image_width or crop_height > image_height:
         raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
 
-    tl = crop_image_pil(image, 0, 0, crop_height, crop_width)
-    tr = crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
-    bl = crop_image_pil(image, image_height - crop_height, 0, crop_height, crop_width)
-    br = crop_image_pil(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
-    center = center_crop_image_pil(image, [crop_height, crop_width])
+    tl = _crop_image_pil(image, 0, 0, crop_height, crop_width)
+    tr = _crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = _crop_image_pil(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = _crop_image_pil(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = _center_crop_image_pil(image, [crop_height, crop_width])
 
     return tl, tr, bl, br, center
 
@@ -2275,7 +2273,7 @@ def five_crop_image_pil(
 def five_crop_video(
     video: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    return five_crop_image_tensor(video, size)
+    return five_crop_image(video, size)
 
 
 def ten_crop(
@@ -2293,7 +2291,7 @@ def ten_crop(
     torch.Tensor,
 ]:
     if torch.jit.is_scripting():
-        return ten_crop_image_tensor(inpt, size=size, vertical_flip=vertical_flip)
+        return ten_crop_image(inpt, size=size, vertical_flip=vertical_flip)
 
     _log_api_usage_once(ten_crop)
 
@@ -2303,7 +2301,7 @@ def ten_crop(
 
 @_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor)
 @_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Image)
-def ten_crop_image_tensor(
+def ten_crop_image(
     image: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
     torch.Tensor,
@@ -2317,20 +2315,20 @@ def ten_crop_image_tensor(
     torch.Tensor,
     torch.Tensor,
 ]:
-    non_flipped = five_crop_image_tensor(image, size)
+    non_flipped = five_crop_image(image, size)
 
     if vertical_flip:
-        image = vertical_flip_image_tensor(image)
+        image = vertical_flip_image(image)
     else:
-        image = horizontal_flip_image_tensor(image)
+        image = horizontal_flip_image(image)
 
-    flipped = five_crop_image_tensor(image, size)
+    flipped = five_crop_image(image, size)
 
     return non_flipped + flipped
 
 
 @_register_five_ten_crop_kernel_internal(ten_crop, PIL.Image.Image)
-def ten_crop_image_pil(
+def _ten_crop_image_pil(
     image: PIL.Image.Image, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
     PIL.Image.Image,
@@ -2344,14 +2342,14 @@ def ten_crop_image_pil(
     PIL.Image.Image,
     PIL.Image.Image,
 ]:
-    non_flipped = five_crop_image_pil(image, size)
+    non_flipped = _five_crop_image_pil(image, size)
 
     if vertical_flip:
-        image = vertical_flip_image_pil(image)
+        image = _vertical_flip_image_pil(image)
     else:
-        image = horizontal_flip_image_pil(image)
+        image = _horizontal_flip_image_pil(image)
 
-    flipped = five_crop_image_pil(image, size)
+    flipped = _five_crop_image_pil(image, size)
 
     return non_flipped + flipped
 
@@ -2371,4 +2369,4 @@ def ten_crop_video(
     torch.Tensor,
     torch.Tensor,
 ]:
-    return ten_crop_image_tensor(video, size, vertical_flip=vertical_flip)
+    return ten_crop_image(video, size, vertical_flip=vertical_flip)
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 89b19d9e8..f2675728c 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -13,7 +13,7 @@ from ._utils import _get_kernel, _register_kernel_internal, is_simple_tensor
 
 def get_dimensions(inpt: torch.Tensor) -> List[int]:
     if torch.jit.is_scripting():
-        return get_dimensions_image_tensor(inpt)
+        return get_dimensions_image(inpt)
 
     _log_api_usage_once(get_dimensions)
 
@@ -23,7 +23,7 @@ def get_dimensions(inpt: torch.Tensor) -> List[int]:
 
 @_register_kernel_internal(get_dimensions, torch.Tensor)
 @_register_kernel_internal(get_dimensions, datapoints.Image, datapoint_wrapper=False)
-def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
+def get_dimensions_image(image: torch.Tensor) -> List[int]:
     chw = list(image.shape[-3:])
     ndims = len(chw)
     if ndims == 3:
@@ -35,17 +35,17 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
         raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
 
 
-get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
+_get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
 
 
 @_register_kernel_internal(get_dimensions, datapoints.Video, datapoint_wrapper=False)
 def get_dimensions_video(video: torch.Tensor) -> List[int]:
-    return get_dimensions_image_tensor(video)
+    return get_dimensions_image(video)
 
 
 def get_num_channels(inpt: torch.Tensor) -> int:
     if torch.jit.is_scripting():
-        return get_num_channels_image_tensor(inpt)
+        return get_num_channels_image(inpt)
 
     _log_api_usage_once(get_num_channels)
 
@@ -55,7 +55,7 @@ def get_num_channels(inpt: torch.Tensor) -> int:
 
 @_register_kernel_internal(get_num_channels, torch.Tensor)
 @_register_kernel_internal(get_num_channels, datapoints.Image, datapoint_wrapper=False)
-def get_num_channels_image_tensor(image: torch.Tensor) -> int:
+def get_num_channels_image(image: torch.Tensor) -> int:
     chw = image.shape[-3:]
     ndims = len(chw)
     if ndims == 3:
@@ -66,12 +66,12 @@ def get_num_channels_image_tensor(image: torch.Tensor) -> int:
         raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
 
 
-get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
+_get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
 
 
 @_register_kernel_internal(get_num_channels, datapoints.Video, datapoint_wrapper=False)
 def get_num_channels_video(video: torch.Tensor) -> int:
-    return get_num_channels_image_tensor(video)
+    return get_num_channels_image(video)
 
 
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
@@ -81,7 +81,7 @@ get_image_num_channels = get_num_channels
 
 def get_size(inpt: torch.Tensor) -> List[int]:
     if torch.jit.is_scripting():
-        return get_size_image_tensor(inpt)
+        return get_size_image(inpt)
 
     _log_api_usage_once(get_size)
 
@@ -91,7 +91,7 @@ def get_size(inpt: torch.Tensor) -> List[int]:
 
 @_register_kernel_internal(get_size, torch.Tensor)
 @_register_kernel_internal(get_size, datapoints.Image, datapoint_wrapper=False)
-def get_size_image_tensor(image: torch.Tensor) -> List[int]:
+def get_size_image(image: torch.Tensor) -> List[int]:
     hw = list(image.shape[-2:])
     ndims = len(hw)
     if ndims == 2:
@@ -101,19 +101,19 @@ def get_size_image_tensor(image: torch.Tensor) -> List[int]:
 
 
 @_register_kernel_internal(get_size, PIL.Image.Image)
-def get_size_image_pil(image: PIL.Image.Image) -> List[int]:
+def _get_size_image_pil(image: PIL.Image.Image) -> List[int]:
     width, height = _FP.get_image_size(image)
     return [height, width]
 
 
 @_register_kernel_internal(get_size, datapoints.Video, datapoint_wrapper=False)
 def get_size_video(video: torch.Tensor) -> List[int]:
-    return get_size_image_tensor(video)
+    return get_size_image(video)
 
 
 @_register_kernel_internal(get_size, datapoints.Mask, datapoint_wrapper=False)
 def get_size_mask(mask: torch.Tensor) -> List[int]:
-    return get_size_image_tensor(mask)
+    return get_size_image(mask)
 
 
 @_register_kernel_internal(get_size, datapoints.BoundingBoxes, datapoint_wrapper=False)
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index 658b61ced..331817bb0 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -21,7 +21,7 @@ def normalize(
     inplace: bool = False,
 ) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
+        return normalize_image(inpt, mean=mean, std=std, inplace=inplace)
 
     _log_api_usage_once(normalize)
 
@@ -31,9 +31,7 @@ def normalize(
 
 @_register_kernel_internal(normalize, torch.Tensor)
 @_register_kernel_internal(normalize, datapoints.Image)
-def normalize_image_tensor(
-    image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False
-) -> torch.Tensor:
+def normalize_image(image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
     if not image.is_floating_point():
         raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
 
@@ -68,12 +66,12 @@ def normalize_image_tensor(
 
 @_register_kernel_internal(normalize, datapoints.Video)
 def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
-    return normalize_image_tensor(video, mean, std, inplace=inplace)
+    return normalize_image(video, mean, std, inplace=inplace)
 
 
 def gaussian_blur(inpt: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
+        return gaussian_blur_image(inpt, kernel_size=kernel_size, sigma=sigma)
 
     _log_api_usage_once(gaussian_blur)
 
@@ -99,7 +97,7 @@ def _get_gaussian_kernel2d(
 
 @_register_kernel_internal(gaussian_blur, torch.Tensor)
 @_register_kernel_internal(gaussian_blur, datapoints.Image)
-def gaussian_blur_image_tensor(
+def gaussian_blur_image(
     image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
     # TODO: consider deprecating integers from sigma on the future
@@ -164,11 +162,11 @@ def gaussian_blur_image_tensor(
 
 
 @_register_kernel_internal(gaussian_blur, PIL.Image.Image)
-def gaussian_blur_image_pil(
+def _gaussian_blur_image_pil(
     image: PIL.Image.Image, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> PIL.Image.Image:
     t_img = pil_to_tensor(image)
-    output = gaussian_blur_image_tensor(t_img, kernel_size=kernel_size, sigma=sigma)
+    output = gaussian_blur_image(t_img, kernel_size=kernel_size, sigma=sigma)
     return to_pil_image(output, mode=image.mode)
 
 
@@ -176,12 +174,12 @@ def gaussian_blur_image_pil(
 def gaussian_blur_video(
     video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
-    return gaussian_blur_image_tensor(video, kernel_size, sigma)
+    return gaussian_blur_image(video, kernel_size, sigma)
 
 
 def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
     if torch.jit.is_scripting():
-        return to_dtype_image_tensor(inpt, dtype=dtype, scale=scale)
+        return to_dtype_image(inpt, dtype=dtype, scale=scale)
 
     _log_api_usage_once(to_dtype)
 
@@ -206,7 +204,7 @@ def _num_value_bits(dtype: torch.dtype) -> int:
 
 @_register_kernel_internal(to_dtype, torch.Tensor)
 @_register_kernel_internal(to_dtype, datapoints.Image)
-def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
 
     if image.dtype == dtype:
         return image
@@ -260,12 +258,12 @@ def to_dtype_image_tensor(image: torch.Tensor, dtype: torch.dtype = torch.float,
 
 # We encourage users to use to_dtype() instead but we keep this for BC
 def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
-    return to_dtype_image_tensor(image, dtype=dtype, scale=True)
+    return to_dtype_image(image, dtype=dtype, scale=True)
 
 
 @_register_kernel_internal(to_dtype, datapoints.Video)
 def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
-    return to_dtype_image_tensor(video, dtype, scale=scale)
+    return to_dtype_image(video, dtype, scale=scale)
 
 
 @_register_kernel_internal(to_dtype, datapoints.BoundingBoxes, datapoint_wrapper=False)
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index 67572cf4a..1f908353d 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -8,7 +8,7 @@ from torchvision.transforms import functional as _F
 
 
 @torch.jit.unused
-def to_image_tensor(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoints.Image:
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoints.Image:
     if isinstance(inpt, np.ndarray):
         output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous()
     elif isinstance(inpt, PIL.Image.Image):
@@ -20,9 +20,5 @@ def to_image_tensor(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> d
     return datapoints.Image(output)
 
 
-to_image_pil = _F.to_pil_image
+to_pil_image = _F.to_pil_image
 pil_to_tensor = _F.pil_to_tensor
-
-# We changed the names to align them with the new naming scheme. Still, `to_pil_image` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-to_pil_image = to_image_pil
-- 
GitLab


From bda807d5a49cca8382a13a835ece2813e9c320ae Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Wed, 16 Aug 2023 15:01:45 -0400
Subject: [PATCH 565/624] Pre-Script Update for Aarch64 (#7834)

---
 packaging/pre_build_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 43f60e510..7d38f2cb4 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -11,7 +11,7 @@ if [[ "$(uname)" == Darwin ]]; then
   conda install -yq wget
 fi
 
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" || "$ARCH" == "aarch64" ]]; then
   # Install libpng from Anaconda (defaults)
   conda install libpng -yq
   conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch
-- 
GitLab


From 4cba51c5254407ec1e460a8cbb4ea06700e92637 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 16 Aug 2023 21:48:49 +0200
Subject: [PATCH 566/624] fix elastic tests (#7841)

---
 test/test_transforms_v2_refactored.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 9d359e595..339725327 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2279,7 +2279,7 @@ class TestElastic:
         image = make_image_tensor(dtype=dtype, device=device)
 
         check_kernel(
-            F.elastic_image_tensor,
+            F.elastic_image,
             image,
             displacement=self._make_displacement(image),
             **{param: value},
@@ -2320,9 +2320,9 @@ class TestElastic:
     @pytest.mark.parametrize(
         ("kernel", "input_type"),
         [
-            (F.elastic_image_tensor, torch.Tensor),
-            (F.elastic_image_pil, PIL.Image.Image),
-            (F.elastic_image_tensor, datapoints.Image),
+            (F.elastic_image, torch.Tensor),
+            (F._elastic_image_pil, PIL.Image.Image),
+            (F.elastic_image, datapoints.Image),
             (F.elastic_bounding_boxes, datapoints.BoundingBoxes),
             (F.elastic_mask, datapoints.Mask),
             (F.elastic_video, datapoints.Video),
-- 
GitLab


From 9040793b6fa7b8e70f18a3918b330d73a78ce71f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 17 Aug 2023 09:23:40 +0100
Subject: [PATCH 567/624] Minor doc update (#7836)

---
 torchvision/datapoints/__init__.py                | 7 ++++---
 torchvision/datapoints/_torch_function_helpers.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index f99e25b62..dcb110f74 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -15,14 +15,15 @@ if _WARN_ABOUT_BETA_TRANSFORMS:
 
 
 def wrap(wrappee, *, like, **kwargs):
-    """Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.datapoint.Datapoint` subclass as ``like``.
+    """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.datapoints.Datapoint` subclass as ``like``.
 
-    If ``like`` is a :class:`~torchvision.datapoint.BoundingBoxes`, the ``format`` and ``canvas_size`` of
+    If ``like`` is a :class:`~torchvision.datapoints.BoundingBoxes`, the ``format`` and ``canvas_size`` of
     ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``.
 
     Args:
         wrappee (Tensor): The tensor to convert.
-        like (Datapoint): The
+        like (:class:`~torchvision.datapoints.Datapoint`): The reference.
+            ``wrappee`` will be converted into the same subclass as ``like``.
         kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.datapoint.BoundingBoxes`.
             Ignored otherwise.
     """
diff --git a/torchvision/datapoints/_torch_function_helpers.py b/torchvision/datapoints/_torch_function_helpers.py
index 6ab4f4158..89c61d166 100644
--- a/torchvision/datapoints/_torch_function_helpers.py
+++ b/torchvision/datapoints/_torch_function_helpers.py
@@ -16,7 +16,7 @@ class _ReturnTypeCM:
 
 
 def set_return_type(return_type: str):
-    """Set the return type of torch operations on datapoints.
+    """[BETA] Set the return type of torch operations on datapoints.
 
     This only affects the behaviour of torch operations. It has no effect on
     ``torchvision`` transforms or functionals, which will always return as
-- 
GitLab


From ba7ce61e1d1804771bd25343cc7bd4bfb6cb5b76 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 17 Aug 2023 11:27:59 +0100
Subject: [PATCH 568/624] Add google collab link for gallery examples (#7843)

---
 .github/workflows/docs.yml | 7 +++++++
 gallery/plot_datapoints.py | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f4cc76db0..f4c8055b0 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -55,6 +55,13 @@ jobs:
         # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
         sed -i -e 's/-j auto/-j 1/' Makefile
         make html
+
+        mkdir build/html/_generated_ipynb_notebooks
+        for file in `find build/html/_downloads`; do
+          if [[ $file == *.ipynb ]]; then
+            cp $file build/html/_generated_ipynb_notebooks/
+          fi
+        done
         
         cp -r build/html "${RUNNER_ARTIFACT_DIR}"
         
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index eecefe955..b98f032d6 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -3,6 +3,8 @@
 Datapoints FAQ
 ==============
 
+https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/_generated_ipynb_notebooks/plot_datapoints.ipynb
+
 Datapoints are Tensor subclasses introduced together with
 ``torchvision.transforms.v2``. This example showcases what these datapoints are
 and how they behave.
-- 
GitLab


From 3554d80e9d2f844b4d1b0f54eeb475d279670f4b Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 17 Aug 2023 14:08:58 +0200
Subject: [PATCH 569/624] Fixed few typos in docs for type conversion
 transforms (#7845)

---
 torchvision/transforms/v2/_type_conversion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index aec82f46f..88a79a1da 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -30,7 +30,7 @@ class ToImage(Transform):
     """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
     ; this does not scale values.
 
-    .. v2betastatus:: ToImageTensor transform
+    .. v2betastatus:: ToImage transform
 
     This transform does not support torchscript.
     """
@@ -46,7 +46,7 @@ class ToImage(Transform):
 class ToPILImage(Transform):
     """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
-    .. v2betastatus:: ToImagePIL transform
+    .. v2betastatus:: ToPILImage transform
 
     This transform does not support torchscript.
 
-- 
GitLab


From 69220e0c97c70d93345fb875f176c5743cb568cd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 17 Aug 2023 13:10:10 +0100
Subject: [PATCH 570/624] Add ToPureTensor transform (#7823)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst                    |  1 +
 references/classification/presets.py          |  6 ++++++
 references/detection/presets.py               |  5 +++++
 references/segmentation/presets.py            |  5 +++++
 test/test_transforms_v2_refactored.py         | 21 +++++++++++++++++++
 torchvision/transforms/v2/__init__.py         |  2 +-
 torchvision/transforms/v2/_type_conversion.py | 14 +++++++++++++
 7 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 670039571..4dfb70afa 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -237,6 +237,7 @@ Conversion
     v2.ConvertImageDtype
     v2.ToDtype
     v2.ConvertBoundingBoxFormat
+    v2.ToPureTensor
 
 Auto-Augmentation
 -----------------
diff --git a/references/classification/presets.py b/references/classification/presets.py
index 9b53f0ccd..84651493f 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -68,6 +68,9 @@ class ClassificationPresetTrain:
         if random_erase_prob > 0:
             transforms.append(T.RandomErasing(p=random_erase_prob))
 
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
@@ -107,6 +110,9 @@ class ClassificationPresetEval:
             T.Normalize(mean=mean, std=std),
         ]
 
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 09ca148a2..0949a9989 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -79,6 +79,7 @@ class DetectionPresetTrain:
             transforms += [
                 T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY),
                 T.SanitizeBoundingBoxes(),
+                T.ToPureTensor(),
             ]
 
         self.transforms = T.Compose(transforms)
@@ -103,6 +104,10 @@ class DetectionPresetEval:
             raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
 
         transforms += [T.ConvertImageDtype(torch.float)]
+
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index 755cb236d..7b7d0493b 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -63,6 +63,8 @@ class SegmentationPresetTrain:
             transforms += [T.ConvertImageDtype(torch.float)]
 
         transforms += [T.Normalize(mean=mean, std=std)]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
 
         self.transforms = T.Compose(transforms)
 
@@ -98,6 +100,9 @@ class SegmentationPresetEval:
             T.ConvertImageDtype(torch.float),
             T.Normalize(mean=mean, std=std),
         ]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 339725327..c51b7c755 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2353,3 +2353,24 @@ class TestElastic:
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, size, device):
         check_transform(transforms.ElasticTransform, make_input(size, device=device))
+
+
+class TestToPureTensor:
+    def test_correctness(self):
+        input = {
+            "img": make_image(),
+            "img_tensor": make_image_tensor(),
+            "img_pil": make_image_pil(),
+            "mask": make_detection_mask(),
+            "video": make_video(),
+            "bbox": make_bounding_box(),
+            "str": "str",
+        }
+
+        out = transforms.ToPureTensor()(input)
+
+        for input_value, out_value in zip(input.values(), out.values()):
+            if isinstance(input_value, datapoints.Datapoint):
+                assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, datapoints.Datapoint)
+            else:
+                assert isinstance(out_value, type(input_value))
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 38da78fa4..bc15c96b5 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -52,7 +52,7 @@ from ._misc import (
     ToDtype,
 )
 from ._temporal import UniformTemporalSubsample
-from ._type_conversion import PILToTensor, ToImage, ToPILImage
+from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
 
 from ._deprecated import ToTensor  # usort: skip
 
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 88a79a1da..ebbc71af0 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -75,3 +75,17 @@ class ToPILImage(Transform):
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> PIL.Image.Image:
         return F.to_pil_image(inpt, mode=self.mode)
+
+
+class ToPureTensor(Transform):
+    """[BETA] Convert all datapoints to pure tensors, removing associated metadata (if any).
+
+    .. v2betastatus:: ToPureTensor transform
+
+    This doesn't scale or change the values, only the type.
+    """
+
+    _transformed_types = (datapoints.Datapoint,)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
+        return inpt.as_subclass(torch.Tensor)
-- 
GitLab


From 4025fc5e8b4e7161005406d81e0d3b9b8a8d2657 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 17 Aug 2023 14:27:04 +0200
Subject: [PATCH 571/624] Fixed setup.py when missing libjpeg (#7840)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 README.md                                     | 7 ++++---
 torchvision/csrc/io/image/cpu/decode_jpeg.cpp | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e9ceb664a..948f23a1d 100644
--- a/README.md
+++ b/README.md
@@ -82,9 +82,10 @@ Torchvision currently supports the following image backends:
 - [libjpeg](http://ijg.org/) - can be installed via conda `conda install jpeg` or any of the package managers for
   debian-based and RHEL-based Linux distributions. [libjpeg-turbo](https://libjpeg-turbo.org/) can be used as well.
 
-**Notes:** `libpng` and `libjpeg` must be available at compilation time in order to be available. Make sure that it is
-available on the standard library locations, otherwise, add the include and library paths in the environment variables
-`TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`, respectively.
+**Notes:** `libpng` and `libjpeg` are optional dependencies. If any of them is available on the system, 
+torchvision will provide encoding/decoding image functionalities from `torchvision.io.image`. 
+When building torchvision from source, `libpng` and `libjpeg` can be found on the standard library locations.
+Otherwise, please use `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY` environment variables to set up include and library paths.
 
 ## Video Backend
 
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
index d07844a5e..09a0618ad 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -155,7 +155,7 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
 #endif // #if !JPEG_FOUND
 
 int64_t _jpeg_version() {
-#ifdef JPEG_FOUND
+#if JPEG_FOUND
   return JPEG_LIB_VERSION;
 #else
   return -1;
-- 
GitLab


From 41f9c1e6cc840f32128bd94a74df14e3e06617b1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 17 Aug 2023 14:06:50 +0100
Subject: [PATCH 572/624] Simple tensor -> pure tensor (#7846)

---
 test/test_prototype_datasets_builtin.py       | 10 ++---
 test/test_prototype_transforms.py             |  6 +--
 test/test_transforms_v2.py                    | 38 +++++++++----------
 test/test_transforms_v2_consistency.py        |  2 +-
 test/test_transforms_v2_functional.py         | 26 ++++++-------
 test/test_transforms_v2_utils.py              |  6 +--
 test/transforms_v2_dispatcher_infos.py        |  2 +-
 torchvision/prototype/transforms/_augment.py  |  6 +--
 torchvision/prototype/transforms/_geometry.py |  4 +-
 torchvision/prototype/transforms/_misc.py     |  6 +--
 torchvision/transforms/v2/_augment.py         |  6 +--
 torchvision/transforms/v2/_auto_augment.py    |  4 +-
 torchvision/transforms/v2/_geometry.py        |  4 +-
 torchvision/transforms/v2/_misc.py            |  8 ++--
 torchvision/transforms/v2/_transform.py       | 22 +++++------
 torchvision/transforms/v2/_type_conversion.py |  6 +--
 .../transforms/v2/functional/__init__.py      |  2 +-
 torchvision/transforms/v2/functional/_meta.py | 12 +++---
 .../transforms/v2/functional/_utils.py        |  2 +-
 torchvision/transforms/v2/utils.py            |  6 +--
 20 files changed, 89 insertions(+), 89 deletions(-)

diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 4d19b6796..e29dfb17f 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -25,7 +25,7 @@ from torchvision.prototype import datasets
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 def assert_samples_equal(*args, msg=None, **kwargs):
@@ -140,18 +140,18 @@ class TestCommon:
             raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:"))
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_unaccompanied_simple_tensors(self, dataset_mock, config):
+    def test_no_unaccompanied_pure_tensors(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
         sample = next_consume(iter(dataset))
 
-        simple_tensors = {key for key, value in sample.items() if is_simple_tensor(value)}
+        pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)}
 
-        if simple_tensors and not any(
+        if pure_tensors and not any(
             isinstance(item, (datapoints.Image, datapoints.Video, EncodedImage)) for item in sample.values()
         ):
             raise AssertionError(
                 f"The values of key(s) "
-                f"{sequence_to_str(sorted(simple_tensors), separate_last='and ')} contained simple tensors, "
+                f"{sequence_to_str(sorted(pure_tensors), separate_last='and ')} contained pure tensors, "
                 f"but didn't find any (encoded) image or video."
             )
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 32a68e140..bf45970df 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -18,7 +18,7 @@ from prototype_common_utils import make_label
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
-from torchvision.transforms.v2.utils import check_type, is_simple_tensor
+from torchvision.transforms.v2.utils import check_type, is_pure_tensor
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
 
@@ -296,7 +296,7 @@ class TestPermuteDimensions:
             value_type = type(value)
             transformed_value = transformed_sample[key]
 
-            if check_type(value, (Image, is_simple_tensor, Video)):
+            if check_type(value, (Image, is_pure_tensor, Video)):
                 if transform.dims.get(value_type) is not None:
                     assert transformed_value.permute(inverse_dims[value_type]).equal(value)
                 assert type(transformed_value) == torch.Tensor
@@ -341,7 +341,7 @@ class TestTransposeDimensions:
             transformed_value = transformed_sample[key]
 
             transposed_dims = transform.dims.get(value_type)
-            if check_type(value, (Image, is_simple_tensor, Video)):
+            if check_type(value, (Image, is_pure_tensor, Video)):
                 if transposed_dims is not None:
                     assert transformed_value.transpose(*transposed_dims).equal(value)
                 assert type(transformed_value) == torch.Tensor
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index ade3bdf0b..d7a6f21bb 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -29,7 +29,7 @@ from torchvision import datapoints
 from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import to_pil_image
 from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
+from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
 
 
 def make_vanilla_tensor_images(*args, **kwargs):
@@ -71,7 +71,7 @@ def auto_augment_adapter(transform, input, device):
         if isinstance(value, (datapoints.BoundingBoxes, datapoints.Mask)):
             # AA transforms don't support bounding boxes or masks
             continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor, PIL.Image.Image)):
             if image_or_video_found:
                 # AA transforms only support a single image or video
                 continue
@@ -101,7 +101,7 @@ def normalize_adapter(transform, input, device):
         if isinstance(value, PIL.Image.Image):
             # normalize doesn't support PIL images
             continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor)):
             # normalize doesn't support integer images
             value = F.to_dtype(value, torch.float32, scale=True)
         adapted_input[key] = value
@@ -357,19 +357,19 @@ class TestSmoke:
         3,
     ),
 )
-def test_simple_tensor_heuristic(flat_inputs):
-    def split_on_simple_tensor(to_split):
+def test_pure_tensor_heuristic(flat_inputs):
+    def split_on_pure_tensor(to_split):
         # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
-        # 1. The first simple tensor. If none is present, this will be `None`
-        # 2. A list of the remaining simple tensors
+        # 1. The first pure tensor. If none is present, this will be `None`
+        # 2. A list of the remaining pure tensors
         # 3. A list of all other items
-        simple_tensors = []
+        pure_tensors = []
         others = []
         # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
         # affect the splitting.
         for item, inpt in zip(to_split, flat_inputs):
-            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
-        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
+            (pure_tensors if is_pure_tensor(inpt) else others).append(item)
+        return pure_tensors[0] if pure_tensors else None, pure_tensors[1:], others
 
     class CopyCloneTransform(transforms.Transform):
         def _transform(self, inpt, params):
@@ -385,20 +385,20 @@ def test_simple_tensor_heuristic(flat_inputs):
             assert_equal(output, inpt)
             return True
 
-    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
+    first_pure_tensor_input, other_pure_tensor_inputs, other_inputs = split_on_pure_tensor(flat_inputs)
 
     transform = CopyCloneTransform()
     transformed_sample = transform(flat_inputs)
 
-    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
+    first_pure_tensor_output, other_pure_tensor_outputs, other_outputs = split_on_pure_tensor(transformed_sample)
 
-    if first_simple_tensor_input is not None:
+    if first_pure_tensor_input is not None:
         if other_inputs:
-            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+            assert not transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
         else:
-            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+            assert transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
 
-    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
+    for output, inpt in zip(other_pure_tensor_outputs, other_pure_tensor_inputs):
         assert not transform.was_applied(output, inpt)
 
     for input, output in zip(other_inputs, other_outputs):
@@ -1004,7 +1004,7 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
         image = to_pil_image(image[0])
     elif image_type is torch.Tensor:
         image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
+        assert is_pure_tensor(image)
 
     label = 1 if label_type is int else torch.tensor([1])
 
@@ -1125,7 +1125,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         image = to_pil_image(image[0])
     elif image_type is torch.Tensor:
         image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
+        assert is_pure_tensor(image)
 
     label = torch.randint(0, 10, size=(num_boxes,))
 
@@ -1146,7 +1146,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     out = t(sample)
 
     if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image:
-        assert is_simple_tensor(out["image"])
+        assert is_pure_tensor(out["image"])
     else:
         assert isinstance(out["image"], datapoints.Image)
     assert isinstance(out["label"], type(sample["label"]))
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 5855fbe44..3196a5fd8 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -602,7 +602,7 @@ def check_call_consistency(
             raise AssertionError(
                 f"Transforming a tensor image with shape {image_repr} failed in the prototype transform with "
                 f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                f"`is_simple_tensor` path in `_transform`."
+                f"`is_pure_tensor` path in `_transform`."
             ) from exc
 
         assert_close(
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 14a1f82b2..29ef54d92 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -24,7 +24,7 @@ from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
 
@@ -168,7 +168,7 @@ class TestKernels:
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
-        datapoint_type = datapoints.Image if is_simple_tensor(batched_input) else type(batched_input)
+        datapoint_type = datapoints.Image if is_pure_tensor(batched_input) else type(batched_input)
         # This dictionary contains the number of rightmost dimensions that contain the actual data.
         # Everything to the left is considered a batch dimension.
         data_dims = {
@@ -333,9 +333,9 @@ class TestDispatchers:
         dispatcher = script(info.dispatcher)
 
         (image_datapoint, *other_args), kwargs = args_kwargs.load(device)
-        image_simple_tensor = torch.Tensor(image_datapoint)
+        image_pure_tensor = torch.Tensor(image_datapoint)
 
-        dispatcher(image_simple_tensor, *other_args, **kwargs)
+        dispatcher(image_pure_tensor, *other_args, **kwargs)
 
     # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke`
     #  replaces this test for them.
@@ -358,11 +358,11 @@ class TestDispatchers:
         script(dispatcher)
 
     @image_sample_inputs
-    def test_simple_tensor_output_type(self, info, args_kwargs):
+    def test_pure_tensor_output_type(self, info, args_kwargs):
         (image_datapoint, *other_args), kwargs = args_kwargs.load()
-        image_simple_tensor = image_datapoint.as_subclass(torch.Tensor)
+        image_pure_tensor = image_datapoint.as_subclass(torch.Tensor)
 
-        output = info.dispatcher(image_simple_tensor, *other_args, **kwargs)
+        output = info.dispatcher(image_pure_tensor, *other_args, **kwargs)
 
         # We cannot use `isinstance` here since all datapoints are instances of `torch.Tensor` as well
         assert type(output) is torch.Tensor
@@ -505,11 +505,11 @@ class TestClampBoundingBoxes:
             dict(canvas_size=(1, 1)),
         ],
     )
-    def test_simple_tensor_insufficient_metadata(self, metadata):
-        simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+    def test_pure_tensor_insufficient_metadata(self, metadata):
+        pure_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
-            F.clamp_bounding_boxes(simple_tensor, **metadata)
+            F.clamp_bounding_boxes(pure_tensor, **metadata)
 
     @pytest.mark.parametrize(
         "metadata",
@@ -538,11 +538,11 @@ class TestConvertFormatBoundingBoxes:
         with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")):
             F.convert_format_bounding_boxes(inpt, old_format)
 
-    def test_simple_tensor_insufficient_metadata(self):
-        simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+    def test_pure_tensor_insufficient_metadata(self):
+        pure_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
-            F.convert_format_bounding_boxes(simple_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
+            F.convert_format_bounding_boxes(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
 
     def test_datapoint_explicit_metadata(self):
         datapoint = next(make_bounding_boxes())
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 0cf7a77ac..0cfe0db70 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -37,15 +37,15 @@ MASK = make_detection_mask(DEFAULT_SIZE)
         ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor), True),
+        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor), True),
         (
             (torch.Tensor(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor),
             True,
         ),
         (
             (to_pil_image(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor),
             True,
         ),
     ],
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 8f212c850..903518627 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -107,7 +107,7 @@ multi_crop_skips = [
         ("TestDispatchers", test_name),
         pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."),
     )
-    for test_name in ["test_simple_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"]
+    for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"]
 ]
 multi_crop_skips.append(skip_dispatch_datapoint)
 
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 81f726a2d..eaa181b67 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -9,7 +9,7 @@ from torchvision.prototype import datapoints as proto_datapoints
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
 
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 class SimpleCopyPaste(Transform):
@@ -109,7 +109,7 @@ class SimpleCopyPaste(Transform):
         # with List[image], List[BoundingBoxes], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
         for obj in flat_sample:
-            if isinstance(obj, datapoints.Image) or is_simple_tensor(obj):
+            if isinstance(obj, datapoints.Image) or is_pure_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
                 images.append(F.to_image(obj))
@@ -146,7 +146,7 @@ class SimpleCopyPaste(Transform):
             elif isinstance(obj, PIL.Image.Image):
                 flat_sample[i] = F.to_pil_image(output_images[c0])
                 c0 += 1
-            elif is_simple_tensor(obj):
+            elif is_pure_tensor(obj):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
             elif isinstance(obj, datapoints.BoundingBoxes):
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 8d8e7eb42..1350b6d1b 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@ from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _FillType, _get_fill, _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_simple_tensor, query_size
+from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_pure_tensor, query_size
 
 
 class FixedSizeCrop(Transform):
@@ -32,7 +32,7 @@ class FixedSizeCrop(Transform):
             flat_inputs,
             PIL.Image.Image,
             datapoints.Image,
-            is_simple_tensor,
+            is_pure_tensor,
             datapoints.Video,
         ):
             raise TypeError(
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index f1b859aac..0dd495ab0 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -8,7 +8,7 @@ import torch
 from torchvision import datapoints
 from torchvision.transforms.v2 import Transform
 
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 T = TypeVar("T")
@@ -25,7 +25,7 @@ def _get_defaultdict(default: T) -> Dict[Any, T]:
 
 
 class PermuteDimensions(Transform):
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None:
         super().__init__()
@@ -47,7 +47,7 @@ class PermuteDimensions(Transform):
 
 
 class TransposeDimensions(Transform):
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None:
         super().__init__()
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index f64ae564b..51ca4c145 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -12,7 +12,7 @@ from torchvision.transforms.v2 import functional as F
 
 from ._transform import _RandomApplyTransform, Transform
 from ._utils import _parse_labels_getter
-from .utils import has_any, is_simple_tensor, query_chw, query_size
+from .utils import has_any, is_pure_tensor, query_chw, query_size
 
 
 class RandomErasing(_RandomApplyTransform):
@@ -243,7 +243,7 @@ class MixUp(_BaseMixUpCutMix):
 
         if inpt is params["labels"]:
             return self._mixup_label(inpt, lam=lam)
-        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt):
             self._check_image_or_video(inpt, batch_size=params["batch_size"])
 
             output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
@@ -310,7 +310,7 @@ class CutMix(_BaseMixUpCutMix):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if inpt is params["labels"]:
             return self._mixup_label(inpt, lam=params["lam_adjusted"])
-        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt):
             self._check_image_or_video(inpt, batch_size=params["batch_size"])
 
             x1, y1, x2, y2 = params["box"]
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 687a2396e..097e90fc4 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -13,7 +13,7 @@ from torchvision.transforms.v2.functional._meta import get_size
 from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
 from ._utils import _get_fill, _setup_fill_arg
-from .utils import check_type, is_simple_tensor
+from .utils import check_type, is_pure_tensor
 
 
 ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video]
@@ -50,7 +50,7 @@ class _AutoAugmentBase(Transform):
                 (
                     datapoints.Image,
                     PIL.Image.Image,
-                    is_simple_tensor,
+                    is_pure_tensor,
                     datapoints.Video,
                 ),
             ):
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index f441a0b74..0be62ae8a 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -24,7 +24,7 @@ from ._utils import (
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import get_bounding_boxes, has_all, has_any, is_simple_tensor, query_size
+from .utils import get_bounding_boxes, has_all, has_any, is_pure_tensor, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -1149,7 +1149,7 @@ class RandomIoUCrop(Transform):
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
             has_all(flat_inputs, datapoints.BoundingBoxes)
-            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
+            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_pure_tensor)
         ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index ef9ac5fd0..405fbc6c4 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -10,7 +10,7 @@ from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
-from .utils import get_bounding_boxes, has_any, is_simple_tensor
+from .utils import get_bounding_boxes, has_any, is_pure_tensor
 
 
 # TODO: do we want/need to expose this?
@@ -75,7 +75,7 @@ class LinearTransformation(Transform):
 
     _v1_transform_cls = _transforms.LinearTransformation
 
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
         super().__init__()
@@ -264,7 +264,7 @@ class ToDtype(Transform):
         if isinstance(self.dtype, torch.dtype):
             # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
             # is a simple torch.dtype
-            if not is_simple_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            if not is_pure_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)):
                 return inpt
 
             dtype: Optional[torch.dtype] = self.dtype
@@ -281,7 +281,7 @@ class ToDtype(Transform):
                 'e.g. dtype={datapoints.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
             )
 
-        supports_scaling = is_simple_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video))
+        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video))
         if dtype is None:
             if self.scale and supports_scaling:
                 warnings.warn(
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index d4ee8af55..e9af4b426 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -8,7 +8,7 @@ import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints
-from torchvision.transforms.v2.utils import check_type, has_any, is_simple_tensor
+from torchvision.transforms.v2.utils import check_type, has_any, is_pure_tensor
 from torchvision.utils import _log_api_usage_once
 
 from .functional._utils import _get_kernel
@@ -55,32 +55,32 @@ class Transform(nn.Module):
         return tree_unflatten(flat_outputs, spec)
 
     def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]:
-        # Below is a heuristic on how to deal with simple tensor inputs:
-        # 1. Simple tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
+        # Below is a heuristic on how to deal with pure tensor inputs:
+        # 1. Pure tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
         #    (`datapoints.Image` or `PIL.Image.Image`) or video (`datapoints.Video`) in the sample.
-        # 2. If there is no explicit image or video in the sample, only the first encountered simple tensor is
+        # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
         #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
         #    of `tree_flatten`, which recurses depth-first through the input.
         #
         # This heuristic stems from two requirements:
-        # 1. We need to keep BC for single input simple tensors and treat them as images.
-        # 2. We don't want to treat all simple tensors as images, because some datasets like `CelebA` or `Widerface`
+        # 1. We need to keep BC for single input pure tensors and treat them as images.
+        # 2. We don't want to treat all pure tensors as images, because some datasets like `CelebA` or `Widerface`
         #    return supplemental numerical data as tensors that cannot be transformed as images.
         #
         # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
-        # tries to transform multiple simple tensors at the same time, expecting them all to be treated as images.
+        # tries to transform multiple pure tensors at the same time, expecting them all to be treated as images.
         # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
 
         needs_transform_list = []
-        transform_simple_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
+        transform_pure_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
         for inpt in flat_inputs:
             needs_transform = True
 
             if not check_type(inpt, self._transformed_types):
                 needs_transform = False
-            elif is_simple_tensor(inpt):
-                if transform_simple_tensor:
-                    transform_simple_tensor = False
+            elif is_pure_tensor(inpt):
+                if transform_pure_tensor:
+                    transform_pure_tensor = False
                 else:
                     needs_transform = False
             needs_transform_list.append(needs_transform)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index ebbc71af0..26d233754 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -7,7 +7,7 @@ import torch
 from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F, Transform
 
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 class PILToTensor(Transform):
@@ -35,7 +35,7 @@ class ToImage(Transform):
     This transform does not support torchscript.
     """
 
-    _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
@@ -65,7 +65,7 @@ class ToPILImage(Transform):
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
 
-    _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
+    _transformed_types = (is_pure_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:
         super().__init__()
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 3510962ff..5d3a18a91 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -1,6 +1,6 @@
 from torchvision.transforms import InterpolationMode  # usort: skip
 
-from ._utils import is_simple_tensor, register_kernel  # usort: skip
+from ._utils import is_pure_tensor, register_kernel  # usort: skip
 
 from ._meta import (
     clamp_bounding_boxes,
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index f2675728c..fc4dfb60d 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -8,7 +8,7 @@ from torchvision.transforms import _functional_pil as _FP
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_kernel_internal, is_simple_tensor
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
 
 
 def get_dimensions(inpt: torch.Tensor) -> List[int]:
@@ -203,7 +203,7 @@ def convert_format_bounding_boxes(
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
 ) -> torch.Tensor:
-    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for simple tensor
+    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
     # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
     # default error that would be thrown if `new_format` had no default value.
@@ -213,9 +213,9 @@ def convert_format_bounding_boxes(
     if not torch.jit.is_scripting():
         _log_api_usage_once(convert_format_bounding_boxes)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
         if old_format is None:
-            raise ValueError("For simple tensor inputs, `old_format` has to be passed.")
+            raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
         return _convert_format_bounding_boxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
     elif isinstance(inpt, datapoints.BoundingBoxes):
         if old_format is not None:
@@ -256,10 +256,10 @@ def clamp_bounding_boxes(
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_boxes)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
 
         if format is None or canvas_size is None:
-            raise ValueError("For simple tensor inputs, `format` and `canvas_size` has to be passed.")
+            raise ValueError("For pure tensor inputs, `format` and `canvas_size` has to be passed.")
         return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
     elif isinstance(inpt, datapoints.BoundingBoxes):
         if format is not None or canvas_size is not None:
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 7fc489299..28319e64c 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -8,7 +8,7 @@ _FillType = Union[int, float, Sequence[int], Sequence[float], None]
 _FillTypeJIT = Optional[List[float]]
 
 
-def is_simple_tensor(inpt: Any) -> bool:
+def is_pure_tensor(inpt: Any) -> bool:
     return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint)
 
 
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
index 1d9219fb4..1e4ff2d05 100644
--- a/torchvision/transforms/v2/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -6,7 +6,7 @@ import PIL.Image
 from torchvision import datapoints
 
 from torchvision._utils import sequence_to_str
-from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
 
 
 def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
@@ -21,7 +21,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs
-        if check_type(inpt, (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
+        if check_type(inpt, (is_pure_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -38,7 +38,7 @@ def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
         if check_type(
             inpt,
             (
-                is_simple_tensor,
+                is_pure_tensor,
                 datapoints.Image,
                 PIL.Image.Image,
                 datapoints.Video,
-- 
GitLab


From a7b52a68f625305cfeead988cfd672c167e87a8e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 17 Aug 2023 14:07:37 +0100
Subject: [PATCH 573/624] Don't error on cuda minor version mismatch (#7726)

---
 torchvision/extension.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/extension.py b/torchvision/extension.py
index c417c54f9..67801056e 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -74,9 +74,9 @@ def _check_cuda_version():
         t_version = torch_version_cuda.split(".")
         t_major = int(t_version[0])
         t_minor = int(t_version[1])
-        if t_major != tv_major or t_minor != tv_minor:
+        if t_major != tv_major:
             raise RuntimeError(
-                "Detected that PyTorch and torchvision were compiled with different CUDA versions. "
+                "Detected that PyTorch and torchvision were compiled with different CUDA major versions. "
                 f"PyTorch has CUDA Version={t_major}.{t_minor} and torchvision has "
                 f"CUDA Version={tv_major}.{tv_minor}. "
                 "Please reinstall the torchvision that matches your PyTorch install."
-- 
GitLab


From 74f50d93aead4d4903bbb508b9db749232d4658e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 17 Aug 2023 16:02:32 +0100
Subject: [PATCH 574/624] Cleanup README.md (#7847)

---
 CONTRIBUTING.md    | 45 +++++++++++++++++++++++----------
 README.md          | 63 ++++++++++++----------------------------------
 docs/source/io.rst | 61 ++++++++++++++++++++++----------------------
 3 files changed, 79 insertions(+), 90 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7c73e8f92..57df2c274 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -30,30 +30,49 @@ clear and has sufficient instructions to be able to reproduce the issue.
 
 ## Development installation
 
-### Install PyTorch Nightly 
+
+### Dependencies
+
+Start by installing the **nightly** build of PyTorch following the [official
+instructions](https://pytorch.org/get-started/locally/).
+
+**Optionally**, install `libpng` and `libjpeg-turbo` if you want to enable
+support for
+native encoding / decoding of PNG and JPEG formats in
+[torchvision.io](https://pytorch.org/vision/stable/io.html#image):
 
 ```bash
-conda install pytorch -c pytorch-nightly
-# or with pip (see https://pytorch.org/get-started/locally/)
-# pip install numpy
-# pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
+conda install libpng libjpeg-turbo -c pytorch
 ```
 
-### Install Torchvision
+Note: you can use the `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`
+environment variables to tell the build system where to find those libraries if
+they are in specific locations. Take a look at
+[setup.py](https://github.com/pytorch/vision/blob/main/setup.py) for more
+details.
+
+### Clone and install torchvision
 
 ```bash
 git clone https://github.com/pytorch/vision.git
 cd vision
-python setup.py develop
+python setup.py develop  # use install instead of develop if you don't care about development.
 # or, for OSX
 # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop
-# for C++ debugging, please use DEBUG=1
+# for C++ debugging, use DEBUG=1
 # DEBUG=1 python setup.py develop
-pip install flake8 typing mypy pytest pytest-mock scipy
 ```
-You may also have to install `libpng-dev` and `libjpeg-turbo8-dev` libraries:
-```bash
-conda install libpng jpeg
+
+By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
+building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+
+We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
+`--no-build-isolation` flag.
+
+Other development dependencies include:
+
+```
+pip install flake8 typing mypy pytest pytest-mock scipy
 ```
 
 ## Development Process
@@ -192,7 +211,7 @@ Please refer to the guidelines in [Contributing to Torchvision - Models](https:/
  
 ### New dataset
 
-More details on how to add a new dataset will be provided later. Please, do not send any PR with a new dataset without discussing 
+Please, do not send any PR with a new dataset without discussing 
 it in an issue as, most likely, it will not be accepted.
 
 ### Pull Request
diff --git a/README.md b/README.md
index 948f23a1d..dd29290a0 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,14 @@ vision.
 
 ## Installation
 
-We recommend Anaconda as Python package management system. Please refer to [pytorch.org](https://pytorch.org/) for the
-detail of PyTorch (`torch`) installation. The following is the corresponding `torchvision` versions and supported Python
+Please refer to the [official
+instructions](https://pytorch.org/get-started/locally/) to install the stable
+versions of `torch` and `torchvision` on your system.
+
+To build source, refer to our [contributing
+page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation).
+
+The following is the corresponding `torchvision` versions and supported Python
 versions.
 
 | `torch`            | `torchvision`      | Python              |
@@ -39,55 +45,18 @@ versions.
 
 </details>
 
-Anaconda:
-
-```
-conda install torchvision -c pytorch
-```
-
-pip:
-
-```
-pip install torchvision
-```
+## Image Backends
 
-From source:
-
-```
-python setup.py install
-# or, for OSX
-# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
-```
-
-We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
-`--no-build-isolation` flag. In case building TorchVision from source fails, install the nightly version of PyTorch
-following the linked guide on the
-[contributing page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation) and retry the
-install.
-
-By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
-building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+Torchvision currently supports the following image backends:
 
-## Image Backend
+- torch tensors
+- PIL images:
+    - [Pillow](https://python-pillow.org/)
+    - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
 
-Torchvision currently supports the following image backends:
+Read more in in our [docs](https://pytorch.org/vision/stable/transforms.html).
 
-- [Pillow](https://python-pillow.org/) (default)
-- [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
-  If installed will be used as the default.
-- [accimage](https://github.com/pytorch/accimage) - if installed can be activated by calling
-  `torchvision.set_image_backend('accimage')`
-- [libpng](http://www.libpng.org/pub/png/libpng.html) - can be installed via conda `conda install libpng` or any of the
-  package managers for debian-based and RHEL-based Linux distributions.
-- [libjpeg](http://ijg.org/) - can be installed via conda `conda install jpeg` or any of the package managers for
-  debian-based and RHEL-based Linux distributions. [libjpeg-turbo](https://libjpeg-turbo.org/) can be used as well.
-
-**Notes:** `libpng` and `libjpeg` are optional dependencies. If any of them is available on the system, 
-torchvision will provide encoding/decoding image functionalities from `torchvision.io.image`. 
-When building torchvision from source, `libpng` and `libjpeg` can be found on the standard library locations.
-Otherwise, please use `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY` environment variables to set up include and library paths.
-
-## Video Backend
+## [UNSTABLE] Video Backend
 
 Torchvision currently supports the following video backends:
 
diff --git a/docs/source/io.rst b/docs/source/io.rst
index 258a1ee16..1da9bb688 100644
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -1,11 +1,37 @@
-Reading/Writing images and videos
-=================================
+Decoding / Encoding images and videos
+=====================================
 
 .. currentmodule:: torchvision.io
 
 The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing video and
-images.
+operations. They are currently specific to reading and writing images and
+videos.
+
+Images
+------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    read_image
+    decode_image
+    encode_jpeg
+    decode_jpeg
+    write_jpeg
+    encode_png
+    decode_png
+    write_png
+    read_file
+    write_file
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    ImageReadMode
+
+
 
 Video
 -----
@@ -20,7 +46,7 @@ Video
 
 
 Fine-grained video API
-----------------------
+^^^^^^^^^^^^^^^^^^^^^^
 
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.
@@ -61,28 +87,3 @@ Example of inspecting a video:
     # the constructor we select a default video stream, but
     # in practice, we can set whichever stream we would like 
     video.set_current_stream("video:0")
-
-
-Image
------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    ImageReadMode
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    read_image
-    decode_image
-    encode_jpeg
-    decode_jpeg
-    write_jpeg
-    encode_png
-    decode_png
-    write_png
-    read_file
-    write_file
-- 
GitLab


From 6241d471337038673746d20a6e2524853cfab1dd Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 18 Aug 2023 10:35:24 +0200
Subject: [PATCH 575/624] increase HTTP post buffer for doc upload (#7851)

---
 .github/workflows/docs.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f4c8055b0..196d80690 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -118,5 +118,6 @@ jobs:
         
         git config user.name 'pytorchbot'
         git config user.email 'soumith+bot@pytorch.org'
+        git config http.postBuffer 524288000
         git commit -m "auto-generating sphinx docs" || true
         git push
-- 
GitLab


From 99ebb75dd18226d87157b737f721a284ed25353e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 18 Aug 2023 09:54:21 +0100
Subject: [PATCH 576/624] Split example gallery into subsections (#7849)

---
 docs/source/conf.py                                  |  2 ++
 docs/source/datapoints.rst                           |  2 +-
 docs/source/transforms.rst                           | 12 ++++++------
 docs/source/utils.rst                                |  2 +-
 gallery/README.rst                                   |  6 ++----
 gallery/others/README.rst                            |  2 ++
 gallery/{ => others}/plot_optical_flow.py            |  0
 gallery/{ => others}/plot_repurposing_annotations.py |  2 +-
 .../{ => others}/plot_scripted_tensor_transforms.py  |  8 ++++----
 gallery/{ => others}/plot_transforms.py              |  2 +-
 gallery/{ => others}/plot_video_api.py               |  0
 gallery/{ => others}/plot_visualization_utils.py     |  6 +++---
 gallery/v2_transforms/README.rst                     |  2 ++
 .../{ => v2_transforms}/plot_custom_datapoints.py    |  2 +-
 .../{ => v2_transforms}/plot_custom_transforms.py    |  0
 gallery/{ => v2_transforms}/plot_cutmix_mixup.py     |  0
 gallery/{ => v2_transforms}/plot_datapoints.py       |  2 +-
 gallery/{ => v2_transforms}/plot_transforms_v2.py    |  6 +++---
 .../{ => v2_transforms}/plot_transforms_v2_e2e.py    |  2 +-
 torchvision/datapoints/_datapoint.py                 |  2 +-
 torchvision/transforms/v2/_augment.py                |  4 ++--
 torchvision/transforms/v2/functional/_utils.py       |  2 +-
 22 files changed, 35 insertions(+), 31 deletions(-)
 create mode 100644 gallery/others/README.rst
 rename gallery/{ => others}/plot_optical_flow.py (100%)
 rename gallery/{ => others}/plot_repurposing_annotations.py (99%)
 rename gallery/{ => others}/plot_scripted_tensor_transforms.py (94%)
 rename gallery/{ => others}/plot_transforms.py (99%)
 rename gallery/{ => others}/plot_video_api.py (100%)
 rename gallery/{ => others}/plot_visualization_utils.py (98%)
 create mode 100644 gallery/v2_transforms/README.rst
 rename gallery/{ => v2_transforms}/plot_custom_datapoints.py (98%)
 rename gallery/{ => v2_transforms}/plot_custom_transforms.py (100%)
 rename gallery/{ => v2_transforms}/plot_cutmix_mixup.py (100%)
 rename gallery/{ => v2_transforms}/plot_datapoints.py (99%)
 rename gallery/{ => v2_transforms}/plot_transforms_v2.py (95%)
 rename gallery/{ => v2_transforms}/plot_transforms_v2_e2e.py (99%)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index fed3884ea..4a331b6cd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -29,6 +29,7 @@ from pathlib import Path
 import pytorch_sphinx_theme
 import torchvision
 import torchvision.models as M
+from sphinx_gallery.sorting import ExplicitOrder
 from tabulate import tabulate
 
 sys.path.append(os.path.abspath("."))
@@ -61,6 +62,7 @@ extensions = [
 sphinx_gallery_conf = {
     "examples_dirs": "../../gallery/",  # path to your example scripts
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
+    "subsection_order": ExplicitOrder(["../../gallery/v2_transforms", "../../gallery/others"]),
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
     "remove_config_comments": True,
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 0599545f7..abaefef60 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -6,7 +6,7 @@ Datapoints
 Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
 dispatch their inputs to the appropriate lower-level kernels. Most users do not
 need to manipulate datapoints directly and can simply rely on dataset wrapping -
-see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+see e.g. :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
 
 .. autosummary::
     :toctree: generated/
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 4dfb70afa..812c17fda 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -13,7 +13,7 @@ Transforming and augmenting images
     are fully backward compatible with the current ones, and you'll see them
     documented below with a `v2.` prefix. To get started with those new
     transforms, you can check out
-    :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+    :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
     Note that these transforms are still BETA, and while we don't expect major
     breaking changes in the future, some APIs may still change according to user
     feedback. Please submit any feedback you may have `here
@@ -54,15 +54,15 @@ across calls. For reproducible transformations across calls, you may use
 
 The following examples illustrate the use of the available transforms:
 
-    * :ref:`sphx_glr_auto_examples_plot_transforms.py`
+    * :ref:`sphx_glr_auto_examples_others_plot_transforms.py`
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_transforms_001.png
+        .. figure:: ../source/auto_examples/others/images/sphx_glr_plot_transforms_001.png
             :align: center
             :scale: 65%
 
-    * :ref:`sphx_glr_auto_examples_plot_scripted_tensor_transforms.py`
+    * :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_scripted_tensor_transforms_001.png
+        .. figure:: ../source/auto_examples/others/images/sphx_glr_plot_scripted_tensor_transforms_001.png
             :align: center
             :scale: 30%
 
@@ -269,7 +269,7 @@ CutMix and MixUp are special transforms that
 are meant to be used on batches rather than on individual images, because they
 are combining pairs of images together. These can be used after the dataloader
 (once the samples are batched), or part of a collation function. See
-:ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+:ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage examples.
 
 .. autosummary::
     :toctree: generated/
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
index 971381a65..cda04de90 100644
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -4,7 +4,7 @@ Utils
 =====
 
 The ``torchvision.utils`` module contains various utilities, mostly :ref:`for
-visualization <sphx_glr_auto_examples_plot_visualization_utils.py>`.
+visualization <sphx_glr_auto_examples_others_plot_visualization_utils.py>`.
 
 .. currentmodule:: torchvision.utils
 
diff --git a/gallery/README.rst b/gallery/README.rst
index 868afe743..9a0838f49 100644
--- a/gallery/README.rst
+++ b/gallery/README.rst
@@ -1,4 +1,2 @@
-Example gallery
-===============
-
-Below is a gallery of examples
+Examples and tutorials
+======================
diff --git a/gallery/others/README.rst b/gallery/others/README.rst
new file mode 100644
index 000000000..fafb007d9
--- /dev/null
+++ b/gallery/others/README.rst
@@ -0,0 +1,2 @@
+Others
+------
diff --git a/gallery/plot_optical_flow.py b/gallery/others/plot_optical_flow.py
similarity index 100%
rename from gallery/plot_optical_flow.py
rename to gallery/others/plot_optical_flow.py
diff --git a/gallery/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py
similarity index 99%
rename from gallery/plot_repurposing_annotations.py
rename to gallery/others/plot_repurposing_annotations.py
index 99f75f03f..f47c30181 100644
--- a/gallery/plot_repurposing_annotations.py
+++ b/gallery/others/plot_repurposing_annotations.py
@@ -20,7 +20,7 @@ import matplotlib.pyplot as plt
 import torchvision.transforms.functional as F
 
 
-ASSETS_DIRECTORY = "assets"
+ASSETS_DIRECTORY = "../assets"
 
 plt.rcParams["savefig.bbox"] = "tight"
 
diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
similarity index 94%
rename from gallery/plot_scripted_tensor_transforms.py
rename to gallery/others/plot_scripted_tensor_transforms.py
index e803da779..5bf48d69f 100644
--- a/gallery/plot_scripted_tensor_transforms.py
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -49,8 +49,8 @@ def show(imgs):
 # The :func:`~torchvision.io.read_image` function allows to read an image and
 # directly load it as a tensor
 
-dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1 = read_image(str(Path('../assets') / 'dog1.jpg'))
+dog2 = read_image(str(Path('../assets') / 'dog2.jpg'))
 show([dog1, dog2])
 
 # %%
@@ -58,7 +58,7 @@ show([dog1, dog2])
 # --------------------------
 # Most transforms natively support tensors on top of PIL images (to visualize
 # the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_plot_transforms.py`).
+# :ref:`sphx_glr_auto_examples_others_plot_transforms.py`).
 # Using tensor images, we can run the transforms on GPUs if cuda is available!
 
 import torch.nn as nn
@@ -121,7 +121,7 @@ res_scripted = scripted_predictor(batch)
 
 import json
 
-with open(Path('assets') / 'imagenet_class_index.json') as labels_file:
+with open(Path('../assets') / 'imagenet_class_index.json') as labels_file:
     labels = json.load(labels_file)
 
 for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
diff --git a/gallery/plot_transforms.py b/gallery/others/plot_transforms.py
similarity index 99%
rename from gallery/plot_transforms.py
rename to gallery/others/plot_transforms.py
index ac6e50a39..2cb0e3469 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/others/plot_transforms.py
@@ -19,7 +19,7 @@ import torchvision.transforms as T
 
 
 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)
diff --git a/gallery/plot_video_api.py b/gallery/others/plot_video_api.py
similarity index 100%
rename from gallery/plot_video_api.py
rename to gallery/others/plot_video_api.py
diff --git a/gallery/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py
similarity index 98%
rename from gallery/plot_visualization_utils.py
rename to gallery/others/plot_visualization_utils.py
index 5e629cb8c..bb3d1c8bc 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/others/plot_visualization_utils.py
@@ -41,8 +41,8 @@ from torchvision.utils import make_grid
 from torchvision.io import read_image
 from pathlib import Path
 
-dog1_int = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2_int = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1_int = read_image(str(Path('../assets') / 'dog1.jpg'))
+dog2_int = read_image(str(Path('../assets') / 'dog2.jpg'))
 dog_list = [dog1_int, dog2_int]
 
 grid = make_grid(dog_list)
@@ -360,7 +360,7 @@ show(dogs_with_masks)
 from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
 from torchvision.io import read_image
 
-person_int = read_image(str(Path("assets") / "person1.jpg"))
+person_int = read_image(str(Path("../assets") / "person1.jpg"))
 
 weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
 transforms = weights.transforms()
diff --git a/gallery/v2_transforms/README.rst b/gallery/v2_transforms/README.rst
new file mode 100644
index 000000000..55a0893e8
--- /dev/null
+++ b/gallery/v2_transforms/README.rst
@@ -0,0 +1,2 @@
+V2 transforms
+-------------
diff --git a/gallery/plot_custom_datapoints.py b/gallery/v2_transforms/plot_custom_datapoints.py
similarity index 98%
rename from gallery/plot_custom_datapoints.py
rename to gallery/v2_transforms/plot_custom_datapoints.py
index a8db87811..dcad5f0a4 100644
--- a/gallery/plot_custom_datapoints.py
+++ b/gallery/v2_transforms/plot_custom_datapoints.py
@@ -6,7 +6,7 @@ How to write your own Datapoint class
 This guide is intended for advanced users and downstream library maintainers. We explain how to
 write your own datapoint class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
-:ref:`sphx_glr_auto_examples_plot_datapoints.py`.
+:ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`.
 """
 
 # %%
diff --git a/gallery/plot_custom_transforms.py b/gallery/v2_transforms/plot_custom_transforms.py
similarity index 100%
rename from gallery/plot_custom_transforms.py
rename to gallery/v2_transforms/plot_custom_transforms.py
diff --git a/gallery/plot_cutmix_mixup.py b/gallery/v2_transforms/plot_cutmix_mixup.py
similarity index 100%
rename from gallery/plot_cutmix_mixup.py
rename to gallery/v2_transforms/plot_cutmix_mixup.py
diff --git a/gallery/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
similarity index 99%
rename from gallery/plot_datapoints.py
rename to gallery/v2_transforms/plot_datapoints.py
index b98f032d6..0bab2d340 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -92,7 +92,7 @@ print(float_image)
 # In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` can also take a
 # :class:`PIL.Image.Image` directly:
 
-image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
+image = datapoints.Image(PIL.Image.open("../assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
 # %%
diff --git a/gallery/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
similarity index 95%
rename from gallery/plot_transforms_v2.py
rename to gallery/v2_transforms/plot_transforms_v2.py
index b85481ae1..e6c8b3ffd 100644
--- a/gallery/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -19,7 +19,7 @@ def load_data():
     from torchvision import datapoints
     from torchvision.ops import masks_to_boxes
 
-    assets_directory = pathlib.Path("assets")
+    assets_directory = pathlib.Path("../assets")
 
     path = assets_directory / "FudanPed00054.png"
     image = datapoints.Image(read_image(str(path)))
@@ -72,9 +72,9 @@ new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": la
 
 # %%
 # Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
-# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
+# appropriate function for the input data: :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`. Note however, that as
 # regular user, you likely don't have to touch this yourself. See
-# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+# :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
 #
 # All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
 # information directly with the sample:
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/v2_transforms/plot_transforms_v2_e2e.py
similarity index 99%
rename from gallery/plot_transforms_v2_e2e.py
rename to gallery/v2_transforms/plot_transforms_v2_e2e.py
index b837b9ba9..e6a36ebbf 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/v2_transforms/plot_transforms_v2_e2e.py
@@ -55,7 +55,7 @@ import torchvision.transforms.v2 as transforms
 def load_example_coco_detection_dataset(**kwargs):
     # This loads fake data for illustration purposes of this example. In practice, you'll have
     # to replace this with the proper data
-    root = pathlib.Path("assets") / "coco"
+    root = pathlib.Path("../assets") / "coco"
     return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs)
 
 
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 59b017b44..11f869103 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -17,7 +17,7 @@ class Datapoint(torch.Tensor):
 
     You probably don't want to use this class unless you're defining your own
     custom Datapoints. See
-    :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for details.
+    :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for details.
     """
 
     @staticmethod
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 51ca4c145..a6af96a5e 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -217,7 +217,7 @@ class MixUp(_BaseMixUpCutMix):
     .. note::
         This transform is meant to be used on **batches** of samples, not
         individual images. See
-        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        :ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage
         examples.
         The sample pairing is deterministic and done by matching consecutive
         samples in the batch, so the batch needs to be shuffled (this is an
@@ -267,7 +267,7 @@ class CutMix(_BaseMixUpCutMix):
     .. note::
         This transform is meant to be used on **batches** of samples, not
         individual images. See
-        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        :ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage
         examples.
         The sample pairing is deterministic and done by matching consecutive
         samples in the batch, so the batch needs to be shuffled (this is an
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 28319e64c..51b76f592 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -69,7 +69,7 @@ _BUILTIN_DATAPOINT_TYPES = {
 def register_kernel(functional, datapoint_cls):
     """Decorate a kernel to register it for a functional and a (custom) datapoint type.
 
-    See :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for usage
+    See :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for usage
     details.
     """
     if isinstance(functional, str):
-- 
GitLab


From 87d54c4e583207e7b003d6b59f1e7f49167f68f1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 18 Aug 2023 11:15:34 +0200
Subject: [PATCH 577/624] separate transforms v2 legacy test utils (#7842)

---
 test/common_utils.py                   | 473 +-----------------
 test/prototype_common_utils.py         |   4 +-
 test/test_datapoints.py                |  26 +-
 test/test_prototype_transforms.py      |  32 +-
 test/test_transforms_v2.py             |  30 +-
 test/test_transforms_v2_consistency.py |  26 +-
 test/test_transforms_v2_functional.py  |  34 +-
 test/test_transforms_v2_refactored.py  |  78 +--
 test/test_transforms_v2_utils.py       |   4 +-
 test/transforms_v2_dispatcher_infos.py |   2 +-
 test/transforms_v2_kernel_infos.py     |   6 +-
 test/transforms_v2_legacy_utils.py     | 633 +++++++++++++++++++++++++
 12 files changed, 754 insertions(+), 594 deletions(-)
 create mode 100644 test/transforms_v2_legacy_utils.py

diff --git a/test/common_utils.py b/test/common_utils.py
index 9713901bd..c815786b5 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -1,7 +1,4 @@
-import collections.abc
 import contextlib
-import dataclasses
-import enum
 import functools
 import itertools
 import os
@@ -12,12 +9,9 @@ import shutil
 import sys
 import tempfile
 import warnings
-from collections import defaultdict
 from subprocess import CalledProcessError, check_output, STDOUT
-from typing import Callable, Sequence, Tuple, Union
 
 import numpy as np
-
 import PIL.Image
 import pytest
 import torch
@@ -27,7 +21,7 @@ from PIL import Image
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+from torchvision.transforms.v2.functional import to_image, to_pil_image
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -363,132 +357,7 @@ def assert_close(
 assert_equal = functools.partial(assert_close, rtol=0, atol=0)
 
 
-def parametrized_error_message(*args, **kwargs):
-    def to_str(obj):
-        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
-            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
-        elif isinstance(obj, enum.Enum):
-            return f"{type(obj).__name__}.{obj.name}"
-        else:
-            return repr(obj)
-
-    if args or kwargs:
-        postfix = "\n".join(
-            [
-                "",
-                "Failure happened for the following parameters:",
-                "",
-                *[to_str(arg) for arg in args],
-                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
-            ]
-        )
-    else:
-        postfix = ""
-
-    def wrapper(msg):
-        return msg + postfix
-
-    return wrapper
-
-
-class ArgsKwargs:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-    def __iter__(self):
-        yield self.args
-        yield self.kwargs
-
-    def load(self, device="cpu"):
-        return ArgsKwargs(
-            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
-            **{
-                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
-                for keyword, arg in self.kwargs.items()
-            },
-        )
-
-
-# new v2 default
 DEFAULT_SIZE = (17, 11)
-# old v2 defaults
-DEFAULT_SQUARE_SPATIAL_SIZE = 15
-DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
-DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
-DEFAULT_SPATIAL_SIZES = (
-    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-)
-
-
-def _parse_size(size, *, name="size"):
-    if size == "random":
-        raise ValueError("This should never happen")
-    elif isinstance(size, int) and size > 0:
-        return (size, size)
-    elif (
-        isinstance(size, collections.abc.Sequence)
-        and len(size) == 2
-        and all(isinstance(length, int) and length > 0 for length in size)
-    ):
-        return tuple(size)
-    else:
-        raise pytest.UsageError(
-            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
-            f"but got {size} instead."
-        )
-
-
-VALID_EXTRA_DIMS = ((), (4,), (2, 3))
-DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
-
-DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
-
-
-def from_loader(loader_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loader = loader_fn(*args, **kwargs)
-        return loader.load(device)
-
-    return wrapper
-
-
-def from_loaders(loaders_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loaders = loaders_fn(*args, **kwargs)
-        for loader in loaders:
-            yield loader.load(device)
-
-    return wrapper
-
-
-@dataclasses.dataclass
-class TensorLoader:
-    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
-    shape: Sequence[int]
-    dtype: torch.dtype
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device)
-
-
-@dataclasses.dataclass
-class ImageLoader(TensorLoader):
-    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
-    num_channels: int = dataclasses.field(init=False)
-    memory_format: torch.memory_format = torch.contiguous_format
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.spatial_size = self.canvas_size = self.shape[-2:]
-        self.num_channels = self.shape[-3]
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
 
 
 NUM_CHANNELS_MAP = {
@@ -499,13 +368,6 @@ NUM_CHANNELS_MAP = {
 }
 
 
-def get_num_channels(color_space):
-    num_channels = NUM_CHANNELS_MAP.get(color_space)
-    if not num_channels:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
-    return num_channels
-
-
 def make_image(
     size=DEFAULT_SIZE,
     *,
@@ -515,10 +377,11 @@ def make_image(
     device="cpu",
     memory_format=torch.contiguous_format,
 ):
+    num_channels = NUM_CHANNELS_MAP[color_space]
     dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
     data = torch.testing.make_tensor(
-        (*batch_dims, get_num_channels(color_space), *size),
+        (*batch_dims, num_channels, *size),
         low=0,
         high=max_value,
         dtype=dtype,
@@ -539,109 +402,7 @@ def make_image_pil(*args, **kwargs):
     return to_pil_image(make_image(*args, **kwargs))
 
 
-def make_image_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    *,
-    color_space="RGB",
-    extra_dims=(),
-    dtype=torch.float32,
-    constant_alpha=True,
-    memory_format=torch.contiguous_format,
-):
-    if not constant_alpha:
-        raise ValueError("This should never happen")
-    size = _parse_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device, memory_format):
-        *batch_dims, _, height, width = shape
-        return make_image(
-            (height, width),
-            color_space=color_space,
-            batch_dims=batch_dims,
-            dtype=dtype,
-            device=device,
-            memory_format=memory_format,
-        )
-
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
-
-
-def make_image_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "GRAY_ALPHA",
-        "RGB",
-        "RGBA",
-    ),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.float64, torch.uint8),
-    constant_alpha=True,
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
-        yield make_image_loader(**params, constant_alpha=constant_alpha)
-
-
-make_images = from_loaders(make_image_loaders)
-
-
-def make_image_loader_for_interpolation(
-    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
-):
-    size = _parse_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device, memory_format):
-        height, width = shape[-2:]
-
-        image_pil = (
-            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
-            .resize((width, height))
-            .convert(
-                {
-                    "GRAY": "L",
-                    "GRAY_ALPHA": "LA",
-                    "RGB": "RGB",
-                    "RGBA": "RGBA",
-                }[color_space]
-            )
-        )
-
-        image_tensor = to_image(image_pil)
-        if memory_format == torch.contiguous_format:
-            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
-        else:
-            image_tensor = image_tensor.to(device=device)
-        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
-
-        return datapoints.Image(image_tensor)
-
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
-
-
-def make_image_loaders_for_interpolation(
-    sizes=((233, 147),),
-    color_spaces=("RGB",),
-    dtypes=(torch.uint8,),
-    memory_formats=(torch.contiguous_format, torch.channels_last),
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
-        yield make_image_loader_for_interpolation(**params)
-
-
-@dataclasses.dataclass
-class BoundingBoxesLoader(TensorLoader):
-    format: datapoints.BoundingBoxFormat
-    spatial_size: Tuple[int, int]
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.canvas_size = self.spatial_size
-
-
-def make_bounding_box(
+def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
@@ -687,42 +448,6 @@ def make_bounding_box(
     )
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
-
-    spatial_size = _parse_size(spatial_size, name="spatial_size")
-
-    def fn(shape, dtype, device):
-        *batch_dims, num_coordinates = shape
-        if num_coordinates != 4:
-            raise pytest.UsageError()
-
-        return make_bounding_box(
-            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
-
-
-def make_bounding_box_loaders(
-    *,
-    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
-    formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    dtypes=(torch.float32, torch.float64, torch.int64),
-):
-    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
-
-
-make_bounding_boxes = from_loaders(make_bounding_box_loaders)
-
-
-class MaskLoader(TensorLoader):
-    pass
-
-
 def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     return datapoints.Mask(
@@ -736,32 +461,6 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp
     )
 
 
-def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
-    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device):
-        *batch_dims, num_objects, height, width = shape
-        return make_detection_mask(
-            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
-
-
-def make_detection_mask_loaders(
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, 5),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
-        yield make_detection_mask_loader(**params)
-
-
-make_detection_masks = from_loaders(make_detection_mask_loaders)
-
-
 def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
     return datapoints.Mask(
@@ -775,56 +474,6 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(
     )
 
 
-def make_segmentation_mask_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
-):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device):
-        *batch_dims, height, width = shape
-        return make_segmentation_mask(
-            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
-
-
-def make_segmentation_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, 10),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_segmentation_mask_loader(**params)
-
-
-make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
-
-
-def make_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, 5),
-    num_categories=(1, 2, 10),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
-    yield from make_segmentation_mask_loaders(
-        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
-    )
-
-
-make_masks = from_loaders(make_mask_loaders)
-
-
-class VideoLoader(ImageLoader):
-    pass
-
-
 def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
     return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
@@ -833,120 +482,6 @@ def make_video_tensor(*args, **kwargs):
     return make_video(*args, **kwargs).as_subclass(torch.Tensor)
 
 
-def make_video_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    *,
-    color_space="RGB",
-    num_frames=3,
-    extra_dims=(),
-    dtype=torch.uint8,
-):
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device, memory_format):
-        *batch_dims, num_frames, _, height, width = shape
-        return make_video(
-            (height, width),
-            num_frames=num_frames,
-            batch_dims=batch_dims,
-            color_space=color_space,
-            dtype=dtype,
-            device=device,
-            memory_format=memory_format,
-        )
-
-    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
-
-
-def make_video_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "RGB",
-    ),
-    num_frames=(1, 0, 3),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8, torch.float32, torch.float64),
-):
-    for params in combinations_grid(
-        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
-    ):
-        yield make_video_loader(**params)
-
-
-make_videos = from_loaders(make_video_loaders)
-
-
-class TestMark:
-    def __init__(
-        self,
-        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
-        # no test class, i.e. a standalone test function, use `None`.
-        test_id,
-        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
-        mark,
-        *,
-        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
-        # applied. If omitted, defaults to always apply.
-        condition=None,
-    ):
-        self.test_id = test_id
-        self.mark = mark
-        self.condition = condition or (lambda args_kwargs: True)
-
-
-def mark_framework_limitation(test_id, reason, condition=None):
-    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
-    # framework cannot handle the kernel in general or a specific parameter combination.
-    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
-    # still justified.
-    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
-    # we are wasting CI resources for no reason for most of the time
-    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
-
-
-class InfoBase:
-    def __init__(
-        self,
-        *,
-        # Identifier if the info that shows up the parametrization.
-        id,
-        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
-        # See the `TestMark` class for details
-        test_marks=None,
-        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
-        # `TestMark`), the dtype, and the device.
-        closeness_kwargs=None,
-    ):
-        self.id = id
-
-        self.test_marks = test_marks or []
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-        self.closeness_kwargs = closeness_kwargs or dict()
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
-
-    def get_closeness_kwargs(self, test_id, *, dtype, device):
-        if not (isinstance(test_id, tuple) and len(test_id) == 2):
-            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
-            if callable(test_id):
-                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
-            else:
-                msg += f", but got {test_id} instead."
-            raise pytest.UsageError(msg)
-        if isinstance(device, torch.device):
-            device = device.type
-        return self.closeness_kwargs.get((test_id, dtype, device), dict())
-
-
 def assert_run_python_script(source_code):
     """Utility to check assertions in an independent Python subprocess.
 
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 8259246c0..acbe1a6a7 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -4,12 +4,12 @@ from typing import Optional, Sequence
 
 import pytest
 import torch
-
-from common_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
 from torch.nn.functional import one_hot
 
 from torchvision.prototype import datapoints
 
+from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
+
 
 @dataclasses.dataclass
 class LabelLoader(TensorLoader):
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 4da2eb393..1aeb23677 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -2,7 +2,7 @@ from copy import deepcopy
 
 import pytest
 import torch
-from common_utils import assert_equal, make_bounding_box, make_image, make_segmentation_mask, make_video
+from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video
 from PIL import Image
 
 from torchvision import datapoints
@@ -68,7 +68,7 @@ def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
     assert datapoint.requires_grad is expected_requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_isinstance(make_input):
     assert isinstance(make_input(), torch.Tensor)
 
@@ -80,7 +80,7 @@ def test_wrapping_no_copy():
     assert image.data_ptr() == tensor.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_to_wrapping(make_input):
     dp = make_input()
 
@@ -90,7 +90,7 @@ def test_to_wrapping(make_input):
     assert dp_to.dtype is torch.float64
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_to_datapoint_reference(make_input, return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
@@ -104,7 +104,7 @@ def test_to_datapoint_reference(make_input, return_type):
     assert type(tensor) is torch.Tensor
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_clone_wrapping(make_input, return_type):
     dp = make_input()
@@ -116,7 +116,7 @@ def test_clone_wrapping(make_input, return_type):
     assert dp_clone.data_ptr() != dp.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_requires_grad__wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float)
@@ -131,7 +131,7 @@ def test_requires_grad__wrapping(make_input, return_type):
     assert dp_requires_grad.requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_detach_wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float).requires_grad_(True)
@@ -170,7 +170,7 @@ def test_force_subclass_with_metadata(return_type):
     datapoints.set_return_type("tensor")
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_other_op_no_wrapping(make_input, return_type):
     dp = make_input()
@@ -182,7 +182,7 @@ def test_other_op_no_wrapping(make_input, return_type):
     assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize(
     "op",
     [
@@ -199,7 +199,7 @@ def test_no_tensor_output_op_no_wrapping(make_input, op):
     assert type(output) is not type(dp)
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_inplace_op_no_wrapping(make_input, return_type):
     dp = make_input()
@@ -212,7 +212,7 @@ def test_inplace_op_no_wrapping(make_input, return_type):
     assert type(dp) is original_type
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_wrap(make_input):
     dp = make_input()
 
@@ -225,7 +225,7 @@ def test_wrap(make_input):
     assert dp_new.data_ptr() == output.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("requires_grad", [False, True])
 def test_deepcopy(make_input, requires_grad):
     dp = make_input(dtype=torch.float)
@@ -242,7 +242,7 @@ def test_deepcopy(make_input, requires_grad):
     assert dp_deepcopied.requires_grad is requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 @pytest.mark.parametrize(
     "op",
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index bf45970df..0410ecadc 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -4,14 +4,7 @@ import PIL.Image
 import pytest
 import torch
 
-from common_utils import (
-    assert_equal,
-    DEFAULT_EXTRA_DIMS,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_video,
-)
+from common_utils import assert_equal
 
 from prototype_common_utils import make_label
 
@@ -19,6 +12,13 @@ from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask
 from torchvision.prototype import datapoints, transforms
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
 from torchvision.transforms.v2.utils import check_type, is_pure_tensor
+from transforms_v2_legacy_utils import (
+    DEFAULT_EXTRA_DIMS,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_video,
+)
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
 
@@ -167,7 +167,7 @@ class TestFixedSizeCrop:
 
         flat_inputs = [
             make_image(size=canvas_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
+            make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -202,7 +202,7 @@ class TestFixedSizeCrop:
             ),
         )
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,))
@@ -240,7 +240,7 @@ class TestFixedSizeCrop:
             ),
         )
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch(
@@ -283,7 +283,7 @@ class TestPermuteDimensions:
     def test_call(self, dims, inverse_dims):
         sample = dict(
             image=make_image(),
-            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -327,7 +327,7 @@ class TestTransposeDimensions:
     def test_call(self, dims):
         sample = dict(
             image=make_image(),
-            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -389,7 +389,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -398,7 +398,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -407,7 +407,7 @@ def test_fixed_sized_crop_against_detection_reference():
 
         datapoint_image = make_image(size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index d7a6f21bb..5752b323f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -11,25 +11,23 @@ import pytest
 import torch
 import torchvision.transforms.v2 as transforms
 
-from common_utils import (
-    assert_equal,
-    assert_run_python_script,
-    cpu_and_cuda,
-    make_bounding_box,
+from common_utils import assert_equal, assert_run_python_script, cpu_and_cuda
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import datapoints
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
+from transforms_v2_legacy_utils import (
     make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_images,
+    make_multiple_bounding_boxes,
     make_segmentation_mask,
     make_video,
     make_videos,
 )
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
-from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import to_pil_image
-from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
 
 
 def make_vanilla_tensor_images(*args, **kwargs):
@@ -45,7 +43,7 @@ def make_pil_images(*args, **kwargs):
 
 
 def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_boxes in make_bounding_boxes(*args, **kwargs):
+    for bounding_boxes in make_multiple_bounding_boxes(*args, **kwargs):
         yield bounding_boxes.data
 
 
@@ -180,13 +178,13 @@ class TestSmoke:
             image_datapoint=make_image(size=canvas_size),
             video_datapoint=make_video(size=canvas_size),
             image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])),
-            bounding_boxes_xyxy=make_bounding_box(
+            bounding_boxes_xyxy=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
             ),
-            bounding_boxes_xywh=make_bounding_box(
+            bounding_boxes_xywh=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
             ),
-            bounding_boxes_cxcywh=make_bounding_box(
+            bounding_boxes_cxcywh=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
             ),
             bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes(
@@ -813,7 +811,7 @@ class TestRandomIoUCrop:
 
         size = (32, 24)
         image = make_image(size)
-        bboxes = make_bounding_box(format="XYXY", canvas_size=size, batch_dims=(6,))
+        bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, batch_dims=(6,))
         masks = make_detection_mask(size, num_objects=6)
 
         sample = [image, bboxes, masks]
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 3196a5fd8..61de769d8 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -12,17 +12,7 @@ import pytest
 
 import torch
 import torchvision.transforms.v2 as v2_transforms
-from common_utils import (
-    ArgsKwargs,
-    assert_close,
-    assert_equal,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_images,
-    make_segmentation_mask,
-    set_rng_seed,
-)
+from common_utils import assert_close, assert_equal, set_rng_seed
 from torch import nn
 from torchvision import datapoints, transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
@@ -32,6 +22,14 @@ from torchvision.transforms.v2 import functional as prototype_F
 from torchvision.transforms.v2._utils import _get_fill
 from torchvision.transforms.v2.functional import to_pil_image
 from torchvision.transforms.v2.utils import query_size
+from transforms_v2_legacy_utils import (
+    ArgsKwargs,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_segmentation_mask,
+)
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
@@ -1090,7 +1088,7 @@ class TestRefDetTransforms:
 
         pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1100,7 +1098,7 @@ class TestRefDetTransforms:
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1110,7 +1108,7 @@ class TestRefDetTransforms:
 
         datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 29ef54d92..15af5a7a9 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -8,16 +8,7 @@ import PIL.Image
 import pytest
 import torch
 
-from common_utils import (
-    assert_close,
-    cache,
-    cpu_and_cuda,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-    make_bounding_boxes,
-    needs_cuda,
-    parametrized_error_message,
-    set_rng_seed,
-)
+from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed
 from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
@@ -27,6 +18,11 @@ from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, con
 from torchvision.transforms.v2.utils import is_pure_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
+from transforms_v2_legacy_utils import (
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    make_multiple_bounding_boxes,
+    parametrized_error_message,
+)
 
 
 KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
@@ -506,7 +502,7 @@ class TestClampBoundingBoxes:
         ],
     )
     def test_pure_tensor_insufficient_metadata(self, metadata):
-        pure_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
             F.clamp_bounding_boxes(pure_tensor, **metadata)
@@ -520,7 +516,7 @@ class TestClampBoundingBoxes:
         ],
     )
     def test_datapoint_explicit_metadata(self, metadata):
-        datapoint = next(make_bounding_boxes())
+        datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
             F.clamp_bounding_boxes(datapoint, **metadata)
@@ -530,8 +526,8 @@ class TestConvertFormatBoundingBoxes:
     @pytest.mark.parametrize(
         ("inpt", "old_format"),
         [
-            (next(make_bounding_boxes()), None),
-            (next(make_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
+            (next(make_multiple_bounding_boxes()), None),
+            (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
         ],
     )
     def test_missing_new_format(self, inpt, old_format):
@@ -539,13 +535,13 @@ class TestConvertFormatBoundingBoxes:
             F.convert_format_bounding_boxes(inpt, old_format)
 
     def test_pure_tensor_insufficient_metadata(self):
-        pure_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
             F.convert_format_bounding_boxes(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
 
     def test_datapoint_explicit_metadata(self):
-        datapoint = next(make_bounding_boxes())
+        datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
             F.convert_format_bounding_boxes(
@@ -736,7 +732,7 @@ def test_correctness_pad_bounding_boxes(device, padding):
         height, width = bbox.canvas_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
@@ -822,7 +818,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
         output_bboxes = F.perspective_bounding_boxes(
@@ -870,7 +866,7 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
         out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index c51b7c755..f57736e5a 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -19,7 +19,7 @@ from common_utils import (
     cpu_and_cuda,
     freeze_rng_state,
     ignore_jit_no_profile_information_warning,
-    make_bounding_box,
+    make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_image_pil,
@@ -456,7 +456,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=format,
             canvas_size=self.INPUT_SIZE,
             dtype=dtype,
@@ -481,7 +481,7 @@ class TestResize:
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, size, make_input):
         check_functional(
@@ -514,7 +514,7 @@ class TestResize:
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -579,7 +579,7 @@ class TestResize:
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(format=format, canvas_size=self.INPUT_SIZE)
+        bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE)
 
         actual = fn(bounding_boxes, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
@@ -618,7 +618,7 @@ class TestResize:
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -687,7 +687,7 @@ class TestResize:
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -714,7 +714,7 @@ class TestResize:
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -743,7 +743,7 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         check_kernel(
             F.horizontal_flip_bounding_boxes,
             bounding_boxes,
@@ -760,7 +760,7 @@ class TestHorizontalFlip:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.horizontal_flip, make_input())
@@ -781,7 +781,7 @@ class TestHorizontalFlip:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -821,7 +821,7 @@ class TestHorizontalFlip:
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_boxes_correctness(self, format, fn):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
         expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
@@ -830,7 +830,7 @@ class TestHorizontalFlip:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
@@ -917,7 +917,7 @@ class TestAffine:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         self._check_kernel(
             F.affine_bounding_boxes,
             bounding_boxes,
@@ -936,7 +936,7 @@ class TestAffine:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
@@ -957,7 +957,7 @@ class TestAffine:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1076,7 +1076,7 @@ class TestAffine:
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = F.affine(
             bounding_boxes,
@@ -1101,7 +1101,7 @@ class TestAffine:
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, center, seed):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
@@ -1208,7 +1208,7 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         check_kernel(
             F.vertical_flip_bounding_boxes,
             bounding_boxes,
@@ -1225,7 +1225,7 @@ class TestVerticalFlip:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.vertical_flip, make_input())
@@ -1246,7 +1246,7 @@ class TestVerticalFlip:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1282,7 +1282,7 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_boxes_correctness(self, format, fn):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
         expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
@@ -1291,7 +1291,7 @@ class TestVerticalFlip:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
@@ -1356,7 +1356,7 @@ class TestRotate:
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
 
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.rotate_bounding_boxes,
@@ -1375,7 +1375,7 @@ class TestRotate:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
@@ -1396,7 +1396,7 @@ class TestRotate:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1490,7 +1490,7 @@ class TestRotate:
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center)
         expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
@@ -1503,7 +1503,7 @@ class TestRotate:
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
 
@@ -1652,7 +1652,7 @@ class TestToDtype:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
     @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
@@ -1727,7 +1727,7 @@ class TestToDtype:
         mask_dtype = torch.bool
         sample = {
             "inpt": make_input(size=(H, W), dtype=inpt_dtype),
-            "bbox": make_bounding_box(canvas_size=(H, W), dtype=bbox_dtype),
+            "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype),
             "mask": make_detection_mask(size=(H, W), dtype=mask_dtype),
         }
 
@@ -2013,7 +2013,7 @@ class TestShapeGetters:
             (F.get_size_image, make_image_tensor),
             (F._get_size_image_pil, make_image_pil),
             (F.get_size_image, make_image),
-            (F.get_size_bounding_boxes, make_bounding_box),
+            (F.get_size_bounding_boxes, make_bounding_boxes),
             (F.get_size_mask, make_detection_mask),
             (F.get_size_mask, make_segmentation_mask),
             (F.get_size_video, make_video),
@@ -2043,15 +2043,15 @@ class TestShapeGetters:
     @pytest.mark.parametrize(
         ("functional", "make_input"),
         [
-            (F.get_dimensions, make_bounding_box),
+            (F.get_dimensions, make_bounding_boxes),
             (F.get_dimensions, make_detection_mask),
             (F.get_dimensions, make_segmentation_mask),
-            (F.get_num_channels, make_bounding_box),
+            (F.get_num_channels, make_bounding_boxes),
             (F.get_num_channels, make_detection_mask),
             (F.get_num_channels, make_segmentation_mask),
             (F.get_num_frames, make_image_pil),
             (F.get_num_frames, make_image),
-            (F.get_num_frames, make_bounding_box),
+            (F.get_num_frames, make_bounding_boxes),
             (F.get_num_frames, make_detection_mask),
             (F.get_num_frames, make_segmentation_mask),
         ],
@@ -2290,7 +2290,7 @@ class TestElastic:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.elastic_bounding_boxes,
@@ -2311,7 +2311,7 @@ class TestElastic:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         input = make_input()
@@ -2333,7 +2333,7 @@ class TestElastic:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_displacement_error(self, make_input):
         input = make_input()
@@ -2346,7 +2346,7 @@ class TestElastic:
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
     @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
@@ -2363,7 +2363,7 @@ class TestToPureTensor:
             "img_pil": make_image_pil(),
             "mask": make_detection_mask(),
             "video": make_video(),
-            "bbox": make_bounding_box(),
+            "bbox": make_bounding_boxes(),
             "str": "str",
         }
 
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 0cfe0db70..55825d652 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image
+from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_pil_image
@@ -12,7 +12,7 @@ from torchvision.transforms.v2.utils import has_all, has_any
 
 
 IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
-BOUNDING_BOX = make_bounding_box(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
+BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
 MASK = make_detection_mask(DEFAULT_SIZE)
 
 
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 903518627..375c30732 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -2,9 +2,9 @@ import collections.abc
 
 import pytest
 import torchvision.transforms.v2.functional as F
-from common_utils import InfoBase, TestMark
 from torchvision import datapoints
 from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
+from transforms_v2_legacy_utils import InfoBase, TestMark
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index acb9a8577..33813b651 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -7,7 +7,9 @@ import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.transforms.v2.functional as F
-from common_utils import (
+from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
+from transforms_v2_legacy_utils import (
     ArgsKwargs,
     combinations_grid,
     DEFAULT_PORTRAIT_SPATIAL_SIZE,
@@ -26,8 +28,6 @@ from common_utils import (
     mark_framework_limitation,
     TestMark,
 )
-from torchvision import datapoints
-from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py
new file mode 100644
index 000000000..bb8943a88
--- /dev/null
+++ b/test/transforms_v2_legacy_utils.py
@@ -0,0 +1,633 @@
+"""
+As the name implies, these are legacy utilities that are hopefully removed soon. The future of
+transforms v2 testing is in test/test_transforms_v2_refactored.py. All new test should be
+implemented there and must not use any of the utilities here.
+
+The following legacy modules depend on this module
+
+- transforms_v2_kernel_infos.py
+- transforms_v2_dispatcher_infos.py
+- test_transforms_v2_functional.py
+- test_transforms_v2_consistency.py
+- test_transforms.py
+
+When all the logic is ported from the files above to test_transforms_v2_refactored.py, delete
+all the legacy modules including this one and drop the _refactored prefix from the name.
+"""
+
+import collections.abc
+import dataclasses
+import enum
+import itertools
+import pathlib
+from collections import defaultdict
+from typing import Callable, Sequence, Tuple, Union
+
+import PIL.Image
+import pytest
+import torch
+
+from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+DEFAULT_SIZE = (17, 11)
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return datapoints.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=datapoints.BoundingBoxFormat.XYXY,
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    if any(dim == 0 for dim in batch_dims):
+        return datapoints.BoundingBoxes(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
+        )
+
+    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+
+    if format is datapoints.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is datapoints.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return datapoints.BoundingBoxes(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
+    )
+
+
+def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+)
+
+
+def _parse_size(size, *, name="size"):
+    if size == "random":
+        raise ValueError("This should never happen")
+    elif isinstance(size, int) and size > 0:
+        return (size, size)
+    elif (
+        isinstance(size, collections.abc.Sequence)
+        and len(size) == 2
+        and all(isinstance(length, int) and length > 0 for length in size)
+    ):
+        return tuple(size)
+    else:
+        raise pytest.UsageError(
+            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
+            f"but got {size} instead."
+        )
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
+
+
+def from_loader(loader_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loader = loader_fn(*args, **kwargs)
+        return loader.load(device)
+
+    return wrapper
+
+
+def from_loaders(loaders_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loaders = loaders_fn(*args, **kwargs)
+        for loader in loaders:
+            yield loader.load(device)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class TensorLoader:
+    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
+    shape: Sequence[int]
+    dtype: torch.dtype
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device)
+
+
+@dataclasses.dataclass
+class ImageLoader(TensorLoader):
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    num_channels: int = dataclasses.field(init=False)
+    memory_format: torch.memory_format = torch.contiguous_format
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.spatial_size = self.canvas_size = self.shape[-2:]
+        self.num_channels = self.shape[-3]
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
+
+
+def make_image_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    extra_dims=(),
+    dtype=torch.float32,
+    constant_alpha=True,
+    memory_format=torch.contiguous_format,
+):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, _, height, width = shape
+        return make_image(
+            (height, width),
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
+    ),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.float32, torch.float64, torch.uint8),
+    constant_alpha=True,
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
+        yield make_image_loader(**params, constant_alpha=constant_alpha)
+
+
+make_images = from_loaders(make_image_loaders)
+
+
+def make_image_loader_for_interpolation(
+    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+):
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = to_image(image_pil)
+        if memory_format == torch.contiguous_format:
+            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
+        else:
+            image_tensor = image_tensor.to(device=device)
+        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
+
+        return datapoints.Image(image_tensor)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=("RGB",),
+    dtypes=(torch.uint8,),
+    memory_formats=(torch.contiguous_format, torch.channels_last),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
+        yield make_image_loader_for_interpolation(**params)
+
+
+@dataclasses.dataclass
+class BoundingBoxesLoader(TensorLoader):
+    format: datapoints.BoundingBoxFormat
+    spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.canvas_size = self.spatial_size
+
+
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    spatial_size = _parse_size(spatial_size, name="spatial_size")
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_coordinates = shape
+        if num_coordinates != 4:
+            raise pytest.UsageError()
+
+        return make_bounding_boxes(
+            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
+
+
+def make_bounding_box_loaders(
+    *,
+    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
+    formats=tuple(datapoints.BoundingBoxFormat),
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    dtypes=(torch.float32, torch.float64, torch.int64),
+):
+    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+
+
+make_multiple_bounding_boxes = from_loaders(make_bounding_box_loaders)
+
+
+class MaskLoader(TensorLoader):
+    pass
+
+
+def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
+    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
+
+
+def make_detection_mask_loaders(
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
+        yield make_detection_mask_loader(**params)
+
+
+make_detection_masks = from_loaders(make_detection_mask_loaders)
+
+
+def make_segmentation_mask_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
+):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+
+
+def make_segmentation_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_segmentation_mask_loader(**params)
+
+
+make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
+
+
+def make_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
+    yield from make_segmentation_mask_loaders(
+        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
+    )
+
+
+make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    num_frames=3,
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            batch_dims=batch_dims,
+            color_space=color_space,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "RGB",
+    ),
+    num_frames=(1, 0, 3),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8, torch.float32, torch.float64),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason, condition=None):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
+
+
+class InfoBase:
+    def __init__(
+        self,
+        *,
+        # Identifier if the info that shows up the parametrization.
+        id,
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
+
+        self.test_marks = test_marks or []
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
+
+
+class ArgsKwargs:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __iter__(self):
+        yield self.args
+        yield self.kwargs
+
+    def load(self, device="cpu"):
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
+
+
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper
-- 
GitLab


From 90e0b79229ae68ad869515cfb1e556ae018233c3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 18 Aug 2023 12:38:55 +0100
Subject: [PATCH 578/624] Remove import-time warning for v2 namespaces (#7853)

---
 docs/source/conf.py                           |  3 --
 .../v2_transforms/plot_custom_datapoints.py   |  6 ---
 .../v2_transforms/plot_custom_transforms.py   |  6 ---
 gallery/v2_transforms/plot_cutmix_mixup.py    |  6 ---
 gallery/v2_transforms/plot_datapoints.py      |  6 ---
 gallery/v2_transforms/plot_transforms_v2.py   |  4 --
 .../v2_transforms/plot_transforms_v2_e2e.py   | 12 +----
 test/conftest.py                              |  4 --
 test/test_transforms_v2.py                    | 52 +------------------
 torchvision/__init__.py                       | 17 ------
 torchvision/datapoints/__init__.py            |  6 ---
 torchvision/datasets/__init__.py              |  5 +-
 torchvision/transforms/v2/__init__.py         |  7 ---
 13 files changed, 5 insertions(+), 129 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4a331b6cd..de2adaa81 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -34,9 +34,6 @@ from tabulate import tabulate
 
 sys.path.append(os.path.abspath("."))
 
-torchvision.disable_beta_transforms_warning()
-import torchvision.datapoints  # Don't remove, otherwise the docs for datapoints aren't linked properly
-
 # -- General configuration ------------------------------------------------
 
 # Required version of sphinx is set from docs/requirements.txt
diff --git a/gallery/v2_transforms/plot_custom_datapoints.py b/gallery/v2_transforms/plot_custom_datapoints.py
index dcad5f0a4..fb72222ba 100644
--- a/gallery/v2_transforms/plot_custom_datapoints.py
+++ b/gallery/v2_transforms/plot_custom_datapoints.py
@@ -11,12 +11,6 @@ Torchvision v2 transforms. Before continuing, make sure you have read
 
 # %%
 import torch
-import torchvision
-
-# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
-# some APIs may slightly change in the future
-torchvision.disable_beta_transforms_warning()
-
 from torchvision import datapoints
 from torchvision.transforms import v2
 
diff --git a/gallery/v2_transforms/plot_custom_transforms.py b/gallery/v2_transforms/plot_custom_transforms.py
index eba8e91fa..18b6c82b2 100644
--- a/gallery/v2_transforms/plot_custom_transforms.py
+++ b/gallery/v2_transforms/plot_custom_transforms.py
@@ -9,12 +9,6 @@ torchvision transforms V2 API.
 
 # %%
 import torch
-import torchvision
-
-# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
-# some APIs may slightly change in the future
-torchvision.disable_beta_transforms_warning()
-
 from torchvision import datapoints
 from torchvision.transforms import v2
 
diff --git a/gallery/v2_transforms/plot_cutmix_mixup.py b/gallery/v2_transforms/plot_cutmix_mixup.py
index 932ce325b..55799b178 100644
--- a/gallery/v2_transforms/plot_cutmix_mixup.py
+++ b/gallery/v2_transforms/plot_cutmix_mixup.py
@@ -17,13 +17,7 @@ function.
 
 # %%
 import torch
-import torchvision
 from torchvision.datasets import FakeData
-
-# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
-# some APIs may slightly change in the future
-torchvision.disable_beta_transforms_warning()
-
 from torchvision.transforms import v2
 
 
diff --git a/gallery/v2_transforms/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
index 0bab2d340..1e696af8e 100644
--- a/gallery/v2_transforms/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -24,12 +24,6 @@ and how they behave.
 import PIL.Image
 
 import torch
-import torchvision
-
-# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
-# some APIs may slightly change in the future
-torchvision.disable_beta_transforms_warning()
-
 from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F
 
diff --git a/gallery/v2_transforms/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
index e6c8b3ffd..144940286 100644
--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -11,7 +11,6 @@ example showcases the core functionality of the new ``torchvision.transforms.v2`
 import pathlib
 
 import torch
-import torchvision
 
 
 def load_data():
@@ -42,9 +41,6 @@ def load_data():
 # detection or instance and semantic segmentation. Still, the interface is the same, making
 # :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1.
 
-# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
-# some APIs may slightly change in the future
-torchvision.disable_beta_transforms_warning()
 import torchvision.transforms.v2 as transforms
 
 transform = transforms.Compose(
diff --git a/gallery/v2_transforms/plot_transforms_v2_e2e.py b/gallery/v2_transforms/plot_transforms_v2_e2e.py
index e6a36ebbf..51e3baff1 100644
--- a/gallery/v2_transforms/plot_transforms_v2_e2e.py
+++ b/gallery/v2_transforms/plot_transforms_v2_e2e.py
@@ -16,7 +16,8 @@ import PIL.Image
 import torch
 import torch.utils.data
 
-import torchvision
+from torchvision import models, datasets
+import torchvision.transforms.v2 as transforms
 
 
 def show(sample):
@@ -39,19 +40,10 @@ def show(sample):
     fig.show()
 
 
-# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
-# some APIs may slightly change in the future
-torchvision.disable_beta_transforms_warning()
-
-from torchvision import models, datasets
-import torchvision.transforms.v2 as transforms
-
-
 # %%
 # We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
 # returns, and we'll see how to convert it to a format that is compatible with our new transforms.
 
-
 def load_example_coco_detection_dataset(**kwargs):
     # This loads fake data for illustration purposes of this example. In practice, you'll have
     # to replace this with the proper data
diff --git a/test/conftest.py b/test/conftest.py
index a54028bc7..ea73b09b9 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,10 +3,6 @@ import random
 import numpy as np
 import pytest
 import torch
-import torchvision
-
-
-torchvision.disable_beta_transforms_warning()
 
 from common_utils import (
     CUDA_NOT_AVAILABLE_MSG,
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 5752b323f..9630132e2 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1,7 +1,6 @@
 import itertools
 import pathlib
 import random
-import textwrap
 import warnings
 
 import numpy as np
@@ -11,7 +10,7 @@ import pytest
 import torch
 import torchvision.transforms.v2 as transforms
 
-from common_utils import assert_equal, assert_run_python_script, cpu_and_cuda
+from common_utils import assert_equal, cpu_and_cuda
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints
 from torchvision.ops.boxes import box_iou
@@ -1279,55 +1278,6 @@ def test_sanitize_bounding_boxes_errors():
         transforms.SanitizeBoundingBoxes()(different_sizes)
 
 
-@pytest.mark.parametrize(
-    "import_statement",
-    (
-        "from torchvision.transforms import v2",
-        "import torchvision.transforms.v2",
-        "from torchvision.transforms.v2 import Resize",
-        "import torchvision.transforms.v2.functional",
-        "from torchvision.transforms.v2.functional import resize",
-        "from torchvision import datapoints",
-        "from torchvision.datapoints import Image",
-        "from torchvision.datasets import wrap_dataset_for_transforms_v2",
-    ),
-)
-@pytest.mark.parametrize("call_disable_warning", (True, False))
-def test_warnings_v2_namespaces(import_statement, call_disable_warning):
-    if call_disable_warning:
-        source = f"""
-        import warnings
-        import torchvision
-        torchvision.disable_beta_transforms_warning()
-        with warnings.catch_warnings():
-            warnings.simplefilter("error")
-            {import_statement}
-        """
-    else:
-        source = f"""
-        import pytest
-        with pytest.warns(UserWarning, match="v2 namespaces are still Beta"):
-            {import_statement}
-        """
-    assert_run_python_script(textwrap.dedent(source))
-
-
-def test_no_warnings_v1_namespace():
-    source = """
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("error")
-        import torchvision.transforms
-        from torchvision import transforms
-        import torchvision.transforms.functional
-        from torchvision.transforms import Resize
-        from torchvision.transforms.functional import resize
-        from torchvision import datasets
-        from torchvision.datasets import ImageNet
-    """
-    assert_run_python_script(textwrap.dedent(source))
-
-
 class TestLambda:
     inputs = pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0])
 
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 3ce050e4d..e44301c1a 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -95,20 +95,3 @@ def get_video_backend():
 
 def _is_tracing():
     return torch._C._get_tracing_state()
-
-
-_WARN_ABOUT_BETA_TRANSFORMS = True
-_BETA_TRANSFORMS_WARNING = (
-    "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. "
-    "While we do not expect major breaking changes, some APIs may still change "
-    "according to user feedback. Please submit any feedback you may have in "
-    "this issue: https://github.com/pytorch/vision/issues/6753, and you can also "
-    "check out https://github.com/pytorch/vision/issues/7319 to learn more about "
-    "the APIs that we suspect might involve future changes. "
-    "You can silence this warning by calling torchvision.disable_beta_transforms_warning()."
-)
-
-
-def disable_beta_transforms_warning():
-    global _WARN_ABOUT_BETA_TRANSFORMS
-    _WARN_ABOUT_BETA_TRANSFORMS = False
diff --git a/torchvision/datapoints/__init__.py b/torchvision/datapoints/__init__.py
index dcb110f74..512a8d606 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/datapoints/__init__.py
@@ -1,5 +1,4 @@
 import torch
-from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
 
 from ._bounding_box import BoundingBoxes, BoundingBoxFormat
 from ._datapoint import Datapoint
@@ -8,11 +7,6 @@ from ._mask import Mask
 from ._torch_function_helpers import set_return_type
 from ._video import Video
 
-if _WARN_ABOUT_BETA_TRANSFORMS:
-    import warnings
-
-    warnings.warn(_BETA_TRANSFORMS_WARNING)
-
 
 def wrap(wrappee, *, like, **kwargs):
     """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.datapoints.Datapoint` subclass as ``like``.
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 7d3357e3d..43b0801d4 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -127,14 +127,13 @@ __all__ = (
     "SintelStereo",
     "InStereo2k",
     "ETH3DStereo",
+    "wrap_dataset_for_transforms_v2",
 )
 
 
 # We override current module's attributes to handle the import:
 # from torchvision.datasets import wrap_dataset_for_transforms_v2
-# with beta state v2 warning from torchvision.datapoints
-# We also want to avoid raising the warning when importing other attributes
-# from torchvision.datasets
+# without a cyclic error.
 # Ref: https://peps.python.org/pep-0562/
 def __getattr__(name):
     if name in ("wrap_dataset_for_transforms_v2",):
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index bc15c96b5..b60962748 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -55,10 +55,3 @@ from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
 
 from ._deprecated import ToTensor  # usort: skip
-
-from torchvision import _BETA_TRANSFORMS_WARNING, _WARN_ABOUT_BETA_TRANSFORMS
-
-if _WARN_ABOUT_BETA_TRANSFORMS:
-    import warnings
-
-    warnings.warn(_BETA_TRANSFORMS_WARNING)
-- 
GitLab


From 59b27ed64cf126357d60e8f2944d204f83075e2e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 18 Aug 2023 13:24:52 +0100
Subject: [PATCH 579/624] Add links to collab (#7854)

---
 .github/workflows/docs.yml                    |  4 ++++
 docs/source/conf.py                           | 20 +++++++++++++++++++
 gallery/others/plot_optical_flow.py           |  4 ++++
 .../others/plot_repurposing_annotations.py    |  4 ++++
 .../others/plot_scripted_tensor_transforms.py |  4 ++++
 gallery/others/plot_transforms.py             |  4 ++++
 gallery/others/plot_video_api.py              |  8 ++++++--
 gallery/others/plot_visualization_utils.py    |  4 ++++
 .../v2_transforms/plot_custom_datapoints.py   |  4 ++++
 .../v2_transforms/plot_custom_transforms.py   |  4 ++++
 gallery/v2_transforms/plot_cutmix_mixup.py    |  4 ++++
 gallery/v2_transforms/plot_datapoints.py      |  5 ++++-
 gallery/v2_transforms/plot_transforms_v2.py   |  4 ++++
 .../v2_transforms/plot_transforms_v2_e2e.py   |  4 ++++
 14 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 196d80690..779da13e3 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -56,6 +56,10 @@ jobs:
         sed -i -e 's/-j auto/-j 1/' Makefile
         make html
 
+        # Below is an imperfect way for us to add "try on collab" links to all of our gallery examples.
+        # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in
+        # build/html/_downloads/<some_hash>/<example_name>.ipynb
+        # We copy all those ipynb files in a more convenient folder so that we can more easily link to them.
         mkdir build/html/_generated_ipynb_notebooks
         for file in `find build/html/_downloads`; do
           if [[ $file == *.ipynb ]]; then
diff --git a/docs/source/conf.py b/docs/source/conf.py
index de2adaa81..639ff2c68 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -56,6 +56,26 @@ extensions = [
     "beta_status",
 ]
 
+# We override sphinx-gallery's example header to prevent sphinx-gallery from
+# creating a note at the top of the renderred notebook.
+# https://github.com/sphinx-gallery/sphinx-gallery/blob/451ccba1007cc523f39cbcc960ebc21ca39f7b75/sphinx_gallery/gen_rst.py#L1267-L1271
+# This is because we also want to add a link to google collab, so we write our own note in each example.
+from sphinx_gallery import gen_rst
+
+gen_rst.EXAMPLE_HEADER = """
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "{0}"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_{1}:
+
+"""
+
+
 sphinx_gallery_conf = {
     "examples_dirs": "../../gallery/",  # path to your example scripts
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
diff --git a/gallery/others/plot_optical_flow.py b/gallery/others/plot_optical_flow.py
index 499f8c663..bc734a6e0 100644
--- a/gallery/others/plot_optical_flow.py
+++ b/gallery/others/plot_optical_flow.py
@@ -3,6 +3,10 @@
 Optical Flow: Predicting movement with the RAFT model
 =====================================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_optical_flow.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_optical_flow.py>` to download the full example code.
+
 Optical flow is the task of predicting movement between two images, usually two
 consecutive frames of a video. Optical flow models take two images as input, and
 predict a flow: the flow indicates the displacement of every single pixel in the
diff --git a/gallery/others/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py
index f47c30181..b1617cacd 100644
--- a/gallery/others/plot_repurposing_annotations.py
+++ b/gallery/others/plot_repurposing_annotations.py
@@ -3,6 +3,10 @@
 Repurposing masks into bounding boxes
 =====================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_repurposing_annotations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_repurposing_annotations.py>` to download the full example code.
+
 The following example illustrates the operations available
 the :ref:`torchvision.ops <ops>` module for repurposing
 segmentation masks into object localization annotations for different tasks
diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
index 5bf48d69f..85b332c4c 100644
--- a/gallery/others/plot_scripted_tensor_transforms.py
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -3,6 +3,10 @@
 Tensor transforms and JIT
 =========================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_scripted_tensor_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_scripted_tensor_transforms.py>` to download the full example code.
+
 This example illustrates various features that are now supported by the
 :ref:`image transformations <transforms>` on Tensor images. In particular, we
 show how image transforms can be performed on GPU, and how one can also script
diff --git a/gallery/others/plot_transforms.py b/gallery/others/plot_transforms.py
index 2cb0e3469..9702bc9c3 100644
--- a/gallery/others/plot_transforms.py
+++ b/gallery/others/plot_transforms.py
@@ -3,6 +3,10 @@
 Illustration of transforms
 ==========================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_transforms.py>` to download the full example code.
+
 This example illustrates the various transforms available in :ref:`the
 torchvision.transforms module <transforms>`.
 """
diff --git a/gallery/others/plot_video_api.py b/gallery/others/plot_video_api.py
index aa3a620a6..ac9eb0ba2 100644
--- a/gallery/others/plot_video_api.py
+++ b/gallery/others/plot_video_api.py
@@ -1,7 +1,11 @@
 """
-=======================
+=========
 Video API
-=======================
+=========
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_video_api.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_video_api.py>` to download the full example code.
 
 This example illustrates some of the APIs that torchvision offers for
 videos, together with the examples on how to build datasets and more.
diff --git a/gallery/others/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py
index bb3d1c8bc..98089c54d 100644
--- a/gallery/others/plot_visualization_utils.py
+++ b/gallery/others/plot_visualization_utils.py
@@ -3,6 +3,10 @@
 Visualization utilities
 =======================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_visualization_utils.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_visualization_utils.py>` to download the full example code.
+
 This example illustrates some of the utilities that torchvision offers for
 visualizing images, bounding boxes, segmentation masks and keypoints.
 """
diff --git a/gallery/v2_transforms/plot_custom_datapoints.py b/gallery/v2_transforms/plot_custom_datapoints.py
index fb72222ba..0859adb6d 100644
--- a/gallery/v2_transforms/plot_custom_datapoints.py
+++ b/gallery/v2_transforms/plot_custom_datapoints.py
@@ -3,6 +3,10 @@
 How to write your own Datapoint class
 =====================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_datapoints.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_custom_datapoints.py>` to download the full example code.
+
 This guide is intended for advanced users and downstream library maintainers. We explain how to
 write your own datapoint class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
diff --git a/gallery/v2_transforms/plot_custom_transforms.py b/gallery/v2_transforms/plot_custom_transforms.py
index 18b6c82b2..ababcd496 100644
--- a/gallery/v2_transforms/plot_custom_transforms.py
+++ b/gallery/v2_transforms/plot_custom_transforms.py
@@ -3,6 +3,10 @@
 How to write your own v2 transforms
 ===================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_custom_transforms.py>` to download the full example code.
+
 This guide explains how to write transforms that are compatible with the
 torchvision transforms V2 API.
 """
diff --git a/gallery/v2_transforms/plot_cutmix_mixup.py b/gallery/v2_transforms/plot_cutmix_mixup.py
index 55799b178..6bf21933d 100644
--- a/gallery/v2_transforms/plot_cutmix_mixup.py
+++ b/gallery/v2_transforms/plot_cutmix_mixup.py
@@ -4,6 +4,10 @@
 How to use CutMix and MixUp
 ===========================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_cutmix_mixup.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_cutmix_mixup.py>` to download the full example code.
+
 :class:`~torchvision.transforms.v2.CutMix` and
 :class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
 that can improve classification accuracy.
diff --git a/gallery/v2_transforms/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
index 1e696af8e..5b5f57b50 100644
--- a/gallery/v2_transforms/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -3,7 +3,10 @@
 Datapoints FAQ
 ==============
 
-https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/_generated_ipynb_notebooks/plot_datapoints.ipynb
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_datapoints.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_datapoints.py>` to download the full example code.
+
 
 Datapoints are Tensor subclasses introduced together with
 ``torchvision.transforms.v2``. This example showcases what these datapoints are
diff --git a/gallery/v2_transforms/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
index 144940286..efa83a161 100644
--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -3,6 +3,10 @@
 Getting started with transforms v2
 ==================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2.py>` to download the full example code.
+
 Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports
 images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This
 example showcases the core functionality of the new ``torchvision.transforms.v2`` API.
diff --git a/gallery/v2_transforms/plot_transforms_v2_e2e.py b/gallery/v2_transforms/plot_transforms_v2_e2e.py
index 51e3baff1..84e3e95ee 100644
--- a/gallery/v2_transforms/plot_transforms_v2_e2e.py
+++ b/gallery/v2_transforms/plot_transforms_v2_e2e.py
@@ -3,6 +3,10 @@
 Transforms v2: End-to-end object detection example
 ==================================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2_e2e.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2_e2e.py>` to download the full example code.
+
 Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
 ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example
 showcases an end-to-end object detection training using the stable ``torchvision.datasets`` and ``torchvision.models``
-- 
GitLab


From a7501e13087ec74af9f52ec155ec1948f6318c90 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 18 Aug 2023 15:00:52 +0200
Subject: [PATCH 580/624] remove batch_dims from make bounding boxes and
 detection masks (#7855)

---
 test/common_utils.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index c815786b5..61f069948 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -406,26 +406,21 @@ def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
-    batch_dims=(),
     dtype=None,
     device="cpu",
 ):
     def sample_position(values, max_value):
         # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
         # However, if we have batch_dims, we need tensors as limits.
-        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()])
 
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
     dtype = dtype or torch.float32
 
-    if any(dim == 0 for dim in batch_dims):
-        return datapoints.BoundingBoxes(
-            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
-        )
-
-    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    num_objects = 1
+    h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size]
     y = sample_position(h, canvas_size[0])
     x = sample_position(w, canvas_size[1])
 
@@ -448,11 +443,12 @@ def make_bounding_boxes(
     )
 
 
-def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    num_objects = 1
     return datapoints.Mask(
         torch.testing.make_tensor(
-            (*batch_dims, num_objects, *size),
+            (num_objects, *size),
             low=0,
             high=2,
             dtype=dtype or torch.bool,
-- 
GitLab


From 965bcabfb906ca020df4b18fef520a5ce31245d1 Mon Sep 17 00:00:00 2001
From: Illia Vysochyn <73446357+ivysochyn@users.noreply.github.com>
Date: Mon, 21 Aug 2023 10:24:32 +0200
Subject: [PATCH 581/624] Fix typos in docstrings (#7858)

---
 cmake/iOS.cmake                                 | 6 +++---
 docs/source/models/fcos.rst                     | 2 +-
 docs/source/models/retinanet.rst                | 2 +-
 docs/source/models/vgg.rst                      | 2 +-
 gallery/others/plot_optical_flow.py             | 4 ++--
 gallery/v2_transforms/plot_custom_transforms.py | 2 +-
 test/test_models.py                             | 2 +-
 torchvision/datapoints/_dataset_wrapper.py      | 6 +++---
 torchvision/datasets/_stereo_matching.py        | 2 +-
 torchvision/io/video_reader.py                  | 6 +++---
 torchvision/transforms/v2/_geometry.py          | 2 +-
 11 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake
index d42ea4c92..935c57f11 100644
--- a/cmake/iOS.cmake
+++ b/cmake/iOS.cmake
@@ -10,11 +10,11 @@
 #   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
 #
 # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   By default this location is automatically chosen based on the IOS_PLATFORM value above.
 #   If set manually, it will override the default location and force the user of a particular Developer Platform
 #
 # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+#   By default this location is automatically chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
 #   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
 #   If set manually, this will force the use of a specific SDK version
 
@@ -100,7 +100,7 @@ if(IOS_DEPLOYMENT_TARGET)
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
 endif()
 
-# Hidden visibilty is required for cxx on iOS
+# Hidden visibility is required for cxx on iOS
 set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
 set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden")
 
diff --git a/docs/source/models/fcos.rst b/docs/source/models/fcos.rst
index 4673d312e..085f26549 100644
--- a/docs/source/models/fcos.rst
+++ b/docs/source/models/fcos.rst
@@ -12,7 +12,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a FCOS model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.fcos.FCOS`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/fcos.py>`_ for
 more details about this class.
diff --git a/docs/source/models/retinanet.rst b/docs/source/models/retinanet.rst
index 8613ae9aa..910692ef3 100644
--- a/docs/source/models/retinanet.rst
+++ b/docs/source/models/retinanet.rst
@@ -12,7 +12,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a RetinaNet model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.retinanet.RetinaNet`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py>`_ for
 more details about this class.
diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst
index a9fa9aabf..77b568692 100644
--- a/docs/source/models/vgg.rst
+++ b/docs/source/models/vgg.rst
@@ -11,7 +11,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a VGG model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.vgg.VGG`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_ for
 more details about this class.
diff --git a/gallery/others/plot_optical_flow.py b/gallery/others/plot_optical_flow.py
index bc734a6e0..3ab144934 100644
--- a/gallery/others/plot_optical_flow.py
+++ b/gallery/others/plot_optical_flow.py
@@ -134,7 +134,7 @@ print(f"length = {len(list_of_flows)} = number of iterations of the model")
 # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
 # in the model. For more details on the iterative nature of the model, please
 # refer to the `original paper <https://arxiv.org/abs/2003.12039>`_. Here, we
-# are only interested in the final predicted flows (they are the most acccurate
+# are only interested in the final predicted flows (they are the most accurate
 # ones), so we will just retrieve the last item in the list.
 #
 # As described above, a flow is a tensor with dimensions (2, H, W) (or (N, 2, H,
@@ -151,7 +151,7 @@ print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
 # %%
 # Visualizing predicted flows
 # ---------------------------
-# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
+# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utility to
 # convert a flow into an RGB image. It also supports batches of flows.
 # each "direction" in the flow will be mapped to a given RGB color. In the
 # images below, pixels with similar colors are assumed by the model to be moving
diff --git a/gallery/v2_transforms/plot_custom_transforms.py b/gallery/v2_transforms/plot_custom_transforms.py
index ababcd496..912ddf323 100644
--- a/gallery/v2_transforms/plot_custom_transforms.py
+++ b/gallery/v2_transforms/plot_custom_transforms.py
@@ -84,7 +84,7 @@ print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_lab
 # In the section above, we have assumed that you already know the structure of
 # your inputs and that you're OK with hard-coding this expected structure in
 # your code. If you want your custom transforms to be as flexible as possible,
-# this can be a bit limitting.
+# this can be a bit limiting.
 #
 # A key feature of the builtin Torchvision V2 transforms is that they can accept
 # arbitrary input structure and return the same structure as output (with
diff --git a/test/test_models.py b/test/test_models.py
index 67eb2115c..76bddebef 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -1037,7 +1037,7 @@ def test_raft(model_fn, scripted):
     torch.manual_seed(0)
 
     # We need very small images, otherwise the pickle size would exceed the 50KB
-    # As a resut we need to override the correlation pyramid to not downsample
+    # As a result we need to override the correlation pyramid to not downsample
     # too much, otherwise we would get nan values (effective H and W would be
     # reduced to 1)
     corr_block = models.optical_flow.raft.CorrBlock(num_levels=2, radius=2)
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index f1e785726..3f1c41deb 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -37,17 +37,17 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
         * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
           returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
           ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``.
-          The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the
+          The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the
           ``"image_id"``, ``"boxes"``, and ``"labels"``.
         * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
           the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
-          preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+          preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
           coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint.
         * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
           dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
           in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is
-          ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+          omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
           :class:`~torchvision.datapoints.Mask` datapoint.
         * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index b07161d27..c180e2e1e 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -796,7 +796,7 @@ class FallingThingsStereo(StereoMatchingDataset):
         # in order to extract disparity from depth maps
         camera_settings_path = Path(file_path).parent / "_camera_settings.json"
         with open(camera_settings_path, "r") as f:
-            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constatnt)
+            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constant)
             intrinsics = json.load(f)
             focal = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
             baseline, pixel_constant = 6, 100  # pixel constant is inverted
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 1cdcb267d..0107c8201 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -91,14 +91,14 @@ class VideoReader:
 
         Each stream descriptor consists of two parts: stream type (e.g. 'video') and
         a unique stream id (which are determined by the video encoding).
-        In this way, if the video contaner contains multiple
+        In this way, if the video container contains multiple
         streams of the same type, users can access the one they want.
         If only stream type is passed, the decoder auto-detects first stream of that type.
 
     Args:
         src (string, bytes object, or tensor): The media source.
             If string-type, it must be a file path supported by FFMPEG.
-            If bytes should be an in memory representatin of a file supported by FFMPEG.
+            If bytes, should be an in-memory representation of a file supported by FFMPEG.
             If Tensor, it is interpreted internally as byte buffer.
             It must be one-dimensional, of type ``torch.uint8``.
 
@@ -279,7 +279,7 @@ class VideoReader:
                 Currently available stream types include ``['video', 'audio']``.
                 Each descriptor consists of two parts: stream type (e.g. 'video') and
                 a unique stream id (which are determined by video encoding).
-                In this way, if the video contaner contains multiple
+                In this way, if the video container contains multiple
                 streams of the same type, users can access the one they want.
                 If only stream type is passed, the decoder auto-detects first stream
                 of that type and returns it.
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 0be62ae8a..a442b2d4b 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -1023,7 +1023,7 @@ class ElasticTransform(Transform):
 
     .. note::
         Implementation to transform bounding boxes is approximative (not exact).
-        We construct an approximation of the inverse grid as ``inverse_grid = idenity - displacement``.
+        We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
         This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
         Our assumption is that ``displacement * displacement`` is small and can be ignored.
         Large displacements would lead to large errors in the approximation.
-- 
GitLab


From 2c44ebaeece31b0cc9a7385e406312f741333ab5 Mon Sep 17 00:00:00 2001
From: Illia Vysochyn <73446357+ivysochyn@users.noreply.github.com>
Date: Mon, 21 Aug 2023 10:25:47 +0200
Subject: [PATCH 582/624] CONTRIBUTING.md: Remove trailing whitespaces (#7857)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 CONTRIBUTING.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 57df2c274..b41c0fe89 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,22 +4,22 @@ We want to make contributing to this project as easy and transparent as possible
 
 ## TL;DR
 
-We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. 
+We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out.
 Your contributions may fall into the following categories:
 
-- It helps the project if you could 
+- It helps the project if you could
     - Report issues you're facing
-    - Give a :+1: on issues that others reported and that are relevant to you 
+    - Give a :+1: on issues that others reported and that are relevant to you
 
 - Answering queries on the issue tracker, investigating bugs are very valuable contributions to the project.
 
-- You would like to improve the documentation. This is no less important than improving the library itself! 
+- You would like to improve the documentation. This is no less important than improving the library itself!
 If you find a typo in the documentation, do not hesitate to submit a GitHub pull request.
 
 - If you would like to fix a bug
     - please pick one from the [list of open issues labelled as "help wanted"](https://github.com/pytorch/vision/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22)
     - comment on the issue that you want to work on this issue
-    - send a PR with your fix, see below. 
+    - send a PR with your fix, see below.
 
 - If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us.
 
@@ -85,12 +85,12 @@ If you plan to modify the code or documentation, please follow the steps below:
 4. Ensure the test suite passes.
 5. Make sure your code passes the formatting checks (see below).
 
-For more details about pull requests, 
-please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). 
+For more details about pull requests,
+please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
 
 If you would like to contribute a new model, please see [here](#New-architecture-or-improved-model-weights).
 
-If you would like to contribute a new dataset, please see [here](#New-dataset). 
+If you would like to contribute a new dataset, please see [here](#New-dataset).
 
 ### Code formatting and typing
 
@@ -145,8 +145,8 @@ mypy --config-file mypy.ini
 
 ### Unit tests
 
-If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific 
-test: 
+If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific
+test:
 ```bash
 pytest test/<test-module.py> -vvv -k <test_myfunc>
 # e.g. pytest test/test_transforms.py -vvv -k test_center_crop
@@ -155,7 +155,7 @@ pytest test/<test-module.py> -vvv -k <test_myfunc>
 If you would like to run all tests:
 ```bash
 pytest test -vvv
-``` 
+```
 
 Tests that require internet access should be in
 `test/test_internet.py`.
@@ -208,18 +208,18 @@ with "transforms" in their name.
 ### New architecture or improved model weights
 
 Please refer to the guidelines in [Contributing to Torchvision - Models](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md).
- 
+
 ### New dataset
 
-Please, do not send any PR with a new dataset without discussing 
+Please, do not send any PR with a new dataset without discussing
 it in an issue as, most likely, it will not be accepted.
 
 ### Pull Request
 
-If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on 
+If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on
 different operating systems, python versions and hardware.
 
-For more details about pull requests workflow, 
+For more details about pull requests workflow,
 please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
 
 ## License
-- 
GitLab


From 37081ee6e594bcc16e16b8287b86f4258da70892 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 22 Aug 2023 10:16:18 +0100
Subject: [PATCH 583/624] Revamp transforms doc (#7859)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/datapoints.rst                    |   2 +
 docs/source/datasets.rst                      |   2 +
 docs/source/transforms.rst                    | 601 +++++++++++++-----
 gallery/v2_transforms/README.rst              |   2 +
 gallery/v2_transforms/plot_datapoints.py      |  15 +-
 test/test_transforms_v2_functional.py         |  34 +-
 test/test_transforms_v2_refactored.py         |   4 +-
 test/transforms_v2_dispatcher_infos.py        |   4 +-
 test/transforms_v2_kernel_infos.py            |  20 +-
 torchvision/datapoints/_dataset_wrapper.py    |   6 +-
 .../datapoints/_torch_function_helpers.py     |   7 +
 torchvision/prototype/transforms/_augment.py  |   4 +-
 torchvision/prototype/transforms/_geometry.py |   2 +-
 torchvision/transforms/v2/_deprecated.py      |   8 +-
 torchvision/transforms/v2/_geometry.py        |   2 +-
 torchvision/transforms/v2/_meta.py            |   2 +-
 torchvision/transforms/v2/_misc.py            |   6 +-
 .../transforms/v2/functional/__init__.py      |   2 +-
 .../transforms/v2/functional/_augment.py      |   1 +
 .../transforms/v2/functional/_color.py        |  12 +
 .../transforms/v2/functional/_deprecated.py   |   1 +
 .../transforms/v2/functional/_geometry.py     |  31 +-
 torchvision/transforms/v2/functional/_meta.py |  18 +-
 torchvision/transforms/v2/functional/_misc.py |   4 +
 .../transforms/v2/functional/_temporal.py     |   1 +
 .../v2/functional/_type_conversion.py         |   1 +
 .../transforms/v2/functional/_utils.py        |   2 +-
 27 files changed, 550 insertions(+), 244 deletions(-)

diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index abaefef60..4a2a8e9fc 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -1,3 +1,5 @@
+.. _datapoints:
+
 Datapoints
 ==========
 
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index 35e5eaf2a..588c1f781 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -1,3 +1,5 @@
+.. _datasets:
+
 Datasets
 ========
 
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 812c17fda..feb0cc422 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -5,242 +5,450 @@ Transforming and augmenting images
 
 .. currentmodule:: torchvision.transforms
 
+Torchvision supports common computer vision transformations in the
+``torchvision.transforms`` and ``torchvision.transforms.v2`` modules. Transforms
+can be used to transform or augment data for training or inference of different
+tasks (image classification, detection, segmentation, video classification).
 
-.. note::
-    In 0.15, we released a new set of transforms available in the
-    ``torchvision.transforms.v2`` namespace, which add support for transforming
-    not just images but also bounding boxes, masks, or videos. These transforms
-    are fully backward compatible with the current ones, and you'll see them
-    documented below with a `v2.` prefix. To get started with those new
-    transforms, you can check out
-    :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
-    Note that these transforms are still BETA, and while we don't expect major
-    breaking changes in the future, some APIs may still change according to user
-    feedback. Please submit any feedback you may have `here
-    <https://github.com/pytorch/vision/issues/6753>`_, and you can also check
-    out `this issue <https://github.com/pytorch/vision/issues/7319>`_ to learn
-    more about the APIs that we suspect might involve future changes.
-
-Transforms are common image transformations available in the
-``torchvision.transforms`` module. They can be chained together using
-:class:`Compose`.
-Most transform classes have a function equivalent: :ref:`functional
-transforms <functional_transforms>` give fine-grained control over the
-transformations.
-This is useful if you have to build a more complex transformation pipeline
-(e.g. in the case of segmentation tasks).
+.. code:: python
+
+    # Image Classification
+    import torch
+    from torchvision.transforms import v2
+
+    H, W = 32, 32
+    img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
+
+    transforms = v2.Compose([
+        v2.RandomResizedCrop(size=(224, 224), antialias=True),
+        v2.RandomHorizontalFlip(p=0.5),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    img = transforms(img)
+
+.. code:: python
+
+    # Detection (re-using imports and transforms from above)
+    from torchvision import datapoints
+
+    img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
+    bboxes = torch.randint(0, H // 2, size=(3, 4))
+    bboxes[:, 2:] += bboxes[:, :2]
+    bboxes = datapoints.BoundingBoxes(bboxes, format="XYXY", canvas_size=(H, W))
+
+    # The same transforms can be used!
+    img, bboxes = transforms(img, bboxes)
+    # And you can pass arbitrary input structures
+    output_dict = transforms({"image": img, "bboxes": bboxes})
+
+Transforms are typically passed as the ``transform`` or ``transforms`` argument
+to the :ref:`Datasets <datasets>`.
+
+.. TODO: add link to getting started guide here.
+
+Supported input types and conventions
+-------------------------------------
 
 Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
-and tensor images, although some transformations are PIL-only and some are
-tensor-only. The :ref:`conversion_transforms` may be used to convert to and from
-PIL images, or for converting dtypes and ranges.
+and tensor images. The result of both backends (PIL or Tensors) should be very
+close. In general, we recommend relying on the tensor backend :ref:`for
+performance <transforms_perf>`.  The :ref:`conversion transforms
+<conversion_transforms>` may be used to convert to and from PIL images, or for
+converting dtypes and ranges.
+
+Tensor image are expected to be of shape ``(C, H, W)``, where ``C`` is the
+number of channels, and ``H`` and ``W`` refer to height and width. Most
+transforms support batched tensor input. A batch of Tensor images is a tensor of
+shape ``(N, C, H, W)``, where ``N`` is a number of images in the batch. The
+:ref:`v2 <v1_or_v2>` transforms generally accept an arbitrary number of leading
+dimensions ``(..., C, H, W)`` and can handle batched images or batched videos.
 
-The transformations that accept tensor images also accept batches of tensor
-images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a
-number of channels, ``H`` and ``W`` are image height and width. A batch of
-Tensor Images is a tensor of ``(B, C, H, W)`` shape, where ``B`` is a number
-of images in the batch.
+.. _range_and_dtype:
+
+Dtype and expected value range
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The expected range of the values of a tensor image is implicitly defined by
 the tensor dtype. Tensor images with a float dtype are expected to have
-values in ``[0, 1)``. Tensor images with an integer dtype are expected to
+values in ``[0, 1]``. Tensor images with an integer dtype are expected to
 have values in ``[0, MAX_DTYPE]`` where ``MAX_DTYPE`` is the largest value
-that can be represented in that dtype.
+that can be represented in that dtype. Typically, images of dtype
+``torch.uint8`` are expected to have values in ``[0, 255]``.
 
-Randomized transformations will apply the same transformation to all the
-images of a given batch, but they will produce different transformations
-across calls. For reproducible transformations across calls, you may use
-:ref:`functional transforms <functional_transforms>`.
+Use :class:`~torchvision.transforms.v2.ToDtype` to convert both the dtype and
+range of the inputs.
 
-The following examples illustrate the use of the available transforms:
+.. _v1_or_v2:
 
-    * :ref:`sphx_glr_auto_examples_others_plot_transforms.py`
+V1 or V2? Which one should I use?
+---------------------------------
 
-        .. figure:: ../source/auto_examples/others/images/sphx_glr_plot_transforms_001.png
-            :align: center
-            :scale: 65%
+**TL;DR** We recommending using the ``torchvision.transforms.v2`` transforms
+instead of those in ``torchvision.transforms``. They're faster and they can do
+more things. Just change the import and you should be good to go.
 
-    * :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`
+In Torchvision 0.15 (March 2023), we released a new set of transforms available
+in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of
+advantages compared to the v1 ones (in ``torchvision.transforms``):
 
-        .. figure:: ../source/auto_examples/others/images/sphx_glr_plot_scripted_tensor_transforms_001.png
-            :align: center
-            :scale: 30%
+- They can transform images **but also** bounding boxes, masks, or videos. This
+  provides support for tasks beyond image classification: detection, segmentation,
+  video classification, etc.
+- They support more transforms like :class:`~torchvision.transforms.v2.CutMix`
+  and :class:`~torchvision.transforms.v2.MixUp`.
+- They're :ref:`faster <transforms_perf>`.
+- They support arbitrary input structures (dicts, lists, tuples, etc.).
+- Future improvements and features will be added to the v2 transforms only.
 
-.. warning::
+.. TODO: Add link to e2e example for first bullet point.
+
+These transforms are **fully backward compatible** with the v1 ones, so if
+you're already using tranforms from ``torchvision.transforms``, all you need to
+do to is to update the import to ``torchvision.transforms.v2``. In terms of
+output, there might be negligible differences due to implementation differences.
+
+To learn more about the v2 transforms, check out
+:ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2.py`.
+
+.. TODO: make sure link is still good!!
+
+.. note::
+
+    The v2 transforms are still BETA, but at this point we do not expect
+    disruptive changes to be made to their public APIs. We're planning to make
+    them fully stable in version 0.17. Please submit any feedback you may have
+    `here <https://github.com/pytorch/vision/issues/6753>`_.
+
+.. _transforms_perf:
+
+Performance considerations
+--------------------------
+
+We recommend the following guidelines to get the best performance out of the
+transforms:
+
+- Rely on the v2 transforms from ``torchvision.transforms.v2``
+- Use tensors instead of PIL images
+- Use ``torch.uint8`` dtype, especially for resizing
+- Resize with bilinear or bicubic mode
+
+This is what a typical transform pipeline could look like:
+
+.. code:: python
+
+    from torchvision.transforms import v2
+    transforms = v2.Compose([
+        v2.ToImage(),  # Convert to tensor, only needed if you had a PIL image
+        v2.ToDtype(torch.uint8, scale=True),  # optional, most input are already uint8 at this point
+        # ...
+        v2.RandomResizedCrop(size=(224, 224), antialias=True),  # Or Resize(antialias=True)
+        # ...
+        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+The above should give you the best performance in a typical training environment
+that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers >
+0``.
+
+Transforms tend to be sensitive to the input strides / memory layout. Some
+transforms will be faster with channels-first images while others prefer
+channels-last. You may want to experiment a bit if you're chasing the very
+best performance. Using :func:`torch.compile` on individual transforms may
+also help factoring out the memory layout variable (e.g. on
+:class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about
+**memory layout**, not tensor shape.
+
+Note that resize transforms like :class:`~torchvision.transforms.v2.Resize`
+and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer
+channels-last input and tend **not** to benefit from :func:`torch.compile` at
+this time.
+
+.. _functional_transforms:
 
-    Since v0.8.0 all random transformations are using torch default random generator to sample random parameters.
-    It is a backward compatibility breaking change and user should set the random state as following:
+Transform classes, functionals, and kernels
+-------------------------------------------
 
-    .. code:: python
+Transforms are available as classes like
+:class:`~torchvision.transforms.v2.Resize`, but also as functionals like
+:func:`~torchvision.transforms.v2.functional.resize` in the
+``torchvision.transforms.v2.functional`` namespace.
+This is very much like the :mod:`torch.nn` package which defines both classes
+and functional equivalents in :mod:`torch.nn.functional`.
 
-        # Previous versions
-        # import random
-        # random.seed(12)
+The functionals support PIL images, pure tensors, or :ref:`datapoints
+<datapoints>`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are
+valid.
 
-        # Now
-        import torch
-        torch.manual_seed(17)
+.. note::
+
+    Random transforms like :class:`~torchvision.transforms.v2.RandomCrop` will
+    randomly sample some parameter each time they're called. Their functional
+    counterpart (:func:`~torchvision.transforms.v2.functional.crop`) does not do
+    any kind of random sampling and thus have a slighlty different
+    parametrization. The ``get_params()`` class method of the transforms class
+    can be used to perform parameter sampling when using the functional APIs.
 
-    Please, keep in mind that the same seed for torch random generator and Python random generator will not
-    produce the same results.
 
+The ``torchvision.transforms.v2.functional`` namespace also contains what we
+call the "kernels". These are the low-level functions that implement the
+core functionalities for specific types, e.g. ``resize_bounding_boxes`` or
+```resized_crop_mask``. They are public, although not documented. Check the
+`code
+<https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/functional/__init__.py>`_
+to see which ones are available (note that those starting with a leading
+underscore are **not** public!). Kernels are only really useful if you want
+:ref:`torchscript support <transforms_torchscript>` for types like bounding
+boxes or masks.
 
-Transforms scriptability
-------------------------
+.. _transforms_torchscript:
 
-.. TODO: Add note about v2 scriptability (in next PR)
+Torchscript support
+-------------------
 
-In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`.
+Most transform classes and functionals support torchscript. For composing
+transforms, use :class:`torch.nn.Sequential` instead of ``Compose``:
 
 .. code:: python
 
     transforms = torch.nn.Sequential(
-        transforms.CenterCrop(10),
-        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        CenterCrop(10),
+        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
     )
     scripted_transforms = torch.jit.script(transforms)
 
-Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor`` and does not require
-`lambda` functions or ``PIL.Image``.
+.. warning::
+
+    v2 transforms support torchscript, but if you call ``torch.jit.script()`` on
+    a v2 **class** transform, you'll actually end up with its (scripted) v1
+    equivalent.  This may lead to slightly different results between the
+    scripted and eager executions due to implementation differences between v1
+    and v2.
+
+    If you really need torchscript support for the v2 tranforms, we recommend
+    scripting the **functionals** from the
+    ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
+
+
+Also note that the functionals only support torchscript for pure tensors, which
+are always treated as images. If you need torchscript support for other types
+like bounding boxes or masks, you can rely on the :ref:`low-level kernels
+<functional_transforms>`.
 
 For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
 
+V2 API reference - Recommended
+------------------------------
 
 Geometry
---------
+^^^^^^^^
+
+Resizing
+""""""""
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Resize
     v2.Resize
     v2.ScaleJitter
     v2.RandomShortestSize
     v2.RandomResize
-    RandomCrop
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.resize
+
+Cropping
+""""""""
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
     v2.RandomCrop
-    RandomResizedCrop
     v2.RandomResizedCrop
     v2.RandomIoUCrop
-    CenterCrop
     v2.CenterCrop
-    FiveCrop
     v2.FiveCrop
-    TenCrop
     v2.TenCrop
-    Pad
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.crop
+    v2.functional.resized_crop
+    v2.functional.ten_crop
+    v2.functional.center_crop
+    v2.functional.five_crop
+
+Others
+""""""
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.RandomHorizontalFlip
+    v2.RandomVerticalFlip
     v2.Pad
     v2.RandomZoomOut
-    RandomRotation
     v2.RandomRotation
-    RandomAffine
     v2.RandomAffine
-    RandomPerspective
     v2.RandomPerspective
-    ElasticTransform
     v2.ElasticTransform
-    RandomHorizontalFlip
-    v2.RandomHorizontalFlip
-    RandomVerticalFlip
-    v2.RandomVerticalFlip
 
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.horizontal_flip
+    v2.functional.vertical_flip
+    v2.functional.pad
+    v2.functional.rotate
+    v2.functional.affine
+    v2.functional.perspective
+    v2.functional.elastic
 
 Color
------
+^^^^^
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    ColorJitter
     v2.ColorJitter
     v2.RandomChannelPermutation
     v2.RandomPhotometricDistort
-    Grayscale
     v2.Grayscale
-    RandomGrayscale
     v2.RandomGrayscale
-    GaussianBlur
     v2.GaussianBlur
-    RandomInvert
     v2.RandomInvert
-    RandomPosterize
     v2.RandomPosterize
-    RandomSolarize
     v2.RandomSolarize
-    RandomAdjustSharpness
     v2.RandomAdjustSharpness
-    RandomAutocontrast
     v2.RandomAutocontrast
-    RandomEqualize
     v2.RandomEqualize
 
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.permute_channels
+    v2.functional.rgb_to_grayscale
+    v2.functional.to_grayscale
+    v2.functional.gaussian_blur
+    v2.functional.invert
+    v2.functional.posterize
+    v2.functional.solarize
+    v2.functional.adjust_sharpness
+    v2.functional.autocontrast
+    v2.functional.adjust_contrast
+    v2.functional.equalize
+    v2.functional.adjust_brightness
+    v2.functional.adjust_saturation
+    v2.functional.adjust_hue
+    v2.functional.adjust_gamma
+
+
 Composition
------------
+^^^^^^^^^^^
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Compose
     v2.Compose
-    RandomApply
     v2.RandomApply
-    RandomChoice
     v2.RandomChoice
-    RandomOrder
     v2.RandomOrder
 
 Miscellaneous
--------------
+^^^^^^^^^^^^^
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    LinearTransformation
     v2.LinearTransformation
-    Normalize
     v2.Normalize
-    RandomErasing
     v2.RandomErasing
-    Lambda
     v2.Lambda
     v2.SanitizeBoundingBoxes
     v2.ClampBoundingBoxes
     v2.UniformTemporalSubsample
 
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.normalize
+    v2.functional.erase
+    v2.functional.clamp_bounding_boxes
+    v2.functional.uniform_temporal_subsample
+
 .. _conversion_transforms:
 
 Conversion
-----------
+^^^^^^^^^^
 
 .. note::
     Beware, some of these conversion transforms below will scale the values
     while performing the conversion, while some may not do any scaling. By
     scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
-    255] range into [0, 1] (and vice-versa).
-    
+    255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`.
+
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    ToPILImage
-    v2.ToPILImage
-    ToTensor
-    v2.ToTensor
-    PILToTensor
-    v2.PILToTensor
     v2.ToImage
-    ConvertImageDtype
-    v2.ConvertImageDtype
+    v2.ToPureTensor
+    v2.PILToTensor
+    v2.ToPILImage
     v2.ToDtype
     v2.ConvertBoundingBoxFormat
-    v2.ToPureTensor
+
+functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: functional.rst
+
+    v2.functional.to_image
+    v2.functional.pil_to_tensor
+    v2.functional.to_pil_image
+    v2.functional.to_dtype
+    v2.functional.convert_bounding_box_format
+
+
+Deprecated
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ToTensor
+    v2.functional.to_tensor
+    v2.ConvertImageDtype
+    v2.functional.convert_image_dtype
 
 Auto-Augmentation
------------------
+^^^^^^^^^^^^^^^^^
 
 `AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
 Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
@@ -252,18 +460,14 @@ The new transform can be used standalone or mixed-and-matched with existing tran
     :toctree: generated/
     :template: class.rst
 
-    AutoAugmentPolicy
-    AutoAugment
     v2.AutoAugment
-    RandAugment
     v2.RandAugment
-    TrivialAugmentWide
     v2.TrivialAugmentWide
-    AugMix
     v2.AugMix
 
+
 CutMix - MixUp
---------------
+^^^^^^^^^^^^^^
 
 CutMix and MixUp are special transforms that
 are meant to be used on batches rather than on individual images, because they
@@ -278,64 +482,126 @@ are combining pairs of images together. These can be used after the dataloader
     v2.CutMix
     v2.MixUp
 
-.. _functional_transforms:
+Developer tools
+^^^^^^^^^^^^^^^
 
-Functional Transforms
----------------------
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
 
-.. currentmodule:: torchvision.transforms.functional
+    v2.functional.register_kernel
 
 
-.. note::
-    You'll find below the documentation for the existing
-    ``torchvision.transforms.functional`` namespace. The
-    ``torchvision.transforms.v2.functional`` namespace exists as well and can be
-    used! The same functionals are present, so you simply need to change your
-    import to rely on the ``v2`` namespace.
+V1 API Reference
+----------------
 
-Functional transforms give you fine-grained control of the transformation pipeline.
-As opposed to the transformations above, functional transforms don't contain a random number
-generator for their parameters.
-That means you have to specify/generate all parameters, but the functional transform will give you
-reproducible results across calls.
+Geometry
+^^^^^^^^
 
-Example:
-you can apply a functional transform with the same parameters to multiple images like this:
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
 
-.. code:: python
+    Resize
+    RandomCrop
+    RandomResizedCrop
+    CenterCrop
+    FiveCrop
+    TenCrop
+    Pad
+    RandomRotation
+    RandomAffine
+    RandomPerspective
+    ElasticTransform
+    RandomHorizontalFlip
+    RandomVerticalFlip
 
-    import torchvision.transforms.functional as TF
-    import random
 
-    def my_segmentation_transforms(image, segmentation):
-        if random.random() > 0.5:
-            angle = random.randint(-30, 30)
-            image = TF.rotate(image, angle)
-            segmentation = TF.rotate(segmentation, angle)
-        # more transforms ...
-        return image, segmentation
+Color
+^^^^^
 
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
 
-Example:
-you can use a functional transform to build transform classes with custom behavior:
+    ColorJitter
+    Grayscale
+    RandomGrayscale
+    GaussianBlur
+    RandomInvert
+    RandomPosterize
+    RandomSolarize
+    RandomAdjustSharpness
+    RandomAutocontrast
+    RandomEqualize
 
-.. code:: python
+Composition
+^^^^^^^^^^^
 
-    import torchvision.transforms.functional as TF
-    import random
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
 
-    class MyRotationTransform:
-        """Rotate by one of the given angles."""
+    Compose
+    RandomApply
+    RandomChoice
+    RandomOrder
 
-        def __init__(self, angles):
-            self.angles = angles
+Miscellaneous
+^^^^^^^^^^^^^
 
-        def __call__(self, x):
-            angle = random.choice(self.angles)
-            return TF.rotate(x, angle)
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
 
-    rotation_transform = MyRotationTransform(angles=[-30, -15, 0, 15, 30])
+    LinearTransformation
+    Normalize
+    RandomErasing
+    Lambda
 
+Conversion
+^^^^^^^^^^
+
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`.
+    
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    ToPILImage
+    ToTensor
+    PILToTensor
+    ConvertImageDtype
+
+Auto-Augmentation
+^^^^^^^^^^^^^^^^^
+
+`AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
+Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
+ImageNet policies provide significant improvements when applied to other datasets.
+In TorchVision we implemented 3 policies learned on the following datasets: ImageNet, CIFAR10 and SVHN.
+The new transform can be used standalone or mixed-and-matched with existing transforms:
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    AutoAugmentPolicy
+    AutoAugment
+    RandAugment
+    TrivialAugmentWide
+    AugMix
+
+
+
+Functional Transforms
+^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: torchvision.transforms.functional
 
 .. autosummary::
     :toctree: generated/
@@ -376,14 +642,3 @@ you can use a functional transform to build transform classes with custom behavi
     to_pil_image
     to_tensor
     vflip
-
-Developer tools
----------------
-
-.. currentmodule:: torchvision.transforms.v2.functional
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    register_kernel
diff --git a/gallery/v2_transforms/README.rst b/gallery/v2_transforms/README.rst
index 55a0893e8..371af30a1 100644
--- a/gallery/v2_transforms/README.rst
+++ b/gallery/v2_transforms/README.rst
@@ -1,2 +1,4 @@
+.. _transforms_gallery:
+
 V2 transforms
 -------------
diff --git a/gallery/v2_transforms/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
index 5b5f57b50..f6859fba1 100644
--- a/gallery/v2_transforms/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -235,7 +235,8 @@ assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 
 # %%
 # Alternatively, you can use the :func:`~torchvision.datapoints.set_return_type`
-# as a global config setting for the whole program, or as a context manager:
+# as a global config setting for the whole program, or as a context manager
+# (read its docs to learn more about caveats):
 
 with datapoints.set_return_type("datapoint"):
     new_bboxes = bboxes + 3
@@ -274,13 +275,13 @@ assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 # ^^^^^^^^^^
 #
 # There are a few exceptions to this "unwrapping" rule:
+# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
+# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
+# the datapoint type.
 #
-# 1. Operations like :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
-#    :meth:`torch.Tensor.detach` and :meth:`~torch.Tensor.requires_grad_` retain
-#    the datapoint type.
-# 2. Inplace operations on datapoints like ``.add_()`` preserve they type. However,
-#    the **returned** value of inplace operations will be unwrapped into a pure
-#    tensor:
+# Inplace operations on datapoints like ``obj.add_()`` will preserve the type of
+# ``obj``. However, the **returned** value of inplace operations will be a pure
+# tensor:
 
 image = datapoints.Image([[[0, 1], [1, 0]]])
 
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 15af5a7a9..e6a540ae0 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -14,7 +14,7 @@ from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
-from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes
+from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_bounding_box_format
 from torchvision.transforms.v2.utils import is_pure_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
@@ -390,7 +390,7 @@ class TestDispatchers:
 
         assert isinstance(output, type(datapoint))
 
-        if isinstance(datapoint, datapoints.BoundingBoxes) and info.dispatcher is not F.convert_format_bounding_boxes:
+        if isinstance(datapoint, datapoints.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format:
             assert output.format == datapoint.format
 
     @pytest.mark.parametrize(
@@ -445,7 +445,7 @@ class TestDispatchers:
         [
             info
             for info in DISPATCHER_INFOS
-            if datapoints.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_format_bounding_boxes
+            if datapoints.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format
         ],
         args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBoxes),
     )
@@ -532,19 +532,19 @@ class TestConvertFormatBoundingBoxes:
     )
     def test_missing_new_format(self, inpt, old_format):
         with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")):
-            F.convert_format_bounding_boxes(inpt, old_format)
+            F.convert_bounding_box_format(inpt, old_format)
 
     def test_pure_tensor_insufficient_metadata(self):
         pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
-            F.convert_format_bounding_boxes(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
+            F.convert_bounding_box_format(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
 
     def test_datapoint_explicit_metadata(self):
         datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
-            F.convert_format_bounding_boxes(
+            F.convert_bounding_box_format(
                 datapoint, old_format=datapoint.format, new_format=datapoints.BoundingBoxFormat.CXCYWH
             )
 
@@ -611,7 +611,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
     ]
     in_boxes = torch.tensor(in_boxes, device=device)
     if format != datapoints.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+        in_boxes = convert_bounding_box_format(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
     expected_bboxes = clamp_bounding_boxes(
         datapoints.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size)
@@ -627,7 +627,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
     )
 
     if format != datapoints.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
+        output_boxes = convert_bounding_box_format(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
     torch.testing.assert_close(output_canvas_size, canvas_size)
@@ -681,12 +681,12 @@ def test_correctness_resized_crop_bounding_boxes(device, format, top, left, heig
         in_boxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device
     )
     if format != datapoints.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+        in_boxes = convert_bounding_box_format(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
     output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size)
 
     if format != datapoints.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
+        output_boxes = convert_bounding_box_format(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
     torch.testing.assert_close(output_canvas_size, size)
@@ -714,13 +714,13 @@ def test_correctness_pad_bounding_boxes(device, padding):
         bbox = (
             bbox.clone()
             if format == datapoints.BoundingBoxFormat.XYXY
-            else convert_format_bounding_boxes(bbox, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+            else convert_bounding_box_format(bbox, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
         )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_format_bounding_boxes(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format)
+        bbox = convert_bounding_box_format(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format)
         if bbox.dtype != dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
@@ -785,9 +785,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
             ]
         )
 
-        bbox_xyxy = convert_format_bounding_boxes(
-            bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY
-        )
+        bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY)
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -808,7 +806,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
             ]
         )
         out_bbox = torch.from_numpy(out_bbox)
-        out_bbox = convert_format_bounding_boxes(
+        out_bbox = convert_bounding_box_format(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_
         )
         return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox)
@@ -848,7 +846,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
 def test_correctness_center_crop_bounding_boxes(device, output_size):
     def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_):
         dtype = bbox.dtype
-        bbox = convert_format_bounding_boxes(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
+        bbox = convert_bounding_box_format(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
@@ -862,7 +860,7 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
             bbox[3].item(),
         ]
         out_bbox = torch.tensor(out_bbox)
-        out_bbox = convert_format_bounding_boxes(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
+        out_bbox = convert_bounding_box_format(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
         out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index f57736e5a..dce6229e8 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -342,7 +342,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
             bbox = bbox.float()
-        bbox_xyxy = F.convert_format_bounding_boxes(
+        bbox_xyxy = F.convert_bounding_box_format(
             bbox.as_subclass(torch.Tensor),
             old_format=format,
             new_format=datapoints.BoundingBoxFormat.XYXY,
@@ -366,7 +366,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
             ],
             dtype=bbox_xyxy.dtype,
         )
-        out_bbox = F.convert_format_bounding_boxes(
+        out_bbox = F.convert_bounding_box_format(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 375c30732..a20f38d84 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -374,8 +374,8 @@ DISPATCHER_INFOS = [
         ],
     ),
     DispatcherInfo(
-        F.convert_format_bounding_boxes,
-        kernels={datapoints.BoundingBoxes: F.convert_format_bounding_boxes},
+        F.convert_bounding_box_format,
+        kernels={datapoints.BoundingBoxes: F.convert_bounding_box_format},
         test_marks=[
             skip_dispatch_datapoint,
         ],
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 33813b651..b10c58277 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -190,7 +190,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
             bbox = bbox.float()
-        bbox_xyxy = F.convert_format_bounding_boxes(
+        bbox_xyxy = F.convert_bounding_box_format(
             bbox.as_subclass(torch.Tensor),
             old_format=format_,
             new_format=datapoints.BoundingBoxFormat.XYXY,
@@ -214,7 +214,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
             ],
             dtype=bbox_xyxy.dtype,
         )
-        out_bbox = F.convert_format_bounding_boxes(
+        out_bbox = F.convert_bounding_box_format(
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
@@ -227,30 +227,30 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
     ).reshape(bounding_boxes.shape)
 
 
-def sample_inputs_convert_format_bounding_boxes():
+def sample_inputs_convert_bounding_box_format():
     formats = list(datapoints.BoundingBoxFormat)
     for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
         yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format)
 
 
-def reference_convert_format_bounding_boxes(bounding_boxes, old_format, new_format):
+def reference_convert_bounding_box_format(bounding_boxes, old_format, new_format):
     return torchvision.ops.box_convert(
         bounding_boxes, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()
     ).to(bounding_boxes.dtype)
 
 
-def reference_inputs_convert_format_bounding_boxes():
-    for args_kwargs in sample_inputs_convert_format_bounding_boxes():
+def reference_inputs_convert_bounding_box_format():
+    for args_kwargs in sample_inputs_convert_bounding_box_format():
         if len(args_kwargs.args[0].shape) == 2:
             yield args_kwargs
 
 
 KERNEL_INFOS.append(
     KernelInfo(
-        F.convert_format_bounding_boxes,
-        sample_inputs_fn=sample_inputs_convert_format_bounding_boxes,
-        reference_fn=reference_convert_format_bounding_boxes,
-        reference_inputs_fn=reference_inputs_convert_format_bounding_boxes,
+        F.convert_bounding_box_format,
+        sample_inputs_fn=sample_inputs_convert_bounding_box_format,
+        reference_fn=reference_convert_bounding_box_format,
+        reference_inputs_fn=reference_inputs_convert_bounding_box_format,
         logs_usage=True,
         closeness_kwargs={
             (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index 3f1c41deb..2ed601fec 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -368,7 +368,7 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
             target["image_id"] = image_id
 
         if "boxes" in target_keys:
-            target["boxes"] = F.convert_format_bounding_boxes(
+            target["boxes"] = F.convert_bounding_box_format(
                 datapoints.BoundingBoxes(
                     batched_target["bbox"],
                     format=datapoints.BoundingBoxFormat.XYWH,
@@ -489,7 +489,7 @@ def celeba_wrapper_factory(dataset, target_keys):
             target,
             target_types=dataset.target_type,
             type_wrappers={
-                "bbox": lambda item: F.convert_format_bounding_boxes(
+                "bbox": lambda item: F.convert_bounding_box_format(
                     datapoints.BoundingBoxes(
                         item,
                         format=datapoints.BoundingBoxFormat.XYWH,
@@ -636,7 +636,7 @@ def widerface_wrapper(dataset, target_keys):
         target = {key: target[key] for key in target_keys}
 
         if "bbox" in target_keys:
-            target["bbox"] = F.convert_format_bounding_boxes(
+            target["bbox"] = F.convert_bounding_box_format(
                 datapoints.BoundingBoxes(
                     target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
                 ),
diff --git a/torchvision/datapoints/_torch_function_helpers.py b/torchvision/datapoints/_torch_function_helpers.py
index 89c61d166..b35b5a1eb 100644
--- a/torchvision/datapoints/_torch_function_helpers.py
+++ b/torchvision/datapoints/_torch_function_helpers.py
@@ -22,6 +22,13 @@ def set_return_type(return_type: str):
     ``torchvision`` transforms or functionals, which will always return as
     output the same type that was passed as input.
 
+    .. warning::
+
+        We recommend using :class:`~torchvision.transforms.v2.ToPureTensor` at
+        the end of your transform pipelines if you use
+        ``set_return_type("dataptoint")``. This will avoid the
+        ``__torch_function__`` overhead in the models ``forward()``.
+
     Can be used as a global flag for the entire program:
 
     .. code:: python
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index eaa181b67..a2f6ebbf4 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -80,7 +80,7 @@ class SimpleCopyPaste(Transform):
         # There is a similar +1 in other reference implementations:
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
-        boxes = F.convert_format_bounding_boxes(
+        boxes = F.convert_bounding_box_format(
             xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
@@ -89,7 +89,7 @@ class SimpleCopyPaste(Transform):
         out_target["labels"] = torch.cat([labels, paste_labels])
 
         # Check for degenerated boxes and remove them
-        boxes = F.convert_format_bounding_boxes(
+        boxes = F.convert_bounding_box_format(
             out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY
         )
         degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 1350b6d1b..bf97d1f60 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -76,7 +76,7 @@ class FixedSizeCrop(Transform):
                 width=new_width,
             )
             bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size)
-            height_and_width = F.convert_format_bounding_boxes(
+            height_and_width = F.convert_bounding_box_format(
                 bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
             )[..., 2:]
             is_valid = torch.all(height_and_width > 0, dim=-1)
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index 1cb135a30..7ffa71943 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,13 +10,15 @@ from torchvision.transforms.v2 import Transform
 
 
 class ToTensor(Transform):
-    """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+    """[BETA] [DEPRECATED] Use ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`` instead.
+
+    Convert a PIL Image or ndarray to tensor and scale the values accordingly.
 
     .. v2betastatus:: ToTensor transform
 
     .. warning::
         :class:`v2.ToTensor` is deprecated and will be removed in a future release.
-        Please use instead ``v2.Compose([transforms.ToImageTensor(), v2.ToDtype(torch.float32, scale=True)])``.
+        Please use instead ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])``.
 
     This transform does not support torchscript.
 
@@ -40,7 +42,7 @@ class ToTensor(Transform):
     def __init__(self) -> None:
         warnings.warn(
             "The transform `ToTensor()` is deprecated and will be removed in a future release. "
-            "Instead, please use `v2.Compose([transforms.ToImageTensor(), v2.ToDtype(torch.float32, scale=True)])`."
+            "Instead, please use `v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`."
         )
         super().__init__()
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index a442b2d4b..2c54b53d4 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -1186,7 +1186,7 @@ class RandomIoUCrop(Transform):
                     continue
 
                 # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_format_bounding_boxes(
+                xyxy_bboxes = F.convert_bounding_box_format(
                     bboxes.as_subclass(torch.Tensor),
                     bboxes.format,
                     datapoints.BoundingBoxFormat.XYXY,
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index f0b622210..e81b6b138 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -24,7 +24,7 @@ class ConvertBoundingBoxFormat(Transform):
         self.format = format
 
     def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
-        return F.convert_format_bounding_boxes(inpt, new_format=self.format)  # type: ignore[return-value]
+        return F.convert_bounding_box_format(inpt, new_format=self.format)  # type: ignore[return-value]
 
 
 class ClampBoundingBoxes(Transform):
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 405fbc6c4..e57473f29 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -293,7 +293,9 @@ class ToDtype(Transform):
 
 
 class ConvertImageDtype(Transform):
-    """[BETA] Convert input image to the given ``dtype`` and scale the values accordingly.
+    """[BETA] [DEPRECATED] Use ``v2.ToDtype(dtype, scale=True)`` instead.
+
+    Convert input image to the given ``dtype`` and scale the values accordingly.
 
     .. v2betastatus:: ConvertImageDtype transform
 
@@ -388,7 +390,7 @@ class SanitizeBoundingBoxes(Transform):
 
         boxes = cast(
             datapoints.BoundingBoxes,
-            F.convert_format_bounding_boxes(
+            F.convert_bounding_box_format(
                 boxes,
                 new_format=datapoints.BoundingBoxFormat.XYXY,
             ),
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 5d3a18a91..81d5c1b9b 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -4,7 +4,7 @@ from ._utils import is_pure_tensor, register_kernel  # usort: skip
 
 from ._meta import (
     clamp_bounding_boxes,
-    convert_format_bounding_boxes,
+    convert_bounding_box_format,
     get_dimensions_image,
     _get_dimensions_image_pil,
     get_dimensions_video,
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 48b8865c4..bc6c4030b 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -17,6 +17,7 @@ def erase(
     v: torch.Tensor,
     inplace: bool = False,
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomErase` for details."""
     if torch.jit.is_scripting():
         return erase_image(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
 
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index 825ffa207..f5c3fa69e 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -15,6 +15,7 @@ from ._utils import _get_kernel, _register_kernel_internal
 
 
 def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Grayscale` for details."""
     if torch.jit.is_scripting():
         return rgb_to_grayscale_image(inpt, num_output_channels=num_output_channels)
 
@@ -69,6 +70,7 @@ def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Te
 
 
 def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    """Adjust brightness."""
 
     if torch.jit.is_scripting():
         return adjust_brightness_image(inpt, brightness_factor=brightness_factor)
@@ -106,6 +108,7 @@ def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> to
 
 
 def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    """Adjust saturation."""
     if torch.jit.is_scripting():
         return adjust_saturation_image(inpt, saturation_factor=saturation_factor)
 
@@ -144,6 +147,7 @@ def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> to
 
 
 def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.RandomAutocontrast`"""
     if torch.jit.is_scripting():
         return adjust_contrast_image(inpt, contrast_factor=contrast_factor)
 
@@ -182,6 +186,7 @@ def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.
 
 
 def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.RandomAdjustSharpness`"""
     if torch.jit.is_scripting():
         return adjust_sharpness_image(inpt, sharpness_factor=sharpness_factor)
 
@@ -254,6 +259,7 @@ def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torc
 
 
 def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    """Adjust hue"""
     if torch.jit.is_scripting():
         return adjust_hue_image(inpt, hue_factor=hue_factor)
 
@@ -371,6 +377,7 @@ def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
 
 
 def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    """Adjust gamma."""
     if torch.jit.is_scripting():
         return adjust_gamma_image(inpt, gamma=gamma, gain=gain)
 
@@ -410,6 +417,7 @@ def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> to
 
 
 def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomPosterize` for details."""
     if torch.jit.is_scripting():
         return posterize_image(inpt, bits=bits)
 
@@ -443,6 +451,7 @@ def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
 
 
 def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomSolarize` for details."""
     if torch.jit.is_scripting():
         return solarize_image(inpt, threshold=threshold)
 
@@ -470,6 +479,7 @@ def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
 
 
 def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomAutocontrast` for details."""
     if torch.jit.is_scripting():
         return autocontrast_image(inpt)
 
@@ -519,6 +529,7 @@ def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def equalize(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomEqualize` for details."""
     if torch.jit.is_scripting():
         return equalize_image(inpt)
 
@@ -608,6 +619,7 @@ def equalize_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def invert(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.RandomInvert`."""
     if torch.jit.is_scripting():
         return invert_image(inpt)
 
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index aac56c51c..37b027c72 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -8,6 +8,7 @@ from torchvision.transforms import functional as _F
 
 @torch.jit.unused
 def to_tensor(inpt: Any) -> torch.Tensor:
+    """[BETA] [DEPREACTED] Use to_image() and to_dtype() instead."""
     warnings.warn(
         "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
         "Instead, please use `to_image(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 0cd43590b..8b3add798 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -23,7 +23,7 @@ from torchvision.transforms.functional import (
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import _get_size_image_pil, clamp_bounding_boxes, convert_format_bounding_boxes
+from ._meta import _get_size_image_pil, clamp_bounding_boxes, convert_bounding_box_format
 
 from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal
 
@@ -40,6 +40,7 @@ def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> Interp
 
 
 def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomHorizontalFlip` for details."""
     if torch.jit.is_scripting():
         return horizontal_flip_image(inpt)
 
@@ -96,6 +97,7 @@ def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomVerticalFlip` for details."""
     if torch.jit.is_scripting():
         return vertical_flip_image(inpt)
 
@@ -177,6 +179,7 @@ def resize(
     max_size: Optional[int] = None,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Resize` for details."""
     if torch.jit.is_scripting():
         return resize_image(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
 
@@ -373,6 +376,7 @@ def affine(
     fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomAffine` for details."""
     if torch.jit.is_scripting():
         return affine_image(
             inpt,
@@ -744,7 +748,7 @@ def _affine_bounding_boxes_with_expand(
     dtype = bounding_boxes.dtype
     device = bounding_boxes.device
     bounding_boxes = (
-        convert_format_bounding_boxes(
+        convert_bounding_box_format(
             bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
         )
     ).reshape(-1, 4)
@@ -805,7 +809,7 @@ def _affine_bounding_boxes_with_expand(
         canvas_size = (new_height, new_width)
 
     out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size)
-    out_bboxes = convert_format_bounding_boxes(
+    out_bboxes = convert_bounding_box_format(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
@@ -946,6 +950,7 @@ def rotate(
     center: Optional[List[float]] = None,
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomRotation` for details."""
     if torch.jit.is_scripting():
         return rotate_image(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
 
@@ -1118,6 +1123,7 @@ def pad(
     fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Pad` for details."""
     if torch.jit.is_scripting():
         return pad_image(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
 
@@ -1333,6 +1339,7 @@ def pad_video(
 
 
 def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
     if torch.jit.is_scripting():
         return crop_image(inpt, top=top, left=left, height=height, width=width)
 
@@ -1426,6 +1433,7 @@ def perspective(
     fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomPerspective` for details."""
     if torch.jit.is_scripting():
         return perspective_image(
             inpt,
@@ -1572,9 +1580,9 @@ def perspective_bounding_boxes(
     perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
     original_shape = bounding_boxes.shape
-    # TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
     bounding_boxes = (
-        convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
@@ -1647,7 +1655,7 @@ def perspective_bounding_boxes(
 
     # out_bboxes should be of shape [N boxes, 4]
 
-    return convert_format_bounding_boxes(
+    return convert_bounding_box_format(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
@@ -1733,6 +1741,7 @@ def elastic(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     fill: _FillTypeJIT = None,
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.ElasticTransform` for details."""
     if torch.jit.is_scripting():
         return elastic_image(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
 
@@ -1853,9 +1862,9 @@ def elastic_bounding_boxes(
         displacement = displacement.to(dtype=dtype, device=device)
 
     original_shape = bounding_boxes.shape
-    # TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
     bounding_boxes = (
-        convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
@@ -1882,7 +1891,7 @@ def elastic_bounding_boxes(
         canvas_size=canvas_size,
     )
 
-    return convert_format_bounding_boxes(
+    return convert_bounding_box_format(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
@@ -1935,6 +1944,7 @@ def elastic_video(
 
 
 def center_crop(inpt: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
     if torch.jit.is_scripting():
         return center_crop_image(inpt, output_size=output_size)
 
@@ -2063,6 +2073,7 @@ def resized_crop(
     interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
     antialias: Optional[Union[str, bool]] = "warn",
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomResizedCrop` for details."""
     if torch.jit.is_scripting():
         return resized_crop_image(
             inpt,
@@ -2207,6 +2218,7 @@ def resized_crop_video(
 def five_crop(
     inpt: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """[BETA] See :class:`~torchvision.transforms.v2.FiveCrop` for details."""
     if torch.jit.is_scripting():
         return five_crop_image(inpt, size=size)
 
@@ -2290,6 +2302,7 @@ def ten_crop(
     torch.Tensor,
     torch.Tensor,
 ]:
+    """[BETA] See :class:`~torchvision.transforms.v2.TenCrop` for details."""
     if torch.jit.is_scripting():
         return ten_crop_image(inpt, size=size, vertical_flip=vertical_flip)
 
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index fc4dfb60d..be57f424b 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -176,7 +176,7 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
     return xyxy
 
 
-def _convert_format_bounding_boxes(
+def _convert_bounding_box_format(
     bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
 
@@ -197,30 +197,31 @@ def _convert_format_bounding_boxes(
     return bounding_boxes
 
 
-def convert_format_bounding_boxes(
+def convert_bounding_box_format(
     inpt: torch.Tensor,
     old_format: Optional[BoundingBoxFormat] = None,
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
 ) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat` for details."""
     # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
     # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
     # default error that would be thrown if `new_format` had no default value.
     if new_format is None:
-        raise TypeError("convert_format_bounding_boxes() missing 1 required argument: 'new_format'")
+        raise TypeError("convert_bounding_box_format() missing 1 required argument: 'new_format'")
 
     if not torch.jit.is_scripting():
-        _log_api_usage_once(convert_format_bounding_boxes)
+        _log_api_usage_once(convert_bounding_box_format)
 
     if torch.jit.is_scripting() or is_pure_tensor(inpt):
         if old_format is None:
             raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
-        return _convert_format_bounding_boxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
+        return _convert_bounding_box_format(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
     elif isinstance(inpt, datapoints.BoundingBoxes):
         if old_format is not None:
             raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.")
-        output = _convert_format_bounding_boxes(
+        output = _convert_bounding_box_format(
             inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
         )
         return datapoints.wrap(output, like=inpt, format=new_format)
@@ -237,12 +238,12 @@ def _clamp_bounding_boxes(
     #  BoundingBoxFormat instead of converting back and forth
     in_dtype = bounding_boxes.dtype
     bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
-    xyxy_boxes = convert_format_bounding_boxes(
+    xyxy_boxes = convert_bounding_box_format(
         bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
-    out_boxes = convert_format_bounding_boxes(
+    out_boxes = convert_bounding_box_format(
         xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
     )
     return out_boxes.to(in_dtype)
@@ -253,6 +254,7 @@ def clamp_bounding_boxes(
     format: Optional[BoundingBoxFormat] = None,
     canvas_size: Optional[Tuple[int, int]] = None,
 ) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.ClampBoundingBoxes` for details."""
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_boxes)
 
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index 331817bb0..1ed134b09 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -20,6 +20,7 @@ def normalize(
     std: List[float],
     inplace: bool = False,
 ) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Normalize` for details."""
     if torch.jit.is_scripting():
         return normalize_image(inpt, mean=mean, std=std, inplace=inplace)
 
@@ -70,6 +71,7 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in
 
 
 def gaussian_blur(inpt: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.GaussianBlur` for details."""
     if torch.jit.is_scripting():
         return gaussian_blur_image(inpt, kernel_size=kernel_size, sigma=sigma)
 
@@ -178,6 +180,7 @@ def gaussian_blur_video(
 
 
 def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.ToDtype` for details."""
     if torch.jit.is_scripting():
         return to_dtype_image(inpt, dtype=dtype, scale=scale)
 
@@ -258,6 +261,7 @@ def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale:
 
 # We encourage users to use to_dtype() instead but we keep this for BC
 def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """[BETA] [DEPRECATED] Use to_dtype() instead."""
     return to_dtype_image(image, dtype=dtype, scale=True)
 
 
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 8edd66c66..9464adf5f 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -8,6 +8,7 @@ from ._utils import _get_kernel, _register_kernel_internal
 
 
 def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.UniformTemporalSubsample` for details."""
     if torch.jit.is_scripting():
         return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
 
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index 1f908353d..4359e0e66 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -9,6 +9,7 @@ from torchvision.transforms import functional as _F
 
 @torch.jit.unused
 def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoints.Image:
+    """[BETA] See :class:`~torchvision.transforms.v2.ToImage` for details."""
     if isinstance(inpt, np.ndarray):
         output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous()
     elif isinstance(inpt, PIL.Image.Image):
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 51b76f592..dd7781646 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -67,7 +67,7 @@ _BUILTIN_DATAPOINT_TYPES = {
 
 
 def register_kernel(functional, datapoint_cls):
-    """Decorate a kernel to register it for a functional and a (custom) datapoint type.
+    """[BETA] Decorate a kernel to register it for a functional and a (custom) datapoint type.
 
     See :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for usage
     details.
-- 
GitLab


From 9c4f7389d0db7cfe7e8591ea920459673344aaa8 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 22 Aug 2023 19:15:02 +0200
Subject: [PATCH 584/624] Fixed issue with jitted AA transforms in v2 and added
 tests (#7839)

---
 test/test_transforms_v2_consistency.py     | 94 ++++++++++++++++++++++
 torchvision/transforms/v2/_auto_augment.py | 19 +++--
 2 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 61de769d8..4e8595d21 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -927,6 +927,29 @@ class TestAATransforms:
 
             assert_close(expected_output, output, atol=1, rtol=0.1)
 
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    def test_randaug_jit(self, interpolation):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+        t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1)
+        t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
     @pytest.mark.parametrize(
         "inpt",
         [
@@ -979,6 +1002,29 @@ class TestAATransforms:
 
             assert_close(expected_output, output, atol=1, rtol=0.1)
 
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    def test_trivial_aug_jit(self, interpolation):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+        t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation)
+        t = v2_transforms.TrivialAugmentWide(interpolation=interpolation)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
     @pytest.mark.parametrize(
         "inpt",
         [
@@ -1032,6 +1078,30 @@ class TestAATransforms:
 
         assert_equal(expected_output, output)
 
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    def test_augmix_jit(self, interpolation):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+
+        t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
+        t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
     @pytest.mark.parametrize(
         "inpt",
         [
@@ -1061,6 +1131,30 @@ class TestAATransforms:
 
         assert_equal(expected_output, output)
 
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    def test_aa_jit(self, interpolation):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+        aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet")
+        t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+        t = v2_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
 
 def import_transforms_from_references(reference):
     HERE = Path(__file__).parent
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 097e90fc4..2c82d092e 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -28,7 +28,16 @@ class _AutoAugmentBase(Transform):
     ) -> None:
         super().__init__()
         self.interpolation = _check_interpolation(interpolation)
-        self.fill = _setup_fill_arg(fill)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        return params
 
     def _get_random_item(self, dct: Dict[str, Tuple[Callable, bool]]) -> Tuple[str, Tuple[Callable, bool]]:
         keys = tuple(dct.keys())
@@ -335,7 +344,7 @@ class AutoAugment(_AutoAugmentBase):
                 magnitude = 0.0
 
             image_or_video = self._apply_image_or_video_transform(
-                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
             )
 
         return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
@@ -419,7 +428,7 @@ class RandAugment(_AutoAugmentBase):
             else:
                 magnitude = 0.0
             image_or_video = self._apply_image_or_video_transform(
-                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
             )
 
         return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
@@ -491,7 +500,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
             magnitude = 0.0
 
         image_or_video = self._apply_image_or_video_transform(
-            image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+            image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
         )
         return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
 
@@ -614,7 +623,7 @@ class AugMix(_AutoAugmentBase):
                     magnitude = 0.0
 
                 aug = self._apply_image_or_video_transform(
-                    aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
+                    aug, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
                 )
             mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
         mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
-- 
GitLab


From 26ed129d530f8ca48113adf62166545a4331c322 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 22 Aug 2023 19:05:46 +0100
Subject: [PATCH 585/624] Make v2.utils private. (#7863)

---
 references/segmentation/v2_extras.py          |  2 +-
 test/test_prototype_datasets_builtin.py       |  2 +-
 test/test_prototype_transforms.py             |  2 +-
 test/test_transforms_v2.py                    |  2 +-
 test/test_transforms_v2_consistency.py        |  3 +-
 test/test_transforms_v2_functional.py         |  2 +-
 test/test_transforms_v2_utils.py              | 10 +--
 torchvision/prototype/transforms/_augment.py  |  2 +-
 torchvision/prototype/transforms/_geometry.py | 12 ++-
 torchvision/prototype/transforms/_misc.py     |  2 +-
 torchvision/transforms/v2/__init__.py         |  2 +-
 torchvision/transforms/v2/_augment.py         |  3 +-
 torchvision/transforms/v2/_auto_augment.py    |  3 +-
 torchvision/transforms/v2/_color.py           |  2 +-
 torchvision/transforms/v2/_geometry.py        |  6 +-
 torchvision/transforms/v2/_misc.py            |  3 +-
 torchvision/transforms/v2/_transform.py       |  2 +-
 torchvision/transforms/v2/_type_conversion.py |  2 +-
 torchvision/transforms/v2/_utils.py           | 81 ++++++++++++++++++-
 torchvision/transforms/v2/utils.py            | 79 ------------------
 20 files changed, 115 insertions(+), 107 deletions(-)
 delete mode 100644 torchvision/transforms/v2/utils.py

diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
index f21799e86..137a00ccf 100644
--- a/references/segmentation/v2_extras.py
+++ b/references/segmentation/v2_extras.py
@@ -11,7 +11,7 @@ class PadIfSmaller(v2.Transform):
         self.fill = v2._utils._setup_fill_arg(fill)
 
     def _get_params(self, sample):
-        _, height, width = v2.utils.query_chw(sample)
+        _, height, width = v2._utils.query_chw(sample)
         padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
         needs_padding = any(padding)
         return dict(padding=padding, needs_padding=needs_padding)
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index e29dfb17f..8497ea27b 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -25,7 +25,7 @@ from torchvision.prototype import datasets
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.transforms.v2.utils import is_pure_tensor
+from torchvision.transforms.v2._utils import is_pure_tensor
 
 
 def assert_samples_equal(*args, msg=None, **kwargs):
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 0410ecadc..b4e1d1087 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -10,8 +10,8 @@ from prototype_common_utils import make_label
 
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
+from torchvision.transforms.v2._utils import check_type, is_pure_tensor
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
-from torchvision.transforms.v2.utils import check_type, is_pure_tensor
 from transforms_v2_legacy_utils import (
     DEFAULT_EXTRA_DIMS,
     make_bounding_boxes,
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9630132e2..26dde6407 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -16,7 +16,7 @@ from torchvision import datapoints
 from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import to_pil_image
 from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
+from torchvision.transforms.v2._utils import check_type, is_pure_tensor, query_chw
 from transforms_v2_legacy_utils import (
     make_bounding_boxes,
     make_detection_mask,
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 4e8595d21..0d11f610a 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -19,9 +19,8 @@ from torchvision._utils import sequence_to_str
 
 from torchvision.transforms import functional as legacy_F
 from torchvision.transforms.v2 import functional as prototype_F
-from torchvision.transforms.v2._utils import _get_fill
+from torchvision.transforms.v2._utils import _get_fill, query_size
 from torchvision.transforms.v2.functional import to_pil_image
-from torchvision.transforms.v2.utils import query_size
 from transforms_v2_legacy_utils import (
     ArgsKwargs,
     make_bounding_boxes,
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index e6a540ae0..826ba8b57 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -13,9 +13,9 @@ from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2._utils import is_pure_tensor
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_bounding_box_format
-from torchvision.transforms.v2.utils import is_pure_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
 from transforms_v2_legacy_utils import (
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 55825d652..511b0c364 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -3,12 +3,12 @@ import pytest
 
 import torch
 
-import torchvision.transforms.v2.utils
+import torchvision.transforms.v2._utils
 from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image
 
 from torchvision import datapoints
+from torchvision.transforms.v2._utils import has_all, has_any
 from torchvision.transforms.v2.functional import to_pil_image
-from torchvision.transforms.v2.utils import has_all, has_any
 
 
 IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
@@ -37,15 +37,15 @@ MASK = make_detection_mask(DEFAULT_SIZE)
         ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor), True),
+        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True),
         (
             (torch.Tensor(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
             True,
         ),
         (
             (to_pil_image(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
             True,
         ),
     ],
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index a2f6ebbf4..f4013ffa7 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -7,9 +7,9 @@ from torchvision import datapoints
 from torchvision.ops import masks_to_boxes
 from torchvision.prototype import datapoints as proto_datapoints
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2._utils import is_pure_tensor
 
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 class SimpleCopyPaste(Transform):
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index bf97d1f60..3b7e68781 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -6,8 +6,16 @@ import torch
 from torchvision import datapoints
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
-from torchvision.transforms.v2._utils import _FillType, _get_fill, _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_pure_tensor, query_size
+from torchvision.transforms.v2._utils import (
+    _FillType,
+    _get_fill,
+    _setup_fill_arg,
+    _setup_size,
+    get_bounding_boxes,
+    has_any,
+    is_pure_tensor,
+    query_size,
+)
 
 
 class FixedSizeCrop(Transform):
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index 0dd495ab0..fa812bbbb 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -8,7 +8,7 @@ import torch
 from torchvision import datapoints
 from torchvision.transforms.v2 import Transform
 
-from torchvision.transforms.v2.utils import is_pure_tensor
+from torchvision.transforms.v2._utils import is_pure_tensor
 
 
 T = TypeVar("T")
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index b60962748..dbc0474d3 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -1,6 +1,6 @@
 from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
 
-from . import functional, utils  # usort: skip
+from . import functional  # usort: skip
 
 from ._transform import Transform  # usort: skip
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index a6af96a5e..130950fee 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -11,8 +11,7 @@ from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F
 
 from ._transform import _RandomApplyTransform, Transform
-from ._utils import _parse_labels_getter
-from .utils import has_any, is_pure_tensor, query_chw, query_size
+from ._utils import _parse_labels_getter, has_any, is_pure_tensor, query_chw, query_size
 
 
 class RandomErasing(_RandomApplyTransform):
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 2c82d092e..664210ff7 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -12,8 +12,7 @@ from torchvision.transforms.v2.functional._geometry import _check_interpolation
 from torchvision.transforms.v2.functional._meta import get_size
 from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
-from ._utils import _get_fill, _setup_fill_arg
-from .utils import check_type, is_pure_tensor
+from ._utils import _get_fill, _setup_fill_arg, check_type, is_pure_tensor
 
 
 ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video]
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index a37927979..efe731b5e 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -6,7 +6,7 @@ from torchvision import transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._transform import _RandomApplyTransform
-from .utils import query_chw
+from ._utils import query_chw
 
 
 class Grayscale(Transform):
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 2c54b53d4..4f94b37aa 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -23,8 +23,12 @@ from ._utils import (
     _setup_fill_arg,
     _setup_float_or_seq,
     _setup_size,
+    get_bounding_boxes,
+    has_all,
+    has_any,
+    is_pure_tensor,
+    query_size,
 )
-from .utils import get_bounding_boxes, has_all, has_any, is_pure_tensor, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index e57473f29..9821a32ce 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -9,8 +9,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints, transforms as _transforms
 from torchvision.transforms.v2 import functional as F, Transform
 
-from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
-from .utils import get_bounding_boxes, has_any, is_pure_tensor
+from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
 
 
 # TODO: do we want/need to expose this?
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index e9af4b426..f377c822a 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -8,7 +8,7 @@ import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints
-from torchvision.transforms.v2.utils import check_type, has_any, is_pure_tensor
+from torchvision.transforms.v2._utils import check_type, has_any, is_pure_tensor
 from torchvision.utils import _log_api_usage_once
 
 from .functional._utils import _get_kernel
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 26d233754..dc4f79c9b 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -7,7 +7,7 @@ import torch
 from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F, Transform
 
-from torchvision.transforms.v2.utils import is_pure_tensor
+from torchvision.transforms.v2._utils import is_pure_tensor
 
 
 class PILToTensor(Transform):
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index f9d9bae49..3c6977fae 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -1,11 +1,20 @@
+from __future__ import annotations
+
 import collections.abc
 import numbers
 from contextlib import suppress
-from typing import Any, Callable, Dict, Literal, Optional, Sequence, Type, Union
 
+from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
+
+import PIL.Image
 import torch
 
+from torchvision import datapoints
+
+from torchvision._utils import sequence_to_str
+
 from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
 from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
 
@@ -138,3 +147,73 @@ def _parse_labels_getter(
         return lambda _: None
     else:
         raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.")
+
+
+def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
+    # This assumes there is only one bbox per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes))
+    except StopIteration:
+        raise ValueError("No bounding boxes were found in the sample")
+
+
+def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
+    chws = {
+        tuple(get_dimensions(inpt))
+        for inpt in flat_inputs
+        if check_type(inpt, (is_pure_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
+    }
+    if not chws:
+        raise TypeError("No image or video was found in the sample")
+    elif len(chws) > 1:
+        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
+    c, h, w = chws.pop()
+    return c, h, w
+
+
+def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
+    sizes = {
+        tuple(get_size(inpt))
+        for inpt in flat_inputs
+        if check_type(
+            inpt,
+            (
+                is_pure_tensor,
+                datapoints.Image,
+                PIL.Image.Image,
+                datapoints.Video,
+                datapoints.Mask,
+                datapoints.BoundingBoxes,
+            ),
+        )
+    }
+    if not sizes:
+        raise TypeError("No image, video, mask or bounding box was found in the sample")
+    elif len(sizes) > 1:
+        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
+    h, w = sizes.pop()
+    return h, w
+
+
+def check_type(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
+    for type_or_check in types_or_checks:
+        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
+            return True
+    return False
+
+
+def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for inpt in flat_inputs:
+        if check_type(inpt, types_or_checks):
+            return True
+    return False
+
+
+def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for type_or_check in types_or_checks:
+        for inpt in flat_inputs:
+            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
+                break
+        else:
+            return False
+    return True
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
deleted file mode 100644
index 1e4ff2d05..000000000
--- a/torchvision/transforms/v2/utils.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, Callable, List, Tuple, Type, Union
-
-import PIL.Image
-from torchvision import datapoints
-
-from torchvision._utils import sequence_to_str
-from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
-
-
-def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
-    # This assumes there is only one bbox per sample as per the general convention
-    try:
-        return next(inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes))
-    except StopIteration:
-        raise ValueError("No bounding boxes were found in the sample")
-
-
-def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
-    chws = {
-        tuple(get_dimensions(inpt))
-        for inpt in flat_inputs
-        if check_type(inpt, (is_pure_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
-    }
-    if not chws:
-        raise TypeError("No image or video was found in the sample")
-    elif len(chws) > 1:
-        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
-    c, h, w = chws.pop()
-    return c, h, w
-
-
-def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
-    sizes = {
-        tuple(get_size(inpt))
-        for inpt in flat_inputs
-        if check_type(
-            inpt,
-            (
-                is_pure_tensor,
-                datapoints.Image,
-                PIL.Image.Image,
-                datapoints.Video,
-                datapoints.Mask,
-                datapoints.BoundingBoxes,
-            ),
-        )
-    }
-    if not sizes:
-        raise TypeError("No image, video, mask or bounding box was found in the sample")
-    elif len(sizes) > 1:
-        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
-    h, w = sizes.pop()
-    return h, w
-
-
-def check_type(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
-    for type_or_check in types_or_checks:
-        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
-            return True
-    return False
-
-
-def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    for inpt in flat_inputs:
-        if check_type(inpt, types_or_checks):
-            return True
-    return False
-
-
-def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    for type_or_check in types_or_checks:
-        for inpt in flat_inputs:
-            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
-                break
-        else:
-            return False
-    return True
-- 
GitLab


From 11e49de410ec84ec669293a91dfaa13a53c9bc47 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 22 Aug 2023 19:06:43 +0100
Subject: [PATCH 586/624] clarifying docs for v2.ToPILImage() (#7864)

---
 torchvision/transforms/v2/_type_conversion.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index dc4f79c9b..e92c98e6c 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -44,23 +44,24 @@ class ToImage(Transform):
 
 
 class ToPILImage(Transform):
-    """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
+    """[BETA] Convert a tensor or an ndarray to PIL Image
 
     .. v2betastatus:: ToPILImage transform
 
     This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
-    H x W x C to a PIL Image while preserving the value range.
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
 
     Args:
         mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
             If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
             - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
             - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
             - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
             - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
-            ``short``).
+              ``short``).
 
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
-- 
GitLab


From 6f72b76c1dd37b9ade7966ea1005c98344fa4d94 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 23 Aug 2023 10:40:55 +0100
Subject: [PATCH 587/624] Re-write getting started guide for transforms V2
 (#7870)

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/conf.py                         |   1 +
 docs/source/transforms.rst                  |  18 +-
 gallery/v2_transforms/helpers.py            |  33 +++
 gallery/v2_transforms/plot_datapoints.py    |  74 -----
 gallery/v2_transforms/plot_transforms_v2.py | 287 +++++++++++++++-----
 torchvision/datapoints/_image.py            |  14 +-
 torchvision/datapoints/_mask.py             |   8 +-
 torchvision/datapoints/_video.py            |   8 +-
 torchvision/transforms/v2/_misc.py          |   3 +-
 9 files changed, 285 insertions(+), 161 deletions(-)
 create mode 100644 gallery/v2_transforms/helpers.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 639ff2c68..33fd64e3a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -83,6 +83,7 @@ sphinx_gallery_conf = {
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
     "remove_config_comments": True,
+    "ignore_pattern": "helpers.py",
 }
 
 napoleon_use_ivar = True
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index feb0cc422..6bf2c3753 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -45,13 +45,17 @@ tasks (image classification, detection, segmentation, video classification).
 Transforms are typically passed as the ``transform`` or ``transforms`` argument
 to the :ref:`Datasets <datasets>`.
 
+.. TODO: Reader guide, i.e. what to read depending on what you're looking for
 .. TODO: add link to getting started guide here.
 
+.. _conventions:
+
 Supported input types and conventions
 -------------------------------------
 
 Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
-and tensor images. The result of both backends (PIL or Tensors) should be very
+and tensor inputs. Both CPU and CUDA tensors are supported.
+The result of both backends (PIL or Tensors) should be very
 close. In general, we recommend relying on the tensor backend :ref:`for
 performance <transforms_perf>`.  The :ref:`conversion transforms
 <conversion_transforms>` may be used to convert to and from PIL images, or for
@@ -152,13 +156,15 @@ The above should give you the best performance in a typical training environment
 that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers >
 0``.
 
-Transforms tend to be sensitive to the input strides / memory layout. Some
+Transforms tend to be sensitive to the input strides / memory format. Some
 transforms will be faster with channels-first images while others prefer
-channels-last. You may want to experiment a bit if you're chasing the very
-best performance. Using :func:`torch.compile` on individual transforms may
-also help factoring out the memory layout variable (e.g. on
+channels-last. Like ``torch`` operators, most transforms will preserve the
+memory format of the input, but this may not always be respected due to
+implementation details. You may want to experiment a bit if you're chasing the
+very best performance.  Using :func:`torch.compile` on individual transforms may
+also help factoring out the memory format variable (e.g. on
 :class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about
-**memory layout**, not tensor shape.
+**memory format**, not :ref:`tensor shape <conventions>`.
 
 Note that resize transforms like :class:`~torchvision.transforms.v2.Resize`
 and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer
diff --git a/gallery/v2_transforms/helpers.py b/gallery/v2_transforms/helpers.py
new file mode 100644
index 000000000..846030c35
--- /dev/null
+++ b/gallery/v2_transforms/helpers.py
@@ -0,0 +1,33 @@
+import matplotlib.pyplot as plt
+from torchvision.utils import draw_bounding_boxes
+
+
+def plot(imgs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            bboxes = None
+            if isinstance(img, tuple):
+                bboxes = img[1]
+                img = img[0]
+                if isinstance(bboxes, dict):
+                    bboxes = bboxes['bboxes']
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+
+            if bboxes is not None:
+                img = draw_bounding_boxes(img, bboxes, colors="yellow", width=3)
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy())
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    plt.tight_layout()
diff --git a/gallery/v2_transforms/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
index f6859fba1..b56de809f 100644
--- a/gallery/v2_transforms/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -28,7 +28,6 @@ import PIL.Image
 
 import torch
 from torchvision import datapoints
-from torchvision.transforms.v2 import functional as F
 
 
 # %%
@@ -119,83 +118,10 @@ new_bboxes = datapoints.wrap(new_bboxes, like=bboxes)
 assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 assert new_bboxes.canvas_size == bboxes.canvas_size
 
-
 # %%
 # The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
 # it as a parameter to override it.
 #
-# Do I have to wrap the output of the datasets myself?
-# ----------------------------------------------------
-#
-# TODO: Move this in another guide - this is user-facing, not dev-facing.
-#
-# Only if you are using custom datasets. For the built-in ones, you can use
-# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
-# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
-# also don't have to wrap manually.
-#
-# If you have a custom dataset, for example the ``PennFudanDataset`` from
-# `this tutorial <https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html>`_, you have two options:
-#
-# 1. Perform the wrapping inside ``__getitem__``:
-
-class PennFudanDataset(torch.utils.data.Dataset):
-    ...
-
-    def __getitem__(self, item):
-        ...
-
-        target["bboxes"] = datapoints.BoundingBoxes(
-            bboxes,
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=F.get_size(img),
-        )
-        target["labels"] = labels
-        target["masks"] = datapoints.Mask(masks)
-
-        ...
-
-        if self.transforms is not None:
-            img, target = self.transforms(img, target)
-
-        ...
-
-# %%
-# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
-
-
-class WrapPennFudanDataset:
-    def __call__(self, img, target):
-        target["boxes"] = datapoints.BoundingBoxes(
-            target["boxes"],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=F.get_size(img),
-        )
-        target["masks"] = datapoints.Mask(target["masks"])
-        return img, target
-
-
-...
-
-
-def get_transform(train):
-    transforms = []
-    transforms.append(WrapPennFudanDataset())
-    transforms.append(T.PILToTensor())
-    ...
-
-# %%
-# .. note::
-#
-#    If both :class:`~torchvision.datapoints.BoundingBoxes` and :class:`~torchvision.datapoints.Mask`'s are included in
-#    the sample, ``torchvision.transforms.v2`` will transform them both. Meaning, if you don't need both, dropping or
-#    at least not wrapping the obsolete parts, can lead to a significant performance boost.
-#
-#    For example, if you are using the ``PennFudanDataset`` for object detection, not wrapping the masks avoids
-#    transforming them over and over again in the pipeline just to ultimately ignoring them. In general, it would be
-#    even better to not load the masks at all, but this is not possible in this example, since the bounding boxes are
-#    generated from the masks.
-#
 # .. _datapoint_unwrapping_behaviour:
 #
 # I had a Datapoint but now I have a Tensor. Help!
diff --git a/gallery/v2_transforms/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
index efa83a161..3058df234 100644
--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -7,103 +7,254 @@ Getting started with transforms v2
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2.ipynb>`_
     or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2.py>` to download the full example code.
 
-Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports
-images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This
-example showcases the core functionality of the new ``torchvision.transforms.v2`` API.
+This example illustrates all of what you need to know to get started with the
+new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
+image classification, and more advanced ones like object detection /
+segmentation.
 """
 
-import pathlib
-
+# %%
+# First, a bit of setup
+from pathlib import Path
 import torch
+import matplotlib.pyplot as plt
+plt.rcParams["savefig.bbox"] = 'tight'
 
+from torchvision.transforms import v2
+from torchvision.io import read_image
 
-def load_data():
-    from torchvision.io import read_image
-    from torchvision import datapoints
-    from torchvision.ops import masks_to_boxes
+torch.manual_seed(1)
 
-    assets_directory = pathlib.Path("../assets")
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+img = read_image(str(Path('../assets') / 'astronaut.jpg'))
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }")
 
-    path = assets_directory / "FudanPed00054.png"
-    image = datapoints.Image(read_image(str(path)))
-    merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png"))
+# %%
+# The basics
+# ----------
+#
+# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in
+# fact, most of them are): instantiate a transform, pass an input, get a
+# transformed output:
 
-    labels = torch.unique(merged_masks)[1:]
+transform = v2.RandomCrop(size=(224, 224))
+out = transform(img)
 
-    masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
+plot([img, out])
 
-    bounding_boxes = datapoints.BoundingBoxes(
-        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:]
-    )
+# %%
+# I just want to do image classification
+# --------------------------------------
+#
+# If you just care about image classification, things are very simple. A basic
+# classification pipeline may look like this:
 
-    return path, image, bounding_boxes, masks, labels
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+out = transforms(img)
 
+plot([img, out])
 
 # %%
-# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
-# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
-# detection or instance and semantic segmentation. Still, the interface is the same, making
-# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1.
+# Such transformation pipeline is typically passed as the ``transform`` argument
+# to the :ref:`Datasets <datasets>`, e.g. ``ImageNet(...,
+# transform=transforms)``.
+#
+# That's pretty much all there is. From there, read through our :ref:`main docs
+# <transforms>` to learn more about recommended practices and conventions, or
+# explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
+# transforms like :ref:`CutMix and MixUp
+# <sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py>`.
+#
+# .. note::
+#
+#     If you're already relying on the ``torchvision.transforms`` v1 API,
+#     we recommend to :ref:`switch to the new v2 transforms<v1_or_v2>`. It's
+#     very easy: the v2 transforms are fully compatible with the v1 API, so you
+#     only need to change the import!
+#
+# Detection, Segmentation, Videos
+# -------------------------------
+#
+# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform bounding
+# boxes, segmentation / detection masks, or videos.
+#
+# Let's briefly look at a detection example with bounding boxes.
 
-import torchvision.transforms.v2 as transforms
+from torchvision import datapoints  # we'll describe this a bit later, bare with us
 
-transform = transforms.Compose(
+bboxes = datapoints.BoundingBoxes(
     [
-        transforms.ColorJitter(contrast=0.5),
-        transforms.RandomRotation(30),
-        transforms.CenterCrop(480),
-    ]
-)
+        [15, 10, 370, 510],
+        [275, 340, 510, 510],
+        [130, 345, 210, 425]
+    ],
+    format="XYXY", canvas_size=img.shape[-2:])
 
-# %%
-# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
-# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
-# order.
+transforms = v2.Compose([
+    v2.RandomPhotometricDistort(),
+    v2.RandomIoUCrop(),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.SanitizeBoundingBoxes(),
+])
+out_img, out_bboxes = transforms(img, bboxes)
 
-path, image, bounding_boxes, masks, labels = load_data()
+plot([(img, bboxes), (out_img, out_bboxes)])
 
-torch.manual_seed(0)
-new_image = transform(image)  # Image Classification
-new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels)  # Object Detection
-new_image, new_bounding_boxes, new_masks, new_labels = transform(
-    image, bounding_boxes, masks, labels
-)  # Instance Segmentation
-new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure
+# %%
+#
+# The example above focuses on object detection. But if we had masks
+# (:class:`torchvision.datapoints.Mask`) for object segmentation or semantic
+# segmentation, or videos (:class:`torchvision.datapoints.Video`), we could have
+# passed them to the transforms in exactly the same way.
+#
+# By now you likely have a few questions: what are these datapoints, how do we
+# use them, and what is the expected input/output of those transforms? We'll
+# answer these in the next sections.
 
 # %%
-# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
-# appropriate function for the input data: :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`. Note however, that as
-# regular user, you likely don't have to touch this yourself. See
-# :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
+# What are Datapoints?
+# --------------------
+#
+# Datapoints are :class:`torch.Tensor` subclasses. The available datapoints are
+# :class:`~torchvision.datapoints.Image`,
+# :class:`~torchvision.datapoints.BoundingBoxes`,
+# :class:`~torchvision.datapoints.Mask`, and
+# :class:`~torchvision.datapoints.Video`.
 #
-# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
-# information directly with the sample:
+# Datapoints look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
+# or any ``torch.*`` operator will also work on a datapoint:
 
-sample = {"path": path, "image": image}
-new_sample = transform(sample)
+img_dp = datapoints.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
 
-assert new_sample["path"] is sample["path"]
+print(f"{isinstance(img_dp, torch.Tensor) = }")
+print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 
 # %%
-# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
-# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
-# simple heuristic:
+# These Datapoint classes are at the core of the transforms: in order to
+# transform a given input, the transforms first look at the **class** of the
+# object, and dispatch to the appropriate implementation accordingly.
 #
-# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`,
-#   or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through.
-# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or
-#   video, while all others will be passed through.
-
-plain_tensor_image = torch.rand(image.shape)
-
-print(image.shape, plain_tensor_image.shape)
+# You don't need to know much more about datapoints at this point, but advanced
+# users who want to learn more can refer to
+# :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`.
+#
+# What do I pass as input?
+# ------------------------
+#
+# Above, we've seen two examples: one where we passed a single image as input
+# i.e. ``out = transforms(img)``, and one where we passed both an image and
+# bounding boxes, i.e. ``out_img, out_bboxes = transforms(img, bboxes)``.
+#
+# In fact, transforms support **arbitrary input structures**. The input can be a
+# single image, a tuple, an arbitrarily nested dictionary... pretty much
+# anything. The same structure will be returned as output. Below, we use the
+# same detection transforms, but pass a tuple (image, target_dict) as input and
+# we're getting the same structure as output:
 
-# passing a plain tensor together with an explicit image, will not transform the former
-plain_tensor_image, image = transform(plain_tensor_image, image)
+target = {
+    "bboxes": bboxes,
+    "labels": torch.arange(bboxes.shape[0]),
+    "this_is_ignored": ("arbitrary", {"structure": "!"})
+}
 
-print(image.shape, plain_tensor_image.shape)
+# Re-using the transforms and definitions from above.
+out_img, out_target = transforms(img, target)
 
-# passing a plain tensor without an explicit image, will transform the former
-plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes)
+plot([(img, target["bboxes"]), (out_img, out_target["bboxes"])])
+print(f"{out_target['this_is_ignored']}")
 
-print(image.shape, plain_tensor_image.shape)
+# %%
+# We passed a tuple so we get a tuple back, and the second element is the
+# tranformed target dict. Transforms don't really care about the structure of
+# the input; as mentioned above, they only care about the **type** of the
+# objects and transforms them accordingly.
+#
+# *Foreign* objects like strings or ints are simply passed-through. This can be
+# useful e.g. if you want to associate a path with every single sample when
+# debugging!
+#
+# .. _passthrough_heuristic:
+#
+# .. note::
+#
+#     **Disclaimer** This note is slightly advanced and can be safely skipped on
+#     a first read.
+#
+#     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
+#     as videos for video-specific transforms). Indeed, you may have noticed
+#     that in the code above we haven't used the
+#     :class:`~torchvision.datapoints.Image` class at all, and yet our images
+#     got transformed properly. Transforms follow the following logic to
+#     determine whether a pure Tensor should be treated as an image (or video),
+#     or just ignored:
+#
+#     * If there is an :class:`~torchvision.datapoints.Image`,
+#       :class:`~torchvision.datapoints.Video`,
+#       or :class:`PIL.Image.Image` instance in the input, all other pure
+#       tensors are passed-through.
+#     * If there is no :class:`~torchvision.datapoints.Image` or
+#       :class:`~torchvision.datapoints.Video` instance, only the first pure
+#       :class:`torch.Tensor` will be transformed as image or video, while all
+#       others will be passed-through. Here "first" means "first in a depth-wise
+#       traversal".
+#
+#     This is what happened in the detection example above: the first pure
+#     tensor was the image so it got transformed properly, and all other pure
+#     tensor instances like the ``labels`` were passed-through (although labels
+#     can still be transformed by some transforms like
+#     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
+#
+# Transforms and Datasets intercompatibility
+# ------------------------------------------
+#
+# Roughly speaking, the output of the datasets must correspond to the input of
+# the transforms. How to do that depends on whether you're using the torchvision
+# :ref:`built-in datatsets <datasets>`, or your own custom datasets.
+#
+# Using built-in datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you're just doing image classification, you don't need to do anything. Just
+# use ``transform`` argument of the dataset e.g. ``ImageNet(...,
+# transform=transforms)`` and you're good to go.
+#
+# Torchvision also supports datasets for object detection or segmentation like
+# :class:`torchvision.datasets.CocoDetection`. Those datasets predate
+# the existence of the :mod:`torchvision.transforms.v2` module and of the
+# datapoints, so they don't return datapoints out of the box.
+#
+# An easy way to force those datasets to return datapoints and to make them
+# compatible with v2 transforms is to use the
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
+#
+# .. code-block:: python
+#
+#    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+#
+#    dataset = CocoDetection(..., transforms=my_v2_transforms)
+#    dataset = wrap_dataset_for_transforms_v2(dataset)
+#    # Now the dataset returns datapoints!
+#
+# Using your own datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you have a custom dataset, then you'll need to convert your objects into
+# the appropriate Datapoint classes. Creating Datapoint instances is very easy,
+# refer to :ref:`datapoint_creation` for more details.
+#
+# There are two main places where you can implement that conversion logic:
+#
+# - At the end of the datasets's ``__getitem__`` method, before returning the
+#   sample (or by sub-classing the dataset).
+# - As the very first step of your transforms pipeline
+#
+# Either way, the logic will depend on your specific dataset.
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 609ace90d..c9cc10c8d 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -11,14 +11,20 @@ from ._datapoint import Datapoint
 class Image(Datapoint):
     """[BETA] :class:`torch.Tensor` subclass for images.
 
+    .. note::
+
+        In the :ref:`transforms <transforms>`, ``Image`` instances are largely
+        interchangeable with pure :class:`torch.Tensor`. See
+        :ref:`this note <passthrough_heuristic>` for more details.
+
     Args:
         data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
             well as PIL images.
-        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
             ``data``.
-        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
-            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
-        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the image is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
             ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
     """
 
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index e2bafcd68..6725ac5fe 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -14,11 +14,11 @@ class Mask(Datapoint):
     Args:
         data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
             well as PIL images.
-        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
             ``data``.
-        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
-            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
-        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the mask is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
             ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
     """
 
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index f6cc80fab..b54bfc54a 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -12,11 +12,11 @@ class Video(Datapoint):
 
     Args:
         data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
-        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
             ``data``.
-        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
-            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
-        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the video is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
             ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
     """
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 9821a32ce..c17530ecf 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -233,7 +233,8 @@ class ToDtype(Transform):
             A dict can be passed to specify per-datapoint conversions, e.g.
             ``dtype={datapoints.Image: torch.float32, datapoints.Mask: torch.int64, "others":None}``. The "others"
             key can be used as a catch-all for any other datapoint type, and ``None`` means no conversion.
-        scale (bool, optional): Whether to scale the values for images or videos. Default: ``False``.
+        scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
+            Default: ``False``.
     """
 
     _transformed_types = (torch.Tensor,)
-- 
GitLab


From 5c1e62ff67fb40e8e1bf70714d8d4b3c7d151f41 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 23 Aug 2023 11:09:18 +0100
Subject: [PATCH 588/624] Convert v2 doc warning into a note (#7867)

---
 docs/source/beta_status.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py
index 4a0fdc72c..cc79ca897 100644
--- a/docs/source/beta_status.py
+++ b/docs/source/beta_status.py
@@ -5,20 +5,20 @@ from docutils.parsers.rst import Directive
 class BetaStatus(Directive):
     has_content = True
     text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
+    node = nodes.warning
 
     def run(self):
         text = self.text.format(api_name=" ".join(self.content))
-        return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))]
+        return [self.node("", nodes.paragraph("", "", nodes.Text(text)))]
 
 
 class V2BetaStatus(BetaStatus):
     text = (
-        "The {api_name} is in Beta stage, and while we do not expect major breaking changes, "
-        "some APIs may still change according to user feedback. Please submit any feedback you may have "
-        "in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check "
-        "out https://github.com/pytorch/vision/issues/7319 to learn "
-        "more about the APIs that we suspect might involve future changes."
+        "The {api_name} is in Beta stage, and while we do not expect disruptive breaking changes, "
+        "some APIs may slightly change according to user feedback. Please submit any feedback you may have "
+        "in this issue: https://github.com/pytorch/vision/issues/6753."
     )
+    node = nodes.note
 
 
 def setup(app):
-- 
GitLab


From c486bb14ad386257b4125ae08a11c3f87b5f41fc Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Wed, 23 Aug 2023 13:11:01 +0200
Subject: [PATCH 589/624] Update coco_utils.py (#7869)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 references/detection/coco_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 5269b45ab..f40dcdff7 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -129,7 +129,7 @@ def convert_to_coco_api(ds):
         # find better way to get target
         # targets = ds.get_annotations(img_idx)
         img, targets = ds[img_idx]
-        image_id = targets["image_id"].item()
+        image_id = targets["image_id"]
         img_dict = {}
         img_dict["id"] = image_id
         img_dict["height"] = img.shape[-2]
-- 
GitLab


From 02d3d6db1e55ead585485beeb94c1e16bf8badd1 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 23 Aug 2023 10:39:08 -0400
Subject: [PATCH 590/624] Fix aarch64 builds (#7872)

---
 packaging/pre_build_script.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 7d38f2cb4..e93a7267e 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -11,7 +11,7 @@ if [[ "$(uname)" == Darwin ]]; then
   conda install -yq wget
 fi
 
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" || "$ARCH" == "aarch64" ]]; then
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Install libpng from Anaconda (defaults)
   conda install libpng -yq
   conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch
@@ -22,8 +22,13 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" || "$ARCH" == "aarch64" ]]; th
       bin_path=$(dirname $python_exec)
       cp "$bin_path/Library/bin/libjpeg.dll" torchvision
   fi
-
 else
+
+  if [[ "$ARCH" == "aarch64" ]]; then
+    conda install libpng -yq
+    conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly
+  fi
+
   # Install native CentOS libJPEG, freetype and GnuTLS
   yum install -y libjpeg-turbo-devel freetype gnutls
 
-- 
GitLab


From 92882b69f203f5ccca43ddecd303ff1e3a79d36d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 24 Aug 2023 09:59:51 +0200
Subject: [PATCH 591/624] disable tests for vit op counts (#7874)

---
 test/common_extended_utils.py | 7 +++++++
 test/test_extended_models.py  | 9 +++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py
index 4993de930..a34e15629 100644
--- a/test/common_extended_utils.py
+++ b/test/common_extended_utils.py
@@ -140,6 +140,12 @@ def conv_backward_flop(inputs: List[Any], outputs: List[Any]):
     return flop_count
 
 
+def scaled_dot_product_flash_attention_flop(inputs: List[Any], outputs: List[Any]):
+    # FIXME: this needs to count the flops of this kernel
+    # https://github.com/pytorch/pytorch/blob/207b06d099def9d9476176a1842e88636c1f714f/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp#L52-L267
+    return 0
+
+
 flop_mapping = {
     aten.mm: matmul_flop,
     aten.matmul: matmul_flop,
@@ -150,6 +156,7 @@ flop_mapping = {
     aten.convolution_backward: conv_backward_flop,
     quantized.conv2d: quant_conv_flop,
     quantized.conv2d_relu: quant_conv_flop,
+    aten._scaled_dot_product_flash_attention: scaled_dot_product_flash_attention_flop,
 }
 
 unmapped_ops = set()
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index 96a3fc5f8..0c918c0af 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -242,7 +242,6 @@ detection_models_input_dims = {
 )
 @run_if_test_with_extended
 def test_schema_meta_validation(model_fn):
-
     if model_fn.__name__ == "maskrcnn_resnet50_fpn_v2":
         pytest.skip(reason="FIXME https://github.com/pytorch/vision/issues/7349")
 
@@ -326,9 +325,11 @@ def test_schema_meta_validation(model_fn):
                     height, width = detection_models_input_dims[model_name]
                     kwargs = {"height": height, "width": width}
 
-                calculated_ops = get_ops(model=model, weight=w, **kwargs)
-                if calculated_ops != w.meta["_ops"]:
-                    incorrect_meta.append((w, "_ops"))
+                if not model_fn.__name__.startswith("vit"):
+                    # FIXME: https://github.com/pytorch/vision/issues/7871
+                    calculated_ops = get_ops(model=model, weight=w, **kwargs)
+                    if calculated_ops != w.meta["_ops"]:
+                        incorrect_meta.append((w, "_ops"))
 
         if not w.name.isupper():
             bad_names.append(w)
-- 
GitLab


From 054432d2d00d80746f2eb55d959a92c2ea3858e3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 24 Aug 2023 10:01:58 +0200
Subject: [PATCH 592/624] enforce pickleability for v2 transforms and wrapped
 datasets (#7860)

---
 test/datasets_utils.py                     | 28 ++++++++++-
 test/test_datasets.py                      | 57 +++++++++++++++++++++-
 test/test_transforms_v2.py                 |  6 ++-
 test/test_transforms_v2_refactored.py      |  3 ++
 torchvision/datapoints/_dataset_wrapper.py |  4 ++
 torchvision/datasets/widerface.py          | 14 +++---
 6 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index b6f22d766..8afc6ddb3 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -5,6 +5,7 @@ import inspect
 import itertools
 import os
 import pathlib
+import platform
 import random
 import shutil
 import string
@@ -548,7 +549,7 @@ class DatasetTestCase(unittest.TestCase):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"]
+            assert len(list(dataset)) == len(dataset) == info["num_examples"]
 
     @test_all_configs
     def test_transforms(self, config):
@@ -692,6 +693,31 @@ class VideoDatasetTestCase(DatasetTestCase):
         super().test_transforms_v2_wrapper.__wrapped__(self, config)
 
 
+def _no_collate(batch):
+    return batch
+
+
+def check_transforms_v2_wrapper_spawn(dataset):
+    # On Linux and Windows, the DataLoader forks the main process by default. This is not available on macOS, so new
+    # subprocesses are spawned. This requires the whole pipeline including the dataset to be pickleable, which is what
+    # we are enforcing here.
+    if platform.system() != "Darwin":
+        pytest.skip("Multiprocessing spawning is only checked on macOS.")
+
+    from torch.utils.data import DataLoader
+    from torchvision import datapoints
+    from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    dataloader = DataLoader(wrapped_dataset, num_workers=2, multiprocessing_context="spawn", collate_fn=_no_collate)
+
+    for wrapped_sample in dataloader:
+        assert tree_any(
+            lambda item: isinstance(item, (datapoints.Image, datapoints.Video, PIL.Image.Image)), wrapped_sample
+        )
+
+
 def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor:
     r"""Create a random uint8 tensor.
 
diff --git a/test/test_datasets.py b/test/test_datasets.py
index ed6aa17d3..265316264 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -183,6 +183,10 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
                 ), "Type of the combined target does not match the type of the corresponding individual target: "
                 f"{actual} is not {expected}",
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset(target_type="category") as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Caltech256
@@ -190,7 +194,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
 
-        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
+        categories = ((1, "ak47"), (2, "american-flag"), (3, "backpack"))
         num_images_per_category = 2
 
         for idx, category in categories:
@@ -258,6 +262,10 @@ class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
 
         return split_to_num_examples[config["split"]]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Cityscapes
@@ -382,6 +390,11 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
             assert isinstance(polygon_img, PIL.Image.Image)
             (polygon_target, info["expected_polygon_target"])
 
+    def test_transforms_v2_wrapper_spawn(self):
+        for target_type in ["instance", "semantic", ["instance", "semantic"]]:
+            with self.create_dataset(target_type=target_type) as (dataset, _):
+                datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ImageNet
@@ -413,6 +426,10 @@ class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
         torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
         return num_examples
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CIFAR10
@@ -607,6 +624,11 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase):
 
         assert merged_imgs_names == all_imgs_names
 
+    def test_transforms_v2_wrapper_spawn(self):
+        for target_type in ["identity", "bbox", ["identity", "bbox"]]:
+            with self.create_dataset(target_type=target_type) as (dataset, _):
+                datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.VOCSegmentation
@@ -694,6 +716,10 @@ class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
 
         return data
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class VOCDetectionTestCase(VOCSegmentationTestCase):
     DATASET_CLASS = datasets.VOCDetection
@@ -714,6 +740,10 @@ class VOCDetectionTestCase(VOCSegmentationTestCase):
 
             assert object == info["annotation"]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CocoDetection
@@ -784,6 +814,10 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
             json.dump(content, fh)
         return file
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CocoCaptionsTestCase(CocoDetectionTestCase):
     DATASET_CLASS = datasets.CocoCaptions
@@ -800,6 +834,11 @@ class CocoCaptionsTestCase(CocoDetectionTestCase):
             _, captions = dataset[0]
             assert tuple(captions) == tuple(info["captions"])
 
+    def test_transforms_v2_wrapper_spawn(self):
+        # We need to define this method, because otherwise the test from the super class will
+        # be run
+        pytest.skip("CocoCaptions is currently not supported by the v2 wrapper.")
+
 
 class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.UCF101
@@ -966,6 +1005,10 @@ class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
             )
         return num_videos_per_class * len(classes)
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset(output_format="TCHW") as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.HMDB51
@@ -1193,6 +1236,10 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
     def _file_stem(self, idx):
         return f"2008_{idx:06d}"
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset(mode="segmentation") as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FakeData
@@ -1642,6 +1689,10 @@ class KittiTestCase(datasets_utils.ImageDatasetTestCase):
 
         return split_to_num_examples[config["train"]]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SVHN
@@ -2516,6 +2567,10 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
         breed_id = "-1"
         return (image_id, class_id, species, breed_id)
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StanfordCars
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 26dde6407..325d864dc 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1,5 +1,6 @@
 import itertools
 import pathlib
+import pickle
 import random
 import warnings
 
@@ -169,8 +170,11 @@ class TestSmoke:
             next(make_vanilla_tensor_images()),
         ],
     )
+    @pytest.mark.parametrize("de_serialize", [lambda t: t, lambda t: pickle.loads(pickle.dumps(t))])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_common(self, transform, adapter, container_type, image_or_video, device):
+    def test_common(self, transform, adapter, container_type, image_or_video, de_serialize, device):
+        transform = de_serialize(transform)
+
         canvas_size = F.get_size(image_or_video)
         input = dict(
             image_or_video=image_or_video,
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index dce6229e8..14842c85c 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2,6 +2,7 @@ import contextlib
 import decimal
 import inspect
 import math
+import pickle
 import re
 from pathlib import Path
 from unittest import mock
@@ -247,6 +248,8 @@ def _check_transform_v1_compatibility(transform, input):
 def check_transform(transform_cls, input, *args, **kwargs):
     transform = transform_cls(*args, **kwargs)
 
+    pickle.loads(pickle.dumps(transform))
+
     output = transform(input)
     assert isinstance(output, type(input))
 
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index 2ed601fec..07a3e0ff7 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -162,6 +162,7 @@ class VisionDatasetDatapointWrapper:
                 raise TypeError(msg)
 
         self._dataset = dataset
+        self._target_keys = target_keys
         self._wrapper = wrapper_factory(dataset, target_keys)
 
         # We need to disable the transforms on the dataset here to be able to inject the wrapping before we apply them.
@@ -197,6 +198,9 @@ class VisionDatasetDatapointWrapper:
     def __len__(self):
         return len(self._dataset)
 
+    def __reduce__(self):
+        return wrap_dataset_for_transforms_v2, (self._dataset, self._target_keys)
+
 
 def raise_not_supported(description):
     raise RuntimeError(
diff --git a/torchvision/datasets/widerface.py b/torchvision/datasets/widerface.py
index b46c7982d..aa520455e 100644
--- a/torchvision/datasets/widerface.py
+++ b/torchvision/datasets/widerface.py
@@ -137,13 +137,13 @@ class WIDERFace(VisionDataset):
                             {
                                 "img_path": img_path,
                                 "annotations": {
-                                    "bbox": labels_tensor[:, 0:4],  # x, y, width, height
-                                    "blur": labels_tensor[:, 4],
-                                    "expression": labels_tensor[:, 5],
-                                    "illumination": labels_tensor[:, 6],
-                                    "occlusion": labels_tensor[:, 7],
-                                    "pose": labels_tensor[:, 8],
-                                    "invalid": labels_tensor[:, 9],
+                                    "bbox": labels_tensor[:, 0:4].clone(),  # x, y, width, height
+                                    "blur": labels_tensor[:, 4].clone(),
+                                    "expression": labels_tensor[:, 5].clone(),
+                                    "illumination": labels_tensor[:, 6].clone(),
+                                    "occlusion": labels_tensor[:, 7].clone(),
+                                    "pose": labels_tensor[:, 8].clone(),
+                                    "invalid": labels_tensor[:, 9].clone(),
                                 },
                             }
                         )
-- 
GitLab


From f7c7bdf5fae5e946e67812c5791d91cec83fccd5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 24 Aug 2023 15:41:54 +0100
Subject: [PATCH 593/624] Stricter SanitizeBoundingBoxes labels_getter
 heuristic (#7880)

---
 gallery/v2_transforms/plot_transforms_v2.py |  7 +++----
 test/test_transforms_v2.py                  | 14 ++++++++++++++
 torchvision/transforms/v2/_utils.py         |  2 +-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/gallery/v2_transforms/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
index 3058df234..0f97431da 100644
--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -99,10 +99,9 @@ bboxes = datapoints.BoundingBoxes(
     format="XYXY", canvas_size=img.shape[-2:])
 
 transforms = v2.Compose([
-    v2.RandomPhotometricDistort(),
-    v2.RandomIoUCrop(),
-    v2.RandomHorizontalFlip(p=0.5),
-    v2.SanitizeBoundingBoxes(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomPhotometricDistort(p=1),
+    v2.RandomHorizontalFlip(p=1),
 ])
 out_img, out_bboxes = transforms(img, bboxes)
 
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 325d864dc..982c86d04 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1256,6 +1256,20 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
         assert out_labels.tolist() == valid_indices
 
 
+def test_sanitize_bounding_boxes_no_label():
+    # Non-regression test for https://github.com/pytorch/vision/issues/7878
+
+    img = make_image()
+    boxes = make_bounding_boxes()
+
+    with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"):
+        transforms.SanitizeBoundingBoxes()(img, boxes)
+
+    out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes)
+    assert isinstance(out_img, datapoints.Image)
+    assert isinstance(out_boxes, datapoints.BoundingBoxes)
+
+
 def test_sanitize_bounding_boxes_errors():
 
     good_bbox = datapoints.BoundingBoxes(
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index 3c6977fae..6b327d45c 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -112,7 +112,7 @@ def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
         inputs = inputs[1]
 
     # MixUp, CutMix
-    if isinstance(inputs, torch.Tensor):
+    if is_pure_tensor(inputs):
         return inputs
 
     if not isinstance(inputs, collections.abc.Mapping):
-- 
GitLab


From b82d8833c2a872acf1d73541032c57762dc5f0cc Mon Sep 17 00:00:00 2001
From: David Chiu <david20571015@gmail.com>
Date: Thu, 24 Aug 2023 23:21:53 +0800
Subject: [PATCH 594/624] Fix  typos in _augment.py (#7877)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 torchvision/transforms/v2/_augment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 130950fee..a9bad8f9b 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -229,7 +229,7 @@ class MixUp(_BaseMixUpCutMix):
         alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
         num_classes (int): number of classes in the batch. Used for one-hot-encoding.
         labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
-            By default, this will pick the second parameter a the labels if it's a tensor. This covers the most
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
             common scenario where this transform is called as ``MixUp()(imgs_batch, labels_batch)``.
             It can also be a callable that takes the same input as the transform, and returns the labels.
     """
@@ -279,7 +279,7 @@ class CutMix(_BaseMixUpCutMix):
         alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
         num_classes (int): number of classes in the batch. Used for one-hot-encoding.
         labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
-            By default, this will pick the second parameter a the labels if it's a tensor. This covers the most
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
             common scenario where this transform is called as ``CutMix()(imgs_batch, labels_batch)``.
             It can also be a callable that takes the same input as the transform, and returns the labels.
     """
-- 
GitLab


From f514ab64baae4862488e068c38829286ae9296fc Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 24 Aug 2023 17:24:30 +0200
Subject: [PATCH 595/624] test output_format in video datasets (#7879)

---
 test/datasets_utils.py              | 22 +++++++++++++++++-----
 torchvision/datasets/video_utils.py |  6 +++---
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 8afc6ddb3..f7a1b8dd3 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -662,27 +662,39 @@ class VideoDatasetTestCase(DatasetTestCase):
     FEATURE_TYPES = (torch.Tensor, torch.Tensor, int)
     REQUIRED_PACKAGES = ("av",)
 
-    DEFAULT_FRAMES_PER_CLIP = 1
+    FRAMES_PER_CLIP = 1
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.dataset_args = self._set_default_frames_per_clip(self.dataset_args)
 
-    def _set_default_frames_per_clip(self, inject_fake_data):
+    def _set_default_frames_per_clip(self, dataset_args):
         argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__)
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @functools.wraps(inject_fake_data)
+        @functools.wraps(dataset_args)
         def wrapper(tmpdir, config):
-            args = inject_fake_data(tmpdir, config)
+            args = dataset_args(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
-                args = (*args, self.DEFAULT_FRAMES_PER_CLIP)
+                args = (*args, self.FRAMES_PER_CLIP)
 
             return args
 
         return wrapper
 
+    def test_output_format(self):
+        for output_format in ["TCHW", "THWC"]:
+            with self.create_dataset(output_format=output_format) as (dataset, _):
+                for video, *_ in dataset:
+                    if output_format == "TCHW":
+                        num_frames, num_channels, *_ = video.shape
+                    else:  # output_format == "THWC":
+                        num_frames, *_, num_channels = video.shape
+
+                assert num_frames == self.FRAMES_PER_CLIP
+                assert num_channels == 3
+
     @test_all_configs
     def test_transforms_v2_wrapper(self, config):
         # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index bb1974b7a..df55518de 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -187,9 +187,9 @@ class VideoClips:
         }
         return type(self)(
             video_paths,
-            self.num_frames,
-            self.step,
-            self.frame_rate,
+            clip_length_in_frames=self.num_frames,
+            frames_between_clips=self.step,
+            frame_rate=self.frame_rate,
             _precomputed_metadata=metadata,
             num_workers=self.num_workers,
             _video_width=self._video_width,
-- 
GitLab


From 4491ca2e8585375ccbab43a42f8af5c664414090 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 24 Aug 2023 17:50:20 +0200
Subject: [PATCH 596/624] Added support for CMYK in decode_jpeg (#7741)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 test/test_image.py                            |  5 +-
 torchvision/csrc/io/image/cpu/decode_jpeg.cpp | 87 +++++++++++++++++--
 2 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/test/test_image.py b/test/test_image.py
index b24ac07d9..a87f5fa2d 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -83,12 +83,9 @@ def test_decode_jpeg(img_path, pil_mode, mode):
     with Image.open(img_path) as img:
         is_cmyk = img.mode == "CMYK"
         if pil_mode is not None:
-            if is_cmyk:
-                # libjpeg does not support the conversion
-                pytest.xfail("Decoding a CMYK jpeg isn't supported")
             img = img.convert(pil_mode)
         img_pil = torch.from_numpy(np.array(img))
-        if is_cmyk:
+        if is_cmyk and mode == ImageReadMode.UNCHANGED:
             # flip the colors to match libjpeg
             img_pil = 255 - img_pil
 
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
index 09a0618ad..63a4e5b42 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -67,6 +67,58 @@ static void torch_jpeg_set_source_mgr(
   src->pub.next_input_byte = src->data;
 }
 
+inline unsigned char clamped_cmyk_rgb_convert(
+    unsigned char k,
+    unsigned char cmy) {
+  // Inspired from Pillow:
+  // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L568-L569
+  int v = k * cmy + 128;
+  v = ((v >> 8) + v) >> 8;
+  return std::clamp(k - v, 0, 255);
+}
+
+void convert_line_cmyk_to_rgb(
+    j_decompress_ptr cinfo,
+    const unsigned char* cmyk_line,
+    unsigned char* rgb_line) {
+  int width = cinfo->output_width;
+  for (int i = 0; i < width; ++i) {
+    int c = cmyk_line[i * 4 + 0];
+    int m = cmyk_line[i * 4 + 1];
+    int y = cmyk_line[i * 4 + 2];
+    int k = cmyk_line[i * 4 + 3];
+
+    rgb_line[i * 3 + 0] = clamped_cmyk_rgb_convert(k, 255 - c);
+    rgb_line[i * 3 + 1] = clamped_cmyk_rgb_convert(k, 255 - m);
+    rgb_line[i * 3 + 2] = clamped_cmyk_rgb_convert(k, 255 - y);
+  }
+}
+
+inline unsigned char rgb_to_gray(int r, int g, int b) {
+  // Inspired from Pillow:
+  // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L226
+  return (r * 19595 + g * 38470 + b * 7471 + 0x8000) >> 16;
+}
+
+void convert_line_cmyk_to_gray(
+    j_decompress_ptr cinfo,
+    const unsigned char* cmyk_line,
+    unsigned char* gray_line) {
+  int width = cinfo->output_width;
+  for (int i = 0; i < width; ++i) {
+    int c = cmyk_line[i * 4 + 0];
+    int m = cmyk_line[i * 4 + 1];
+    int y = cmyk_line[i * 4 + 2];
+    int k = cmyk_line[i * 4 + 3];
+
+    int r = clamped_cmyk_rgb_convert(k, 255 - c);
+    int g = clamped_cmyk_rgb_convert(k, 255 - m);
+    int b = clamped_cmyk_rgb_convert(k, 255 - y);
+
+    gray_line[i] = rgb_to_gray(r, g, b);
+  }
+}
+
 } // namespace
 
 torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
@@ -102,20 +154,29 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   jpeg_read_header(&cinfo, TRUE);
 
   int channels = cinfo.num_components;
+  bool cmyk_to_rgb_or_gray = false;
 
   if (mode != IMAGE_READ_MODE_UNCHANGED) {
     switch (mode) {
       case IMAGE_READ_MODE_GRAY:
-        if (cinfo.jpeg_color_space != JCS_GRAYSCALE) {
+        if (cinfo.jpeg_color_space == JCS_CMYK ||
+            cinfo.jpeg_color_space == JCS_YCCK) {
+          cinfo.out_color_space = JCS_CMYK;
+          cmyk_to_rgb_or_gray = true;
+        } else {
           cinfo.out_color_space = JCS_GRAYSCALE;
-          channels = 1;
         }
+        channels = 1;
         break;
       case IMAGE_READ_MODE_RGB:
-        if (cinfo.jpeg_color_space != JCS_RGB) {
+        if (cinfo.jpeg_color_space == JCS_CMYK ||
+            cinfo.jpeg_color_space == JCS_YCCK) {
+          cinfo.out_color_space = JCS_CMYK;
+          cmyk_to_rgb_or_gray = true;
+        } else {
           cinfo.out_color_space = JCS_RGB;
-          channels = 3;
         }
+        channels = 3;
         break;
       /*
        * Libjpeg does not support converting from CMYK to grayscale etc. There
@@ -139,12 +200,28 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   auto tensor =
       torch::empty({int64_t(height), int64_t(width), channels}, torch::kU8);
   auto ptr = tensor.data_ptr<uint8_t>();
+  torch::Tensor cmyk_line_tensor;
+  if (cmyk_to_rgb_or_gray) {
+    cmyk_line_tensor = torch::empty({int64_t(width), 4}, torch::kU8);
+  }
+
   while (cinfo.output_scanline < cinfo.output_height) {
     /* jpeg_read_scanlines expects an array of pointers to scanlines.
      * Here the array is only one element long, but you could ask for
      * more than one scanline at a time if that's more convenient.
      */
-    jpeg_read_scanlines(&cinfo, &ptr, 1);
+    if (cmyk_to_rgb_or_gray) {
+      auto cmyk_line_ptr = cmyk_line_tensor.data_ptr<uint8_t>();
+      jpeg_read_scanlines(&cinfo, &cmyk_line_ptr, 1);
+
+      if (channels == 3) {
+        convert_line_cmyk_to_rgb(&cinfo, cmyk_line_ptr, ptr);
+      } else if (channels == 1) {
+        convert_line_cmyk_to_gray(&cinfo, cmyk_line_ptr, ptr);
+      }
+    } else {
+      jpeg_read_scanlines(&cinfo, &ptr, 1);
+    }
     ptr += stride;
   }
 
-- 
GitLab


From 9f0afd55394454e5686af69efc4a38905a3f96c4 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 24 Aug 2023 17:51:05 +0200
Subject: [PATCH 597/624] Replaced ConvertImageDtype by ToDtype in reference
 scripts (#7862)

Co-authored-by: Nicolas Hug <nh.nicolas.hug@gmail.com>
---
 references/classification/presets.py  | 4 ++--
 references/detection/presets.py       | 4 ++--
 references/detection/transforms.py    | 7 +++++--
 references/segmentation/presets.py    | 4 ++--
 references/segmentation/transforms.py | 7 +++++--
 references/segmentation/v2_extras.py  | 2 +-
 6 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/references/classification/presets.py b/references/classification/presets.py
index 84651493f..8653957a5 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -61,7 +61,7 @@ class ClassificationPresetTrain:
 
         transforms.extend(
             [
-                T.ConvertImageDtype(torch.float),
+                T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
                 T.Normalize(mean=mean, std=std),
             ]
         )
@@ -106,7 +106,7 @@ class ClassificationPresetEval:
             transforms.append(T.PILToTensor())
 
         transforms += [
-            T.ConvertImageDtype(torch.float),
+            T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
             T.Normalize(mean=mean, std=std),
         ]
 
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 0949a9989..e7b2ca357 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -73,7 +73,7 @@ class DetectionPresetTrain:
             # Note: we could just convert to pure tensors even in v2.
             transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
 
-        transforms += [T.ConvertImageDtype(torch.float)]
+        transforms += [T.ToDtype(torch.float, scale=True)]
 
         if use_v2:
             transforms += [
@@ -103,7 +103,7 @@ class DetectionPresetEval:
         else:
             raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
 
-        transforms += [T.ConvertImageDtype(torch.float)]
+        transforms += [T.ToDtype(torch.float, scale=True)]
 
         if use_v2:
             transforms += [T.ToPureTensor()]
diff --git a/references/detection/transforms.py b/references/detection/transforms.py
index 65cf4e835..e07ccfc99 100644
--- a/references/detection/transforms.py
+++ b/references/detection/transforms.py
@@ -53,14 +53,17 @@ class PILToTensor(nn.Module):
         return image, target
 
 
-class ConvertImageDtype(nn.Module):
-    def __init__(self, dtype: torch.dtype) -> None:
+class ToDtype(nn.Module):
+    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
         super().__init__()
         self.dtype = dtype
+        self.scale = scale
 
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
     ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
         image = F.convert_image_dtype(image, self.dtype)
         return image, target
 
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index 7b7d0493b..b0539fcca 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -60,7 +60,7 @@ class SegmentationPresetTrain:
             ]
         else:
             # No need to explicitly convert masks as they're magically int64 already
-            transforms += [T.ConvertImageDtype(torch.float)]
+            transforms += [T.ToDtype(torch.float, scale=True)]
 
         transforms += [T.Normalize(mean=mean, std=std)]
         if use_v2:
@@ -97,7 +97,7 @@ class SegmentationPresetEval:
             transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
 
         transforms += [
-            T.ConvertImageDtype(torch.float),
+            T.ToDtype(torch.float, scale=True),
             T.Normalize(mean=mean, std=std),
         ]
         if use_v2:
diff --git a/references/segmentation/transforms.py b/references/segmentation/transforms.py
index 2b3e79b14..6934b9f86 100644
--- a/references/segmentation/transforms.py
+++ b/references/segmentation/transforms.py
@@ -81,11 +81,14 @@ class PILToTensor:
         return image, target
 
 
-class ConvertImageDtype:
-    def __init__(self, dtype):
+class ToDtype:
+    def __init__(self, dtype, scale=False):
         self.dtype = dtype
+        self.scale = scale
 
     def __call__(self, image, target):
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
         image = F.convert_image_dtype(image, self.dtype)
         return image, target
 
diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
index 137a00ccf..ae55f0727 100644
--- a/references/segmentation/v2_extras.py
+++ b/references/segmentation/v2_extras.py
@@ -78,6 +78,6 @@ class CocoDetectionToVOCSegmentation(v2.Transform):
     def forward(self, image, target):
         segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target)
         if segmentation_mask is None:
-            segmentation_mask = torch.zeros(v2.functional.get_spatial_size(image), dtype=torch.uint8)
+            segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8)
 
         return image, datapoints.Mask(segmentation_mask)
-- 
GitLab


From 92e4e9c650fafb42294a80a42b6d394e10b5f3c4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 25 Aug 2023 10:17:22 +0200
Subject: [PATCH 598/624] port remaining resize tests (#7856)

---
 test/test_transforms_v2_functional.py | 40 ------------------
 test/test_transforms_v2_refactored.py | 60 +++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 826ba8b57..315993c75 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -1013,43 +1013,3 @@ def test_correctness_uniform_temporal_subsample(device):
 
     out_video = F.uniform_temporal_subsample(video, 8)
     assert out_video.unique().tolist() == [0, 1, 2, 3, 5, 6, 7, 9]
-
-
-# TODO: We can remove this test and related torchvision workaround
-# once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430
-@make_info_args_kwargs_parametrization(
-    [info for info in KERNEL_INFOS if info.kernel is F.resize_image],
-    args_kwargs_fn=lambda info: info.reference_inputs_fn(),
-)
-def test_memory_format_consistency_resize_image_tensor(test_id, info, args_kwargs):
-    (input, *other_args), kwargs = args_kwargs.load("cpu")
-
-    output = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
-
-    error_msg_fn = parametrized_error_message(input, *other_args, **kwargs)
-    assert input.ndim == 3, error_msg_fn
-    input_stride = input.stride()
-    output_stride = output.stride()
-    # Here we check output memory format according to the input:
-    # if input_stride is (..., 1) then input is most likely channels first and thus
-    #   output strides should match channels first strides (H * W, H, 1)
-    # if input_stride is (1, ...) then input is most likely channels last and thus
-    #   output strides should match channels last strides (1, W * C, C)
-    if input_stride[-1] == 1:
-        expected_stride = (output.shape[-2] * output.shape[-1], output.shape[-1], 1)
-        assert expected_stride == output_stride, error_msg_fn("")
-    elif input_stride[0] == 1:
-        expected_stride = (1, output.shape[0] * output.shape[-1], output.shape[0])
-        assert expected_stride == output_stride, error_msg_fn("")
-    else:
-        assert False, error_msg_fn("")
-
-
-def test_resize_float16_no_rounding():
-    # Make sure Resize() doesn't round float16 images
-    # Non-regression test for https://github.com/pytorch/vision/issues/7667
-
-    img = torch.randint(0, 256, size=(1, 3, 100, 100), dtype=torch.float16)
-    out = F.resize(img, size=(10, 10))
-    assert out.dtype == torch.float16
-    assert (out.round() - out).sum() > 0
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 14842c85c..0b9024c94 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -735,6 +735,66 @@ class TestResize:
 
         assert max(F.get_size(output)) == max_size
 
+    def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs):
+        # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming
+        # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the
+        # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create
+        # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels
+        # last although PyTorch doesn't recognizes it as such.
+        emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1
+
+        image = make_image(
+            *args,
+            batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims,
+            memory_format=memory_format,
+            **kwargs,
+        )
+
+        if emulate_channels_last:
+            image = datapoints.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image)
+
+        return image
+
+    def _check_stride(self, image, *, memory_format):
+        C, H, W = F.get_dimensions(image)
+        if memory_format is torch.contiguous_format:
+            expected_stride = (H * W, W, 1)
+        elif memory_format is torch.channels_last:
+            expected_stride = (1, W * C, C)
+        else:
+            raise ValueError(f"Unknown memory_format: {memory_format}")
+
+        assert image.stride() == expected_stride
+
+    # TODO: We can remove this test and related torchvision workaround
+    #  once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize("antialias", [True, False])
+    @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last])
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device):
+        size = self.OUTPUT_SIZES[0]
+
+        input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format)
+
+        # Smoke test to make sure we aren't starting with wrong assumptions
+        self._check_stride(input, memory_format=memory_format)
+
+        output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias)
+
+        self._check_stride(output, memory_format=memory_format)
+
+    def test_float16_no_rounding(self):
+        # Make sure Resize() doesn't round float16 images
+        # Non-regression test for https://github.com/pytorch/vision/issues/7667
+
+        input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16)
+        output = F.resize_image(input, size=self.OUTPUT_SIZES[0])
+
+        assert output.dtype is torch.float16
+        assert (output.round() - output).abs().sum() > 0
+
 
 class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
-- 
GitLab


From 224cbc83647bf3b9714744fd64ca1215cfe09a50 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 25 Aug 2023 09:41:02 +0100
Subject: [PATCH 599/624] Rewrite transforms v2 e2e example (#7881)

---
 gallery/v2_transforms/helpers.py              |  29 ++-
 gallery/v2_transforms/plot_transforms_v2.py   |  18 +-
 .../v2_transforms/plot_transforms_v2_e2e.py   | 198 ++++++++++--------
 3 files changed, 142 insertions(+), 103 deletions(-)

diff --git a/gallery/v2_transforms/helpers.py b/gallery/v2_transforms/helpers.py
index 846030c35..3c92df432 100644
--- a/gallery/v2_transforms/helpers.py
+++ b/gallery/v2_transforms/helpers.py
@@ -1,5 +1,8 @@
 import matplotlib.pyplot as plt
-from torchvision.utils import draw_bounding_boxes
+import torch
+from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
+from torchvision import datapoints
+from torchvision.transforms.v2 import functional as F
 
 
 def plot(imgs):
@@ -12,20 +15,30 @@ def plot(imgs):
     _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
     for row_idx, row in enumerate(imgs):
         for col_idx, img in enumerate(row):
-            bboxes = None
+            boxes = None
+            masks = None
             if isinstance(img, tuple):
-                bboxes = img[1]
-                img = img[0]
-                if isinstance(bboxes, dict):
-                    bboxes = bboxes['bboxes']
+                img, target = img
+                if isinstance(target, dict):
+                    boxes = target.get("boxes")
+                    masks = target.get("masks")
+                elif isinstance(target, datapoints.BoundingBoxes):
+                    boxes = target
+                else:
+                    raise ValueError(f"Unexpected target type: {type(target)}")
+            img = F.to_image(img)
             if img.dtype.is_floating_point and img.min() < 0:
                 # Poor man's re-normalization for the colors to be OK-ish. This
                 # is useful for images coming out of Normalize()
                 img -= img.min()
                 img /= img.max()
 
-            if bboxes is not None:
-                img = draw_bounding_boxes(img, bboxes, colors="yellow", width=3)
+            img = F.to_dtype(img, torch.uint8, scale=True)
+            if boxes is not None:
+                img = draw_bounding_boxes(img, boxes, colors="yellow", width=3)
+            if masks is not None:
+                img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
+
             ax = axs[row_idx, col_idx]
             ax.imshow(img.permute(1, 2, 0).numpy())
             ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
diff --git a/gallery/v2_transforms/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
index 0f97431da..92a92545a 100644
--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -90,7 +90,7 @@ plot([img, out])
 
 from torchvision import datapoints  # we'll describe this a bit later, bare with us
 
-bboxes = datapoints.BoundingBoxes(
+boxes = datapoints.BoundingBoxes(
     [
         [15, 10, 370, 510],
         [275, 340, 510, 510],
@@ -103,9 +103,10 @@ transforms = v2.Compose([
     v2.RandomPhotometricDistort(p=1),
     v2.RandomHorizontalFlip(p=1),
 ])
-out_img, out_bboxes = transforms(img, bboxes)
+out_img, out_boxes = transforms(img, boxes)
+print(type(boxes), type(out_boxes))
 
-plot([(img, bboxes), (out_img, out_bboxes)])
+plot([(img, boxes), (out_img, out_boxes)])
 
 # %%
 #
@@ -119,6 +120,9 @@ plot([(img, bboxes), (out_img, out_bboxes)])
 # answer these in the next sections.
 
 # %%
+#
+# .. _what_are_datapoints:
+#
 # What are Datapoints?
 # --------------------
 #
@@ -151,7 +155,7 @@ print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 #
 # Above, we've seen two examples: one where we passed a single image as input
 # i.e. ``out = transforms(img)``, and one where we passed both an image and
-# bounding boxes, i.e. ``out_img, out_bboxes = transforms(img, bboxes)``.
+# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``.
 #
 # In fact, transforms support **arbitrary input structures**. The input can be a
 # single image, a tuple, an arbitrarily nested dictionary... pretty much
@@ -160,15 +164,15 @@ print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 # we're getting the same structure as output:
 
 target = {
-    "bboxes": bboxes,
-    "labels": torch.arange(bboxes.shape[0]),
+    "boxes": boxes,
+    "labels": torch.arange(boxes.shape[0]),
     "this_is_ignored": ("arbitrary", {"structure": "!"})
 }
 
 # Re-using the transforms and definitions from above.
 out_img, out_target = transforms(img, target)
 
-plot([(img, target["bboxes"]), (out_img, out_target["bboxes"])])
+plot([(img, target["boxes"]), (out_img, out_target["boxes"])])
 print(f"{out_target['this_is_ignored']}")
 
 # %%
diff --git a/gallery/v2_transforms/plot_transforms_v2_e2e.py b/gallery/v2_transforms/plot_transforms_v2_e2e.py
index 84e3e95ee..fa47dbfef 100644
--- a/gallery/v2_transforms/plot_transforms_v2_e2e.py
+++ b/gallery/v2_transforms/plot_transforms_v2_e2e.py
@@ -1,146 +1,168 @@
 """
-==================================================
-Transforms v2: End-to-end object detection example
-==================================================
+===============================================================
+Transforms v2: End-to-end object detection/segmentation example
+===============================================================
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2_e2e.ipynb>`_
     or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2_e2e.py>` to download the full example code.
 
-Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
-``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example
-showcases an end-to-end object detection training using the stable ``torchvision.datasets`` and ``torchvision.models``
-as well as the new ``torchvision.transforms.v2`` v2 API.
+Object detection and segmentation tasks are natively supported:
+``torchvision.transforms.v2`` enables jointly transforming images, videos,
+bounding boxes, and masks.
+
+This example showcases an end-to-end instance segmentation training case using
+Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and
+``torchvision.transforms.v2``. Everything covered here can be applied similarly
+to object detection or semantic segmentation tasks.
 """
 
+# %%
 import pathlib
 
-import PIL.Image
-
 import torch
 import torch.utils.data
 
-from torchvision import models, datasets
-import torchvision.transforms.v2 as transforms
-
-
-def show(sample):
-    import matplotlib.pyplot as plt
-
-    from torchvision.transforms.v2 import functional as F
-    from torchvision.utils import draw_bounding_boxes
-
-    image, target = sample
-    if isinstance(image, PIL.Image.Image):
-        image = F.to_image(image)
-    image = F.to_dtype(image, torch.uint8, scale=True)
-    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+from torchvision import models, datasets, datapoints
+from torchvision.transforms import v2
 
-    fig, ax = plt.subplots()
-    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
-    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-    fig.tight_layout()
+torch.manual_seed(0)
 
-    fig.show()
+# This loads fake data for illustration purposes of this example. In practice, you'll have
+# to replace this with the proper data.
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+ROOT = pathlib.Path("../assets") / "coco"
+IMAGES_PATH = str(ROOT / "images")
+ANNOTATIONS_PATH = str(ROOT / "instances.json")
+from helpers import plot
 
 
 # %%
+# Dataset preparation
+# -------------------
+#
 # We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
-# returns, and we'll see how to convert it to a format that is compatible with our new transforms.
-
-def load_example_coco_detection_dataset(**kwargs):
-    # This loads fake data for illustration purposes of this example. In practice, you'll have
-    # to replace this with the proper data
-    root = pathlib.Path("../assets") / "coco"
-    return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs)
-
+# returns.
 
-dataset = load_example_coco_detection_dataset()
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH)
 
 sample = dataset[0]
-image, target = sample
-print(type(image))
-print(type(target), type(target[0]), list(target[0].keys()))
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }")
 
 
 # %%
-# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
-# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
-# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
+# Torchvision datasets preserve the data structure and types as it was intended
+# by the datasets authors. So by default, the output structure may not always be
+# compatible with the models or the transforms.
+#
+# To overcome that, we can use the
 # :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
-# :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It
-# also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding
-# ``torchvision.datapoints``. By default, it only returns ``"boxes"`` and ``"labels"`` to avoid transforming unnecessary
-# items down the line, but you can pass the ``target_type`` parameter for fine-grained control.
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target
+# structure to a single dictionary of lists:
 
-dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks"))
 
 sample = dataset[0]
-image, target = sample
-print(type(image))
-print(type(target), list(target.keys()))
-print(type(target["boxes"]), type(target["labels"]))
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
+print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }")
 
 # %%
+# We used the ``target_keys`` parameter to specify the kind of output we're
+# interested in. Our dataset now returns a target which is dict where the values
+# are :ref:`Datapoints <what_are_datapoints>` (all are :class:`torch.Tensor`
+# subclasses). We're dropped all unncessary keys from the previous output, but
+# if you need any of the original keys e.g. "image_id", you can still ask for
+# it.
+#
+# .. note::
+#
+#     If you just want to do detection, you don't need and shouldn't pass
+#     "masks" in ``target_keys``: if masks are present in the sample, they will
+#     be transformed, slowing down your transformations unnecessarily.
+#
 # As baseline, let's have a look at a sample without transformations:
 
-show(sample)
+plot([dataset[0], dataset[1]])
 
 
 # %%
-# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
-# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
+# Transforms
+# ----------
+#
+# Let's now define our pre-processing transforms. All the transforms know how
+# to handle images, bouding boxes and masks when relevant.
+#
+# Transforms are typically passed as the ``transforms`` parameter of the
+# dataset so that they can leverage multi-processing from the
+# :class:`torch.utils.data.DataLoader`.
 
-transform = transforms.Compose(
+transforms = v2.Compose(
     [
-        transforms.RandomPhotometricDistort(),
-        transforms.RandomZoomOut(fill={PIL.Image.Image: (123, 117, 104), "others": 0}),
-        transforms.RandomIoUCrop(),
-        transforms.RandomHorizontalFlip(),
-        transforms.ToImage(),
-        transforms.ConvertImageDtype(torch.float32),
-        transforms.SanitizeBoundingBoxes(),
+        v2.ToImage(),
+        v2.RandomPhotometricDistort(p=1),
+        v2.RandomZoomOut(fill={datapoints.Image: (123, 117, 104), "others": 0}),
+        v2.RandomIoUCrop(),
+        v2.RandomHorizontalFlip(p=1),
+        v2.SanitizeBoundingBoxes(),
+        v2.ToDtype(torch.float32, scale=True),
     ]
 )
 
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"])
+
 # %%
-# .. note::
-#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
-#    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
-#    the corresponding labels and optionally masks. It is particularly critical to add it if
-#    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+# A few things are worth noting here:
+#
+# - We're converting the PIL image into a
+#   :class:`~torchvision.transforms.v2.Image` object. This isn't strictly
+#   necessary, but relying on Tensors (here: a Tensor subclass) will
+#   :ref:`generally be faster <transforms_perf>`.
+# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to
+#   make sure we remove degenerate bounding boxes, as well as their
+#   corresponding labels and masks.
+#   :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed
+#   at least once at the end of a detection pipeline; it is particularly
+#   critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
 #
 # Let's look how the sample looks like with our augmentation pipeline in place:
 
-dataset = load_example_coco_detection_dataset(transforms=transform)
-dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
-
-torch.manual_seed(3141)
-sample = dataset[0]
-
 # sphinx_gallery_thumbnail_number = 2
-show(sample)
+plot([dataset[0], dataset[1]])
 
 
 # %%
-# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
-# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
+# We can see that the color of the images were distorted, zoomed in or out, and flipped.
+# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training.
+#
+# Data loading and training loop
+# ------------------------------
+#
+# Below we're using Mask-RCNN which is an instance segmentation model, but
+# everything we've covered in this tutorial also applies to object detection and
+# semantic segmentation tasks.
 
 data_loader = torch.utils.data.DataLoader(
     dataset,
     batch_size=2,
-    # We need a custom collation function here, since the object detection models expect a
-    # sequence of images and target dictionaries. The default collation function tries to
-    # `torch.stack` the individual elements, which fails in general for object detection,
-    # because the number of object instances varies between the samples. This is the same for
-    # `torchvision.transforms` v1
+    # We need a custom collation function here, since the object detection
+    # models expect a sequence of images and target dictionaries. The default
+    # collation function tries to torch.stack() the individual elements,
+    # which fails in general for object detection, because the number of bouding
+    # boxes varies between the images of a same batch.
     collate_fn=lambda batch: tuple(zip(*batch)),
 )
 
-model = models.get_model("ssd300_vgg16", weights=None, weights_backbone=None).train()
+model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()
 
-for images, targets in data_loader:
-    loss_dict = model(images, targets)
-    print(loss_dict)
+for imgs, targets in data_loader:
+    loss_dict = model(imgs, targets)
     # Put your training logic here
-    break
+
+    print(f"{[img.shape for img in imgs] = }")
+    print(f"{[type(target) for target in targets] = }")
+    for name, loss_val in loss_dict.items():
+        print(f"{name:<20}{loss_val:.3f}")
-- 
GitLab


From 7ebc3ad8d720004df01fdde424dcaefc9f43bd6f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 25 Aug 2023 12:02:07 +0100
Subject: [PATCH 600/624] Improvements to gallery (#7883)

---
 docs/source/conf.py                           | 32 ++++++++++++++++++-
 docs/source/datapoints.rst                    |  2 +-
 docs/source/transforms.rst                    |  4 +--
 gallery/transforms/README.rst                 |  4 +++
 .../{v2_transforms => transforms}/helpers.py  |  0
 .../plot_custom_datapoints.py                 |  4 +--
 .../plot_custom_transforms.py                 |  2 +-
 .../plot_cutmix_mixup.py                      |  2 +-
 .../plot_datapoints.py                        |  2 +-
 .../plot_transforms_e2e.py}                   |  4 +--
 .../plot_transforms_getting_started.py}       | 10 +++---
 gallery/v2_transforms/README.rst              |  4 ---
 torchvision/datapoints/_datapoint.py          |  2 +-
 torchvision/transforms/v2/_augment.py         |  4 +--
 .../transforms/v2/functional/_utils.py        |  2 +-
 15 files changed, 54 insertions(+), 24 deletions(-)
 create mode 100644 gallery/transforms/README.rst
 rename gallery/{v2_transforms => transforms}/helpers.py (100%)
 rename gallery/{v2_transforms => transforms}/plot_custom_datapoints.py (95%)
 rename gallery/{v2_transforms => transforms}/plot_custom_transforms.py (97%)
 rename gallery/{v2_transforms => transforms}/plot_cutmix_mixup.py (97%)
 rename gallery/{v2_transforms => transforms}/plot_datapoints.py (98%)
 rename gallery/{v2_transforms/plot_transforms_v2_e2e.py => transforms/plot_transforms_e2e.py} (97%)
 rename gallery/{v2_transforms/plot_transforms_v2.py => transforms/plot_transforms_getting_started.py} (96%)
 delete mode 100644 gallery/v2_transforms/README.rst

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 33fd64e3a..f2836dd5b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -76,14 +76,44 @@ gen_rst.EXAMPLE_HEADER = """
 """
 
 
+class CustomGalleryExampleSortKey:
+    # See https://sphinx-gallery.github.io/stable/configuration.html#sorting-gallery-examples
+    # and https://github.com/sphinx-gallery/sphinx-gallery/blob/master/sphinx_gallery/sorting.py
+    def __init__(self, src_dir):
+        self.src_dir = src_dir
+
+    transforms_subsection_order = [
+        "plot_transforms_getting_started.py",
+        "plot_transforms_e2e.py",
+        "plot_cutmix_mixup.py",
+        "plot_custom_transforms.py",
+        "plot_datapoints.py",
+        "plot_custom_datapoints.py",
+    ]
+
+    def __call__(self, filename):
+        if "gallery/transforms" in self.src_dir:
+            try:
+                return self.transforms_subsection_order.index(filename)
+            except ValueError as e:
+                raise ValueError(
+                    "Looks like you added an example in gallery/transforms? "
+                    "You need to specify its order in docs/source/conf.py. Look for CustomGalleryExampleSortKey."
+                ) from e
+        else:
+            # For other subsections we just sort alphabetically by filename
+            return filename
+
+
 sphinx_gallery_conf = {
     "examples_dirs": "../../gallery/",  # path to your example scripts
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
-    "subsection_order": ExplicitOrder(["../../gallery/v2_transforms", "../../gallery/others"]),
+    "subsection_order": ExplicitOrder(["../../gallery/transforms", "../../gallery/others"]),
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
     "remove_config_comments": True,
     "ignore_pattern": "helpers.py",
+    "within_subsection_order": CustomGalleryExampleSortKey,
 }
 
 napoleon_use_ivar = True
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 4a2a8e9fc..2ecfdec54 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -8,7 +8,7 @@ Datapoints
 Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
 dispatch their inputs to the appropriate lower-level kernels. Most users do not
 need to manipulate datapoints directly and can simply rely on dataset wrapping -
-see e.g. :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
+see e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
 
 .. autosummary::
     :toctree: generated/
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 6bf2c3753..152700044 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -113,7 +113,7 @@ do to is to update the import to ``torchvision.transforms.v2``. In terms of
 output, there might be negligible differences due to implementation differences.
 
 To learn more about the v2 transforms, check out
-:ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2.py`.
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py`.
 
 .. TODO: make sure link is still good!!
 
@@ -479,7 +479,7 @@ CutMix and MixUp are special transforms that
 are meant to be used on batches rather than on individual images, because they
 are combining pairs of images together. These can be used after the dataloader
 (once the samples are batched), or part of a collation function. See
-:ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage examples.
+:ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage examples.
 
 .. autosummary::
     :toctree: generated/
diff --git a/gallery/transforms/README.rst b/gallery/transforms/README.rst
new file mode 100644
index 000000000..1b8b1b081
--- /dev/null
+++ b/gallery/transforms/README.rst
@@ -0,0 +1,4 @@
+.. _transforms_gallery:
+
+Transforms
+----------
diff --git a/gallery/v2_transforms/helpers.py b/gallery/transforms/helpers.py
similarity index 100%
rename from gallery/v2_transforms/helpers.py
rename to gallery/transforms/helpers.py
diff --git a/gallery/v2_transforms/plot_custom_datapoints.py b/gallery/transforms/plot_custom_datapoints.py
similarity index 95%
rename from gallery/v2_transforms/plot_custom_datapoints.py
rename to gallery/transforms/plot_custom_datapoints.py
index 0859adb6d..674aceb6e 100644
--- a/gallery/v2_transforms/plot_custom_datapoints.py
+++ b/gallery/transforms/plot_custom_datapoints.py
@@ -5,12 +5,12 @@ How to write your own Datapoint class
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_datapoints.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_custom_datapoints.py>` to download the full example code.
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_datapoints.py>` to download the full example code.
 
 This guide is intended for advanced users and downstream library maintainers. We explain how to
 write your own datapoint class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
-:ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`.
+:ref:`sphx_glr_auto_examples_transforms_plot_datapoints.py`.
 """
 
 # %%
diff --git a/gallery/v2_transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py
similarity index 97%
rename from gallery/v2_transforms/plot_custom_transforms.py
rename to gallery/transforms/plot_custom_transforms.py
index 912ddf323..55e8e3f06 100644
--- a/gallery/v2_transforms/plot_custom_transforms.py
+++ b/gallery/transforms/plot_custom_transforms.py
@@ -5,7 +5,7 @@ How to write your own v2 transforms
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_transforms.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_custom_transforms.py>` to download the full example code.
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_transforms.py>` to download the full example code.
 
 This guide explains how to write transforms that are compatible with the
 torchvision transforms V2 API.
diff --git a/gallery/v2_transforms/plot_cutmix_mixup.py b/gallery/transforms/plot_cutmix_mixup.py
similarity index 97%
rename from gallery/v2_transforms/plot_cutmix_mixup.py
rename to gallery/transforms/plot_cutmix_mixup.py
index 6bf21933d..d26b027b1 100644
--- a/gallery/v2_transforms/plot_cutmix_mixup.py
+++ b/gallery/transforms/plot_cutmix_mixup.py
@@ -6,7 +6,7 @@ How to use CutMix and MixUp
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_cutmix_mixup.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_cutmix_mixup.py>` to download the full example code.
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_cutmix_mixup.py>` to download the full example code.
 
 :class:`~torchvision.transforms.v2.CutMix` and
 :class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
diff --git a/gallery/v2_transforms/plot_datapoints.py b/gallery/transforms/plot_datapoints.py
similarity index 98%
rename from gallery/v2_transforms/plot_datapoints.py
rename to gallery/transforms/plot_datapoints.py
index b56de809f..726046097 100644
--- a/gallery/v2_transforms/plot_datapoints.py
+++ b/gallery/transforms/plot_datapoints.py
@@ -5,7 +5,7 @@ Datapoints FAQ
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_datapoints.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_datapoints.py>` to download the full example code.
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_datapoints.py>` to download the full example code.
 
 
 Datapoints are Tensor subclasses introduced together with
diff --git a/gallery/v2_transforms/plot_transforms_v2_e2e.py b/gallery/transforms/plot_transforms_e2e.py
similarity index 97%
rename from gallery/v2_transforms/plot_transforms_v2_e2e.py
rename to gallery/transforms/plot_transforms_e2e.py
index fa47dbfef..313c7b7e6 100644
--- a/gallery/v2_transforms/plot_transforms_v2_e2e.py
+++ b/gallery/transforms/plot_transforms_e2e.py
@@ -4,8 +4,8 @@ Transforms v2: End-to-end object detection/segmentation example
 ===============================================================
 
 .. note::
-    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2_e2e.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2_e2e.py>` to download the full example code.
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_e2e.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_e2e.py>` to download the full example code.
 
 Object detection and segmentation tasks are natively supported:
 ``torchvision.transforms.v2`` enables jointly transforming images, videos,
diff --git a/gallery/v2_transforms/plot_transforms_v2.py b/gallery/transforms/plot_transforms_getting_started.py
similarity index 96%
rename from gallery/v2_transforms/plot_transforms_v2.py
rename to gallery/transforms/plot_transforms_getting_started.py
index 92a92545a..da23ccd81 100644
--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -4,8 +4,8 @@ Getting started with transforms v2
 ==================================
 
 .. note::
-    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2.py>` to download the full example code.
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_getting_started.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_getting_started.py>` to download the full example code.
 
 This example illustrates all of what you need to know to get started with the
 new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
@@ -70,7 +70,7 @@ plot([img, out])
 # <transforms>` to learn more about recommended practices and conventions, or
 # explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
 # transforms like :ref:`CutMix and MixUp
-# <sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py>`.
+# <sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py>`.
 #
 # .. note::
 #
@@ -148,7 +148,7 @@ print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 #
 # You don't need to know much more about datapoints at this point, but advanced
 # users who want to learn more can refer to
-# :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`.
+# :ref:`sphx_glr_auto_examples_transforms_plot_datapoints.py`.
 #
 # What do I pass as input?
 # ------------------------
@@ -243,7 +243,7 @@ print(f"{out_target['this_is_ignored']}")
 #
 #    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
 #
-#    dataset = CocoDetection(..., transforms=my_v2_transforms)
+#    dataset = CocoDetection(..., transforms=my_transforms)
 #    dataset = wrap_dataset_for_transforms_v2(dataset)
 #    # Now the dataset returns datapoints!
 #
diff --git a/gallery/v2_transforms/README.rst b/gallery/v2_transforms/README.rst
deleted file mode 100644
index 371af30a1..000000000
--- a/gallery/v2_transforms/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _transforms_gallery:
-
-V2 transforms
--------------
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 11f869103..64103f583 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -17,7 +17,7 @@ class Datapoint(torch.Tensor):
 
     You probably don't want to use this class unless you're defining your own
     custom Datapoints. See
-    :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for details.
+    :ref:`sphx_glr_auto_examples_transforms_plot_custom_datapoints.py` for details.
     """
 
     @staticmethod
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index a9bad8f9b..a5c983825 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -216,7 +216,7 @@ class MixUp(_BaseMixUpCutMix):
     .. note::
         This transform is meant to be used on **batches** of samples, not
         individual images. See
-        :ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
         examples.
         The sample pairing is deterministic and done by matching consecutive
         samples in the batch, so the batch needs to be shuffled (this is an
@@ -266,7 +266,7 @@ class CutMix(_BaseMixUpCutMix):
     .. note::
         This transform is meant to be used on **batches** of samples, not
         individual images. See
-        :ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
         examples.
         The sample pairing is deterministic and done by matching consecutive
         samples in the batch, so the batch needs to be shuffled (this is an
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index dd7781646..5a907121b 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -69,7 +69,7 @@ _BUILTIN_DATAPOINT_TYPES = {
 def register_kernel(functional, datapoint_cls):
     """[BETA] Decorate a kernel to register it for a functional and a (custom) datapoint type.
 
-    See :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for usage
+    See :ref:`sphx_glr_auto_examples_transforms_plot_custom_datapoints.py` for usage
     details.
     """
     if isinstance(functional, str):
-- 
GitLab


From 47cd5ea8e21d7596a24907710411d6b4a43f628d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Fri, 25 Aug 2023 14:57:51 +0100
Subject: [PATCH 601/624] Make transforms illutration example use v2 instead of
 v1 (#7886)

---
 docs/source/conf.py                           |   1 +
 .../others/plot_scripted_tensor_transforms.py |   2 +-
 gallery/transforms/helpers.py                 |   8 +-
 .../plot_transforms_illustrations.py}         | 152 ++++++++----------
 4 files changed, 73 insertions(+), 90 deletions(-)
 rename gallery/{others/plot_transforms.py => transforms/plot_transforms_illustrations.py} (73%)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index f2836dd5b..66138c2d1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -84,6 +84,7 @@ class CustomGalleryExampleSortKey:
 
     transforms_subsection_order = [
         "plot_transforms_getting_started.py",
+        "plot_transforms_illustrations.py",
         "plot_transforms_e2e.py",
         "plot_cutmix_mixup.py",
         "plot_custom_transforms.py",
diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
index 85b332c4c..27e8cc166 100644
--- a/gallery/others/plot_scripted_tensor_transforms.py
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -62,7 +62,7 @@ show([dog1, dog2])
 # --------------------------
 # Most transforms natively support tensors on top of PIL images (to visualize
 # the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_others_plot_transforms.py`).
+# :ref:`sphx_glr_auto_examples_transforms_plot_transforms_illustrations.py`).
 # Using tensor images, we can run the transforms on GPUs if cuda is available!
 
 import torch.nn as nn
diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py
index 3c92df432..957d9bcb7 100644
--- a/gallery/transforms/helpers.py
+++ b/gallery/transforms/helpers.py
@@ -5,7 +5,7 @@ from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F
 
 
-def plot(imgs):
+def plot(imgs, row_title=None, **imshow_kwargs):
     if not isinstance(imgs[0], list):
         # Make a 2d grid even if there's just 1 row
         imgs = [imgs]
@@ -40,7 +40,11 @@ def plot(imgs):
                 img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
 
             ax = axs[row_idx, col_idx]
-            ax.imshow(img.permute(1, 2, 0).numpy())
+            ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs)
             ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
+    if row_title is not None:
+        for row_idx in range(num_rows):
+            axs[row_idx, 0].set(ylabel=row_title[row_idx])
+
     plt.tight_layout()
diff --git a/gallery/others/plot_transforms.py b/gallery/transforms/plot_transforms_illustrations.py
similarity index 73%
rename from gallery/others/plot_transforms.py
rename to gallery/transforms/plot_transforms_illustrations.py
index 9702bc9c3..95ab455d0 100644
--- a/gallery/others/plot_transforms.py
+++ b/gallery/transforms/plot_transforms_illustrations.py
@@ -4,55 +4,33 @@ Illustration of transforms
 ==========================
 
 .. note::
-    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_transforms.py>` to download the full example code.
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_illustrations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_illustrations.py>` to download the full example code.
 
-This example illustrates the various transforms available in :ref:`the
-torchvision.transforms module <transforms>`.
+This example illustrates some of the various transforms available in :ref:`the
+torchvision.transforms.v2 module <transforms>`.
 """
+# %%
 
 # sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png"
 
 from PIL import Image
 from pathlib import Path
 import matplotlib.pyplot as plt
-import numpy as np
 
 import torch
-import torchvision.transforms as T
-
+from torchvision.transforms import v2
 
 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
+
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)
 
-
-def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
-    if not isinstance(imgs[0], list):
-        # Make a 2d grid even if there's just 1 row
-        imgs = [imgs]
-
-    num_rows = len(imgs)
-    num_cols = len(imgs[0]) + with_orig
-    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
-    for row_idx, row in enumerate(imgs):
-        row = [orig_img] + row if with_orig else row
-        for col_idx, img in enumerate(row):
-            ax = axs[row_idx, col_idx]
-            ax.imshow(np.asarray(img), **imshow_kwargs)
-            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-    if with_orig:
-        axs[0, 0].set(title='Original image')
-        axs[0, 0].title.set_size(8)
-    if row_title is not None:
-        for row_idx in range(num_rows):
-            axs[row_idx, 0].set(ylabel=row_title[row_idx])
-
-    plt.tight_layout()
-
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
 
 # %%
 # Geometric Transforms
@@ -66,8 +44,8 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
 # The :class:`~torchvision.transforms.Pad` transform
 # (see also :func:`~torchvision.transforms.functional.pad`)
 # pads all image borders with some pixel values.
-padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
-plot(padded_imgs)
+padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
+plot([orig_img] + padded_imgs)
 
 # %%
 # Resize
@@ -75,8 +53,8 @@ plot(padded_imgs)
 # The :class:`~torchvision.transforms.Resize` transform
 # (see also :func:`~torchvision.transforms.functional.resize`)
 # resizes an image.
-resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(resized_imgs)
+resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + resized_imgs)
 
 # %%
 # CenterCrop
@@ -84,8 +62,8 @@ plot(resized_imgs)
 # The :class:`~torchvision.transforms.CenterCrop` transform
 # (see also :func:`~torchvision.transforms.functional.center_crop`)
 # crops the given image at the center.
-center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(center_crops)
+center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + center_crops)
 
 # %%
 # FiveCrop
@@ -93,8 +71,8 @@ plot(center_crops)
 # The :class:`~torchvision.transforms.FiveCrop` transform
 # (see also :func:`~torchvision.transforms.functional.five_crop`)
 # crops the given image into four corners and the central crop.
-(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
-plot([top_left, top_right, bottom_left, bottom_right, center])
+(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img)
+plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center])
 
 # %%
 # RandomPerspective
@@ -102,9 +80,9 @@ plot([top_left, top_right, bottom_left, bottom_right, center])
 # The :class:`~torchvision.transforms.RandomPerspective` transform
 # (see also :func:`~torchvision.transforms.functional.perspective`)
 # performs random perspective transform on an image.
-perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
+perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
-plot(perspective_imgs)
+plot([orig_img] + perspective_imgs)
 
 # %%
 # RandomRotation
@@ -112,9 +90,9 @@ plot(perspective_imgs)
 # The :class:`~torchvision.transforms.RandomRotation` transform
 # (see also :func:`~torchvision.transforms.functional.rotate`)
 # rotates an image with random angle.
-rotater = T.RandomRotation(degrees=(0, 180))
+rotater = v2.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
-plot(rotated_imgs)
+plot([orig_img] + rotated_imgs)
 
 # %%
 # RandomAffine
@@ -122,9 +100,9 @@ plot(rotated_imgs)
 # The :class:`~torchvision.transforms.RandomAffine` transform
 # (see also :func:`~torchvision.transforms.functional.affine`)
 # performs random affine transform on an image.
-affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
+affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
-plot(affine_imgs)
+plot([orig_img] + affine_imgs)
 
 # %%
 # ElasticTransform
@@ -133,9 +111,9 @@ plot(affine_imgs)
 # (see also :func:`~torchvision.transforms.functional.elastic_transform`)
 # Randomly transforms the morphology of objects in images and produces a
 # see-through-water-like effect.
-elastic_transformer = T.ElasticTransform(alpha=250.0)
+elastic_transformer = v2.ElasticTransform(alpha=250.0)
 transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
 # %%
 # RandomCrop
@@ -143,9 +121,9 @@ plot(transformed_imgs)
 # The :class:`~torchvision.transforms.RandomCrop` transform
 # (see also :func:`~torchvision.transforms.functional.crop`)
 # crops an image at a random location.
-cropper = T.RandomCrop(size=(128, 128))
+cropper = v2.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
-plot(crops)
+plot([orig_img] + crops)
 
 # %%
 # RandomResizedCrop
@@ -154,9 +132,9 @@ plot(crops)
 # (see also :func:`~torchvision.transforms.functional.resized_crop`)
 # crops an image at a random location, and then resizes the crop to a given
 # size.
-resize_cropper = T.RandomResizedCrop(size=(32, 32))
+resize_cropper = v2.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
-plot(resized_crops)
+plot([orig_img] + resized_crops)
 
 # %%
 # Photometric Transforms
@@ -175,17 +153,17 @@ plot(resized_crops)
 # The :class:`~torchvision.transforms.Grayscale` transform
 # (see also :func:`~torchvision.transforms.functional.to_grayscale`)
 # converts an image to grayscale
-gray_img = T.Grayscale()(orig_img)
-plot([gray_img], cmap='gray')
+gray_img = v2.Grayscale()(orig_img)
+plot([orig_img, gray_img], cmap='gray')
 
 # %%
 # ColorJitter
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ColorJitter` transform
 # randomly changes the brightness, contrast, saturation, hue, and other properties of an image.
-jitter = T.ColorJitter(brightness=.5, hue=.3)
-jitted_imgs = [jitter(orig_img) for _ in range(4)]
-plot(jitted_imgs)
+jitter = v2.ColorJitter(brightness=.5, hue=.3)
+jittered_imgs = [jitter(orig_img) for _ in range(4)]
+plot([orig_img] + jittered_imgs)
 
 # %%
 # GaussianBlur
@@ -193,9 +171,9 @@ plot(jitted_imgs)
 # The :class:`~torchvision.transforms.GaussianBlur` transform
 # (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
 # performs gaussian blur transform on an image.
-blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
+blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.))
 blurred_imgs = [blurrer(orig_img) for _ in range(4)]
-plot(blurred_imgs)
+plot([orig_img] + blurred_imgs)
 
 # %%
 # RandomInvert
@@ -203,9 +181,9 @@ plot(blurred_imgs)
 # The :class:`~torchvision.transforms.RandomInvert` transform
 # (see also :func:`~torchvision.transforms.functional.invert`)
 # randomly inverts the colors of the given image.
-inverter = T.RandomInvert()
+inverter = v2.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
-plot(invertered_imgs)
+plot([orig_img] + invertered_imgs)
 
 # %%
 # RandomPosterize
@@ -214,9 +192,9 @@ plot(invertered_imgs)
 # (see also :func:`~torchvision.transforms.functional.posterize`)
 # randomly posterizes the image by reducing the number of bits
 # of each color channel.
-posterizer = T.RandomPosterize(bits=2)
+posterizer = v2.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
-plot(posterized_imgs)
+plot([orig_img] + posterized_imgs)
 
 # %%
 # RandomSolarize
@@ -225,9 +203,9 @@ plot(posterized_imgs)
 # (see also :func:`~torchvision.transforms.functional.solarize`)
 # randomly solarizes the image by inverting all pixel values above
 # the threshold.
-solarizer = T.RandomSolarize(threshold=192.0)
+solarizer = v2.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
-plot(solarized_imgs)
+plot([orig_img] + solarized_imgs)
 
 # %%
 # RandomAdjustSharpness
@@ -235,9 +213,9 @@ plot(solarized_imgs)
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
 # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`)
 # randomly adjusts the sharpness of the given image.
-sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
+sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
-plot(sharpened_imgs)
+plot([orig_img] + sharpened_imgs)
 
 # %%
 # RandomAutocontrast
@@ -245,9 +223,9 @@ plot(sharpened_imgs)
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
 # (see also :func:`~torchvision.transforms.functional.autocontrast`)
 # randomly applies autocontrast to the given image.
-autocontraster = T.RandomAutocontrast()
+autocontraster = v2.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
-plot(autocontrasted_imgs)
+plot([orig_img] + autocontrasted_imgs)
 
 # %%
 # RandomEqualize
@@ -255,9 +233,9 @@ plot(autocontrasted_imgs)
 # The :class:`~torchvision.transforms.RandomEqualize` transform
 # (see also :func:`~torchvision.transforms.functional.equalize`)
 # randomly equalizes the histogram of the given image.
-equalizer = T.RandomEqualize()
+equalizer = v2.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
-plot(equalized_imgs)
+plot([orig_img] + equalized_imgs)
 
 # %%
 # Augmentation Transforms
@@ -270,22 +248,22 @@ plot(equalized_imgs)
 # The :class:`~torchvision.transforms.AutoAugment` transform
 # automatically augments data based on a given auto-augmentation policy.
 # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies.
-policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN]
-augmenters = [T.AutoAugment(policy) for policy in policies]
+policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN]
+augmenters = [v2.AutoAugment(policy) for policy in policies]
 imgs = [
     [augmenter(orig_img) for _ in range(4)]
     for augmenter in augmenters
 ]
 row_title = [str(policy).split('.')[-1] for policy in policies]
-plot(imgs, row_title=row_title)
+plot([[orig_img] + row for row in imgs], row_title=row_title)
 
 # %%
 # RandAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
-augmenter = T.RandAugment()
+augmenter = v2.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
 # %%
 # TrivialAugmentWide
@@ -293,17 +271,17 @@ plot(imgs)
 # The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
 # However, instead of transforming an image multiple times, it transforms an image only once
 # using a random transform from a given list with a random strength number.
-augmenter = T.TrivialAugmentWide()
+augmenter = v2.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
 # %%
 # AugMix
 # ~~~~~~
 # The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
-augmenter = T.AugMix()
+augmenter = v2.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
 # %%
 # Randomly-applied Transforms
@@ -318,9 +296,9 @@ plot(imgs)
 # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.hflip`)
 # performs horizontal flip of an image, with a given probability.
-hflipper = T.RandomHorizontalFlip(p=0.5)
+hflipper = v2.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
 # %%
 # RandomVerticalFlip
@@ -328,15 +306,15 @@ plot(transformed_imgs)
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.vflip`)
 # performs vertical flip of an image, with a given probability.
-vflipper = T.RandomVerticalFlip(p=0.5)
+vflipper = v2.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
 # %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform
 # randomly applies a list of transforms, with a given probability.
-applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5)
+applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5)
 transformed_imgs = [applier(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
-- 
GitLab


From fae532174e1cde24542d3b357055d8f37b44e326 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 28 Aug 2023 09:48:22 +0200
Subject: [PATCH 602/624] Enable v1 vs. v2 consistency in refactored tests
 (#7882)

---
 test/test_transforms_v2_consistency.py | 142 -------------------------
 test/test_transforms_v2_refactored.py  |  60 +++++++----
 2 files changed, 41 insertions(+), 161 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 0d11f610a..a06ecb748 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1,4 +1,3 @@
-import enum
 import importlib.machinery
 import importlib.util
 import inspect
@@ -83,35 +82,6 @@ CONSISTENCY_CONFIGS = [
         supports_pil=False,
         make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.float]),
     ),
-    ConsistencyConfig(
-        v2_transforms.Resize,
-        legacy_transforms.Resize,
-        [
-            NotScriptableArgsKwargs(32),
-            ArgsKwargs([32]),
-            ArgsKwargs((32, 29)),
-            ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((30, 27), interpolation=PIL.Image.NEAREST),
-            ArgsKwargs((35, 29), interpolation=PIL.Image.BILINEAR),
-            NotScriptableArgsKwargs(31, max_size=32),
-            ArgsKwargs([31], max_size=32),
-            NotScriptableArgsKwargs(30, max_size=100),
-            ArgsKwargs([31], max_size=32),
-            ArgsKwargs((29, 32), antialias=False),
-            ArgsKwargs((28, 31), antialias=True),
-        ],
-        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
-        closeness_kwargs=dict(rtol=0, atol=1),
-    ),
-    ConsistencyConfig(
-        v2_transforms.Resize,
-        legacy_transforms.Resize,
-        [
-            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC, antialias=True),
-            ArgsKwargs((34, 25), interpolation=PIL.Image.BICUBIC, antialias=True),
-        ],
-        closeness_kwargs=dict(rtol=0, atol=21),
-    ),
     ConsistencyConfig(
         v2_transforms.CenterCrop,
         legacy_transforms.CenterCrop,
@@ -187,20 +157,6 @@ CONSISTENCY_CONFIGS = [
         # Use default tolerances of `torch.testing.assert_close`
         closeness_kwargs=dict(rtol=None, atol=None),
     ),
-    ConsistencyConfig(
-        v2_transforms.ConvertImageDtype,
-        legacy_transforms.ConvertImageDtype,
-        [
-            ArgsKwargs(torch.float16),
-            ArgsKwargs(torch.bfloat16),
-            ArgsKwargs(torch.float32),
-            ArgsKwargs(torch.float64),
-            ArgsKwargs(torch.uint8),
-        ],
-        supports_pil=False,
-        # Use default tolerances of `torch.testing.assert_close`
-        closeness_kwargs=dict(rtol=None, atol=None),
-    ),
     ConsistencyConfig(
         v2_transforms.ToPILImage,
         legacy_transforms.ToPILImage,
@@ -226,22 +182,6 @@ CONSISTENCY_CONFIGS = [
         # images given that the transform does nothing but call it anyway.
         supports_pil=False,
     ),
-    ConsistencyConfig(
-        v2_transforms.RandomHorizontalFlip,
-        legacy_transforms.RandomHorizontalFlip,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
-    ConsistencyConfig(
-        v2_transforms.RandomVerticalFlip,
-        legacy_transforms.RandomVerticalFlip,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
     ConsistencyConfig(
         v2_transforms.RandomEqualize,
         legacy_transforms.RandomEqualize,
@@ -367,30 +307,6 @@ CONSISTENCY_CONFIGS = [
         ],
         closeness_kwargs={"atol": 1e-5, "rtol": 1e-5},
     ),
-    *[
-        ConsistencyConfig(
-            v2_transforms.ElasticTransform,
-            legacy_transforms.ElasticTransform,
-            [
-                ArgsKwargs(),
-                ArgsKwargs(alpha=20.0),
-                ArgsKwargs(alpha=(15.3, 27.2)),
-                ArgsKwargs(sigma=3.0),
-                ArgsKwargs(sigma=(2.5, 3.9)),
-                ArgsKwargs(interpolation=v2_transforms.InterpolationMode.NEAREST),
-                ArgsKwargs(interpolation=v2_transforms.InterpolationMode.BICUBIC),
-                ArgsKwargs(interpolation=PIL.Image.NEAREST),
-                ArgsKwargs(interpolation=PIL.Image.BICUBIC),
-                ArgsKwargs(fill=1),
-            ],
-            # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
-            make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(163, 163), (72, 333), (313, 95)], dtypes=[dt]),
-            # We updated gaussian blur kernel generation with a faster and numerically more stable version
-            # This brings float32 accumulation visible in elastic transform -> we need to relax consistency tolerance
-            closeness_kwargs=ckw,
-        )
-        for dt, ckw in [(torch.uint8, {"rtol": 1e-1, "atol": 1}), (torch.float32, {"rtol": 1e-2, "atol": 1e-3})]
-    ],
     ConsistencyConfig(
         v2_transforms.GaussianBlur,
         legacy_transforms.GaussianBlur,
@@ -402,26 +318,6 @@ CONSISTENCY_CONFIGS = [
         ],
         closeness_kwargs={"rtol": 1e-5, "atol": 1e-5},
     ),
-    ConsistencyConfig(
-        v2_transforms.RandomAffine,
-        legacy_transforms.RandomAffine,
-        [
-            ArgsKwargs(degrees=30.0),
-            ArgsKwargs(degrees=(-20.0, 10.0)),
-            ArgsKwargs(degrees=0.0, translate=(0.4, 0.6)),
-            ArgsKwargs(degrees=0.0, scale=(0.3, 0.8)),
-            ArgsKwargs(degrees=0.0, shear=13),
-            ArgsKwargs(degrees=0.0, shear=(8, 17)),
-            ArgsKwargs(degrees=0.0, shear=(4, 5, 4, 13)),
-            ArgsKwargs(degrees=(-20.0, 10.0), translate=(0.4, 0.6), scale=(0.3, 0.8), shear=(4, 5, 4, 13)),
-            ArgsKwargs(degrees=30.0, interpolation=v2_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs(degrees=30.0, interpolation=PIL.Image.NEAREST),
-            ArgsKwargs(degrees=30.0, fill=1),
-            ArgsKwargs(degrees=30.0, fill=(2, 3, 4)),
-            ArgsKwargs(degrees=30.0, center=(0, 0)),
-        ],
-        removed_params=["fillcolor", "resample"],
-    ),
     ConsistencyConfig(
         v2_transforms.RandomCrop,
         legacy_transforms.RandomCrop,
@@ -456,21 +352,6 @@ CONSISTENCY_CONFIGS = [
         ],
         closeness_kwargs={"atol": None, "rtol": None},
     ),
-    ConsistencyConfig(
-        v2_transforms.RandomRotation,
-        legacy_transforms.RandomRotation,
-        [
-            ArgsKwargs(degrees=30.0),
-            ArgsKwargs(degrees=(-20.0, 10.0)),
-            ArgsKwargs(degrees=30.0, interpolation=v2_transforms.InterpolationMode.BILINEAR),
-            ArgsKwargs(degrees=30.0, interpolation=PIL.Image.BILINEAR),
-            ArgsKwargs(degrees=30.0, expand=True),
-            ArgsKwargs(degrees=30.0, center=(0, 0)),
-            ArgsKwargs(degrees=30.0, fill=1),
-            ArgsKwargs(degrees=30.0, fill=(1, 2, 3)),
-        ],
-        removed_params=["resample"],
-    ),
     ConsistencyConfig(
         v2_transforms.PILToTensor,
         legacy_transforms.PILToTensor,
@@ -514,23 +395,6 @@ CONSISTENCY_CONFIGS = [
 ]
 
 
-def test_automatic_coverage():
-    available = {
-        name
-        for name, obj in legacy_transforms.__dict__.items()
-        if not name.startswith("_") and isinstance(obj, type) and not issubclass(obj, enum.Enum)
-    }
-
-    checked = {config.legacy_cls.__name__ for config in CONSISTENCY_CONFIGS}
-
-    missing = available - checked
-    if missing:
-        raise AssertionError(
-            f"The prototype transformations {sequence_to_str(sorted(missing), separate_last='and ')} "
-            f"are not checked for consistency although a legacy counterpart exists."
-        )
-
-
 @pytest.mark.parametrize("config", CONSISTENCY_CONFIGS, ids=lambda config: config.legacy_cls.__name__)
 def test_signature_consistency(config):
     legacy_params = dict(inspect.signature(config.legacy_cls).parameters)
@@ -708,15 +572,9 @@ get_params_parametrization = pytest.mark.parametrize(
             (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
             (v2_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
-            (v2_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
             (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
-            (
-                v2_transforms.RandomAffine,
-                ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
-            ),
             (v2_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
             (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
-            (v2_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
             (v2_transforms.AutoAugment, ArgsKwargs(5)),
         ]
     ],
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 0b9024c94..9ff35dd2c 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -228,26 +228,37 @@ def check_functional_kernel_signature_match(functional, *, kernel, input_type):
         assert functional_param == kernel_param
 
 
-def _check_transform_v1_compatibility(transform, input):
+def _check_transform_v1_compatibility(transform, input, rtol, atol):
     """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
-    ``get_params`` method, is scriptable, and the scripted version can be called without error."""
-    if transform._v1_transform_cls is None:
+    ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version
+    can be called without error."""
+    if type(input) is not torch.Tensor or isinstance(input, PIL.Image.Image):
         return
 
-    if type(input) is not torch.Tensor:
+    v1_transform_cls = transform._v1_transform_cls
+    if v1_transform_cls is None:
         return
 
-    if hasattr(transform._v1_transform_cls, "get_params"):
-        assert type(transform).get_params is transform._v1_transform_cls.get_params
+    if hasattr(v1_transform_cls, "get_params"):
+        assert type(transform).get_params is v1_transform_cls.get_params
 
-    scripted_transform = _script(transform)
-    with ignore_jit_no_profile_information_warning():
-        scripted_transform(input)
+    v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform())
+
+    with freeze_rng_state():
+        output_v2 = transform(input)
+
+    with freeze_rng_state():
+        output_v1 = v1_transform(input)
+
+    assert_close(output_v2, output_v1, rtol=rtol, atol=atol)
 
+    if isinstance(input, PIL.Image.Image):
+        return
+
+    _script(v1_transform)(input)
 
-def check_transform(transform_cls, input, *args, **kwargs):
-    transform = transform_cls(*args, **kwargs)
 
+def check_transform(transform, input, check_v1_compatibility=True):
     pickle.loads(pickle.dumps(transform))
 
     output = transform(input)
@@ -256,7 +267,8 @@ def check_transform(transform_cls, input, *args, **kwargs):
     if isinstance(input, datapoints.BoundingBoxes):
         assert output.format == input.format
 
-    _check_transform_v1_compatibility(transform, input)
+    if check_v1_compatibility:
+        _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility))
 
 
 def transform_cls_to_functional(transform_cls, **transform_specific_kwargs):
@@ -524,7 +536,12 @@ class TestResize:
         ],
     )
     def test_transform(self, size, device, make_input):
-        check_transform(transforms.Resize, make_input(self.INPUT_SIZE, device=device), size=size, antialias=True)
+        check_transform(
+            transforms.Resize(size=size, antialias=True),
+            make_input(self.INPUT_SIZE, device=device),
+            # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+            check_v1_compatibility=dict(rtol=0, atol=1),
+        )
 
     def _check_output_size(self, input, output, *, size, max_size):
         assert tuple(F.get_size(output)) == self._compute_output_size(
@@ -848,7 +865,7 @@ class TestHorizontalFlip:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
-        check_transform(transforms.RandomHorizontalFlip, make_input(device=device), p=1)
+        check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device))
 
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
@@ -1026,7 +1043,7 @@ class TestAffine:
     def test_transform(self, make_input, device):
         input = make_input(device=device)
 
-        check_transform(transforms.RandomAffine, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
+        check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input)
 
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
     @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
@@ -1313,7 +1330,7 @@ class TestVerticalFlip:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
-        check_transform(transforms.RandomVerticalFlip, make_input(device=device), p=1)
+        check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device))
 
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_image_correctness(self, fn):
@@ -1464,7 +1481,7 @@ class TestRotate:
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
         check_transform(
-            transforms.RandomRotation, make_input(device=device), **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES
+            transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device)
         )
 
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
@@ -1726,7 +1743,7 @@ class TestToDtype:
         input = make_input(dtype=input_dtype, device=device)
         if as_dict:
             output_dtype = {type(input): output_dtype}
-        check_transform(transforms.ToDtype, input, dtype=output_dtype, scale=scale)
+        check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input)
 
     def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False):
         input_dtype = image.dtype
@@ -2415,7 +2432,12 @@ class TestElastic:
     @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, size, device):
-        check_transform(transforms.ElasticTransform, make_input(size, device=device))
+        check_transform(
+            transforms.ElasticTransform(),
+            make_input(size, device=device),
+            # We updated gaussian blur kernel generation with a faster and numerically more stable version
+            check_v1_compatibility=dict(rtol=0, atol=1),
+        )
 
 
 class TestToPureTensor:
-- 
GitLab


From 6472a5cb16b3a5970cb81a2c9352ffbdadaabeef Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 28 Aug 2023 14:54:37 +0200
Subject: [PATCH 603/624] improve affine bounding box reference helper (#7884)

---
 test/test_transforms_v2_refactored.py         | 178 ++++++++++--------
 .../transforms/v2/functional/_geometry.py     |   4 +-
 2 files changed, 104 insertions(+), 78 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 9ff35dd2c..2fea19e81 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -351,45 +351,62 @@ def assert_warns_antialias_default_value():
         yield
 
 
-def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix):
-    def transform(bbox):
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+
+    def affine_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
-        in_dtype = bbox.dtype
-        if not torch.is_floating_point(bbox):
-            bbox = bbox.float()
-        bbox_xyxy = F.convert_bounding_box_format(
-            bbox.as_subclass(torch.Tensor),
+        input_xyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(torch.float64, copy=True),
             old_format=format,
             new_format=datapoints.BoundingBoxFormat.XYXY,
             inplace=True,
         )
+        x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
+
         points = np.array(
             [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
+                [x1, y1, 1.0],
+                [x2, y1, 1.0],
+                [x1, y2, 1.0],
+                [x2, y2, 1.0],
             ]
         )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = torch.tensor(
+        transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
+
+        output_xyxy = torch.Tensor(
             [
-                np.min(transformed_points[:, 0]).item(),
-                np.min(transformed_points[:, 1]).item(),
-                np.max(transformed_points[:, 0]).item(),
-                np.max(transformed_points[:, 1]).item(),
-            ],
-            dtype=bbox_xyxy.dtype,
+                float(np.min(transformed_points[:, 0])),
+                float(np.min(transformed_points[:, 1])),
+                float(np.max(transformed_points[:, 0])),
+                float(np.max(transformed_points[:, 1])),
+            ]
         )
-        out_bbox = F.convert_bounding_box_format(
-            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+
+        output = F.convert_bounding_box_format(
+            output_xyxy, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format
         )
-        # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, canvas_size=canvas_size)
-        out_bbox = out_bbox.to(dtype=in_dtype)
-        return out_bbox
 
-    return torch.stack([transform(b) for b in bounding_boxes.reshape(-1, 4).unbind()]).reshape(bounding_boxes.shape)
+        if clamp:
+            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            output = F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+            ).to(dtype)
+
+        return output
+
+    return datapoints.BoundingBoxes(
+        torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
+            bounding_boxes.shape
+        ),
+        format=format,
+        canvas_size=canvas_size,
+    )
 
 
 class TestResize:
@@ -580,16 +597,13 @@ class TestResize:
                 [new_width / old_width, 0, 0],
                 [0, new_height / old_height, 0],
             ],
-            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
 
-        expected_bboxes = reference_affine_bounding_boxes_helper(
+        return reference_affine_bounding_boxes_helper(
             bounding_boxes,
-            format=bounding_boxes.format,
-            canvas_size=(new_height, new_width),
             affine_matrix=affine_matrix,
+            new_canvas_size=(new_height, new_width),
         )
-        return datapoints.wrap(expected_bboxes, like=bounding_boxes, canvas_size=(new_height, new_width))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
@@ -884,17 +898,9 @@ class TestHorizontalFlip:
                 [-1, 0, bounding_boxes.canvas_size[1]],
                 [0, 1, 0],
             ],
-            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
-        )
-
-        expected_bboxes = reference_affine_bounding_boxes_helper(
-            bounding_boxes,
-            format=bounding_boxes.format,
-            canvas_size=bounding_boxes.canvas_size,
-            affine_matrix=affine_matrix,
         )
 
-        return datapoints.wrap(expected_bboxes, like=bounding_boxes)
+        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize(
@@ -1129,26 +1135,19 @@ class TestAffine:
         shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
         rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
         true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
-        return true_matrix
+        return true_matrix[:2, :]
 
     def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center):
         if center is None:
             center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
 
-        affine_matrix = self._compute_affine_matrix(
-            angle=angle, translate=translate, scale=scale, shear=shear, center=center
-        )
-        affine_matrix = affine_matrix[:2, :]
-
-        expected_bboxes = reference_affine_bounding_boxes_helper(
+        return reference_affine_bounding_boxes_helper(
             bounding_boxes,
-            format=bounding_boxes.format,
-            canvas_size=bounding_boxes.canvas_size,
-            affine_matrix=affine_matrix,
+            affine_matrix=self._compute_affine_matrix(
+                angle=angle, translate=translate, scale=scale, shear=shear, center=center
+            ),
         )
 
-        return expected_bboxes
-
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
     @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
@@ -1347,17 +1346,9 @@ class TestVerticalFlip:
                 [1, 0, 0],
                 [0, -1, bounding_boxes.canvas_size[0]],
             ],
-            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
-        )
-
-        expected_bboxes = reference_affine_bounding_boxes_helper(
-            bounding_boxes,
-            format=bounding_boxes.format,
-            canvas_size=bounding_boxes.canvas_size,
-            affine_matrix=affine_matrix,
         )
 
-        return datapoints.wrap(expected_bboxes, like=bounding_boxes)
+        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
@@ -1535,39 +1526,73 @@ class TestRotate:
         mae = (actual.float() - expected.float()).abs().mean()
         assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
 
-    def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center):
-        # FIXME
-        if expand:
-            raise ValueError("This reference currently does not support expand=True")
+    def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix):
+        if not expand:
+            return canvas_size, (0.0, 0.0)
+
+        input_height, input_width = canvas_size
+
+        input_image_frame = np.array(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, input_height, 1.0],
+                [input_width, input_height, 1.0],
+                [input_width, 0.0, 1.0],
+            ],
+            dtype=np.float64,
+        )
+        output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T)
+
+        recenter_x = float(np.min(output_image_frame[:, 0]))
+        recenter_y = float(np.min(output_image_frame[:, 1]))
+
+        output_width = int(np.max(output_image_frame[:, 0]) - recenter_x)
+        output_height = int(np.max(output_image_frame[:, 1]) - recenter_y)
+
+        return (output_height, output_width), (recenter_x, recenter_y)
 
+    def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy):
+        x, y = recenter_xy
+        if bounding_boxes.format is datapoints.BoundingBoxFormat.XYXY:
+            translate = [x, y, x, y]
+        else:
+            translate = [x, y, 0.0, 0.0]
+        return datapoints.wrap(
+            (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes
+        )
+
+    def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center):
         if center is None:
             center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
+        cx, cy = center
 
         a = np.cos(angle * np.pi / 180.0)
         b = np.sin(angle * np.pi / 180.0)
-        cx = center[0]
-        cy = center[1]
         affine_matrix = np.array(
             [
                 [a, b, cx - cx * a - b * cy],
                 [-b, a, cy + cx * b - a * cy],
             ],
-            dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
         )
 
-        expected_bboxes = reference_affine_bounding_boxes_helper(
+        new_canvas_size, recenter_xy = self._compute_output_canvas_size(
+            expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix
+        )
+
+        output = reference_affine_bounding_boxes_helper(
             bounding_boxes,
-            format=bounding_boxes.format,
-            canvas_size=bounding_boxes.canvas_size,
             affine_matrix=affine_matrix,
+            new_canvas_size=new_canvas_size,
+            clamp=False,
         )
 
-        return expected_bboxes
+        return F.clamp_bounding_boxes(self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy)).to(
+            bounding_boxes
+        )
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
-    # TODO: add support for expand=True in the reference
-    @pytest.mark.parametrize("expand", [False])
+    @pytest.mark.parametrize("expand", [False, True])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
         bounding_boxes = make_bounding_boxes(format=format)
@@ -1576,10 +1601,10 @@ class TestRotate:
         expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
 
         torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
-    # TODO: add support for expand=True in the reference
-    @pytest.mark.parametrize("expand", [False])
+    @pytest.mark.parametrize("expand", [False, True])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
@@ -1596,6 +1621,7 @@ class TestRotate:
         expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center)
 
         torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
 
     @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
     @pytest.mark.parametrize("seed", list(range(10)))
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 8b3add798..abc3716cf 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -803,8 +803,8 @@ def _affine_bounding_boxes_with_expand(
         tr = torch.amin(new_points, dim=0, keepdim=True)
         # Translate bounding boxes
         out_bboxes.sub_(tr.repeat((1, 2)))
-        # Estimate meta-data for image with inverted=True and with center=[0,0]
-        affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
+        # Estimate meta-data for image with inverted=True
+        affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
         new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
         canvas_size = (new_height, new_width)
 
-- 
GitLab


From 655ebdbcd71251ff6bbac89c4183f537db9aae2d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 29 Aug 2023 10:38:03 +0100
Subject: [PATCH 604/624] Redo torchscript example (#7889)

---
 docs/source/transforms.rst                    |  10 +-
 .../others/plot_scripted_tensor_transforms.py | 109 ++++++++----------
 .../plot_transforms_getting_started.py        |   1 +
 3 files changed, 58 insertions(+), 62 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 152700044..74ab20605 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -214,7 +214,8 @@ Torchscript support
 -------------------
 
 Most transform classes and functionals support torchscript. For composing
-transforms, use :class:`torch.nn.Sequential` instead of ``Compose``:
+transforms, use :class:`torch.nn.Sequential` instead of
+:class:`~torchvision.transforms.v2.Compose`:
 
 .. code:: python
 
@@ -232,7 +233,7 @@ transforms, use :class:`torch.nn.Sequential` instead of ``Compose``:
     scripted and eager executions due to implementation differences between v1
     and v2.
 
-    If you really need torchscript support for the v2 tranforms, we recommend
+    If you really need torchscript support for the v2 transforms, we recommend
     scripting the **functionals** from the
     ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
 
@@ -242,7 +243,10 @@ are always treated as images. If you need torchscript support for other types
 like bounding boxes or masks, you can rely on the :ref:`low-level kernels
 <functional_transforms>`.
 
-For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
+For any custom transformations to be used with ``torch.jit.script``, they should
+be derived from ``torch.nn.Module``.
+
+See also: :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`.
 
 V2 API reference - Recommended
 ------------------------------
diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
index 27e8cc166..128ce7778 100644
--- a/gallery/others/plot_scripted_tensor_transforms.py
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -1,90 +1,77 @@
 """
-=========================
-Tensor transforms and JIT
-=========================
+===================
+Torchscript support
+===================
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_scripted_tensor_transforms.ipynb>`_
     or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_scripted_tensor_transforms.py>` to download the full example code.
 
-This example illustrates various features that are now supported by the
-:ref:`image transformations <transforms>` on Tensor images. In particular, we
-show how image transforms can be performed on GPU, and how one can also script
-them using JIT compilation.
-
-Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric
-and presented multiple limitations due to that. Now, since v0.8.0, transforms
-implementations are Tensor and PIL compatible, and we can achieve the following
-new features:
-
-- transform multi-band torch tensor images (with more than 3-4 channels)
-- torchscript transforms together with your model for deployment
-- support for GPU acceleration
-- batched transformation such as for videos
-- read and decode data directly as torch tensor with torchscript support (for PNG and JPEG image formats)
-
-.. note::
-    These features are only possible with **Tensor** images.
+This example illustrates `torchscript
+<https://pytorch.org/docs/stable/jit.html>`_ support of the torchvision
+:ref:`transforms <transforms>` on Tensor images.
 """
 
+# %%
 from pathlib import Path
 
 import matplotlib.pyplot as plt
-import numpy as np
 
 import torch
-import torchvision.transforms as T
-from torchvision.io import read_image
+import torch.nn as nn
 
+import torchvision.transforms as v1
+from torchvision.io import read_image
 
 plt.rcParams["savefig.bbox"] = 'tight'
 torch.manual_seed(1)
 
-
-def show(imgs):
-    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
-    for i, img in enumerate(imgs):
-        img = T.ToPILImage()(img.to('cpu'))
-        axs[0, i].imshow(np.asarray(img))
-        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+import sys
+sys.path += ["../transforms"]
+from helpers import plot
+ASSETS_PATH = Path('../assets')
 
 
 # %%
-# The :func:`~torchvision.io.read_image` function allows to read an image and
-# directly load it as a tensor
+# Most transforms support torchscript. For composing transforms, we use
+# :class:`torch.nn.Sequential` instead of
+# :class:`~torchvision.transforms.v2.Compose`:
 
-dog1 = read_image(str(Path('../assets') / 'dog1.jpg'))
-dog2 = read_image(str(Path('../assets') / 'dog2.jpg'))
-show([dog1, dog2])
-
-# %%
-# Transforming images on GPU
-# --------------------------
-# Most transforms natively support tensors on top of PIL images (to visualize
-# the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_transforms_plot_transforms_illustrations.py`).
-# Using tensor images, we can run the transforms on GPUs if cuda is available!
-
-import torch.nn as nn
+dog1 = read_image(str(ASSETS_PATH / 'dog1.jpg'))
+dog2 = read_image(str(ASSETS_PATH / 'dog2.jpg'))
 
 transforms = torch.nn.Sequential(
-    T.RandomCrop(224),
-    T.RandomHorizontalFlip(p=0.3),
+    v1.RandomCrop(224),
+    v1.RandomHorizontalFlip(p=0.3),
 )
 
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-dog1 = dog1.to(device)
-dog2 = dog2.to(device)
+scripted_transforms = torch.jit.script(transforms)
+
+plot([dog1, scripted_transforms(dog1), dog2, scripted_transforms(dog2)])
 
-transformed_dog1 = transforms(dog1)
-transformed_dog2 = transforms(dog2)
-show([transformed_dog1, transformed_dog2])
 
 # %%
-# Scriptable transforms for easier deployment via torchscript
-# -----------------------------------------------------------
-# We now show how to combine image transformations and a model forward pass,
-# while using ``torch.jit.script`` to obtain a single scripted module.
+# .. warning::
+#
+#     Above we have used transforms from the ``torchvision.transforms``
+#     namespace, i.e. the "v1" transforms. The v2 transforms from the
+#     ``torchvision.transforms.v2`` namespace are the :ref:`recommended
+#     <v1_or_v2>` way to use transforms in your code.
+#
+#     The v2 transforms also support torchscript, but if you call
+#     ``torch.jit.script()`` on a v2 **class** transform, you'll actually end up
+#     with its (scripted) v1 equivalent.  This may lead to slightly different
+#     results between the scripted and eager executions due to implementation
+#     differences between v1 and v2.
+#
+#     If you really need torchscript support for the v2 transforms, **we
+#     recommend scripting the functionals** from the
+#     ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
+#
+# Below we now show how to combine image transformations and a model forward
+# pass, while using ``torch.jit.script`` to obtain a single scripted module.
 #
 # Let's define a ``Predictor`` module that transforms the input tensor and then
 # applies an ImageNet model on it.
@@ -98,7 +85,7 @@ class Predictor(nn.Module):
         super().__init__()
         weights = ResNet18_Weights.DEFAULT
         self.resnet18 = resnet18(weights=weights, progress=False).eval()
-        self.transforms = weights.transforms()
+        self.transforms = weights.transforms(antialias=True)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         with torch.no_grad():
@@ -111,6 +98,8 @@ class Predictor(nn.Module):
 # Now, let's define scripted and non-scripted instances of ``Predictor`` and
 # apply it on multiple tensor images of the same size
 
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
 predictor = Predictor().to(device)
 scripted_predictor = torch.jit.script(predictor).to(device)
 
@@ -143,3 +132,5 @@ with tempfile.NamedTemporaryFile() as f:
     dumped_scripted_predictor = torch.jit.load(f.name)
     res_scripted_dumped = dumped_scripted_predictor(batch)
 assert (res_scripted_dumped == res_scripted).all()
+
+# %%
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
index da23ccd81..f44fbb2ef 100644
--- a/gallery/transforms/plot_transforms_getting_started.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -172,6 +172,7 @@ target = {
 # Re-using the transforms and definitions from above.
 out_img, out_target = transforms(img, target)
 
+# sphinx_gallery_thumbnail_number = 4
 plot([(img, target["boxes"]), (out_img, out_target["boxes"])])
 print(f"{out_target['this_is_ignored']}")
 
-- 
GitLab


From ec881a09a96e34c671151cf8df385465a0640cd7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 29 Aug 2023 14:02:10 +0100
Subject: [PATCH 605/624] Fix weight download URLs (#7898)

---
 torchvision/models/efficientnet.py              | 2 +-
 torchvision/models/quantization/googlenet.py    | 2 +-
 torchvision/models/quantization/inception.py    | 2 +-
 torchvision/models/quantization/shufflenetv2.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index 14b134bb2..d04028a3b 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -463,7 +463,7 @@ class EfficientNet_B0_Weights(WeightsEnum):
 class EfficientNet_B1_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-bac287d4.pth",
         transforms=partial(
             ImageClassification, crop_size=240, resize_size=256, interpolation=InterpolationMode.BICUBIC
         ),
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index a12b5dec0..30ef3356b 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -108,7 +108,7 @@ class QuantizableGoogLeNet(GoogLeNet):
 
 class GoogLeNet_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c00238cf.pth",
+        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c81f6644.pth",
         transforms=partial(ImageClassification, crop_size=224),
         meta={
             "num_params": 6624904,
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index 9c4562eea..75c126697 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -168,7 +168,7 @@ class QuantizableInception3(inception_module.Inception3):
 
 class Inception_V3_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-71447a44.pth",
+        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-a2837893.pth",
         transforms=partial(ImageClassification, crop_size=299, resize_size=342),
         meta={
             "num_params": 27161264,
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index 50fef3a65..3e1b01356 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -148,7 +148,7 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
 
 class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-db332c57.pth",
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-1e62bb32.pth",
         transforms=partial(ImageClassification, crop_size=224),
         meta={
             **_COMMON_META,
-- 
GitLab


From e928537042dc2212b369bf981345420170e7b7a4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 29 Aug 2023 15:30:46 +0100
Subject: [PATCH 606/624] Put back disable_beta_transforms_warning and make it
 a noop (#7897)

---
 torchvision/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index e44301c1a..d3142028e 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -95,3 +95,9 @@ def get_video_backend():
 
 def _is_tracing():
     return torch._C._get_tracing_state()
+
+
+def disable_beta_transforms_warning():
+    # Noop, only exists to avoid breaking existing code.
+    # See https://github.com/pytorch/vision/issues/7896
+    pass
-- 
GitLab


From b9447fddb836259ec81da070cbc72c24199dc581 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@gmail.com>
Date: Tue, 29 Aug 2023 17:01:48 -0400
Subject: [PATCH 607/624] Separate Test Token for Conda Uploads (#7900)

---
 .github/workflows/build-conda-linux.yml   | 1 +
 .github/workflows/build-conda-m1.yml      | 1 +
 .github/workflows/build-conda-macos.yml   | 1 +
 .github/workflows/build-conda-windows.yml | 1 +
 4 files changed, 4 insertions(+)

diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
index e44bb877e..a445ef9af 100644
--- a/.github/workflows/build-conda-linux.yml
+++ b/.github/workflows/build-conda-linux.yml
@@ -49,3 +49,4 @@ jobs:
       trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
index 7006b6714..159a3c79a 100644
--- a/.github/workflows/build-conda-m1.yml
+++ b/.github/workflows/build-conda-m1.yml
@@ -50,3 +50,4 @@ jobs:
       trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml
index 39d00534b..6f4929e27 100644
--- a/.github/workflows/build-conda-macos.yml
+++ b/.github/workflows/build-conda-macos.yml
@@ -50,3 +50,4 @@ jobs:
       trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 954d09d39..968cb7007 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -49,3 +49,4 @@ jobs:
       trigger-event: ${{ github.event_name }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-- 
GitLab


From d5f4cc38dc96bd14e5cab0893c3203af9a8a9685 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 30 Aug 2023 10:23:23 +0100
Subject: [PATCH 608/624] Datapoint -> TVTensor; datapoint[s] -> tv_tensor[s]
 (#7894)

---
 docs/source/conf.py                           |   4 +-
 docs/source/index.rst                         |   2 +-
 docs/source/transforms.rst                    |   8 +-
 .../source/{datapoints.rst => tv_tensors.rst} |  12 +-
 gallery/transforms/helpers.py                 |   4 +-
 gallery/transforms/plot_custom_transforms.py  |  10 +-
 ...atapoints.py => plot_custom_tv_tensors.py} |  44 +--
 gallery/transforms/plot_transforms_e2e.py     |   6 +-
 .../plot_transforms_getting_started.py        |  56 ++--
 ...{plot_datapoints.py => plot_tv_tensors.py} | 134 +++++-----
 references/detection/presets.py               |  18 +-
 references/detection/train.py                 |   4 +-
 references/segmentation/presets.py            |  20 +-
 references/segmentation/train.py              |   2 +-
 references/segmentation/v2_extras.py          |   4 +-
 test/common_utils.py                          |  22 +-
 test/datasets_utils.py                        |   8 +-
 test/prototype_common_utils.py                |   6 +-
 test/test_datasets.py                         |   4 +-
 test/test_prototype_datasets_builtin.py       |   8 +-
 test/test_prototype_transforms.py             |  30 +--
 test/test_transforms_v2.py                    | 114 ++++----
 test/test_transforms_v2_consistency.py        |  40 +--
 test/test_transforms_v2_functional.py         | 134 +++++-----
 test/test_transforms_v2_refactored.py         | 200 +++++++-------
 test/test_transforms_v2_utils.py              |  64 ++---
 ...{test_datapoints.py => test_tv_tensors.py} | 104 ++++----
 test/transforms_v2_dispatcher_infos.py        | 152 +++++------
 test/transforms_v2_kernel_infos.py            |  10 +-
 test/transforms_v2_legacy_utils.py            |  32 +--
 torchvision/datasets/__init__.py              |   2 +-
 torchvision/prototype/__init__.py             |   2 +-
 .../prototype/datasets/_builtin/caltech.py    |   4 +-
 .../prototype/datasets/_builtin/celeba.py     |   4 +-
 .../prototype/datasets/_builtin/cifar.py      |   4 +-
 .../prototype/datasets/_builtin/clevr.py      |   2 +-
 .../prototype/datasets/_builtin/coco.py       |   4 +-
 .../prototype/datasets/_builtin/country211.py |   2 +-
 .../prototype/datasets/_builtin/cub200.py     |   4 +-
 .../prototype/datasets/_builtin/dtd.py        |   2 +-
 .../prototype/datasets/_builtin/eurosat.py    |   2 +-
 .../prototype/datasets/_builtin/fer2013.py    |   4 +-
 .../prototype/datasets/_builtin/food101.py    |   2 +-
 .../prototype/datasets/_builtin/gtsrb.py      |   4 +-
 .../prototype/datasets/_builtin/imagenet.py   |   2 +-
 .../prototype/datasets/_builtin/mnist.py      |   4 +-
 .../datasets/_builtin/oxford_iiit_pet.py      |   2 +-
 .../prototype/datasets/_builtin/pcam.py       |   4 +-
 .../prototype/datasets/_builtin/semeion.py    |   4 +-
 .../datasets/_builtin/stanford_cars.py        |   4 +-
 .../prototype/datasets/_builtin/svhn.py       |   4 +-
 .../prototype/datasets/_builtin/usps.py       |   4 +-
 .../prototype/datasets/_builtin/voc.py        |   4 +-
 torchvision/prototype/datasets/_folder.py     |   2 +-
 .../prototype/datasets/utils/_encoded.py      |   6 +-
 torchvision/prototype/transforms/_augment.py  |  44 +--
 torchvision/prototype/transforms/_geometry.py |  20 +-
 torchvision/prototype/transforms/_misc.py     |  18 +-
 .../prototype/transforms/_type_conversion.py  |   8 +-
 .../{datapoints => tv_tensors}/__init__.py    |   0
 .../{datapoints => tv_tensors}/_label.py      |   4 +-
 torchvision/transforms/v2/_augment.py         |  22 +-
 torchvision/transforms/v2/_auto_augment.py    |  16 +-
 torchvision/transforms/v2/_geometry.py        | 116 ++++----
 torchvision/transforms/v2/_meta.py            |  18 +-
 torchvision/transforms/v2/_misc.py            |  36 +--
 torchvision/transforms/v2/_transform.py       |   8 +-
 torchvision/transforms/v2/_type_conversion.py |  12 +-
 torchvision/transforms/v2/_utils.py           |  16 +-
 .../transforms/v2/functional/_augment.py      |   6 +-
 .../transforms/v2/functional/_color.py        |  54 ++--
 .../transforms/v2/functional/_geometry.py     | 252 +++++++++---------
 torchvision/transforms/v2/functional/_meta.py |  44 +--
 torchvision/transforms/v2/functional/_misc.py |  20 +-
 .../transforms/v2/functional/_temporal.py     |   4 +-
 .../v2/functional/_type_conversion.py         |   6 +-
 .../transforms/v2/functional/_utils.py        |  48 ++--
 .../{datapoints => tv_tensors}/__init__.py    |  10 +-
 .../_bounding_box.py                          |   8 +-
 .../_dataset_wrapper.py                       |  68 ++---
 .../{datapoints => tv_tensors}/_image.py      |   4 +-
 .../{datapoints => tv_tensors}/_mask.py       |   4 +-
 .../_torch_function_helpers.py                |  14 +-
 .../_tv_tensor.py}                            |  20 +-
 .../{datapoints => tv_tensors}/_video.py      |   4 +-
 85 files changed, 1121 insertions(+), 1121 deletions(-)
 rename docs/source/{datapoints.rst => tv_tensors.rst} (57%)
 rename gallery/transforms/{plot_custom_datapoints.py => plot_custom_tv_tensors.py} (74%)
 rename gallery/transforms/{plot_datapoints.py => plot_tv_tensors.py} (57%)
 rename test/{test_datapoints.py => test_tv_tensors.py} (74%)
 rename torchvision/prototype/{datapoints => tv_tensors}/__init__.py (100%)
 rename torchvision/prototype/{datapoints => tv_tensors}/_label.py (95%)
 rename torchvision/{datapoints => tv_tensors}/__init__.py (76%)
 rename torchvision/{datapoints => tv_tensors}/_bounding_box.py (95%)
 rename torchvision/{datapoints => tv_tensors}/_dataset_wrapper.py (91%)
 rename torchvision/{datapoints => tv_tensors}/_image.py (96%)
 rename torchvision/{datapoints => tv_tensors}/_mask.py (96%)
 rename torchvision/{datapoints => tv_tensors}/_torch_function_helpers.py (81%)
 rename torchvision/{datapoints/_datapoint.py => tv_tensors/_tv_tensor.py} (90%)
 rename torchvision/{datapoints => tv_tensors}/_video.py (95%)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 66138c2d1..cd3a28658 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -88,8 +88,8 @@ class CustomGalleryExampleSortKey:
         "plot_transforms_e2e.py",
         "plot_cutmix_mixup.py",
         "plot_custom_transforms.py",
-        "plot_datapoints.py",
-        "plot_custom_datapoints.py",
+        "plot_tv_tensors.py",
+        "plot_custom_tv_tensors.py",
     ]
 
     def __call__(self, filename):
diff --git a/docs/source/index.rst b/docs/source/index.rst
index bc38fdb03..dc5fdefae 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,7 +32,7 @@ architectures, and common image transformations for computer vision.
    :caption: Package Reference
 
    transforms
-   datapoints
+   tv_tensors
    models
    datasets
    utils
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 74ab20605..fe9258d73 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -30,12 +30,12 @@ tasks (image classification, detection, segmentation, video classification).
 .. code:: python
 
     # Detection (re-using imports and transforms from above)
-    from torchvision import datapoints
+    from torchvision import tv_tensors
 
     img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
     bboxes = torch.randint(0, H // 2, size=(3, 4))
     bboxes[:, 2:] += bboxes[:, :2]
-    bboxes = datapoints.BoundingBoxes(bboxes, format="XYXY", canvas_size=(H, W))
+    bboxes = tv_tensors.BoundingBoxes(bboxes, format="XYXY", canvas_size=(H, W))
 
     # The same transforms can be used!
     img, bboxes = transforms(img, bboxes)
@@ -183,8 +183,8 @@ Transforms are available as classes like
 This is very much like the :mod:`torch.nn` package which defines both classes
 and functional equivalents in :mod:`torch.nn.functional`.
 
-The functionals support PIL images, pure tensors, or :ref:`datapoints
-<datapoints>`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are
+The functionals support PIL images, pure tensors, or :ref:`tv_tensors
+<tv_tensors>`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are
 valid.
 
 .. note::
diff --git a/docs/source/datapoints.rst b/docs/source/tv_tensors.rst
similarity index 57%
rename from docs/source/datapoints.rst
rename to docs/source/tv_tensors.rst
index 2ecfdec54..d9a96b981 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/tv_tensors.rst
@@ -1,13 +1,13 @@
-.. _datapoints:
+.. _tv_tensors:
 
-Datapoints
+TVTensors
 ==========
 
-.. currentmodule:: torchvision.datapoints
+.. currentmodule:: torchvision.tv_tensors
 
-Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
+TVTensors are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
 dispatch their inputs to the appropriate lower-level kernels. Most users do not
-need to manipulate datapoints directly and can simply rely on dataset wrapping -
+need to manipulate tv_tensors directly and can simply rely on dataset wrapping -
 see e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
 
 .. autosummary::
@@ -19,6 +19,6 @@ see e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
     BoundingBoxFormat
     BoundingBoxes
     Mask
-    Datapoint
+    TVTensor
     set_return_type
     wrap
diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py
index 957d9bcb7..e94d717eb 100644
--- a/gallery/transforms/helpers.py
+++ b/gallery/transforms/helpers.py
@@ -1,7 +1,7 @@
 import matplotlib.pyplot as plt
 import torch
 from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.v2 import functional as F
 
 
@@ -22,7 +22,7 @@ def plot(imgs, row_title=None, **imshow_kwargs):
                 if isinstance(target, dict):
                     boxes = target.get("boxes")
                     masks = target.get("masks")
-                elif isinstance(target, datapoints.BoundingBoxes):
+                elif isinstance(target, tv_tensors.BoundingBoxes):
                     boxes = target
                 else:
                     raise ValueError(f"Unexpected target type: {type(target)}")
diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py
index 55e8e3f06..789de0ea6 100644
--- a/gallery/transforms/plot_custom_transforms.py
+++ b/gallery/transforms/plot_custom_transforms.py
@@ -13,7 +13,7 @@ torchvision transforms V2 API.
 
 # %%
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms import v2
 
 
@@ -62,7 +62,7 @@ transforms = v2.Compose([
 
 H, W = 256, 256
 img = torch.rand(3, H, W)
-bboxes = datapoints.BoundingBoxes(
+bboxes = tv_tensors.BoundingBoxes(
     torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]),
     format="XYXY",
     canvas_size=(H, W)
@@ -74,9 +74,9 @@ out_img, out_bboxes, out_label = transforms(img, bboxes, label)
 print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
 # %%
 # .. note::
-#     While working with datapoint classes in your code, make sure to
+#     While working with tv_tensor classes in your code, make sure to
 #     familiarize yourself with this section:
-#     :ref:`datapoint_unwrapping_behaviour`
+#     :ref:`tv_tensor_unwrapping_behaviour`
 #
 # Supporting arbitrary input structures
 # =====================================
@@ -111,7 +111,7 @@ print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
 # In brief, the core logic is to unpack the input into a flat list using `pytree
 # <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
 # then transform only the entries that can be transformed (the decision is made
-# based on the **class** of the entries, as all datapoints are
+# based on the **class** of the entries, as all tv_tensors are
 # tensor-subclasses) plus some custom logic that is out of score here - check the
 # code for details. The (potentially transformed) entries are then repacked and
 # returned, in the same structure as the input.
diff --git a/gallery/transforms/plot_custom_datapoints.py b/gallery/transforms/plot_custom_tv_tensors.py
similarity index 74%
rename from gallery/transforms/plot_custom_datapoints.py
rename to gallery/transforms/plot_custom_tv_tensors.py
index 674aceb6e..75c4e8254 100644
--- a/gallery/transforms/plot_custom_datapoints.py
+++ b/gallery/transforms/plot_custom_tv_tensors.py
@@ -1,62 +1,62 @@
 """
 =====================================
-How to write your own Datapoint class
+How to write your own TVTensor class
 =====================================
 
 .. note::
-    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_datapoints.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_datapoints.py>` to download the full example code.
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_tv_tensors.py>` to download the full example code.
 
 This guide is intended for advanced users and downstream library maintainers. We explain how to
-write your own datapoint class, and how to make it compatible with the built-in
+write your own tv_tensor class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
-:ref:`sphx_glr_auto_examples_transforms_plot_datapoints.py`.
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
 """
 
 # %%
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms import v2
 
 # %%
 # We will create a very simple class that just inherits from the base
-# :class:`~torchvision.datapoints.Datapoint` class. It will be enough to cover
+# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover
 # what you need to know to implement your more elaborate uses-cases. If you need
 # to create a class that carries meta-data, take a look at how the
-# :class:`~torchvision.datapoints.BoundingBoxes` class is `implemented
-# <https://github.com/pytorch/vision/blob/main/torchvision/datapoints/_bounding_box.py>`_.
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented
+# <https://github.com/pytorch/vision/blob/main/torchvision/tv_tensors/_bounding_box.py>`_.
 
 
-class MyDatapoint(datapoints.Datapoint):
+class MyTVTensor(tv_tensors.TVTensor):
     pass
 
 
-my_dp = MyDatapoint([1, 2, 3])
+my_dp = MyTVTensor([1, 2, 3])
 my_dp
 
 # %%
-# Now that we have defined our custom Datapoint class, we want it to be
+# Now that we have defined our custom TVTensor class, we want it to be
 # compatible with the built-in torchvision transforms, and the functional API.
 # For that, we need to implement a kernel which performs the core of the
 # transformation, and then "hook" it to the functional that we want to support
 # via :func:`~torchvision.transforms.v2.functional.register_kernel`.
 #
 # We illustrate this process below: we create a kernel for the "horizontal flip"
-# operation of our MyDatapoint class, and register it to the functional API.
+# operation of our MyTVTensor class, and register it to the functional API.
 
 from torchvision.transforms.v2 import functional as F
 
 
-@F.register_kernel(functional="hflip", datapoint_cls=MyDatapoint)
-def hflip_my_datapoint(my_dp, *args, **kwargs):
+@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor)
+def hflip_my_tv_tensor(my_dp, *args, **kwargs):
     print("Flipping!")
     out = my_dp.flip(-1)
-    return datapoints.wrap(out, like=my_dp)
+    return tv_tensors.wrap(out, like=my_dp)
 
 
 # %%
-# To understand why :func:`~torchvision.datapoints.wrap` is used, see
-# :ref:`datapoint_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
+# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see
+# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
 # we will explain it below in :ref:`param_forwarding`.
 #
 # .. note::
@@ -67,9 +67,9 @@ def hflip_my_datapoint(my_dp, *args, **kwargs):
 #     ``@register_kernel(functional=F.hflip, ...)``.
 #
 # Now that we have registered our kernel, we can call the functional API on a
-# ``MyDatapoint`` instance:
+# ``MyTVTensor`` instance:
 
-my_dp = MyDatapoint(torch.rand(3, 256, 256))
+my_dp = MyTVTensor(torch.rand(3, 256, 256))
 _ = F.hflip(my_dp)
 
 # %%
@@ -102,10 +102,10 @@ _ = t(my_dp)
 # to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you
 # already defined and registered your own kernel as
 
-def hflip_my_datapoint(my_dp):  # noqa
+def hflip_my_tv_tensor(my_dp):  # noqa
     print("Flipping!")
     out = my_dp.flip(-1)
-    return datapoints.wrap(out, like=my_dp)
+    return tv_tensors.wrap(out, like=my_dp)
 
 
 # %%
diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py
index 313c7b7e6..66d9203d7 100644
--- a/gallery/transforms/plot_transforms_e2e.py
+++ b/gallery/transforms/plot_transforms_e2e.py
@@ -23,7 +23,7 @@ import pathlib
 import torch
 import torch.utils.data
 
-from torchvision import models, datasets, datapoints
+from torchvision import models, datasets, tv_tensors
 from torchvision.transforms import v2
 
 torch.manual_seed(0)
@@ -72,7 +72,7 @@ print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['m
 # %%
 # We used the ``target_keys`` parameter to specify the kind of output we're
 # interested in. Our dataset now returns a target which is dict where the values
-# are :ref:`Datapoints <what_are_datapoints>` (all are :class:`torch.Tensor`
+# are :ref:`TVTensors <what_are_tv_tensors>` (all are :class:`torch.Tensor`
 # subclasses). We're dropped all unncessary keys from the previous output, but
 # if you need any of the original keys e.g. "image_id", you can still ask for
 # it.
@@ -103,7 +103,7 @@ transforms = v2.Compose(
     [
         v2.ToImage(),
         v2.RandomPhotometricDistort(p=1),
-        v2.RandomZoomOut(fill={datapoints.Image: (123, 117, 104), "others": 0}),
+        v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}),
         v2.RandomIoUCrop(),
         v2.RandomHorizontalFlip(p=1),
         v2.SanitizeBoundingBoxes(),
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
index f44fbb2ef..62ee13643 100644
--- a/gallery/transforms/plot_transforms_getting_started.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -88,9 +88,9 @@ plot([img, out])
 #
 # Let's briefly look at a detection example with bounding boxes.
 
-from torchvision import datapoints  # we'll describe this a bit later, bare with us
+from torchvision import tv_tensors  # we'll describe this a bit later, bare with us
 
-boxes = datapoints.BoundingBoxes(
+boxes = tv_tensors.BoundingBoxes(
     [
         [15, 10, 370, 510],
         [275, 340, 510, 510],
@@ -111,44 +111,44 @@ plot([(img, boxes), (out_img, out_boxes)])
 # %%
 #
 # The example above focuses on object detection. But if we had masks
-# (:class:`torchvision.datapoints.Mask`) for object segmentation or semantic
-# segmentation, or videos (:class:`torchvision.datapoints.Video`), we could have
+# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic
+# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have
 # passed them to the transforms in exactly the same way.
 #
-# By now you likely have a few questions: what are these datapoints, how do we
+# By now you likely have a few questions: what are these tv_tensors, how do we
 # use them, and what is the expected input/output of those transforms? We'll
 # answer these in the next sections.
 
 # %%
 #
-# .. _what_are_datapoints:
+# .. _what_are_tv_tensors:
 #
-# What are Datapoints?
+# What are TVTensors?
 # --------------------
 #
-# Datapoints are :class:`torch.Tensor` subclasses. The available datapoints are
-# :class:`~torchvision.datapoints.Image`,
-# :class:`~torchvision.datapoints.BoundingBoxes`,
-# :class:`~torchvision.datapoints.Mask`, and
-# :class:`~torchvision.datapoints.Video`.
+# TVTensors are :class:`torch.Tensor` subclasses. The available tv_tensors are
+# :class:`~torchvision.tv_tensors.Image`,
+# :class:`~torchvision.tv_tensors.BoundingBoxes`,
+# :class:`~torchvision.tv_tensors.Mask`, and
+# :class:`~torchvision.tv_tensors.Video`.
 #
-# Datapoints look and feel just like regular tensors - they **are** tensors.
+# TVTensors look and feel just like regular tensors - they **are** tensors.
 # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
-# or any ``torch.*`` operator will also work on a datapoint:
+# or any ``torch.*`` operator will also work on a tv_tensor:
 
-img_dp = datapoints.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
+img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
 
 print(f"{isinstance(img_dp, torch.Tensor) = }")
 print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 
 # %%
-# These Datapoint classes are at the core of the transforms: in order to
+# These TVTensor classes are at the core of the transforms: in order to
 # transform a given input, the transforms first look at the **class** of the
 # object, and dispatch to the appropriate implementation accordingly.
 #
-# You don't need to know much more about datapoints at this point, but advanced
+# You don't need to know much more about tv_tensors at this point, but advanced
 # users who want to learn more can refer to
-# :ref:`sphx_glr_auto_examples_transforms_plot_datapoints.py`.
+# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
 #
 # What do I pass as input?
 # ------------------------
@@ -196,17 +196,17 @@ print(f"{out_target['this_is_ignored']}")
 #     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
 #     as videos for video-specific transforms). Indeed, you may have noticed
 #     that in the code above we haven't used the
-#     :class:`~torchvision.datapoints.Image` class at all, and yet our images
+#     :class:`~torchvision.tv_tensors.Image` class at all, and yet our images
 #     got transformed properly. Transforms follow the following logic to
 #     determine whether a pure Tensor should be treated as an image (or video),
 #     or just ignored:
 #
-#     * If there is an :class:`~torchvision.datapoints.Image`,
-#       :class:`~torchvision.datapoints.Video`,
+#     * If there is an :class:`~torchvision.tv_tensors.Image`,
+#       :class:`~torchvision.tv_tensors.Video`,
 #       or :class:`PIL.Image.Image` instance in the input, all other pure
 #       tensors are passed-through.
-#     * If there is no :class:`~torchvision.datapoints.Image` or
-#       :class:`~torchvision.datapoints.Video` instance, only the first pure
+#     * If there is no :class:`~torchvision.tv_tensors.Image` or
+#       :class:`~torchvision.tv_tensors.Video` instance, only the first pure
 #       :class:`torch.Tensor` will be transformed as image or video, while all
 #       others will be passed-through. Here "first" means "first in a depth-wise
 #       traversal".
@@ -234,9 +234,9 @@ print(f"{out_target['this_is_ignored']}")
 # Torchvision also supports datasets for object detection or segmentation like
 # :class:`torchvision.datasets.CocoDetection`. Those datasets predate
 # the existence of the :mod:`torchvision.transforms.v2` module and of the
-# datapoints, so they don't return datapoints out of the box.
+# tv_tensors, so they don't return tv_tensors out of the box.
 #
-# An easy way to force those datasets to return datapoints and to make them
+# An easy way to force those datasets to return tv_tensors and to make them
 # compatible with v2 transforms is to use the
 # :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
 #
@@ -246,14 +246,14 @@ print(f"{out_target['this_is_ignored']}")
 #
 #    dataset = CocoDetection(..., transforms=my_transforms)
 #    dataset = wrap_dataset_for_transforms_v2(dataset)
-#    # Now the dataset returns datapoints!
+#    # Now the dataset returns tv_tensors!
 #
 # Using your own datasets
 # ^^^^^^^^^^^^^^^^^^^^^^^
 #
 # If you have a custom dataset, then you'll need to convert your objects into
-# the appropriate Datapoint classes. Creating Datapoint instances is very easy,
-# refer to :ref:`datapoint_creation` for more details.
+# the appropriate TVTensor classes. Creating TVTensor instances is very easy,
+# refer to :ref:`tv_tensor_creation` for more details.
 #
 # There are two main places where you can implement that conversion logic:
 #
diff --git a/gallery/transforms/plot_datapoints.py b/gallery/transforms/plot_tv_tensors.py
similarity index 57%
rename from gallery/transforms/plot_datapoints.py
rename to gallery/transforms/plot_tv_tensors.py
index 726046097..c5813189e 100644
--- a/gallery/transforms/plot_datapoints.py
+++ b/gallery/transforms/plot_tv_tensors.py
@@ -1,43 +1,43 @@
 """
-==============
-Datapoints FAQ
-==============
+=============
+TVTensors FAQ
+=============
 
 .. note::
-    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_datapoints.ipynb>`_
-    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_datapoints.py>` to download the full example code.
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_tv_tensors.py>` to download the full example code.
 
 
-Datapoints are Tensor subclasses introduced together with
-``torchvision.transforms.v2``. This example showcases what these datapoints are
+TVTensors are Tensor subclasses introduced together with
+``torchvision.transforms.v2``. This example showcases what these tv_tensors are
 and how they behave.
 
 .. warning::
 
-    **Intended Audience** Unless you're writing your own transforms or your own datapoints, you
+    **Intended Audience** Unless you're writing your own transforms or your own tv_tensors, you
     probably do not need to read this guide. This is a fairly low-level topic
     that most users will not need to worry about: you do not need to understand
-    the internals of datapoints to efficiently rely on
+    the internals of tv_tensors to efficiently rely on
     ``torchvision.transforms.v2``. It may however be useful for advanced users
     trying to implement their own datasets, transforms, or work directly with
-    the datapoints.
+    the tv_tensors.
 """
 
 # %%
 import PIL.Image
 
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 
 
 # %%
-# What are datapoints?
+# What are tv_tensors?
 # --------------------
 #
-# Datapoints are zero-copy tensor subclasses:
+# TVTensors are zero-copy tensor subclasses:
 
 tensor = torch.rand(3, 256, 256)
-image = datapoints.Image(tensor)
+image = tv_tensors.Image(tensor)
 
 assert isinstance(image, torch.Tensor)
 assert image.data_ptr() == tensor.data_ptr()
@@ -46,33 +46,33 @@ assert image.data_ptr() == tensor.data_ptr()
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
-# :mod:`torchvision.datapoints` supports four types of datapoints:
+# :mod:`torchvision.tv_tensors` supports four types of tv_tensors:
 #
-# * :class:`~torchvision.datapoints.Image`
-# * :class:`~torchvision.datapoints.Video`
-# * :class:`~torchvision.datapoints.BoundingBoxes`
-# * :class:`~torchvision.datapoints.Mask`
+# * :class:`~torchvision.tv_tensors.Image`
+# * :class:`~torchvision.tv_tensors.Video`
+# * :class:`~torchvision.tv_tensors.BoundingBoxes`
+# * :class:`~torchvision.tv_tensors.Mask`
 #
-# What can I do with a datapoint?
+# What can I do with a tv_tensor?
 # -------------------------------
 #
-# Datapoints look and feel just like regular tensors - they **are** tensors.
+# TVTensors look and feel just like regular tensors - they **are** tensors.
 # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
-# any ``torch.*`` operator will also work on datapoints. See
-# :ref:`datapoint_unwrapping_behaviour` for a few gotchas.
+# any ``torch.*`` operator will also work on tv_tensors. See
+# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas.
 
 # %%
-# .. _datapoint_creation:
+# .. _tv_tensor_creation:
 #
-# How do I construct a datapoint?
+# How do I construct a tv_tensor?
 # -------------------------------
 #
 # Using the constructor
 # ^^^^^^^^^^^^^^^^^^^^^
 #
-# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+# Each tv_tensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
 
-image = datapoints.Image([[[[0, 1], [1, 0]]]])
+image = tv_tensors.Image([[[[0, 1], [1, 0]]]])
 print(image)
 
 
@@ -80,64 +80,64 @@ print(image)
 # Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
 # parameters.
 
-float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
 print(float_image)
 
 
 # %%
-# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` can also take a
+# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a
 # :class:`PIL.Image.Image` directly:
 
-image = datapoints.Image(PIL.Image.open("../assets/astronaut.jpg"))
+image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
 # %%
-# Some datapoints require additional metadata to be passed in ordered to be constructed. For example,
-# :class:`~torchvision.datapoints.BoundingBoxes` requires the coordinate format as well as the size of the
+# Some tv_tensors require additional metadata to be passed in ordered to be constructed. For example,
+# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
 # corresponding image (``canvas_size``) alongside the actual values. These
 # metadata are required to properly transform the bounding boxes.
 
-bboxes = datapoints.BoundingBoxes(
+bboxes = tv_tensors.BoundingBoxes(
     [[17, 16, 344, 495], [0, 10, 0, 10]],
-    format=datapoints.BoundingBoxFormat.XYXY,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
     canvas_size=image.shape[-2:]
 )
 print(bboxes)
 
 # %%
-# Using ``datapoints.wrap()``
+# Using ``tv_tensors.wrap()``
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# You can also use the :func:`~torchvision.datapoints.wrap` function to wrap a tensor object
-# into a datapoint. This is useful when you already have an object of the
+# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object
+# into a tv_tensor. This is useful when you already have an object of the
 # desired type, which typically happens when writing transforms: you just want
 # to wrap the output like the input.
 
 new_bboxes = torch.tensor([0, 20, 30, 40])
-new_bboxes = datapoints.wrap(new_bboxes, like=bboxes)
-assert isinstance(new_bboxes, datapoints.BoundingBoxes)
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 assert new_bboxes.canvas_size == bboxes.canvas_size
 
 # %%
 # The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
 # it as a parameter to override it.
 #
-# .. _datapoint_unwrapping_behaviour:
+# .. _tv_tensor_unwrapping_behaviour:
 #
-# I had a Datapoint but now I have a Tensor. Help!
+# I had a TVTensor but now I have a Tensor. Help!
 # ------------------------------------------------
 #
-# By default, operations on :class:`~torchvision.datapoints.Datapoint` objects
+# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects
 # will return a pure Tensor:
 
 
-assert isinstance(bboxes, datapoints.BoundingBoxes)
+assert isinstance(bboxes, tv_tensors.BoundingBoxes)
 
 # Shift bboxes by 3 pixels in both H and W
 new_bboxes = bboxes + 3
 
 assert isinstance(new_bboxes, torch.Tensor)
-assert not isinstance(new_bboxes, datapoints.BoundingBoxes)
+assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 
 # %%
 # .. note::
@@ -145,36 +145,36 @@ assert not isinstance(new_bboxes, datapoints.BoundingBoxes)
 #    This behavior only affects native ``torch`` operations. If you are using
 #    the built-in ``torchvision`` transforms or functionals, you will always get
 #    as output the same type that you passed as input (pure ``Tensor`` or
-#    ``Datapoint``).
+#    ``TVTensor``).
 
 # %%
-# But I want a Datapoint back!
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# But I want a TVTensor back!
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# You can re-wrap a pure tensor into a datapoint by just calling the datapoint
-# constructor, or by using the :func:`~torchvision.datapoints.wrap` function
-# (see more details above in :ref:`datapoint_creation`):
+# You can re-wrap a pure tensor into a tv_tensor by just calling the tv_tensor
+# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function
+# (see more details above in :ref:`tv_tensor_creation`):
 
 new_bboxes = bboxes + 3
-new_bboxes = datapoints.wrap(new_bboxes, like=bboxes)
-assert isinstance(new_bboxes, datapoints.BoundingBoxes)
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 
 # %%
-# Alternatively, you can use the :func:`~torchvision.datapoints.set_return_type`
+# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type`
 # as a global config setting for the whole program, or as a context manager
 # (read its docs to learn more about caveats):
 
-with datapoints.set_return_type("datapoint"):
+with tv_tensors.set_return_type("tv_tensor"):
     new_bboxes = bboxes + 3
-assert isinstance(new_bboxes, datapoints.BoundingBoxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 
 # %%
 # Why is this happening?
 # ^^^^^^^^^^^^^^^^^^^^^^
 #
-# **For performance reasons**. :class:`~torchvision.datapoints.Datapoint`
+# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor`
 # classes are Tensor subclasses, so any operation involving a
-# :class:`~torchvision.datapoints.Datapoint` object will go through the
+# :class:`~torchvision.tv_tensors.TVTensor` object will go through the
 # `__torch_function__
 # <https://pytorch.org/docs/stable/notes/extending.html#extending-torch>`_
 # protocol. This induces a small overhead, which we want to avoid when possible.
@@ -183,12 +183,12 @@ assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 # ``forward``.
 #
 # **The alternative isn't much better anyway.** For every operation where
-# preserving the :class:`~torchvision.datapoints.Datapoint` type makes
+# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes
 # sense, there are just as many operations where returning a pure Tensor is
-# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.datapoints.Image`?
-# If we were to preserve :class:`~torchvision.datapoints.Datapoint` types all
+# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`?
+# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all
 # the way, even model's logits or the output of the loss function would end up
-# being of type :class:`~torchvision.datapoints.Image`, and surely that's not
+# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not
 # desirable.
 #
 # .. note::
@@ -203,22 +203,22 @@ assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 # There are a few exceptions to this "unwrapping" rule:
 # :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
 # :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
-# the datapoint type.
+# the tv_tensor type.
 #
-# Inplace operations on datapoints like ``obj.add_()`` will preserve the type of
+# Inplace operations on tv_tensors like ``obj.add_()`` will preserve the type of
 # ``obj``. However, the **returned** value of inplace operations will be a pure
 # tensor:
 
-image = datapoints.Image([[[0, 1], [1, 0]]])
+image = tv_tensors.Image([[[0, 1], [1, 0]]])
 
 new_image = image.add_(1).mul_(2)
 
-# image got transformed in-place and is still an Image datapoint, but new_image
+# image got transformed in-place and is still an Image tv_tensor, but new_image
 # is a Tensor. They share the same underlying data and they're equal, just
 # different classes.
-assert isinstance(image, datapoints.Image)
+assert isinstance(image, tv_tensors.Image)
 print(image)
 
-assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image)
 assert (new_image == image).all()
 assert new_image.data_ptr() == image.data_ptr()
diff --git a/references/detection/presets.py b/references/detection/presets.py
index e7b2ca357..e9b6d56c8 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -7,10 +7,10 @@ import transforms as reference_transforms
 def get_modules(use_v2):
     # We need a protected import to avoid the V2 warning in case just V1 is used
     if use_v2:
-        import torchvision.datapoints
         import torchvision.transforms.v2
+        import torchvision.tv_tensors
 
-        return torchvision.transforms.v2, torchvision.datapoints
+        return torchvision.transforms.v2, torchvision.tv_tensors
     else:
         return reference_transforms, None
 
@@ -28,16 +28,16 @@ class DetectionPresetTrain:
         use_v2=False,
     ):
 
-        T, datapoints = get_modules(use_v2)
+        T, tv_tensors = get_modules(use_v2)
 
         transforms = []
         backend = backend.lower()
-        if backend == "datapoint":
+        if backend == "tv_tensor":
             transforms.append(T.ToImage())
         elif backend == "tensor":
             transforms.append(T.PILToTensor())
         elif backend != "pil":
-            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
 
         if data_augmentation == "hflip":
             transforms += [T.RandomHorizontalFlip(p=hflip_prob)]
@@ -54,7 +54,7 @@ class DetectionPresetTrain:
                 T.RandomHorizontalFlip(p=hflip_prob),
             ]
         elif data_augmentation == "ssd":
-            fill = defaultdict(lambda: mean, {datapoints.Mask: 0}) if use_v2 else list(mean)
+            fill = defaultdict(lambda: mean, {tv_tensors.Mask: 0}) if use_v2 else list(mean)
             transforms += [
                 T.RandomPhotometricDistort(),
                 T.RandomZoomOut(fill=fill),
@@ -77,7 +77,7 @@ class DetectionPresetTrain:
 
         if use_v2:
             transforms += [
-                T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY),
+                T.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.XYXY),
                 T.SanitizeBoundingBoxes(),
                 T.ToPureTensor(),
             ]
@@ -98,10 +98,10 @@ class DetectionPresetEval:
             transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
         elif backend == "tensor":
             transforms += [T.PILToTensor()]
-        elif backend == "datapoint":
+        elif backend == "tv_tensor":
             transforms += [T.ToImage()]
         else:
-            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
 
         transforms += [T.ToDtype(torch.float, scale=True)]
 
diff --git a/references/detection/train.py b/references/detection/train.py
index 892ffbbbc..d165a2d35 100644
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -180,8 +180,8 @@ def get_args_parser(add_help=True):
 
 
 def main(args):
-    if args.backend.lower() == "datapoint" and not args.use_v2:
-        raise ValueError("Use --use-v2 if you want to use the datapoint backend.")
+    if args.backend.lower() == "tv_tensor" and not args.use_v2:
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor backend.")
     if args.dataset not in ("coco", "coco_kp"):
         raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}")
     if "keypoint" in args.model and args.dataset != "coco_kp":
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index b0539fcca..803769fca 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -4,11 +4,11 @@ import torch
 def get_modules(use_v2):
     # We need a protected import to avoid the V2 warning in case just V1 is used
     if use_v2:
-        import torchvision.datapoints
         import torchvision.transforms.v2
+        import torchvision.tv_tensors
         import v2_extras
 
-        return torchvision.transforms.v2, torchvision.datapoints, v2_extras
+        return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras
     else:
         import transforms
 
@@ -27,16 +27,16 @@ class SegmentationPresetTrain:
         backend="pil",
         use_v2=False,
     ):
-        T, datapoints, v2_extras = get_modules(use_v2)
+        T, tv_tensors, v2_extras = get_modules(use_v2)
 
         transforms = []
         backend = backend.lower()
-        if backend == "datapoint":
+        if backend == "tv_tensor":
             transforms.append(T.ToImage())
         elif backend == "tensor":
             transforms.append(T.PILToTensor())
         elif backend != "pil":
-            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
 
         transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))]
 
@@ -46,7 +46,7 @@ class SegmentationPresetTrain:
         if use_v2:
             # We need a custom pad transform here, since the padding we want to perform here is fundamentally
             # different from the padding in `RandomCrop` if `pad_if_needed=True`.
-            transforms += [v2_extras.PadIfSmaller(crop_size, fill={datapoints.Mask: 255, "others": 0})]
+            transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})]
 
         transforms += [T.RandomCrop(crop_size)]
 
@@ -54,9 +54,9 @@ class SegmentationPresetTrain:
             transforms += [T.PILToTensor()]
 
         if use_v2:
-            img_type = datapoints.Image if backend == "datapoint" else torch.Tensor
+            img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor
             transforms += [
-                T.ToDtype(dtype={img_type: torch.float32, datapoints.Mask: torch.int64, "others": None}, scale=True)
+                T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True)
             ]
         else:
             # No need to explicitly convert masks as they're magically int64 already
@@ -82,10 +82,10 @@ class SegmentationPresetEval:
         backend = backend.lower()
         if backend == "tensor":
             transforms += [T.PILToTensor()]
-        elif backend == "datapoint":
+        elif backend == "tv_tensor":
             transforms += [T.ToImage()]
         elif backend != "pil":
-            raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
 
         if use_v2:
             transforms += [T.Resize(size=(base_size, base_size))]
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index 7ca4bd1c5..35ece7264 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -128,7 +128,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi
 def main(args):
     if args.backend.lower() != "pil" and not args.use_v2:
         # TODO: Support tensor backend in V1?
-        raise ValueError("Use --use-v2 if you want to use the datapoint or tensor backend.")
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.")
     if args.use_v2 and args.dataset != "coco":
         raise ValueError("v2 is only support supported for coco dataset for now.")
 
diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
index ae55f0727..e1a8b53e0 100644
--- a/references/segmentation/v2_extras.py
+++ b/references/segmentation/v2_extras.py
@@ -1,6 +1,6 @@
 """This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1."""
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms import v2
 
 
@@ -80,4 +80,4 @@ class CocoDetectionToVOCSegmentation(v2.Transform):
         if segmentation_mask is None:
             segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8)
 
-        return image, datapoints.Mask(segmentation_mask)
+        return image, tv_tensors.Mask(segmentation_mask)
diff --git a/test/common_utils.py b/test/common_utils.py
index 61f069948..a1d188efd 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -19,7 +19,7 @@ import torch.testing
 from PIL import Image
 
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
-from torchvision import datapoints, io
+from torchvision import io, tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.v2.functional import to_image, to_pil_image
 
@@ -391,7 +391,7 @@ def make_image(
     if color_space in {"GRAY_ALPHA", "RGBA"}:
         data[..., -1, :, :] = max_value
 
-    return datapoints.Image(data)
+    return tv_tensors.Image(data)
 
 
 def make_image_tensor(*args, **kwargs):
@@ -405,7 +405,7 @@ def make_image_pil(*args, **kwargs):
 def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
-    format=datapoints.BoundingBoxFormat.XYXY,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
     dtype=None,
     device="cpu",
 ):
@@ -415,7 +415,7 @@ def make_bounding_boxes(
         return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()])
 
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
+        format = tv_tensors.BoundingBoxFormat[format]
 
     dtype = dtype or torch.float32
 
@@ -424,21 +424,21 @@ def make_bounding_boxes(
     y = sample_position(h, canvas_size[0])
     x = sample_position(w, canvas_size[1])
 
-    if format is datapoints.BoundingBoxFormat.XYWH:
+    if format is tv_tensors.BoundingBoxFormat.XYWH:
         parts = (x, y, w, h)
-    elif format is datapoints.BoundingBoxFormat.XYXY:
+    elif format is tv_tensors.BoundingBoxFormat.XYXY:
         x1, y1 = x, y
         x2 = x1 + w
         y2 = y1 + h
         parts = (x1, y1, x2, y2)
-    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWH:
         cx = x + w / 2
         cy = y + h / 2
         parts = (cx, cy, w, h)
     else:
         raise ValueError(f"Format {format} is not supported")
 
-    return datapoints.BoundingBoxes(
+    return tv_tensors.BoundingBoxes(
         torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
     )
 
@@ -446,7 +446,7 @@ def make_bounding_boxes(
 def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     num_objects = 1
-    return datapoints.Mask(
+    return tv_tensors.Mask(
         torch.testing.make_tensor(
             (num_objects, *size),
             low=0,
@@ -459,7 +459,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"):
 
 def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
-    return datapoints.Mask(
+    return tv_tensors.Mask(
         torch.testing.make_tensor(
             (*batch_dims, *size),
             low=0,
@@ -471,7 +471,7 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(
 
 
 def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
-    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+    return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
 def make_video_tensor(*args, **kwargs):
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index f7a1b8dd3..bd9f7ea3a 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -568,7 +568,7 @@ class DatasetTestCase(unittest.TestCase):
 
     @test_all_configs
     def test_transforms_v2_wrapper(self, config):
-        from torchvision import datapoints
+        from torchvision import tv_tensors
         from torchvision.datasets import wrap_dataset_for_transforms_v2
 
         try:
@@ -590,7 +590,7 @@ class DatasetTestCase(unittest.TestCase):
 
                     wrapped_sample = wrapped_dataset[0]
                     assert tree_any(
-                        lambda item: isinstance(item, (datapoints.Datapoint, PIL.Image.Image)), wrapped_sample
+                        lambda item: isinstance(item, (tv_tensors.TVTensor, PIL.Image.Image)), wrapped_sample
                     )
         except TypeError as error:
             msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
@@ -717,7 +717,7 @@ def check_transforms_v2_wrapper_spawn(dataset):
         pytest.skip("Multiprocessing spawning is only checked on macOS.")
 
     from torch.utils.data import DataLoader
-    from torchvision import datapoints
+    from torchvision import tv_tensors
     from torchvision.datasets import wrap_dataset_for_transforms_v2
 
     wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
@@ -726,7 +726,7 @@ def check_transforms_v2_wrapper_spawn(dataset):
 
     for wrapped_sample in dataloader:
         assert tree_any(
-            lambda item: isinstance(item, (datapoints.Image, datapoints.Video, PIL.Image.Image)), wrapped_sample
+            lambda item: isinstance(item, (tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)), wrapped_sample
         )
 
 
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index acbe1a6a7..b26bcff32 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -6,7 +6,7 @@ import pytest
 import torch
 from torch.nn.functional import one_hot
 
-from torchvision.prototype import datapoints
+from torchvision.prototype import tv_tensors
 
 from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
 
@@ -40,7 +40,7 @@ def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
         # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
         # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
         data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
-        return datapoints.Label(data, categories=categories)
+        return tv_tensors.Label(data, categories=categories)
 
     return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)
 
@@ -64,7 +64,7 @@ def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int
             # since `one_hot` only supports int64
             label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
             data = one_hot(label, num_classes=num_categories).to(dtype)
-        return datapoints.OneHotLabel(data, categories=categories)
+        return tv_tensors.OneHotLabel(data, categories=categories)
 
     return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)
 
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 265316264..1270201d5 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -3387,11 +3387,11 @@ class TestDatasetWrapper:
             datasets.wrap_dataset_for_transforms_v2(dataset)
 
     def test_subclass(self, mocker):
-        from torchvision import datapoints
+        from torchvision import tv_tensors
 
         sentinel = object()
         mocker.patch.dict(
-            datapoints._dataset_wrapper.WRAPPER_FACTORIES,
+            tv_tensors._dataset_wrapper.WRAPPER_FACTORIES,
             clear=False,
             values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel},
         )
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 8497ea27b..6a8068ed2 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -19,12 +19,12 @@ from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.dataloader2.graph.utils import traverse_dps
 from torchdata.datapipes.iter import ShardingFilter, Shuffler
 from torchdata.datapipes.utils import StreamWrapper
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import datasets
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
+from torchvision.prototype.tv_tensors import Label
 from torchvision.transforms.v2._utils import is_pure_tensor
 
 
@@ -147,7 +147,7 @@ class TestCommon:
         pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)}
 
         if pure_tensors and not any(
-            isinstance(item, (datapoints.Image, datapoints.Video, EncodedImage)) for item in sample.values()
+            isinstance(item, (tv_tensors.Image, tv_tensors.Video, EncodedImage)) for item in sample.values()
         ):
             raise AssertionError(
                 f"The values of key(s) "
@@ -276,7 +276,7 @@ class TestUSPS:
             assert "image" in sample
             assert "label" in sample
 
-            assert isinstance(sample["image"], datapoints.Image)
+            assert isinstance(sample["image"], tv_tensors.Image)
             assert isinstance(sample["label"], Label)
 
             assert sample["image"].shape == (1, 16, 16)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index b4e1d1087..9794b196a 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -7,11 +7,11 @@ import torch
 from common_utils import assert_equal
 
 from prototype_common_utils import make_label
-
-from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
-from torchvision.prototype import datapoints, transforms
+from torchvision.prototype import transforms, tv_tensors
 from torchvision.transforms.v2._utils import check_type, is_pure_tensor
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
+
+from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from transforms_v2_legacy_utils import (
     DEFAULT_EXTRA_DIMS,
     make_bounding_boxes,
@@ -51,7 +51,7 @@ class TestSimpleCopyPaste:
             # images, batch size = 2
             self.create_fake_image(mocker, Image),
             # labels, bboxes, masks
-            mocker.MagicMock(spec=datapoints.Label),
+            mocker.MagicMock(spec=tv_tensors.Label),
             mocker.MagicMock(spec=BoundingBoxes),
             mocker.MagicMock(spec=Mask),
             # labels, bboxes, masks
@@ -63,7 +63,7 @@ class TestSimpleCopyPaste:
             transform._extract_image_targets(flat_sample)
 
     @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel])
     def test__extract_image_targets(self, image_type, label_type, mocker):
         transform = transforms.SimpleCopyPaste()
 
@@ -101,7 +101,7 @@ class TestSimpleCopyPaste:
                 assert isinstance(target[key], type_)
                 assert target[key] in flat_sample
 
-    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel])
     def test__copy_paste(self, label_type):
         image = 2 * torch.ones(3, 32, 32)
         masks = torch.zeros(2, 32, 32)
@@ -111,7 +111,7 @@ class TestSimpleCopyPaste:
         blending = True
         resize_interpolation = InterpolationMode.BILINEAR
         antialias = None
-        if label_type == datapoints.OneHotLabel:
+        if label_type == tv_tensors.OneHotLabel:
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
             "boxes": BoundingBoxes(
@@ -126,7 +126,7 @@ class TestSimpleCopyPaste:
         paste_masks[0, 13:19, 12:18] = 1
         paste_masks[1, 15:19, 1:8] = 1
         paste_labels = torch.tensor([3, 4])
-        if label_type == datapoints.OneHotLabel:
+        if label_type == tv_tensors.OneHotLabel:
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
             "boxes": BoundingBoxes(
@@ -148,7 +148,7 @@ class TestSimpleCopyPaste:
         torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
 
         expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == datapoints.OneHotLabel:
+        if label_type == tv_tensors.OneHotLabel:
             expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
         torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
 
@@ -258,10 +258,10 @@ class TestFixedSizeCrop:
 class TestLabelToOneHot:
     def test__transform(self):
         categories = ["apple", "pear", "pineapple"]
-        labels = datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
+        labels = tv_tensors.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
         transform = transforms.LabelToOneHot()
         ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, datapoints.OneHotLabel)
+        assert isinstance(ohe_labels, tv_tensors.OneHotLabel)
         assert ohe_labels.shape == (4, 3)
         assert ohe_labels.categories == labels.categories == categories
 
@@ -383,7 +383,7 @@ det_transforms = import_transforms_from_references("detection")
 
 
 def test_fixed_sized_crop_against_detection_reference():
-    def make_datapoints():
+    def make_tv_tensors():
         size = (600, 800)
         num_objects = 22
 
@@ -405,19 +405,19 @@ def test_fixed_sized_crop_against_detection_reference():
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB")
+        tv_tensor_image = make_image(size=size, color_space="RGB")
         target = {
             "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
 
-        yield (datapoint_image, target)
+        yield (tv_tensor_image, target)
 
     t = transforms.FixedSizeCrop((1024, 1024), fill=0)
     t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0)
 
-    for dp in make_datapoints():
+    for dp in make_tv_tensors():
         # We should use prototype transform first as reference transform performs inplace target update
         torch.manual_seed(12)
         output = t(dp)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 982c86d04..65e65481b 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -13,7 +13,7 @@ import torchvision.transforms.v2 as transforms
 
 from common_utils import assert_equal, cpu_and_cuda
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import to_pil_image
 from torchvision.transforms.v2 import functional as F
@@ -66,10 +66,10 @@ def auto_augment_adapter(transform, input, device):
     adapted_input = {}
     image_or_video_found = False
     for key, value in input.items():
-        if isinstance(value, (datapoints.BoundingBoxes, datapoints.Mask)):
+        if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
             # AA transforms don't support bounding boxes or masks
             continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor, PIL.Image.Image)):
+        elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)):
             if image_or_video_found:
                 # AA transforms only support a single image or video
                 continue
@@ -99,7 +99,7 @@ def normalize_adapter(transform, input, device):
         if isinstance(value, PIL.Image.Image):
             # normalize doesn't support PIL images
             continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor)):
+        elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor)):
             # normalize doesn't support integer images
             value = F.to_dtype(value, torch.float32, scale=True)
         adapted_input[key] = value
@@ -142,7 +142,7 @@ class TestSmoke:
             (transforms.Resize([16, 16], antialias=True), None),
             (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
             (transforms.ClampBoundingBoxes(), None),
-            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
+            (transforms.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertImageDtype(), None),
             (transforms.GaussianBlur(kernel_size=3), None),
             (
@@ -178,19 +178,19 @@ class TestSmoke:
         canvas_size = F.get_size(image_or_video)
         input = dict(
             image_or_video=image_or_video,
-            image_datapoint=make_image(size=canvas_size),
-            video_datapoint=make_video(size=canvas_size),
+            image_tv_tensor=make_image(size=canvas_size),
+            video_tv_tensor=make_video(size=canvas_size),
             image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])),
             bounding_boxes_xyxy=make_bounding_boxes(
-                format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
+                format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
             ),
             bounding_boxes_xywh=make_bounding_boxes(
-                format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
+                format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
             ),
             bounding_boxes_cxcywh=make_bounding_boxes(
-                format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
+                format=tv_tensors.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
             ),
-            bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes(
+            bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes(
                 [
                     [0, 0, 0, 0],  # no height or width
                     [0, 0, 0, 1],  # no height
@@ -199,10 +199,10 @@ class TestSmoke:
                     [0, 2, 1, 1],  # x1 < x2, y1 > y2
                     [2, 2, 1, 1],  # x1 > x2, y1 > y2
                 ],
-                format=datapoints.BoundingBoxFormat.XYXY,
+                format=tv_tensors.BoundingBoxFormat.XYXY,
                 canvas_size=canvas_size,
             ),
-            bounding_boxes_degenerate_xywh=datapoints.BoundingBoxes(
+            bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes(
                 [
                     [0, 0, 0, 0],  # no height or width
                     [0, 0, 0, 1],  # no height
@@ -211,10 +211,10 @@ class TestSmoke:
                     [0, 0, -1, 1],  # negative width
                     [0, 0, -1, -1],  # negative height and width
                 ],
-                format=datapoints.BoundingBoxFormat.XYWH,
+                format=tv_tensors.BoundingBoxFormat.XYWH,
                 canvas_size=canvas_size,
             ),
-            bounding_boxes_degenerate_cxcywh=datapoints.BoundingBoxes(
+            bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes(
                 [
                     [0, 0, 0, 0],  # no height or width
                     [0, 0, 0, 1],  # no height
@@ -223,7 +223,7 @@ class TestSmoke:
                     [0, 0, -1, 1],  # negative width
                     [0, 0, -1, -1],  # negative height and width
                 ],
-                format=datapoints.BoundingBoxFormat.CXCYWH,
+                format=tv_tensors.BoundingBoxFormat.CXCYWH,
                 canvas_size=canvas_size,
             ),
             detection_mask=make_detection_mask(size=canvas_size),
@@ -262,7 +262,7 @@ class TestSmoke:
             else:
                 assert output_item is input_item
 
-            if isinstance(input_item, datapoints.BoundingBoxes) and not isinstance(
+            if isinstance(input_item, tv_tensors.BoundingBoxes) and not isinstance(
                 transform, transforms.ConvertBoundingBoxFormat
             ):
                 assert output_item.format == input_item.format
@@ -270,9 +270,9 @@ class TestSmoke:
         # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
         # transform that does this), back into a valid one.
         # TODO: we should test that against all degenerate boxes above
-        for format in list(datapoints.BoundingBoxFormat):
+        for format in list(tv_tensors.BoundingBoxFormat):
             sample = dict(
-                boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)),
+                boxes=tv_tensors.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
             assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
@@ -652,7 +652,7 @@ class TestRandomErasing:
 class TestTransform:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
     )
     def test_check_transformed_types(self, inpt_type, mocker):
         # This test ensures that we correctly handle which types to transform and which to bypass
@@ -670,7 +670,7 @@ class TestTransform:
 class TestToImage:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch(
@@ -681,7 +681,7 @@ class TestToImage:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToImage()
         transform(inpt)
-        if inpt_type in (datapoints.BoundingBoxes, datapoints.Image, str, int):
+        if inpt_type in (tv_tensors.BoundingBoxes, tv_tensors.Image, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt)
@@ -690,7 +690,7 @@ class TestToImage:
 class TestToPILImage:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.transforms.v2.functional.to_pil_image")
@@ -698,7 +698,7 @@ class TestToPILImage:
         inpt = mocker.MagicMock(spec=inpt_type)
         transform = transforms.ToPILImage()
         transform(inpt)
-        if inpt_type in (PIL.Image.Image, datapoints.BoundingBoxes, str, int):
+        if inpt_type in (PIL.Image.Image, tv_tensors.BoundingBoxes, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt, mode=transform.mode)
@@ -707,7 +707,7 @@ class TestToPILImage:
 class TestToTensor:
     @pytest.mark.parametrize(
         "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int],
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
     )
     def test__transform(self, inpt_type, mocker):
         fn = mocker.patch("torchvision.transforms.functional.to_tensor")
@@ -716,7 +716,7 @@ class TestToTensor:
         with pytest.warns(UserWarning, match="deprecated and will be removed"):
             transform = transforms.ToTensor()
         transform(inpt)
-        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBoxes, str, int):
+        if inpt_type in (tv_tensors.Image, torch.Tensor, tv_tensors.BoundingBoxes, str, int):
             assert fn.call_count == 0
         else:
             fn.assert_called_once_with(inpt)
@@ -757,7 +757,7 @@ class TestRandomIoUCrop:
     def test__get_params(self, device, options):
         orig_h, orig_w = size = (24, 32)
         image = make_image(size)
-        bboxes = datapoints.BoundingBoxes(
+        bboxes = tv_tensors.BoundingBoxes(
             torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
             format="XYXY",
             canvas_size=size,
@@ -792,8 +792,8 @@ class TestRandomIoUCrop:
 
     def test__transform_empty_params(self, mocker):
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
-        image = datapoints.Image(torch.rand(1, 3, 4, 4))
-        bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4))
+        image = tv_tensors.Image(torch.rand(1, 3, 4, 4))
+        bboxes = tv_tensors.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4))
         label = torch.tensor([1])
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
@@ -827,11 +827,11 @@ class TestRandomIoUCrop:
 
         # check number of bboxes vs number of labels:
         output_bboxes = output[1]
-        assert isinstance(output_bboxes, datapoints.BoundingBoxes)
+        assert isinstance(output_bboxes, tv_tensors.BoundingBoxes)
         assert (output_bboxes[~is_within_crop_area] == 0).all()
 
         output_masks = output[2]
-        assert isinstance(output_masks, datapoints.Mask)
+        assert isinstance(output_masks, tv_tensors.Mask)
 
 
 class TestScaleJitter:
@@ -899,7 +899,7 @@ class TestLinearTransformation:
         [
             122 * torch.ones(1, 3, 8, 8),
             122.0 * torch.ones(1, 3, 8, 8),
-            datapoints.Image(122 * torch.ones(1, 3, 8, 8)),
+            tv_tensors.Image(122 * torch.ones(1, 3, 8, 8)),
             PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
         ],
     )
@@ -941,7 +941,7 @@ class TestUniformTemporalSubsample:
         [
             torch.zeros(10, 3, 8, 8),
             torch.zeros(1, 10, 3, 8, 8),
-            datapoints.Video(torch.zeros(1, 10, 3, 8, 8)),
+            tv_tensors.Video(torch.zeros(1, 10, 3, 8, 8)),
         ],
     )
     def test__transform(self, inpt):
@@ -971,12 +971,12 @@ def test_antialias_warning():
         transforms.RandomResize(10, 20)(tensor_img)
 
     with pytest.warns(UserWarning, match=match):
-        F.resized_crop(datapoints.Image(tensor_img), 0, 0, 10, 10, (20, 20))
+        F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20))
 
     with pytest.warns(UserWarning, match=match):
-        F.resize(datapoints.Video(tensor_video), (20, 20))
+        F.resize(tv_tensors.Video(tensor_video), (20, 20))
     with pytest.warns(UserWarning, match=match):
-        F.resized_crop(datapoints.Video(tensor_video), 0, 0, 10, 10, (20, 20))
+        F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20))
 
     with warnings.catch_warnings():
         warnings.simplefilter("error")
@@ -990,17 +990,17 @@ def test_antialias_warning():
         transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
         transforms.RandomResize(10, 20, antialias=True)(tensor_img)
 
-        F.resized_crop(datapoints.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True)
-        F.resized_crop(datapoints.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True)
+        F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True)
+        F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True)
 
 
-@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
 @pytest.mark.parametrize("label_type", (torch.Tensor, int))
 @pytest.mark.parametrize("dataset_return_type", (dict, tuple))
 @pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
 def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 
-    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
+    image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
     if image_type is PIL.Image:
         image = to_pil_image(image[0])
     elif image_type is torch.Tensor:
@@ -1056,7 +1056,7 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
     assert out_label == label
 
 
-@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
 @pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
 @pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
 @pytest.mark.parametrize("sanitize", (True, False))
@@ -1082,7 +1082,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             # leaving FixedSizeCrop in prototype for now, and it expects Label
             # classes which we won't release yet.
             # transforms.FixedSizeCrop(
-            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})
+            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {tv_tensors.Mask: 0})
             # ),
             transforms.RandomCrop((1024, 1024), pad_if_needed=True),
             transforms.RandomHorizontalFlip(p=1),
@@ -1101,7 +1101,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     elif data_augmentation == "ssd":
         t = [
             transforms.RandomPhotometricDistort(p=1),
-            transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), datapoints.Mask: 0}, p=1),
+            transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), tv_tensors.Mask: 0}, p=1),
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
             to_tensor,
@@ -1121,7 +1121,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     num_boxes = 5
     H = W = 250
 
-    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
+    image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
     if image_type is PIL.Image:
         image = to_pil_image(image[0])
     elif image_type is torch.Tensor:
@@ -1133,9 +1133,9 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
     boxes[:, 2:] += boxes[:, :2]
     boxes = boxes.clamp(min=0, max=min(H, W))
-    boxes = datapoints.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
+    boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
 
-    masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
+    masks = tv_tensors.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
 
     sample = {
         "image": image,
@@ -1146,10 +1146,10 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 
     out = t(sample)
 
-    if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image:
+    if isinstance(to_tensor, transforms.ToTensor) and image_type is not tv_tensors.Image:
         assert is_pure_tensor(out["image"])
     else:
-        assert isinstance(out["image"], datapoints.Image)
+        assert isinstance(out["image"], tv_tensors.Image)
     assert isinstance(out["label"], type(sample["label"]))
 
     num_boxes_expected = {
@@ -1204,13 +1204,13 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     boxes = torch.tensor(boxes)
     labels = torch.arange(boxes.shape[0])
 
-    boxes = datapoints.BoundingBoxes(
+    boxes = tv_tensors.BoundingBoxes(
         boxes,
-        format=datapoints.BoundingBoxFormat.XYXY,
+        format=tv_tensors.BoundingBoxFormat.XYXY,
         canvas_size=(H, W),
     )
 
-    masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
+    masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
     whatever = torch.rand(10)
     input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)
     sample = {
@@ -1244,8 +1244,8 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     assert out_image is input_img
     assert out_whatever is whatever
 
-    assert isinstance(out_boxes, datapoints.BoundingBoxes)
-    assert isinstance(out_masks, datapoints.Mask)
+    assert isinstance(out_boxes, tv_tensors.BoundingBoxes)
+    assert isinstance(out_masks, tv_tensors.Mask)
 
     if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
         assert out_labels is labels
@@ -1266,15 +1266,15 @@ def test_sanitize_bounding_boxes_no_label():
         transforms.SanitizeBoundingBoxes()(img, boxes)
 
     out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes)
-    assert isinstance(out_img, datapoints.Image)
-    assert isinstance(out_boxes, datapoints.BoundingBoxes)
+    assert isinstance(out_img, tv_tensors.Image)
+    assert isinstance(out_boxes, tv_tensors.BoundingBoxes)
 
 
 def test_sanitize_bounding_boxes_errors():
 
-    good_bbox = datapoints.BoundingBoxes(
+    good_bbox = tv_tensors.BoundingBoxes(
         [[0, 0, 10, 10]],
-        format=datapoints.BoundingBoxFormat.XYXY,
+        format=tv_tensors.BoundingBoxFormat.XYXY,
         canvas_size=(20, 20),
     )
 
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index a06ecb748..e37913a94 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -13,7 +13,7 @@ import torch
 import torchvision.transforms.v2 as v2_transforms
 from common_utils import assert_close, assert_equal, set_rng_seed
 from torch import nn
-from torchvision import datapoints, transforms as legacy_transforms
+from torchvision import transforms as legacy_transforms, tv_tensors
 from torchvision._utils import sequence_to_str
 
 from torchvision.transforms import functional as legacy_F
@@ -478,15 +478,15 @@ def check_call_consistency(
             output_prototype_image = prototype_transform(image)
         except Exception as exc:
             raise AssertionError(
-                f"Transforming a image datapoint with shape {image_repr} failed in the prototype transform with "
+                f"Transforming a image tv_tensor with shape {image_repr} failed in the prototype transform with "
                 f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                f"`datapoints.Image` path in `_transform`."
+                f"`tv_tensors.Image` path in `_transform`."
             ) from exc
 
         assert_close(
             output_prototype_image,
             output_prototype_tensor,
-            msg=lambda msg: f"Output for datapoint and tensor images is not equal: \n\n{msg}",
+            msg=lambda msg: f"Output for tv_tensor and tensor images is not equal: \n\n{msg}",
             **closeness_kwargs,
         )
 
@@ -747,7 +747,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -812,7 +812,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -887,7 +887,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -964,7 +964,7 @@ class TestAATransforms:
         [
             torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
             PIL.Image.new("RGB", (256, 256), 123),
-            datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
         ],
     )
     @pytest.mark.parametrize(
@@ -1030,7 +1030,7 @@ det_transforms = import_transforms_from_references("detection")
 
 
 class TestRefDetTransforms:
-    def make_datapoints(self, with_mask=True):
+    def make_tv_tensors(self, with_mask=True):
         size = (600, 800)
         num_objects = 22
 
@@ -1057,7 +1057,7 @@ class TestRefDetTransforms:
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
+        tv_tensor_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
             "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -1065,7 +1065,7 @@ class TestRefDetTransforms:
         if with_mask:
             target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
 
-        yield (datapoint_image, target)
+        yield (tv_tensor_image, target)
 
     @pytest.mark.parametrize(
         "t_ref, t, data_kwargs",
@@ -1095,7 +1095,7 @@ class TestRefDetTransforms:
         ],
     )
     def test_transform(self, t_ref, t, data_kwargs):
-        for dp in self.make_datapoints(**data_kwargs):
+        for dp in self.make_tv_tensors(**data_kwargs):
 
             # We should use prototype transform first as reference transform performs inplace target update
             torch.manual_seed(12)
@@ -1135,7 +1135,7 @@ class PadIfSmaller(v2_transforms.Transform):
 
 
 class TestRefSegTransforms:
-    def make_datapoints(self, supports_pil=True, image_dtype=torch.uint8):
+    def make_tv_tensors(self, supports_pil=True, image_dtype=torch.uint8):
         size = (256, 460)
         num_categories = 21
 
@@ -1145,13 +1145,13 @@ class TestRefSegTransforms:
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
-            datapoint_image = make_image(size=size, color_space="RGB", dtype=image_dtype)
-            datapoint_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
+            tv_tensor_image = make_image(size=size, color_space="RGB", dtype=image_dtype)
+            tv_tensor_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
 
-            dp = (conv_fn(datapoint_image), datapoint_mask)
+            dp = (conv_fn(tv_tensor_image), tv_tensor_mask)
             dp_ref = (
-                to_pil_image(datapoint_image) if supports_pil else datapoint_image.as_subclass(torch.Tensor),
-                to_pil_image(datapoint_mask),
+                to_pil_image(tv_tensor_image) if supports_pil else tv_tensor_image.as_subclass(torch.Tensor),
+                to_pil_image(tv_tensor_mask),
             )
 
             yield dp, dp_ref
@@ -1161,7 +1161,7 @@ class TestRefSegTransforms:
         random.seed(seed)
 
     def check(self, t, t_ref, data_kwargs=None):
-        for dp, dp_ref in self.make_datapoints(**data_kwargs or dict()):
+        for dp, dp_ref in self.make_tv_tensors(**data_kwargs or dict()):
 
             self.set_seed()
             actual = actual_image, actual_mask = t(dp)
@@ -1192,7 +1192,7 @@ class TestRefSegTransforms:
                 seg_transforms.RandomCrop(size=480),
                 v2_transforms.Compose(
                     [
-                        PadIfSmaller(size=480, fill={datapoints.Mask: 255, "others": 0}),
+                        PadIfSmaller(size=480, fill={tv_tensors.Mask: 255, "others": 0}),
                         v2_transforms.RandomCrop(size=480),
                     ]
                 ),
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 315993c75..cdd75ca0f 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -10,7 +10,7 @@ import torch
 
 from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed
 from torch.utils._pytree import tree_map
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2._utils import is_pure_tensor
@@ -164,22 +164,22 @@ class TestKernels:
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
-        datapoint_type = datapoints.Image if is_pure_tensor(batched_input) else type(batched_input)
+        tv_tensor_type = tv_tensors.Image if is_pure_tensor(batched_input) else type(batched_input)
         # This dictionary contains the number of rightmost dimensions that contain the actual data.
         # Everything to the left is considered a batch dimension.
         data_dims = {
-            datapoints.Image: 3,
-            datapoints.BoundingBoxes: 1,
+            tv_tensors.Image: 3,
+            tv_tensors.BoundingBoxes: 1,
             # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks
             # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one
             # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as
             # common ground.
-            datapoints.Mask: 2,
-            datapoints.Video: 4,
-        }.get(datapoint_type)
+            tv_tensors.Mask: 2,
+            tv_tensors.Video: 4,
+        }.get(tv_tensor_type)
         if data_dims is None:
             raise pytest.UsageError(
-                f"The number of data dimensions cannot be determined for input of type {datapoint_type.__name__}."
+                f"The number of data dimensions cannot be determined for input of type {tv_tensor_type.__name__}."
             ) from None
         elif batched_input.ndim <= data_dims:
             pytest.skip("Input is not batched.")
@@ -305,8 +305,8 @@ def spy_on(mocker):
 
 class TestDispatchers:
     image_sample_inputs = make_info_args_kwargs_parametrization(
-        [info for info in DISPATCHER_INFOS if datapoints.Image in info.kernels],
-        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
+        [info for info in DISPATCHER_INFOS if tv_tensors.Image in info.kernels],
+        args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image),
     )
 
     @make_info_args_kwargs_parametrization(
@@ -328,8 +328,8 @@ class TestDispatchers:
     def test_scripted_smoke(self, info, args_kwargs, device):
         dispatcher = script(info.dispatcher)
 
-        (image_datapoint, *other_args), kwargs = args_kwargs.load(device)
-        image_pure_tensor = torch.Tensor(image_datapoint)
+        (image_tv_tensor, *other_args), kwargs = args_kwargs.load(device)
+        image_pure_tensor = torch.Tensor(image_tv_tensor)
 
         dispatcher(image_pure_tensor, *other_args, **kwargs)
 
@@ -355,25 +355,25 @@ class TestDispatchers:
 
     @image_sample_inputs
     def test_pure_tensor_output_type(self, info, args_kwargs):
-        (image_datapoint, *other_args), kwargs = args_kwargs.load()
-        image_pure_tensor = image_datapoint.as_subclass(torch.Tensor)
+        (image_tv_tensor, *other_args), kwargs = args_kwargs.load()
+        image_pure_tensor = image_tv_tensor.as_subclass(torch.Tensor)
 
         output = info.dispatcher(image_pure_tensor, *other_args, **kwargs)
 
-        # We cannot use `isinstance` here since all datapoints are instances of `torch.Tensor` as well
+        # We cannot use `isinstance` here since all tv_tensors are instances of `torch.Tensor` as well
         assert type(output) is torch.Tensor
 
     @make_info_args_kwargs_parametrization(
         [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
-        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image),
+        args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image),
     )
     def test_pil_output_type(self, info, args_kwargs):
-        (image_datapoint, *other_args), kwargs = args_kwargs.load()
+        (image_tv_tensor, *other_args), kwargs = args_kwargs.load()
 
-        if image_datapoint.ndim > 3:
+        if image_tv_tensor.ndim > 3:
             pytest.skip("Input is batched")
 
-        image_pil = F.to_pil_image(image_datapoint)
+        image_pil = F.to_pil_image(image_tv_tensor)
 
         output = info.dispatcher(image_pil, *other_args, **kwargs)
 
@@ -383,38 +383,38 @@ class TestDispatchers:
         DISPATCHER_INFOS,
         args_kwargs_fn=lambda info: info.sample_inputs(),
     )
-    def test_datapoint_output_type(self, info, args_kwargs):
-        (datapoint, *other_args), kwargs = args_kwargs.load()
+    def test_tv_tensor_output_type(self, info, args_kwargs):
+        (tv_tensor, *other_args), kwargs = args_kwargs.load()
 
-        output = info.dispatcher(datapoint, *other_args, **kwargs)
+        output = info.dispatcher(tv_tensor, *other_args, **kwargs)
 
-        assert isinstance(output, type(datapoint))
+        assert isinstance(output, type(tv_tensor))
 
-        if isinstance(datapoint, datapoints.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format:
-            assert output.format == datapoint.format
+        if isinstance(tv_tensor, tv_tensors.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format:
+            assert output.format == tv_tensor.format
 
     @pytest.mark.parametrize(
-        ("dispatcher_info", "datapoint_type", "kernel_info"),
+        ("dispatcher_info", "tv_tensor_type", "kernel_info"),
         [
             pytest.param(
-                dispatcher_info, datapoint_type, kernel_info, id=f"{dispatcher_info.id}-{datapoint_type.__name__}"
+                dispatcher_info, tv_tensor_type, kernel_info, id=f"{dispatcher_info.id}-{tv_tensor_type.__name__}"
             )
             for dispatcher_info in DISPATCHER_INFOS
-            for datapoint_type, kernel_info in dispatcher_info.kernel_infos.items()
+            for tv_tensor_type, kernel_info in dispatcher_info.kernel_infos.items()
         ],
     )
-    def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, datapoint_type, kernel_info):
+    def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, tv_tensor_type, kernel_info):
         dispatcher_signature = inspect.signature(dispatcher_info.dispatcher)
         dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
 
         kernel_signature = inspect.signature(kernel_info.kernel)
         kernel_params = list(kernel_signature.parameters.values())[1:]
 
-        # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be
+        # We filter out metadata that is implicitly passed to the dispatcher through the input tv_tensor, but has to be
         # explicitly passed to the kernel.
         input_type = {v: k for k, v in dispatcher_info.kernels.items()}.get(kernel_info.kernel)
         explicit_metadata = {
-            datapoints.BoundingBoxes: {"format", "canvas_size"},
+            tv_tensors.BoundingBoxes: {"format", "canvas_size"},
         }
         kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
 
@@ -445,9 +445,9 @@ class TestDispatchers:
         [
             info
             for info in DISPATCHER_INFOS
-            if datapoints.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format
+            if tv_tensors.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format
         ],
-        args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBoxes),
+        args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.BoundingBoxes),
     )
     def test_bounding_boxes_format_consistency(self, info, args_kwargs):
         (bounding_boxes, *other_args), kwargs = args_kwargs.load()
@@ -497,7 +497,7 @@ class TestClampBoundingBoxes:
         "metadata",
         [
             dict(),
-            dict(format=datapoints.BoundingBoxFormat.XYXY),
+            dict(format=tv_tensors.BoundingBoxFormat.XYXY),
             dict(canvas_size=(1, 1)),
         ],
     )
@@ -510,16 +510,16 @@ class TestClampBoundingBoxes:
     @pytest.mark.parametrize(
         "metadata",
         [
-            dict(format=datapoints.BoundingBoxFormat.XYXY),
+            dict(format=tv_tensors.BoundingBoxFormat.XYXY),
             dict(canvas_size=(1, 1)),
-            dict(format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1, 1)),
+            dict(format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=(1, 1)),
         ],
     )
-    def test_datapoint_explicit_metadata(self, metadata):
-        datapoint = next(make_multiple_bounding_boxes())
+    def test_tv_tensor_explicit_metadata(self, metadata):
+        tv_tensor = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
-            F.clamp_bounding_boxes(datapoint, **metadata)
+            F.clamp_bounding_boxes(tv_tensor, **metadata)
 
 
 class TestConvertFormatBoundingBoxes:
@@ -527,7 +527,7 @@ class TestConvertFormatBoundingBoxes:
         ("inpt", "old_format"),
         [
             (next(make_multiple_bounding_boxes()), None),
-            (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
+            (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), tv_tensors.BoundingBoxFormat.XYXY),
         ],
     )
     def test_missing_new_format(self, inpt, old_format):
@@ -538,14 +538,14 @@ class TestConvertFormatBoundingBoxes:
         pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
-            F.convert_bounding_box_format(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
+            F.convert_bounding_box_format(pure_tensor, new_format=tv_tensors.BoundingBoxFormat.CXCYWH)
 
-    def test_datapoint_explicit_metadata(self):
-        datapoint = next(make_multiple_bounding_boxes())
+    def test_tv_tensor_explicit_metadata(self):
+        tv_tensor = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
             F.convert_bounding_box_format(
-                datapoint, old_format=datapoint.format, new_format=datapoints.BoundingBoxFormat.CXCYWH
+                tv_tensor, old_format=tv_tensor.format, new_format=tv_tensors.BoundingBoxFormat.CXCYWH
             )
 
 
@@ -579,7 +579,7 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "format",
-    [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH],
+    [tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH, tv_tensors.BoundingBoxFormat.CXCYWH],
 )
 @pytest.mark.parametrize(
     "top, left, height, width, expected_bboxes",
@@ -602,7 +602,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
     #     out_box = denormalize_bbox(n_out_box, height, width)
     #     expected_bboxes.append(out_box)
 
-    format = datapoints.BoundingBoxFormat.XYXY
+    format = tv_tensors.BoundingBoxFormat.XYXY
     canvas_size = (64, 76)
     in_boxes = [
         [10.0, 15.0, 25.0, 35.0],
@@ -610,11 +610,11 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
         [45.0, 46.0, 56.0, 62.0],
     ]
     in_boxes = torch.tensor(in_boxes, device=device)
-    if format != datapoints.BoundingBoxFormat.XYXY:
-        in_boxes = convert_bounding_box_format(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+    if format != tv_tensors.BoundingBoxFormat.XYXY:
+        in_boxes = convert_bounding_box_format(in_boxes, tv_tensors.BoundingBoxFormat.XYXY, format)
 
     expected_bboxes = clamp_bounding_boxes(
-        datapoints.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size)
+        tv_tensors.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size)
     ).tolist()
 
     output_boxes, output_canvas_size = F.crop_bounding_boxes(
@@ -626,8 +626,8 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt
         canvas_size[1],
     )
 
-    if format != datapoints.BoundingBoxFormat.XYXY:
-        output_boxes = convert_bounding_box_format(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
+    if format != tv_tensors.BoundingBoxFormat.XYXY:
+        output_boxes = convert_bounding_box_format(output_boxes, format, tv_tensors.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
     torch.testing.assert_close(output_canvas_size, canvas_size)
@@ -648,7 +648,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "format",
-    [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH],
+    [tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH, tv_tensors.BoundingBoxFormat.CXCYWH],
 )
 @pytest.mark.parametrize(
     "top, left, height, width, size",
@@ -666,7 +666,7 @@ def test_correctness_resized_crop_bounding_boxes(device, format, top, left, heig
         bbox[3] = (bbox[3] - top_) * size_[0] / height_
         return bbox
 
-    format = datapoints.BoundingBoxFormat.XYXY
+    format = tv_tensors.BoundingBoxFormat.XYXY
     canvas_size = (100, 100)
     in_boxes = [
         [10.0, 10.0, 20.0, 20.0],
@@ -677,16 +677,16 @@ def test_correctness_resized_crop_bounding_boxes(device, format, top, left, heig
         expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
-    in_boxes = datapoints.BoundingBoxes(
-        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device
+    in_boxes = tv_tensors.BoundingBoxes(
+        in_boxes, format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device
     )
-    if format != datapoints.BoundingBoxFormat.XYXY:
-        in_boxes = convert_bounding_box_format(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+    if format != tv_tensors.BoundingBoxFormat.XYXY:
+        in_boxes = convert_bounding_box_format(in_boxes, tv_tensors.BoundingBoxFormat.XYXY, format)
 
     output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size)
 
-    if format != datapoints.BoundingBoxFormat.XYXY:
-        output_boxes = convert_bounding_box_format(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
+    if format != tv_tensors.BoundingBoxFormat.XYXY:
+        output_boxes = convert_bounding_box_format(output_boxes, format, tv_tensors.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
     torch.testing.assert_close(output_canvas_size, size)
@@ -713,14 +713,14 @@ def test_correctness_pad_bounding_boxes(device, padding):
         dtype = bbox.dtype
         bbox = (
             bbox.clone()
-            if format == datapoints.BoundingBoxFormat.XYXY
-            else convert_bounding_box_format(bbox, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+            if format == tv_tensors.BoundingBoxFormat.XYXY
+            else convert_bounding_box_format(bbox, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
         )
 
         bbox[0::2] += pad_left
         bbox[1::2] += pad_up
 
-        bbox = convert_bounding_box_format(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format)
+        bbox = convert_bounding_box_format(bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format)
         if bbox.dtype != dtype:
             # Temporary cast to original dtype
             # e.g. float32 -> int
@@ -785,7 +785,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
             ]
         )
 
-        bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY)
+        bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=tv_tensors.BoundingBoxFormat.XYXY)
         points = np.array(
             [
                 [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
@@ -807,7 +807,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
         )
         out_bbox = torch.from_numpy(out_bbox)
         out_bbox = convert_bounding_box_format(
-            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_
+            out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_
         )
         return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox)
 
@@ -846,7 +846,7 @@ def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
 def test_correctness_center_crop_bounding_boxes(device, output_size):
     def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_):
         dtype = bbox.dtype
-        bbox = convert_bounding_box_format(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
+        bbox = convert_bounding_box_format(bbox.float(), format_, tv_tensors.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
@@ -860,7 +860,7 @@ def test_correctness_center_crop_bounding_boxes(device, output_size):
             bbox[3].item(),
         ]
         out_bbox = torch.tensor(out_bbox)
-        out_bbox = convert_bounding_box_format(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
+        out_bbox = convert_bounding_box_format(out_bbox, tv_tensors.BoundingBoxFormat.XYWH, format_)
         out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
@@ -958,7 +958,7 @@ def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize,
         torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor)
     )
 
-    image = datapoints.Image(tensor)
+    image = tv_tensors.Image(tensor)
 
     out = fn(image, kernel_size=ksize, sigma=sigma)
     torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 2fea19e81..c7a40bbb4 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -36,7 +36,7 @@ from torch import nn
 from torch.testing import assert_close
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader, default_collate
-from torchvision import datapoints
+from torchvision import tv_tensors
 
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
@@ -167,7 +167,7 @@ def check_kernel(
 
 def _check_functional_scripted_smoke(functional, input, *args, **kwargs):
     """Checks if the functional can be scripted and the scripted version can be called without error."""
-    if not isinstance(input, datapoints.Image):
+    if not isinstance(input, tv_tensors.Image):
         return
 
     functional_scripted = _script(functional)
@@ -187,7 +187,7 @@ def check_functional(functional, input, *args, check_scripted_smoke=True, **kwar
 
     assert isinstance(output, type(input))
 
-    if isinstance(input, datapoints.BoundingBoxes):
+    if isinstance(input, tv_tensors.BoundingBoxes):
         assert output.format == input.format
 
     if check_scripted_smoke:
@@ -199,11 +199,11 @@ def check_functional_kernel_signature_match(functional, *, kernel, input_type):
     functional_params = list(inspect.signature(functional).parameters.values())[1:]
     kernel_params = list(inspect.signature(kernel).parameters.values())[1:]
 
-    if issubclass(input_type, datapoints.Datapoint):
-        # We filter out metadata that is implicitly passed to the functional through the input datapoint, but has to be
+    if issubclass(input_type, tv_tensors.TVTensor):
+        # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be
         # explicitly passed to the kernel.
         explicit_metadata = {
-            datapoints.BoundingBoxes: {"format", "canvas_size"},
+            tv_tensors.BoundingBoxes: {"format", "canvas_size"},
         }
         kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
 
@@ -264,7 +264,7 @@ def check_transform(transform, input, check_v1_compatibility=True):
     output = transform(input)
     assert isinstance(output, type(input))
 
-    if isinstance(input, datapoints.BoundingBoxes):
+    if isinstance(input, tv_tensors.BoundingBoxes):
         assert output.format == input.format
 
     if check_v1_compatibility:
@@ -362,7 +362,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
         input_xyxy = F.convert_bounding_box_format(
             bounding_boxes.to(torch.float64, copy=True),
             old_format=format,
-            new_format=datapoints.BoundingBoxFormat.XYXY,
+            new_format=tv_tensors.BoundingBoxFormat.XYXY,
             inplace=True,
         )
         x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
@@ -387,7 +387,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
         )
 
         output = F.convert_bounding_box_format(
-            output_xyxy, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format
+            output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format
         )
 
         if clamp:
@@ -400,7 +400,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
 
         return output
 
-    return datapoints.BoundingBoxes(
+    return tv_tensors.BoundingBoxes(
         torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
             bounding_boxes.shape
         ),
@@ -479,7 +479,7 @@ class TestResize:
             check_scripted_vs_eager=not isinstance(size, int),
         )
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("use_max_size", [True, False])
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@@ -529,10 +529,10 @@ class TestResize:
         [
             (F.resize_image, torch.Tensor),
             (F._resize_image_pil, PIL.Image.Image),
-            (F.resize_image, datapoints.Image),
-            (F.resize_bounding_boxes, datapoints.BoundingBoxes),
-            (F.resize_mask, datapoints.Mask),
-            (F.resize_video, datapoints.Video),
+            (F.resize_image, tv_tensors.Image),
+            (F.resize_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.resize_mask, tv_tensors.Mask),
+            (F.resize_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -605,7 +605,7 @@ class TestResize:
             new_canvas_size=(new_height, new_width),
         )
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("use_max_size", [True, False])
     @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
@@ -734,9 +734,9 @@ class TestResize:
 
         # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
         # is a good reason to break this, feel free to downgrade to an equality check.
-        if isinstance(input, datapoints.Datapoint):
+        if isinstance(input, tv_tensors.TVTensor):
             # We can't test identity directly, since that checks for the identity of the Python object. Since all
-            # datapoints unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check
+            # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check
             # that the underlying storage is the same
             assert output.data_ptr() == input.data_ptr()
         else:
@@ -782,7 +782,7 @@ class TestResize:
         )
 
         if emulate_channels_last:
-            image = datapoints.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image)
+            image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image)
 
         return image
 
@@ -833,7 +833,7 @@ class TestHorizontalFlip:
     def test_kernel_image_tensor(self, dtype, device):
         check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device))
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
@@ -864,10 +864,10 @@ class TestHorizontalFlip:
         [
             (F.horizontal_flip_image, torch.Tensor),
             (F._horizontal_flip_image_pil, PIL.Image.Image),
-            (F.horizontal_flip_image, datapoints.Image),
-            (F.horizontal_flip_bounding_boxes, datapoints.BoundingBoxes),
-            (F.horizontal_flip_mask, datapoints.Mask),
-            (F.horizontal_flip_video, datapoints.Video),
+            (F.horizontal_flip_image, tv_tensors.Image),
+            (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.horizontal_flip_mask, tv_tensors.Mask),
+            (F.horizontal_flip_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -902,7 +902,7 @@ class TestHorizontalFlip:
 
         return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
@@ -999,7 +999,7 @@ class TestAffine:
         shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
         center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
     )
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
@@ -1032,10 +1032,10 @@ class TestAffine:
         [
             (F.affine_image, torch.Tensor),
             (F._affine_image_pil, PIL.Image.Image),
-            (F.affine_image, datapoints.Image),
-            (F.affine_bounding_boxes, datapoints.BoundingBoxes),
-            (F.affine_mask, datapoints.Mask),
-            (F.affine_video, datapoints.Video),
+            (F.affine_image, tv_tensors.Image),
+            (F.affine_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.affine_mask, tv_tensors.Mask),
+            (F.affine_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -1148,7 +1148,7 @@ class TestAffine:
             ),
         )
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
     @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
     @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
@@ -1176,7 +1176,7 @@ class TestAffine:
 
         torch.testing.assert_close(actual, expected)
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, center, seed):
@@ -1283,7 +1283,7 @@ class TestVerticalFlip:
     def test_kernel_image_tensor(self, dtype, device):
         check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device))
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
@@ -1314,10 +1314,10 @@ class TestVerticalFlip:
         [
             (F.vertical_flip_image, torch.Tensor),
             (F._vertical_flip_image_pil, PIL.Image.Image),
-            (F.vertical_flip_image, datapoints.Image),
-            (F.vertical_flip_bounding_boxes, datapoints.BoundingBoxes),
-            (F.vertical_flip_mask, datapoints.Mask),
-            (F.vertical_flip_video, datapoints.Video),
+            (F.vertical_flip_image, tv_tensors.Image),
+            (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.vertical_flip_mask, tv_tensors.Mask),
+            (F.vertical_flip_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -1350,7 +1350,7 @@ class TestVerticalFlip:
 
         return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_boxes_correctness(self, format, fn):
         bounding_boxes = make_bounding_boxes(format=format)
@@ -1419,7 +1419,7 @@ class TestRotate:
         expand=[False, True],
         center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
     )
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
@@ -1456,10 +1456,10 @@ class TestRotate:
         [
             (F.rotate_image, torch.Tensor),
             (F._rotate_image_pil, PIL.Image.Image),
-            (F.rotate_image, datapoints.Image),
-            (F.rotate_bounding_boxes, datapoints.BoundingBoxes),
-            (F.rotate_mask, datapoints.Mask),
-            (F.rotate_video, datapoints.Video),
+            (F.rotate_image, tv_tensors.Image),
+            (F.rotate_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.rotate_mask, tv_tensors.Mask),
+            (F.rotate_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -1553,11 +1553,11 @@ class TestRotate:
 
     def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy):
         x, y = recenter_xy
-        if bounding_boxes.format is datapoints.BoundingBoxFormat.XYXY:
+        if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY:
             translate = [x, y, x, y]
         else:
             translate = [x, y, 0.0, 0.0]
-        return datapoints.wrap(
+        return tv_tensors.wrap(
             (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes
         )
 
@@ -1590,7 +1590,7 @@ class TestRotate:
             bounding_boxes
         )
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
     @pytest.mark.parametrize("expand", [False, True])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
@@ -1603,7 +1603,7 @@ class TestRotate:
         torch.testing.assert_close(actual, expected)
         torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("expand", [False, True])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
@@ -1861,7 +1861,7 @@ class TestToDtype:
         # make sure "others" works as a catch-all and that None means no conversion
 
         sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
-        out = transforms.ToDtype(dtype={datapoints.Mask: torch.int64, "others": None})(sample)
+        out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample)
         assert out["inpt"].dtype == inpt_dtype
         assert out["bbox"].dtype == bbox_dtype
         assert out["mask"].dtype != mask_dtype
@@ -1874,7 +1874,7 @@ class TestToDtype:
 
         sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
         out = transforms.ToDtype(
-            dtype={type(sample["inpt"]): torch.float32, datapoints.Mask: torch.int64, "others": None}, scale=True
+            dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True
         )(sample)
         assert out["inpt"].dtype != inpt_dtype
         assert out["inpt"].dtype == torch.float32
@@ -1888,9 +1888,9 @@ class TestToDtype:
         sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
 
         with pytest.raises(ValueError, match="No dtype was specified for"):
-            out = transforms.ToDtype(dtype={datapoints.Mask: torch.float32})(sample)
+            out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample)
         with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")):
-            transforms.ToDtype(dtype={torch.Tensor: torch.float32, datapoints.Image: torch.float32})
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32})
         with pytest.warns(UserWarning, match="no scaling will be done"):
             out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample)
         assert out["inpt"].dtype == inpt_dtype
@@ -1923,8 +1923,8 @@ class TestAdjustBrightness:
         [
             (F.adjust_brightness_image, torch.Tensor),
             (F._adjust_brightness_image_pil, PIL.Image.Image),
-            (F.adjust_brightness_image, datapoints.Image),
-            (F.adjust_brightness_video, datapoints.Video),
+            (F.adjust_brightness_image, tv_tensors.Image),
+            (F.adjust_brightness_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -2028,8 +2028,8 @@ class TestCutMixMixUp:
 
         for input_with_bad_type in (
             F.to_pil_image(imgs[0]),
-            datapoints.Mask(torch.rand(12, 12)),
-            datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12),
+            tv_tensors.Mask(torch.rand(12, 12)),
+            tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12),
         ):
             with pytest.raises(ValueError, match="does not support PIL images, "):
                 cutmix_mixup(input_with_bad_type)
@@ -2172,12 +2172,12 @@ class TestShapeGetters:
 class TestRegisterKernel:
     @pytest.mark.parametrize("functional", (F.resize, "resize"))
     def test_register_kernel(self, functional):
-        class CustomDatapoint(datapoints.Datapoint):
+        class CustomTVTensor(tv_tensors.TVTensor):
             pass
 
         kernel_was_called = False
 
-        @F.register_kernel(functional, CustomDatapoint)
+        @F.register_kernel(functional, CustomTVTensor)
         def new_resize(dp, *args, **kwargs):
             nonlocal kernel_was_called
             kernel_was_called = True
@@ -2185,38 +2185,38 @@ class TestRegisterKernel:
 
         t = transforms.Resize(size=(224, 224), antialias=True)
 
-        my_dp = CustomDatapoint(torch.rand(3, 10, 10))
+        my_dp = CustomTVTensor(torch.rand(3, 10, 10))
         out = t(my_dp)
         assert out is my_dp
         assert kernel_was_called
 
         # Sanity check to make sure we didn't override the kernel of other types
         t(torch.rand(3, 10, 10)).shape == (3, 224, 224)
-        t(datapoints.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
+        t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
 
     def test_errors(self):
         with pytest.raises(ValueError, match="Could not find functional with name"):
-            F.register_kernel("bad_name", datapoints.Image)
+            F.register_kernel("bad_name", tv_tensors.Image)
 
         with pytest.raises(ValueError, match="Kernels can only be registered on functionals"):
-            F.register_kernel(datapoints.Image, F.resize)
+            F.register_kernel(tv_tensors.Image, F.resize)
 
         with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"):
             F.register_kernel(F.resize, object)
 
-        with pytest.raises(ValueError, match="cannot be registered for the builtin datapoint classes"):
-            F.register_kernel(F.resize, datapoints.Image)(F.resize_image)
+        with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"):
+            F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image)
 
-        class CustomDatapoint(datapoints.Datapoint):
+        class CustomTVTensor(tv_tensors.TVTensor):
             pass
 
-        def resize_custom_datapoint():
+        def resize_custom_tv_tensor():
             pass
 
-        F.register_kernel(F.resize, CustomDatapoint)(resize_custom_datapoint)
+        F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor)
 
         with pytest.raises(ValueError, match="already has a kernel registered for type"):
-            F.register_kernel(F.resize, CustomDatapoint)(resize_custom_datapoint)
+            F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor)
 
 
 class TestGetKernel:
@@ -2225,10 +2225,10 @@ class TestGetKernel:
     KERNELS = {
         torch.Tensor: F.resize_image,
         PIL.Image.Image: F._resize_image_pil,
-        datapoints.Image: F.resize_image,
-        datapoints.BoundingBoxes: F.resize_bounding_boxes,
-        datapoints.Mask: F.resize_mask,
-        datapoints.Video: F.resize_video,
+        tv_tensors.Image: F.resize_image,
+        tv_tensors.BoundingBoxes: F.resize_bounding_boxes,
+        tv_tensors.Mask: F.resize_mask,
+        tv_tensors.Video: F.resize_video,
     }
 
     @pytest.mark.parametrize("input_type", [str, int, object])
@@ -2244,57 +2244,57 @@ class TestGetKernel:
             pass
 
         for input_type, kernel in self.KERNELS.items():
-            _register_kernel_internal(resize_with_pure_kernels, input_type, datapoint_wrapper=False)(kernel)
+            _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel)
 
             assert _get_kernel(resize_with_pure_kernels, input_type) is kernel
 
-    def test_builtin_datapoint_subclass(self):
+    def test_builtin_tv_tensor_subclass(self):
         # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
         # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
-        # here, register the kernels without wrapper, and check if subclasses of our builtin datapoints get dispatched
+        # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched
         # to the kernel of the corresponding superclass
         def resize_with_pure_kernels():
             pass
 
-        class MyImage(datapoints.Image):
+        class MyImage(tv_tensors.Image):
             pass
 
-        class MyBoundingBoxes(datapoints.BoundingBoxes):
+        class MyBoundingBoxes(tv_tensors.BoundingBoxes):
             pass
 
-        class MyMask(datapoints.Mask):
+        class MyMask(tv_tensors.Mask):
             pass
 
-        class MyVideo(datapoints.Video):
+        class MyVideo(tv_tensors.Video):
             pass
 
-        for custom_datapoint_subclass in [
+        for custom_tv_tensor_subclass in [
             MyImage,
             MyBoundingBoxes,
             MyMask,
             MyVideo,
         ]:
-            builtin_datapoint_class = custom_datapoint_subclass.__mro__[1]
-            builtin_datapoint_kernel = self.KERNELS[builtin_datapoint_class]
-            _register_kernel_internal(resize_with_pure_kernels, builtin_datapoint_class, datapoint_wrapper=False)(
-                builtin_datapoint_kernel
+            builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1]
+            builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class]
+            _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)(
+                builtin_tv_tensor_kernel
             )
 
-            assert _get_kernel(resize_with_pure_kernels, custom_datapoint_subclass) is builtin_datapoint_kernel
+            assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel
 
-    def test_datapoint_subclass(self):
-        class MyDatapoint(datapoints.Datapoint):
+    def test_tv_tensor_subclass(self):
+        class MyTVTensor(tv_tensors.TVTensor):
             pass
 
         with pytest.raises(TypeError, match="supports inputs of type"):
-            _get_kernel(F.resize, MyDatapoint)
+            _get_kernel(F.resize, MyTVTensor)
 
-        def resize_my_datapoint():
+        def resize_my_tv_tensor():
             pass
 
-        _register_kernel_internal(F.resize, MyDatapoint, datapoint_wrapper=False)(resize_my_datapoint)
+        _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor)
 
-        assert _get_kernel(F.resize, MyDatapoint) is resize_my_datapoint
+        assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor
 
     def test_pil_image_subclass(self):
         opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
@@ -2342,8 +2342,8 @@ class TestPermuteChannels:
         [
             (F.permute_channels_image, torch.Tensor),
             (F._permute_channels_image_pil, PIL.Image.Image),
-            (F.permute_channels_image, datapoints.Image),
-            (F.permute_channels_video, datapoints.Video),
+            (F.permute_channels_image, tv_tensors.Image),
+            (F.permute_channels_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -2352,7 +2352,7 @@ class TestPermuteChannels:
     def reference_image_correctness(self, image, permutation):
         channel_images = image.split(1, dim=-3)
         permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation]
-        return datapoints.Image(torch.concat(permuted_channel_images, dim=-3))
+        return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3))
 
     @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]])
     @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)])
@@ -2392,7 +2392,7 @@ class TestElastic:
             check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
         )
 
-    @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
@@ -2428,10 +2428,10 @@ class TestElastic:
         [
             (F.elastic_image, torch.Tensor),
             (F._elastic_image_pil, PIL.Image.Image),
-            (F.elastic_image, datapoints.Image),
-            (F.elastic_bounding_boxes, datapoints.BoundingBoxes),
-            (F.elastic_mask, datapoints.Mask),
-            (F.elastic_video, datapoints.Video),
+            (F.elastic_image, tv_tensors.Image),
+            (F.elastic_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.elastic_mask, tv_tensors.Mask),
+            (F.elastic_video, tv_tensors.Video),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
@@ -2481,7 +2481,7 @@ class TestToPureTensor:
         out = transforms.ToPureTensor()(input)
 
         for input_value, out_value in zip(input.values(), out.values()):
-            if isinstance(input_value, datapoints.Datapoint):
-                assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, datapoints.Datapoint)
+            if isinstance(input_value, tv_tensors.TVTensor):
+                assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor)
             else:
                 assert isinstance(out_value, type(input_value))
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 511b0c364..246e31485 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -6,46 +6,46 @@ import torch
 import torchvision.transforms.v2._utils
 from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.v2._utils import has_all, has_any
 from torchvision.transforms.v2.functional import to_pil_image
 
 
 IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
-BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
+BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=tv_tensors.BoundingBoxFormat.XYXY)
 MASK = make_detection_mask(DEFAULT_SIZE)
 
 
 @pytest.mark.parametrize(
     ("sample", "types", "expected"),
     [
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True),
-        ((MASK,), (datapoints.Image, datapoints.BoundingBoxes), False),
-        ((BOUNDING_BOX,), (datapoints.Image, datapoints.Mask), False),
-        ((IMAGE,), (datapoints.BoundingBoxes, datapoints.Mask), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True),
+        ((MASK,), (tv_tensors.Image, tv_tensors.BoundingBoxes), False),
+        ((BOUNDING_BOX,), (tv_tensors.Image, tv_tensors.Mask), False),
+        ((IMAGE,), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
             True,
         ),
-        ((), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
+        ((), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, tv_tensors.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True),
+        ((IMAGE,), (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True),
         (
             (torch.Tensor(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
+            (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
             True,
         ),
         (
             (to_pil_image(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
+            (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
             True,
         ),
     ],
@@ -57,31 +57,31 @@ def test_has_any(sample, types, expected):
 @pytest.mark.parametrize(
     ("sample", "types", "expected"),
     [
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
             True,
         ),
-        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), False),
-        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), False),
-        ((IMAGE, MASK), (datapoints.BoundingBoxes, datapoints.Mask), False),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), False),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), False),
+        ((IMAGE, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
             True,
         ),
-        ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
-        ((IMAGE, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
-        ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, BOUNDING_BOX), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
         (
             (IMAGE, BOUNDING_BOX, MASK),
-            (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)),),
+            (lambda obj: isinstance(obj, (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask)),),
             True,
         ),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
diff --git a/test/test_datapoints.py b/test/test_tv_tensors.py
similarity index 74%
rename from test/test_datapoints.py
rename to test/test_tv_tensors.py
index 1aeb23677..92747f7eb 100644
--- a/test/test_datapoints.py
+++ b/test/test_tv_tensors.py
@@ -5,7 +5,7 @@ import torch
 from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video
 from PIL import Image
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 
 
 @pytest.fixture(autouse=True)
@@ -13,40 +13,40 @@ def restore_tensor_return_type():
     # This is for security, as we should already be restoring the default manually in each test anyway
     # (at least at the time of writing...)
     yield
-    datapoints.set_return_type("Tensor")
+    tv_tensors.set_return_type("Tensor")
 
 
 @pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
 def test_image_instance(data):
-    image = datapoints.Image(data)
+    image = tv_tensors.Image(data)
     assert isinstance(image, torch.Tensor)
     assert image.ndim == 3 and image.shape[0] == 3
 
 
 @pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
 def test_mask_instance(data):
-    mask = datapoints.Mask(data)
+    mask = tv_tensors.Mask(data)
     assert isinstance(mask, torch.Tensor)
     assert mask.ndim == 3 and mask.shape[0] == 1
 
 
 @pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]])
 @pytest.mark.parametrize(
-    "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
+    "format", ["XYXY", "CXCYWH", tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH]
 )
 def test_bbox_instance(data, format):
-    bboxes = datapoints.BoundingBoxes(data, format=format, canvas_size=(32, 32))
+    bboxes = tv_tensors.BoundingBoxes(data, format=format, canvas_size=(32, 32))
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[(format.upper())]
+        format = tv_tensors.BoundingBoxFormat[(format.upper())]
     assert bboxes.format == format
 
 
 def test_bbox_dim_error():
     data_3d = [[[1, 2, 3, 4]]]
     with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"):
-        datapoints.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32))
+        tv_tensors.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32))
 
 
 @pytest.mark.parametrize(
@@ -64,8 +64,8 @@ def test_bbox_dim_error():
     ],
 )
 def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
-    datapoint = datapoints.Image(data, requires_grad=input_requires_grad)
-    assert datapoint.requires_grad is expected_requires_grad
+    tv_tensor = tv_tensors.Image(data, requires_grad=input_requires_grad)
+    assert tv_tensor.requires_grad is expected_requires_grad
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
@@ -75,7 +75,7 @@ def test_isinstance(make_input):
 
 def test_wrapping_no_copy():
     tensor = torch.rand(3, 16, 16)
-    image = datapoints.Image(tensor)
+    image = tv_tensors.Image(tensor)
 
     assert image.data_ptr() == tensor.data_ptr()
 
@@ -91,25 +91,25 @@ def test_to_wrapping(make_input):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
-def test_to_datapoint_reference(make_input, return_type):
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+def test_to_tv_tensor_reference(make_input, return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
     dp = make_input()
 
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         tensor_to = tensor.to(dp)
 
-    assert type(tensor_to) is (type(dp) if return_type == "datapoint" else torch.Tensor)
+    assert type(tensor_to) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
     assert tensor_to.dtype is dp.dtype
     assert type(tensor) is torch.Tensor
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 def test_clone_wrapping(make_input, return_type):
     dp = make_input()
 
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         dp_clone = dp.clone()
 
     assert type(dp_clone) is type(dp)
@@ -117,13 +117,13 @@ def test_clone_wrapping(make_input, return_type):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 def test_requires_grad__wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float)
 
     assert not dp.requires_grad
 
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         dp_requires_grad = dp.requires_grad_(True)
 
     assert type(dp_requires_grad) is type(dp)
@@ -132,54 +132,54 @@ def test_requires_grad__wrapping(make_input, return_type):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 def test_detach_wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float).requires_grad_(True)
 
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         dp_detached = dp.detach()
 
     assert type(dp_detached) is type(dp)
 
 
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 def test_force_subclass_with_metadata(return_type):
-    # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and datapoints with metadata
+    # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and tv_tensors with metadata
     # Largely the same as above, we additionally check that the metadata is preserved
     format, canvas_size = "XYXY", (32, 32)
-    bbox = datapoints.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
+    bbox = tv_tensors.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
 
-    datapoints.set_return_type(return_type)
+    tv_tensors.set_return_type(return_type)
     bbox = bbox.clone()
-    if return_type == "datapoint":
+    if return_type == "tv_tensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     bbox = bbox.to(torch.float64)
-    if return_type == "datapoint":
+    if return_type == "tv_tensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     bbox = bbox.detach()
-    if return_type == "datapoint":
+    if return_type == "tv_tensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     assert not bbox.requires_grad
     bbox.requires_grad_(True)
-    if return_type == "datapoint":
+    if return_type == "tv_tensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
         assert bbox.requires_grad
-    datapoints.set_return_type("tensor")
+    tv_tensors.set_return_type("tensor")
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 def test_other_op_no_wrapping(make_input, return_type):
     dp = make_input()
 
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
         output = dp * 2
 
-    assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
+    assert type(output) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
@@ -200,15 +200,15 @@ def test_no_tensor_output_op_no_wrapping(make_input, op):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 def test_inplace_op_no_wrapping(make_input, return_type):
     dp = make_input()
     original_type = type(dp)
 
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         output = dp.add_(0)
 
-    assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
+    assert type(output) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
     assert type(dp) is original_type
 
 
@@ -219,7 +219,7 @@ def test_wrap(make_input):
     # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
     output = dp * 2
 
-    dp_new = datapoints.wrap(output, like=dp)
+    dp_new = tv_tensors.wrap(output, like=dp)
 
     assert type(dp_new) is type(dp)
     assert dp_new.data_ptr() == output.data_ptr()
@@ -243,7 +243,7 @@ def test_deepcopy(make_input, requires_grad):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
+@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
 @pytest.mark.parametrize(
     "op",
     (
@@ -265,10 +265,10 @@ def test_deepcopy(make_input, requires_grad):
 def test_usual_operations(make_input, return_type, op):
 
     dp = make_input()
-    with datapoints.set_return_type(return_type):
+    with tv_tensors.set_return_type(return_type):
         out = op(dp)
-    assert type(out) is (type(dp) if return_type == "datapoint" else torch.Tensor)
-    if isinstance(dp, datapoints.BoundingBoxes) and return_type == "datapoint":
+    assert type(out) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
+    if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "tv_tensor":
         assert hasattr(out, "format")
         assert hasattr(out, "canvas_size")
 
@@ -286,22 +286,22 @@ def test_set_return_type():
 
     assert type(img + 3) is torch.Tensor
 
-    with datapoints.set_return_type("datapoint"):
-        assert type(img + 3) is datapoints.Image
+    with tv_tensors.set_return_type("tv_tensor"):
+        assert type(img + 3) is tv_tensors.Image
     assert type(img + 3) is torch.Tensor
 
-    datapoints.set_return_type("datapoint")
-    assert type(img + 3) is datapoints.Image
+    tv_tensors.set_return_type("tv_tensor")
+    assert type(img + 3) is tv_tensors.Image
 
-    with datapoints.set_return_type("tensor"):
+    with tv_tensors.set_return_type("tensor"):
         assert type(img + 3) is torch.Tensor
-        with datapoints.set_return_type("datapoint"):
-            assert type(img + 3) is datapoints.Image
-            datapoints.set_return_type("tensor")
+        with tv_tensors.set_return_type("tv_tensor"):
+            assert type(img + 3) is tv_tensors.Image
+            tv_tensors.set_return_type("tensor")
             assert type(img + 3) is torch.Tensor
         assert type(img + 3) is torch.Tensor
     # Exiting a context manager will restore the return type as it was prior to entering it,
-    # regardless of whether the "global" datapoints.set_return_type() was called within the context manager.
-    assert type(img + 3) is datapoints.Image
+    # regardless of whether the "global" tv_tensors.set_return_type() was called within the context manager.
+    assert type(img + 3) is tv_tensors.Image
 
-    datapoints.set_return_type("tensor")
+    tv_tensors.set_return_type("tensor")
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index a20f38d84..31d98db7f 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -2,7 +2,7 @@ import collections.abc
 
 import pytest
 import torchvision.transforms.v2.functional as F
-from torchvision import datapoints
+from torchvision import tv_tensors
 from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
 from transforms_v2_legacy_utils import InfoBase, TestMark
 
@@ -44,19 +44,19 @@ class DispatcherInfo(InfoBase):
         self.pil_kernel_info = pil_kernel_info
 
         kernel_infos = {}
-        for datapoint_type, kernel in self.kernels.items():
+        for tv_tensor_type, kernel in self.kernels.items():
             kernel_info = self._KERNEL_INFO_MAP.get(kernel)
             if not kernel_info:
                 raise pytest.UsageError(
-                    f"Can't register {kernel.__name__} for type {datapoint_type} since there is no `KernelInfo` for it. "
+                    f"Can't register {kernel.__name__} for type {tv_tensor_type} since there is no `KernelInfo` for it. "
                     f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`."
                 )
-            kernel_infos[datapoint_type] = kernel_info
+            kernel_infos[tv_tensor_type] = kernel_info
         self.kernel_infos = kernel_infos
 
-    def sample_inputs(self, *datapoint_types, filter_metadata=True):
-        for datapoint_type in datapoint_types or self.kernel_infos.keys():
-            kernel_info = self.kernel_infos.get(datapoint_type)
+    def sample_inputs(self, *tv_tensor_types, filter_metadata=True):
+        for tv_tensor_type in tv_tensor_types or self.kernel_infos.keys():
+            kernel_info = self.kernel_infos.get(tv_tensor_type)
             if not kernel_info:
                 raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}")
 
@@ -69,12 +69,12 @@ class DispatcherInfo(InfoBase):
             import itertools
 
             for args_kwargs in sample_inputs:
-                if hasattr(datapoint_type, "__annotations__"):
+                if hasattr(tv_tensor_type, "__annotations__"):
                     for name in itertools.chain(
-                        datapoint_type.__annotations__.keys(),
+                        tv_tensor_type.__annotations__.keys(),
                         # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a
                         #  per-dispatcher level. However, so far there is no option for that.
-                        (f"old_{name}" for name in datapoint_type.__annotations__.keys()),
+                        (f"old_{name}" for name in tv_tensor_type.__annotations__.keys()),
                     ):
                         if name in args_kwargs.kwargs:
                             del args_kwargs.kwargs[name]
@@ -97,9 +97,9 @@ def xfail_jit_python_scalar_arg(name, *, reason=None):
     )
 
 
-skip_dispatch_datapoint = TestMark(
-    ("TestDispatchers", "test_dispatch_datapoint"),
-    pytest.mark.skip(reason="Dispatcher doesn't support arbitrary datapoint dispatch."),
+skip_dispatch_tv_tensor = TestMark(
+    ("TestDispatchers", "test_dispatch_tv_tensor"),
+    pytest.mark.skip(reason="Dispatcher doesn't support arbitrary tv_tensor dispatch."),
 )
 
 multi_crop_skips = [
@@ -107,9 +107,9 @@ multi_crop_skips = [
         ("TestDispatchers", test_name),
         pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."),
     )
-    for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"]
+    for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_tv_tensor_output_type"]
 ]
-multi_crop_skips.append(skip_dispatch_datapoint)
+multi_crop_skips.append(skip_dispatch_tv_tensor)
 
 
 def xfails_pil(reason, *, condition=None):
@@ -142,30 +142,30 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.crop,
         kernels={
-            datapoints.Image: F.crop_image,
-            datapoints.Video: F.crop_video,
-            datapoints.BoundingBoxes: F.crop_bounding_boxes,
-            datapoints.Mask: F.crop_mask,
+            tv_tensors.Image: F.crop_image,
+            tv_tensors.Video: F.crop_video,
+            tv_tensors.BoundingBoxes: F.crop_bounding_boxes,
+            tv_tensors.Mask: F.crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F._crop_image_pil, kernel_name="crop_image_pil"),
     ),
     DispatcherInfo(
         F.resized_crop,
         kernels={
-            datapoints.Image: F.resized_crop_image,
-            datapoints.Video: F.resized_crop_video,
-            datapoints.BoundingBoxes: F.resized_crop_bounding_boxes,
-            datapoints.Mask: F.resized_crop_mask,
+            tv_tensors.Image: F.resized_crop_image,
+            tv_tensors.Video: F.resized_crop_video,
+            tv_tensors.BoundingBoxes: F.resized_crop_bounding_boxes,
+            tv_tensors.Mask: F.resized_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F._resized_crop_image_pil),
     ),
     DispatcherInfo(
         F.pad,
         kernels={
-            datapoints.Image: F.pad_image,
-            datapoints.Video: F.pad_video,
-            datapoints.BoundingBoxes: F.pad_bounding_boxes,
-            datapoints.Mask: F.pad_mask,
+            tv_tensors.Image: F.pad_image,
+            tv_tensors.Video: F.pad_video,
+            tv_tensors.BoundingBoxes: F.pad_bounding_boxes,
+            tv_tensors.Mask: F.pad_mask,
         },
         pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"),
         test_marks=[
@@ -184,10 +184,10 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.perspective,
         kernels={
-            datapoints.Image: F.perspective_image,
-            datapoints.Video: F.perspective_video,
-            datapoints.BoundingBoxes: F.perspective_bounding_boxes,
-            datapoints.Mask: F.perspective_mask,
+            tv_tensors.Image: F.perspective_image,
+            tv_tensors.Video: F.perspective_video,
+            tv_tensors.BoundingBoxes: F.perspective_bounding_boxes,
+            tv_tensors.Mask: F.perspective_mask,
         },
         pil_kernel_info=PILKernelInfo(F._perspective_image_pil),
         test_marks=[
@@ -198,10 +198,10 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.elastic,
         kernels={
-            datapoints.Image: F.elastic_image,
-            datapoints.Video: F.elastic_video,
-            datapoints.BoundingBoxes: F.elastic_bounding_boxes,
-            datapoints.Mask: F.elastic_mask,
+            tv_tensors.Image: F.elastic_image,
+            tv_tensors.Video: F.elastic_video,
+            tv_tensors.BoundingBoxes: F.elastic_bounding_boxes,
+            tv_tensors.Mask: F.elastic_mask,
         },
         pil_kernel_info=PILKernelInfo(F._elastic_image_pil),
         test_marks=[xfail_jit_python_scalar_arg("fill")],
@@ -209,10 +209,10 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.center_crop,
         kernels={
-            datapoints.Image: F.center_crop_image,
-            datapoints.Video: F.center_crop_video,
-            datapoints.BoundingBoxes: F.center_crop_bounding_boxes,
-            datapoints.Mask: F.center_crop_mask,
+            tv_tensors.Image: F.center_crop_image,
+            tv_tensors.Video: F.center_crop_video,
+            tv_tensors.BoundingBoxes: F.center_crop_bounding_boxes,
+            tv_tensors.Mask: F.center_crop_mask,
         },
         pil_kernel_info=PILKernelInfo(F._center_crop_image_pil),
         test_marks=[
@@ -222,8 +222,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.gaussian_blur,
         kernels={
-            datapoints.Image: F.gaussian_blur_image,
-            datapoints.Video: F.gaussian_blur_video,
+            tv_tensors.Image: F.gaussian_blur_image,
+            tv_tensors.Video: F.gaussian_blur_video,
         },
         pil_kernel_info=PILKernelInfo(F._gaussian_blur_image_pil),
         test_marks=[
@@ -234,99 +234,99 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.equalize,
         kernels={
-            datapoints.Image: F.equalize_image,
-            datapoints.Video: F.equalize_video,
+            tv_tensors.Image: F.equalize_image,
+            tv_tensors.Video: F.equalize_video,
         },
         pil_kernel_info=PILKernelInfo(F._equalize_image_pil, kernel_name="equalize_image_pil"),
     ),
     DispatcherInfo(
         F.invert,
         kernels={
-            datapoints.Image: F.invert_image,
-            datapoints.Video: F.invert_video,
+            tv_tensors.Image: F.invert_image,
+            tv_tensors.Video: F.invert_video,
         },
         pil_kernel_info=PILKernelInfo(F._invert_image_pil, kernel_name="invert_image_pil"),
     ),
     DispatcherInfo(
         F.posterize,
         kernels={
-            datapoints.Image: F.posterize_image,
-            datapoints.Video: F.posterize_video,
+            tv_tensors.Image: F.posterize_image,
+            tv_tensors.Video: F.posterize_video,
         },
         pil_kernel_info=PILKernelInfo(F._posterize_image_pil, kernel_name="posterize_image_pil"),
     ),
     DispatcherInfo(
         F.solarize,
         kernels={
-            datapoints.Image: F.solarize_image,
-            datapoints.Video: F.solarize_video,
+            tv_tensors.Image: F.solarize_image,
+            tv_tensors.Video: F.solarize_video,
         },
         pil_kernel_info=PILKernelInfo(F._solarize_image_pil, kernel_name="solarize_image_pil"),
     ),
     DispatcherInfo(
         F.autocontrast,
         kernels={
-            datapoints.Image: F.autocontrast_image,
-            datapoints.Video: F.autocontrast_video,
+            tv_tensors.Image: F.autocontrast_image,
+            tv_tensors.Video: F.autocontrast_video,
         },
         pil_kernel_info=PILKernelInfo(F._autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_sharpness,
         kernels={
-            datapoints.Image: F.adjust_sharpness_image,
-            datapoints.Video: F.adjust_sharpness_video,
+            tv_tensors.Image: F.adjust_sharpness_image,
+            tv_tensors.Video: F.adjust_sharpness_video,
         },
         pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
     ),
     DispatcherInfo(
         F.erase,
         kernels={
-            datapoints.Image: F.erase_image,
-            datapoints.Video: F.erase_video,
+            tv_tensors.Image: F.erase_image,
+            tv_tensors.Video: F.erase_video,
         },
         pil_kernel_info=PILKernelInfo(F._erase_image_pil),
         test_marks=[
-            skip_dispatch_datapoint,
+            skip_dispatch_tv_tensor,
         ],
     ),
     DispatcherInfo(
         F.adjust_contrast,
         kernels={
-            datapoints.Image: F.adjust_contrast_image,
-            datapoints.Video: F.adjust_contrast_video,
+            tv_tensors.Image: F.adjust_contrast_image,
+            tv_tensors.Video: F.adjust_contrast_video,
         },
         pil_kernel_info=PILKernelInfo(F._adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_gamma,
         kernels={
-            datapoints.Image: F.adjust_gamma_image,
-            datapoints.Video: F.adjust_gamma_video,
+            tv_tensors.Image: F.adjust_gamma_image,
+            tv_tensors.Video: F.adjust_gamma_video,
         },
         pil_kernel_info=PILKernelInfo(F._adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_hue,
         kernels={
-            datapoints.Image: F.adjust_hue_image,
-            datapoints.Video: F.adjust_hue_video,
+            tv_tensors.Image: F.adjust_hue_image,
+            tv_tensors.Video: F.adjust_hue_video,
         },
         pil_kernel_info=PILKernelInfo(F._adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
     ),
     DispatcherInfo(
         F.adjust_saturation,
         kernels={
-            datapoints.Image: F.adjust_saturation_image,
-            datapoints.Video: F.adjust_saturation_video,
+            tv_tensors.Image: F.adjust_saturation_image,
+            tv_tensors.Video: F.adjust_saturation_video,
         },
         pil_kernel_info=PILKernelInfo(F._adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
     ),
     DispatcherInfo(
         F.five_crop,
         kernels={
-            datapoints.Image: F.five_crop_image,
-            datapoints.Video: F.five_crop_video,
+            tv_tensors.Image: F.five_crop_image,
+            tv_tensors.Video: F.five_crop_video,
         },
         pil_kernel_info=PILKernelInfo(F._five_crop_image_pil),
         test_marks=[
@@ -337,8 +337,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.ten_crop,
         kernels={
-            datapoints.Image: F.ten_crop_image,
-            datapoints.Video: F.ten_crop_video,
+            tv_tensors.Image: F.ten_crop_image,
+            tv_tensors.Video: F.ten_crop_video,
         },
         test_marks=[
             xfail_jit_python_scalar_arg("size"),
@@ -349,8 +349,8 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.normalize,
         kernels={
-            datapoints.Image: F.normalize_image,
-            datapoints.Video: F.normalize_video,
+            tv_tensors.Image: F.normalize_image,
+            tv_tensors.Video: F.normalize_video,
         },
         test_marks=[
             xfail_jit_python_scalar_arg("mean"),
@@ -360,24 +360,24 @@ DISPATCHER_INFOS = [
     DispatcherInfo(
         F.uniform_temporal_subsample,
         kernels={
-            datapoints.Video: F.uniform_temporal_subsample_video,
+            tv_tensors.Video: F.uniform_temporal_subsample_video,
         },
         test_marks=[
-            skip_dispatch_datapoint,
+            skip_dispatch_tv_tensor,
         ],
     ),
     DispatcherInfo(
         F.clamp_bounding_boxes,
-        kernels={datapoints.BoundingBoxes: F.clamp_bounding_boxes},
+        kernels={tv_tensors.BoundingBoxes: F.clamp_bounding_boxes},
         test_marks=[
-            skip_dispatch_datapoint,
+            skip_dispatch_tv_tensor,
         ],
     ),
     DispatcherInfo(
         F.convert_bounding_box_format,
-        kernels={datapoints.BoundingBoxes: F.convert_bounding_box_format},
+        kernels={tv_tensors.BoundingBoxes: F.convert_bounding_box_format},
         test_marks=[
-            skip_dispatch_datapoint,
+            skip_dispatch_tv_tensor,
         ],
     ),
 ]
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index b10c58277..b682c992d 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -7,7 +7,7 @@ import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.transforms.v2.functional as F
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
 from transforms_v2_legacy_utils import (
     ArgsKwargs,
@@ -193,7 +193,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
         bbox_xyxy = F.convert_bounding_box_format(
             bbox.as_subclass(torch.Tensor),
             old_format=format_,
-            new_format=datapoints.BoundingBoxFormat.XYXY,
+            new_format=tv_tensors.BoundingBoxFormat.XYXY,
             inplace=True,
         )
         points = np.array(
@@ -215,7 +215,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
             dtype=bbox_xyxy.dtype,
         )
         out_bbox = F.convert_bounding_box_format(
-            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
+            out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
         out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_)
@@ -228,7 +228,7 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_siz
 
 
 def sample_inputs_convert_bounding_box_format():
-    formats = list(datapoints.BoundingBoxFormat)
+    formats = list(tv_tensors.BoundingBoxFormat)
     for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
         yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format)
 
@@ -659,7 +659,7 @@ def sample_inputs_perspective_bounding_boxes():
             coefficients=_PERSPECTIVE_COEFFS[0],
         )
 
-    format = datapoints.BoundingBoxFormat.XYXY
+    format = tv_tensors.BoundingBoxFormat.XYXY
     loader = make_bounding_box_loader(format=format)
     yield ArgsKwargs(
         loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py
index bb8943a88..9dead7934 100644
--- a/test/transforms_v2_legacy_utils.py
+++ b/test/transforms_v2_legacy_utils.py
@@ -27,7 +27,7 @@ import PIL.Image
 import pytest
 import torch
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
 
@@ -82,7 +82,7 @@ def make_image(
     if color_space in {"GRAY_ALPHA", "RGBA"}:
         data[..., -1, :, :] = max_value
 
-    return datapoints.Image(data)
+    return tv_tensors.Image(data)
 
 
 def make_image_tensor(*args, **kwargs):
@@ -96,7 +96,7 @@ def make_image_pil(*args, **kwargs):
 def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
-    format=datapoints.BoundingBoxFormat.XYXY,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
     batch_dims=(),
     dtype=None,
     device="cpu",
@@ -107,12 +107,12 @@ def make_bounding_boxes(
         return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
 
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
+        format = tv_tensors.BoundingBoxFormat[format]
 
     dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
-        return datapoints.BoundingBoxes(
+        return tv_tensors.BoundingBoxes(
             torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
         )
 
@@ -120,28 +120,28 @@ def make_bounding_boxes(
     y = sample_position(h, canvas_size[0])
     x = sample_position(w, canvas_size[1])
 
-    if format is datapoints.BoundingBoxFormat.XYWH:
+    if format is tv_tensors.BoundingBoxFormat.XYWH:
         parts = (x, y, w, h)
-    elif format is datapoints.BoundingBoxFormat.XYXY:
+    elif format is tv_tensors.BoundingBoxFormat.XYXY:
         x1, y1 = x, y
         x2 = x1 + w
         y2 = y1 + h
         parts = (x1, y1, x2, y2)
-    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWH:
         cx = x + w / 2
         cy = y + h / 2
         parts = (cx, cy, w, h)
     else:
         raise ValueError(f"Format {format} is not supported")
 
-    return datapoints.BoundingBoxes(
+    return tv_tensors.BoundingBoxes(
         torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
     )
 
 
 def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
-    return datapoints.Mask(
+    return tv_tensors.Mask(
         torch.testing.make_tensor(
             (*batch_dims, num_objects, *size),
             low=0,
@@ -154,7 +154,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp
 
 def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
-    return datapoints.Mask(
+    return tv_tensors.Mask(
         torch.testing.make_tensor(
             (*batch_dims, *size),
             low=0,
@@ -166,7 +166,7 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(
 
 
 def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
-    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+    return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
 def make_video_tensor(*args, **kwargs):
@@ -335,7 +335,7 @@ def make_image_loader_for_interpolation(
             image_tensor = image_tensor.to(device=device)
         image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
 
-        return datapoints.Image(image_tensor)
+        return tv_tensors.Image(image_tensor)
 
     return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
 
@@ -352,7 +352,7 @@ def make_image_loaders_for_interpolation(
 
 @dataclasses.dataclass
 class BoundingBoxesLoader(TensorLoader):
-    format: datapoints.BoundingBoxFormat
+    format: tv_tensors.BoundingBoxFormat
     spatial_size: Tuple[int, int]
     canvas_size: Tuple[int, int] = dataclasses.field(init=False)
 
@@ -362,7 +362,7 @@ class BoundingBoxesLoader(TensorLoader):
 
 def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
+        format = tv_tensors.BoundingBoxFormat[format]
 
     spatial_size = _parse_size(spatial_size, name="spatial_size")
 
@@ -381,7 +381,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORT
 def make_bounding_box_loaders(
     *,
     extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
-    formats=tuple(datapoints.BoundingBoxFormat),
+    formats=tuple(tv_tensors.BoundingBoxFormat),
     spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     dtypes=(torch.float32, torch.float64, torch.int64),
 ):
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 43b0801d4..f196713b7 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -137,7 +137,7 @@ __all__ = (
 # Ref: https://peps.python.org/pep-0562/
 def __getattr__(name):
     if name in ("wrap_dataset_for_transforms_v2",):
-        from torchvision.datapoints._dataset_wrapper import wrap_dataset_for_transforms_v2
+        from torchvision.tv_tensors._dataset_wrapper import wrap_dataset_for_transforms_v2
 
         return wrap_dataset_for_transforms_v2
 
diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py
index 200f5cd95..0621c9bf7 100644
--- a/torchvision/prototype/__init__.py
+++ b/torchvision/prototype/__init__.py
@@ -1 +1 @@
-from . import datapoints, models, transforms, utils
+from . import models, transforms, tv_tensors, utils
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index 631de46b2..5072902b2 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -6,8 +6,6 @@ import numpy as np
 
 import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.datapoints import BoundingBoxes
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -16,6 +14,8 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
index 9112a8035..5ec3ee3eb 100644
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -4,8 +4,6 @@ from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tupl
 
 import torch
 from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.datapoints import BoundingBoxes
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -14,6 +12,8 @@ from torchvision.prototype.datasets.utils._internal import (
     INFINITE_BUFFER_SIZE,
     path_accessor,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
index 7d1782919..0668ec9fc 100644
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ b/torchvision/prototype/datasets/_builtin/cifar.py
@@ -6,8 +6,6 @@ from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Tuple, U
 
 import numpy as np
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -15,6 +13,8 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
index e28263568..46cef90ef 100644
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ b/torchvision/prototype/datasets/_builtin/clevr.py
@@ -2,7 +2,6 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -12,6 +11,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_accessor,
     path_comparator,
 )
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
index abf19acec..628629e33 100644
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -14,8 +14,6 @@ from torchdata.datapipes.iter import (
     Mapper,
     UnBatcher,
 )
-from torchvision.datapoints import BoundingBoxes, Mask
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -26,6 +24,8 @@ from torchvision.prototype.datasets.utils._internal import (
     path_accessor,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes, Mask
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py
index 0f4b3d769..3308ddb99 100644
--- a/torchvision/prototype/datasets/_builtin/country211.py
+++ b/torchvision/prototype/datasets/_builtin/country211.py
@@ -2,7 +2,6 @@ import pathlib
 from typing import Any, Dict, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -10,6 +9,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
index b301c6ba0..1230c88fb 100644
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -15,8 +15,6 @@ from torchdata.datapipes.iter import (
     Mapper,
 )
 from torchdata.datapipes.map import IterToMapConverter
-from torchvision.datapoints import BoundingBoxes
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -28,6 +26,8 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py
index 6ddab2af7..5b9922a82 100644
--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ b/torchvision/prototype/datasets/_builtin/dtd.py
@@ -3,7 +3,6 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -13,6 +12,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py
index 463eed79d..747f0320e 100644
--- a/torchvision/prototype/datasets/_builtin/eurosat.py
+++ b/torchvision/prototype/datasets/_builtin/eurosat.py
@@ -2,9 +2,9 @@ import pathlib
 from typing import Any, Dict, List, Tuple, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py
index 17f092aa3..e6194ab01 100644
--- a/torchvision/prototype/datasets/_builtin/fer2013.py
+++ b/torchvision/prototype/datasets/_builtin/fer2013.py
@@ -3,10 +3,10 @@ from typing import Any, Dict, List, Union
 
 import torch
 from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py
index f3054d8fb..ed446f342 100644
--- a/torchvision/prototype/datasets/_builtin/food101.py
+++ b/torchvision/prototype/datasets/_builtin/food101.py
@@ -2,7 +2,6 @@ from pathlib import Path
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -12,6 +11,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 34651fcfc..b31793c0f 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -2,8 +2,6 @@ import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.datapoints import BoundingBoxes
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -11,6 +9,8 @@ from torchvision.prototype.datasets.utils._internal import (
     INFINITE_BUFFER_SIZE,
     path_comparator,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py
index 5e2db41e1..ad561c48c 100644
--- a/torchvision/prototype/datasets/_builtin/imagenet.py
+++ b/torchvision/prototype/datasets/_builtin/imagenet.py
@@ -15,7 +15,6 @@ from torchdata.datapipes.iter import (
     TarArchiveLoader,
 )
 from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, ManualDownloadResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -26,6 +25,7 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 8f22a33ae..218b8b330 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -7,11 +7,11 @@ from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Sequence
 
 import torch
 from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE
+from torchvision.prototype.tv_tensors import Label
 from torchvision.prototype.utils._internal import fromfile
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
index fbc7d30c2..9d14a7b9b 100644
--- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
+++ b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
@@ -3,7 +3,6 @@ import pathlib
 from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -14,6 +13,7 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
index 4de5ae276..6eb4118ca 100644
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ b/torchvision/prototype/datasets/_builtin/pcam.py
@@ -4,10 +4,10 @@ from collections import namedtuple
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py
index 92e1b93b4..f10f8e09e 100644
--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ b/torchvision/prototype/datasets/_builtin/semeion.py
@@ -3,10 +3,10 @@ from typing import Any, Dict, List, Tuple, Union
 
 import torch
 from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import OneHotLabel
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.tv_tensors import OneHotLabel
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
index aefbbede2..e6bd6c0cf 100644
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -2,8 +2,6 @@ import pathlib
 from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
 
 from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.datapoints import BoundingBoxes
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     hint_sharding,
@@ -12,6 +10,8 @@ from torchvision.prototype.datasets.utils._internal import (
     read_categories_file,
     read_mat,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py
index 94de4cf42..8c3eb4e1d 100644
--- a/torchvision/prototype/datasets/_builtin/svhn.py
+++ b/torchvision/prototype/datasets/_builtin/svhn.py
@@ -3,10 +3,10 @@ from typing import Any, BinaryIO, Dict, List, Tuple, Union
 
 import numpy as np
 from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py
index b5486669e..27dbba657 100644
--- a/torchvision/prototype/datasets/_builtin/usps.py
+++ b/torchvision/prototype/datasets/_builtin/usps.py
@@ -3,10 +3,10 @@ from typing import Any, Dict, List, Union
 
 import torch
 from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper
-from torchvision.datapoints import Image
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import Image
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
index 53dfbd185..8850b4bcd 100644
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -5,9 +5,7 @@ from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
 from xml.etree import ElementTree
 
 from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.datapoints import BoundingBoxes
 from torchvision.datasets import VOCDetection
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
 from torchvision.prototype.datasets.utils._internal import (
     getitem,
@@ -18,6 +16,8 @@ from torchvision.prototype.datasets.utils._internal import (
     path_comparator,
     read_categories_file,
 )
+from torchvision.prototype.tv_tensors import Label
+from torchvision.tv_tensors import BoundingBoxes
 
 from .._api import register_dataset, register_info
 
diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py
index 0a37df03a..0bdded3e0 100644
--- a/torchvision/prototype/datasets/_folder.py
+++ b/torchvision/prototype/datasets/_folder.py
@@ -5,9 +5,9 @@ import pathlib
 from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union
 
 from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper
-from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedData, EncodedImage
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.tv_tensors import Label
 
 
 __all__ = ["from_data_folder", "from_image_folder"]
diff --git a/torchvision/prototype/datasets/utils/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py
index 8adc1e57a..7a0af4258 100644
--- a/torchvision/prototype/datasets/utils/_encoded.py
+++ b/torchvision/prototype/datasets/utils/_encoded.py
@@ -6,14 +6,14 @@ from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
 
 import PIL.Image
 import torch
-
-from torchvision.datapoints._datapoint import Datapoint
 from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
 
+from torchvision.tv_tensors._tv_tensor import TVTensor
+
 D = TypeVar("D", bound="EncodedData")
 
 
-class EncodedData(Datapoint):
+class EncodedData(TVTensor):
     @classmethod
     def _wrap(cls: Type[D], tensor: torch.Tensor) -> D:
         return tensor.as_subclass(cls)
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index f4013ffa7..f7e5a6be2 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -3,9 +3,9 @@ from typing import Any, cast, Dict, List, Optional, Tuple, Union
 import PIL.Image
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.ops import masks_to_boxes
-from torchvision.prototype import datapoints as proto_datapoints
+from torchvision.prototype import tv_tensors as proto_tv_tensors
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2._utils import is_pure_tensor
 
@@ -26,9 +26,9 @@ class SimpleCopyPaste(Transform):
 
     def _copy_paste(
         self,
-        image: Union[torch.Tensor, datapoints.Image],
+        image: Union[torch.Tensor, tv_tensors.Image],
         target: Dict[str, Any],
-        paste_image: Union[torch.Tensor, datapoints.Image],
+        paste_image: Union[torch.Tensor, tv_tensors.Image],
         paste_target: Dict[str, Any],
         random_selection: torch.Tensor,
         blending: bool,
@@ -36,9 +36,9 @@ class SimpleCopyPaste(Transform):
         antialias: Optional[bool],
     ) -> Tuple[torch.Tensor, Dict[str, Any]]:
 
-        paste_masks = datapoints.wrap(paste_target["masks"][random_selection], like=paste_target["masks"])
-        paste_boxes = datapoints.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"])
-        paste_labels = datapoints.wrap(paste_target["labels"][random_selection], like=paste_target["labels"])
+        paste_masks = tv_tensors.wrap(paste_target["masks"][random_selection], like=paste_target["masks"])
+        paste_boxes = tv_tensors.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"])
+        paste_labels = tv_tensors.wrap(paste_target["labels"][random_selection], like=paste_target["labels"])
 
         masks = target["masks"]
 
@@ -81,7 +81,7 @@ class SimpleCopyPaste(Transform):
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
         boxes = F.convert_bounding_box_format(
-            xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
+            xyxy_boxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
 
@@ -90,7 +90,7 @@ class SimpleCopyPaste(Transform):
 
         # Check for degenerated boxes and remove them
         boxes = F.convert_bounding_box_format(
-            out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY
+            out_target["boxes"], old_format=bbox_format, new_format=tv_tensors.BoundingBoxFormat.XYXY
         )
         degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
         if degenerate_boxes.any():
@@ -104,20 +104,20 @@ class SimpleCopyPaste(Transform):
 
     def _extract_image_targets(
         self, flat_sample: List[Any]
-    ) -> Tuple[List[Union[torch.Tensor, datapoints.Image]], List[Dict[str, Any]]]:
+    ) -> Tuple[List[Union[torch.Tensor, tv_tensors.Image]], List[Dict[str, Any]]]:
         # fetch all images, bboxes, masks and labels from unstructured input
         # with List[image], List[BoundingBoxes], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
         for obj in flat_sample:
-            if isinstance(obj, datapoints.Image) or is_pure_tensor(obj):
+            if isinstance(obj, tv_tensors.Image) or is_pure_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
                 images.append(F.to_image(obj))
-            elif isinstance(obj, datapoints.BoundingBoxes):
+            elif isinstance(obj, tv_tensors.BoundingBoxes):
                 bboxes.append(obj)
-            elif isinstance(obj, datapoints.Mask):
+            elif isinstance(obj, tv_tensors.Mask):
                 masks.append(obj)
-            elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)):
+            elif isinstance(obj, (proto_tv_tensors.Label, proto_tv_tensors.OneHotLabel)):
                 labels.append(obj)
 
         if not (len(images) == len(bboxes) == len(masks) == len(labels)):
@@ -140,8 +140,8 @@ class SimpleCopyPaste(Transform):
     ) -> None:
         c0, c1, c2, c3 = 0, 0, 0, 0
         for i, obj in enumerate(flat_sample):
-            if isinstance(obj, datapoints.Image):
-                flat_sample[i] = datapoints.wrap(output_images[c0], like=obj)
+            if isinstance(obj, tv_tensors.Image):
+                flat_sample[i] = tv_tensors.wrap(output_images[c0], like=obj)
                 c0 += 1
             elif isinstance(obj, PIL.Image.Image):
                 flat_sample[i] = F.to_pil_image(output_images[c0])
@@ -149,14 +149,14 @@ class SimpleCopyPaste(Transform):
             elif is_pure_tensor(obj):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
-            elif isinstance(obj, datapoints.BoundingBoxes):
-                flat_sample[i] = datapoints.wrap(output_targets[c1]["boxes"], like=obj)
+            elif isinstance(obj, tv_tensors.BoundingBoxes):
+                flat_sample[i] = tv_tensors.wrap(output_targets[c1]["boxes"], like=obj)
                 c1 += 1
-            elif isinstance(obj, datapoints.Mask):
-                flat_sample[i] = datapoints.wrap(output_targets[c2]["masks"], like=obj)
+            elif isinstance(obj, tv_tensors.Mask):
+                flat_sample[i] = tv_tensors.wrap(output_targets[c2]["masks"], like=obj)
                 c2 += 1
-            elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)):
-                flat_sample[i] = datapoints.wrap(output_targets[c3]["labels"], like=obj)
+            elif isinstance(obj, (proto_tv_tensors.Label, proto_tv_tensors.OneHotLabel)):
+                flat_sample[i] = tv_tensors.wrap(output_targets[c3]["labels"], like=obj)
                 c3 += 1
 
     def forward(self, *inputs: Any) -> Any:
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 3b7e68781..b04e1fe5a 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -3,8 +3,8 @@ from typing import Any, Dict, List, Optional, Sequence, Type, Union
 import PIL.Image
 import torch
 
-from torchvision import datapoints
-from torchvision.prototype.datapoints import Label, OneHotLabel
+from torchvision import tv_tensors
+from torchvision.prototype.tv_tensors import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import (
     _FillType,
@@ -39,15 +39,15 @@ class FixedSizeCrop(Transform):
         if not has_any(
             flat_inputs,
             PIL.Image.Image,
-            datapoints.Image,
+            tv_tensors.Image,
             is_pure_tensor,
-            datapoints.Video,
+            tv_tensors.Video,
         ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
             )
 
-        if has_any(flat_inputs, datapoints.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel):
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel):
             raise TypeError(
                 f"If a BoundingBoxes is contained in the input sample, "
                 f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
@@ -85,7 +85,7 @@ class FixedSizeCrop(Transform):
             )
             bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size)
             height_and_width = F.convert_bounding_box_format(
-                bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
+                bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYWH
             )[..., 2:]
             is_valid = torch.all(height_and_width > 0, dim=-1)
         else:
@@ -119,10 +119,10 @@ class FixedSizeCrop(Transform):
             )
 
         if params["is_valid"] is not None:
-            if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)):
-                inpt = datapoints.wrap(inpt[params["is_valid"]], like=inpt)
-            elif isinstance(inpt, datapoints.BoundingBoxes):
-                inpt = datapoints.wrap(
+            if isinstance(inpt, (Label, OneHotLabel, tv_tensors.Mask)):
+                inpt = tv_tensors.wrap(inpt[params["is_valid"]], like=inpt)
+            elif isinstance(inpt, tv_tensors.BoundingBoxes):
+                inpt = tv_tensors.wrap(
                     F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size),
                     like=inpt,
                 )
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index fa812bbbb..bab2c7081 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -5,7 +5,7 @@ from typing import Any, Dict, Optional, Sequence, Tuple, Type, TypeVar, Union
 
 import torch
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.v2 import Transform
 
 from torchvision.transforms.v2._utils import is_pure_tensor
@@ -25,17 +25,17 @@ def _get_defaultdict(default: T) -> Dict[Any, T]:
 
 
 class PermuteDimensions(Transform):
-    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
 
     def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None:
         super().__init__()
         if not isinstance(dims, dict):
             dims = _get_defaultdict(dims)
-        if torch.Tensor in dims and any(cls in dims for cls in [datapoints.Image, datapoints.Video]):
+        if torch.Tensor in dims and any(cls in dims for cls in [tv_tensors.Image, tv_tensors.Video]):
             warnings.warn(
-                "Got `dims` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Got `dims` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
                 "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
-                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
             )
         self.dims = dims
 
@@ -47,17 +47,17 @@ class PermuteDimensions(Transform):
 
 
 class TransposeDimensions(Transform):
-    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
 
     def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None:
         super().__init__()
         if not isinstance(dims, dict):
             dims = _get_defaultdict(dims)
-        if torch.Tensor in dims and any(cls in dims for cls in [datapoints.Image, datapoints.Video]):
+        if torch.Tensor in dims and any(cls in dims for cls in [tv_tensors.Image, tv_tensors.Video]):
             warnings.warn(
-                "Got `dims` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Got `dims` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
                 "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
-                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
             )
         self.dims = dims
 
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
index 4cd3cf468..3532abb37 100644
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ b/torchvision/prototype/transforms/_type_conversion.py
@@ -4,23 +4,23 @@ import torch
 
 from torch.nn.functional import one_hot
 
-from torchvision.prototype import datapoints as proto_datapoints
+from torchvision.prototype import tv_tensors as proto_tv_tensors
 from torchvision.transforms.v2 import Transform
 
 
 class LabelToOneHot(Transform):
-    _transformed_types = (proto_datapoints.Label,)
+    _transformed_types = (proto_tv_tensors.Label,)
 
     def __init__(self, num_categories: int = -1):
         super().__init__()
         self.num_categories = num_categories
 
-    def _transform(self, inpt: proto_datapoints.Label, params: Dict[str, Any]) -> proto_datapoints.OneHotLabel:
+    def _transform(self, inpt: proto_tv_tensors.Label, params: Dict[str, Any]) -> proto_tv_tensors.OneHotLabel:
         num_categories = self.num_categories
         if num_categories == -1 and inpt.categories is not None:
             num_categories = len(inpt.categories)
         output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories)
-        return proto_datapoints.OneHotLabel(output, categories=inpt.categories)
+        return proto_tv_tensors.OneHotLabel(output, categories=inpt.categories)
 
     def extra_repr(self) -> str:
         if self.num_categories == -1:
diff --git a/torchvision/prototype/datapoints/__init__.py b/torchvision/prototype/tv_tensors/__init__.py
similarity index 100%
rename from torchvision/prototype/datapoints/__init__.py
rename to torchvision/prototype/tv_tensors/__init__.py
diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/tv_tensors/_label.py
similarity index 95%
rename from torchvision/prototype/datapoints/_label.py
rename to torchvision/prototype/tv_tensors/_label.py
index 10ac1bf82..506c4fb2b 100644
--- a/torchvision/prototype/datapoints/_label.py
+++ b/torchvision/prototype/tv_tensors/_label.py
@@ -5,13 +5,13 @@ from typing import Any, Optional, Sequence, Type, TypeVar, Union
 import torch
 from torch.utils._pytree import tree_map
 
-from torchvision.datapoints._datapoint import Datapoint
+from torchvision.tv_tensors._tv_tensor import TVTensor
 
 
 L = TypeVar("L", bound="_LabelBase")
 
 
-class _LabelBase(Datapoint):
+class _LabelBase(TVTensor):
     categories: Optional[Sequence[str]]
 
     @classmethod
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index a5c983825..ad7fb861b 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch.nn.functional import one_hot
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints, transforms as _transforms
+from torchvision import transforms as _transforms, tv_tensors
 from torchvision.transforms.v2 import functional as F
 
 from ._transform import _RandomApplyTransform, Transform
@@ -91,10 +91,10 @@ class RandomErasing(_RandomApplyTransform):
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
     def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
-        if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
             warnings.warn(
                 f"{type(self).__name__}() is currently passing through inputs of type "
-                f"datapoints.{type(inpt).__name__}. This will likely change in the future."
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
             )
         return super()._call_kernel(functional, inpt, *args, **kwargs)
 
@@ -158,7 +158,7 @@ class _BaseMixUpCutMix(Transform):
         flat_inputs, spec = tree_flatten(inputs)
         needs_transform_list = self._needs_transform_list(flat_inputs)
 
-        if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask):
+        if has_any(flat_inputs, PIL.Image.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask):
             raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.")
 
         labels = self._labels_getter(inputs)
@@ -188,7 +188,7 @@ class _BaseMixUpCutMix(Transform):
         return tree_unflatten(flat_outputs, spec)
 
     def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int):
-        expected_num_dims = 5 if isinstance(inpt, datapoints.Video) else 4
+        expected_num_dims = 5 if isinstance(inpt, tv_tensors.Video) else 4
         if inpt.ndim != expected_num_dims:
             raise ValueError(
                 f"Expected a batched input with {expected_num_dims} dims, but got {inpt.ndim} dimensions instead."
@@ -242,13 +242,13 @@ class MixUp(_BaseMixUpCutMix):
 
         if inpt is params["labels"]:
             return self._mixup_label(inpt, lam=lam)
-        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt):
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
             self._check_image_or_video(inpt, batch_size=params["batch_size"])
 
             output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
 
-            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-                output = datapoints.wrap(output, like=inpt)
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
 
             return output
         else:
@@ -309,7 +309,7 @@ class CutMix(_BaseMixUpCutMix):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if inpt is params["labels"]:
             return self._mixup_label(inpt, lam=params["lam_adjusted"])
-        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt):
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
             self._check_image_or_video(inpt, batch_size=params["batch_size"])
 
             x1, y1, x2, y2 = params["box"]
@@ -317,8 +317,8 @@ class CutMix(_BaseMixUpCutMix):
             output = inpt.clone()
             output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
 
-            if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-                output = datapoints.wrap(output, like=inpt)
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
 
             return output
         else:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 664210ff7..4fec62f1b 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -5,7 +5,7 @@ import PIL.Image
 import torch
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
-from torchvision import datapoints, transforms as _transforms
+from torchvision import transforms as _transforms, tv_tensors
 from torchvision.transforms import _functional_tensor as _FT
 from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
@@ -15,7 +15,7 @@ from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 from ._utils import _get_fill, _setup_fill_arg, check_type, is_pure_tensor
 
 
-ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video]
+ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video]
 
 
 class _AutoAugmentBase(Transform):
@@ -46,7 +46,7 @@ class _AutoAugmentBase(Transform):
     def _flatten_and_extract_image_or_video(
         self,
         inputs: Any,
-        unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBoxes, datapoints.Mask),
+        unsupported_types: Tuple[Type, ...] = (tv_tensors.BoundingBoxes, tv_tensors.Mask),
     ) -> Tuple[Tuple[List[Any], TreeSpec, int], ImageOrVideo]:
         flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
         needs_transform_list = self._needs_transform_list(flat_inputs)
@@ -56,10 +56,10 @@ class _AutoAugmentBase(Transform):
             if needs_transform and check_type(
                 inpt,
                 (
-                    datapoints.Image,
+                    tv_tensors.Image,
                     PIL.Image.Image,
                     is_pure_tensor,
-                    datapoints.Video,
+                    tv_tensors.Video,
                 ),
             ):
                 image_or_videos.append((idx, inpt))
@@ -590,7 +590,7 @@ class AugMix(_AutoAugmentBase):
         augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
 
         orig_dims = list(image_or_video.shape)
-        expected_ndim = 5 if isinstance(orig_image_or_video, datapoints.Video) else 4
+        expected_ndim = 5 if isinstance(orig_image_or_video, tv_tensors.Video) else 4
         batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
         batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
 
@@ -627,8 +627,8 @@ class AugMix(_AutoAugmentBase):
             mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
         mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
 
-        if isinstance(orig_image_or_video, (datapoints.Image, datapoints.Video)):
-            mix = datapoints.wrap(mix, like=orig_image_or_video)
+        if isinstance(orig_image_or_video, (tv_tensors.Image, tv_tensors.Video)):
+            mix = tv_tensors.wrap(mix, like=orig_image_or_video)
         elif isinstance(orig_image_or_video, PIL.Image.Image):
             mix = F.to_pil_image(mix)
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 4f94b37aa..ba3e690dd 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -6,7 +6,7 @@ from typing import Any, Callable, cast, Dict, List, Literal, Optional, Sequence,
 import PIL.Image
 import torch
 
-from torchvision import datapoints, transforms as _transforms
+from torchvision import transforms as _transforms, tv_tensors
 from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
@@ -36,8 +36,8 @@ class RandomHorizontalFlip(_RandomApplyTransform):
 
     .. v2betastatus:: RandomHorizontalFlip transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -56,8 +56,8 @@ class RandomVerticalFlip(_RandomApplyTransform):
 
     .. v2betastatus:: RandomVerticalFlip transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -76,8 +76,8 @@ class Resize(Transform):
 
     .. v2betastatus:: Resize transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -171,8 +171,8 @@ class CenterCrop(Transform):
 
     .. v2betastatus:: CenterCrop transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -199,8 +199,8 @@ class RandomResizedCrop(Transform):
 
     .. v2betastatus:: RandomResizedCrop transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -322,8 +322,8 @@ class FiveCrop(Transform):
 
     .. v2betastatus:: FiveCrop transform
 
-    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
-    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
     For example, the image can have ``[..., C, H, W]`` shape.
 
     .. Note::
@@ -338,15 +338,15 @@ class FiveCrop(Transform):
 
     Example:
         >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
+        ...     def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
         ...         images_or_videos, labels = sample
         ...         batch_size = len(images_or_videos)
         ...         image_or_video = images_or_videos[0]
-        ...         images_or_videos = datapoints.wrap(torch.stack(images_or_videos), like=image_or_video)
+        ...         images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
         ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
         ...         return images_or_videos, labels
         ...
-        >>> image = datapoints.Image(torch.rand(3, 256, 256))
+        >>> image = tv_tensors.Image(torch.rand(3, 256, 256))
         >>> label = 3
         >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
         >>> images, labels = transform(image, label)
@@ -363,10 +363,10 @@ class FiveCrop(Transform):
         self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
 
     def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
-        if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
             warnings.warn(
                 f"{type(self).__name__}() is currently passing through inputs of type "
-                f"datapoints.{type(inpt).__name__}. This will likely change in the future."
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
             )
         return super()._call_kernel(functional, inpt, *args, **kwargs)
 
@@ -374,7 +374,7 @@ class FiveCrop(Transform):
         return self._call_kernel(F.five_crop, inpt, self.size)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
             raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
 
 
@@ -384,8 +384,8 @@ class TenCrop(Transform):
 
     .. v2betastatus:: TenCrop transform
 
-    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
-    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
     For example, the image can have ``[..., C, H, W]`` shape.
 
     See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
@@ -410,15 +410,15 @@ class TenCrop(Transform):
         self.vertical_flip = vertical_flip
 
     def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
-        if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)):
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
             warnings.warn(
                 f"{type(self).__name__}() is currently passing through inputs of type "
-                f"datapoints.{type(inpt).__name__}. This will likely change in the future."
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
             )
         return super()._call_kernel(functional, inpt, *args, **kwargs)
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
             raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -430,8 +430,8 @@ class Pad(Transform):
 
     .. v2betastatus:: Pad transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -447,7 +447,7 @@ class Pad(Transform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
         padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
             Default is "constant".
@@ -515,8 +515,8 @@ class RandomZoomOut(_RandomApplyTransform):
         output_width = input_width * r
         output_height = input_height * r
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -524,7 +524,7 @@ class RandomZoomOut(_RandomApplyTransform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
         side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
             scale the input size.
@@ -574,8 +574,8 @@ class RandomRotation(Transform):
 
     .. v2betastatus:: RandomRotation transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -596,7 +596,7 @@ class RandomRotation(Transform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
 
     .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
@@ -648,8 +648,8 @@ class RandomAffine(Transform):
 
     .. v2betastatus:: RandomAffine transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -676,7 +676,7 @@ class RandomAffine(Transform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
@@ -770,8 +770,8 @@ class RandomCrop(Transform):
 
     .. v2betastatus:: RandomCrop transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -794,7 +794,7 @@ class RandomCrop(Transform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
         padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
             Default is constant.
@@ -927,8 +927,8 @@ class RandomPerspective(_RandomApplyTransform):
 
     .. v2betastatus:: RandomPerspective transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -943,7 +943,7 @@ class RandomPerspective(_RandomApplyTransform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
     """
 
@@ -1014,8 +1014,8 @@ class ElasticTransform(Transform):
 
     .. v2betastatus:: RandomPerspective transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1046,7 +1046,7 @@ class ElasticTransform(Transform):
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
-            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
             ``Mask`` will be filled with 0.
     """
 
@@ -1107,15 +1107,15 @@ class RandomIoUCrop(Transform):
 
     .. v2betastatus:: RandomIoUCrop transform
 
-    This transformation requires an image or video data and ``datapoints.BoundingBoxes`` in the input.
+    This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
 
     .. warning::
         In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
         must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
         after or later in the transforms pipeline.
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1152,8 +1152,8 @@ class RandomIoUCrop(Transform):
 
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
-            has_all(flat_inputs, datapoints.BoundingBoxes)
-            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_pure_tensor)
+            has_all(flat_inputs, tv_tensors.BoundingBoxes)
+            and has_any(flat_inputs, PIL.Image.Image, tv_tensors.Image, is_pure_tensor)
         ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
@@ -1193,7 +1193,7 @@ class RandomIoUCrop(Transform):
                 xyxy_bboxes = F.convert_bounding_box_format(
                     bboxes.as_subclass(torch.Tensor),
                     bboxes.format,
-                    datapoints.BoundingBoxFormat.XYXY,
+                    tv_tensors.BoundingBoxFormat.XYXY,
                 )
                 cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
                 cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
@@ -1221,7 +1221,7 @@ class RandomIoUCrop(Transform):
             F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
         )
 
-        if isinstance(output, datapoints.BoundingBoxes):
+        if isinstance(output, tv_tensors.BoundingBoxes):
             # We "mark" the invalid boxes as degenreate, and they can be
             # removed by a later call to SanitizeBoundingBoxes()
             output[~params["is_within_crop_area"]] = 0
@@ -1235,8 +1235,8 @@ class ScaleJitter(Transform):
 
     .. v2betastatus:: ScaleJitter transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1303,8 +1303,8 @@ class RandomShortestSize(Transform):
 
     .. v2betastatus:: RandomShortestSize transform
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
@@ -1384,8 +1384,8 @@ class RandomResize(Transform):
         output_width = size
         output_height = size
 
-    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
-    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
     it can have arbitrary number of leading batch dimensions. For example,
     the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index e81b6b138..9fa31ebef 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Union
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.v2 import functional as F, Transform
 
 
@@ -10,20 +10,20 @@ class ConvertBoundingBoxFormat(Transform):
     .. v2betastatus:: ConvertBoundingBoxFormat transform
 
     Args:
-        format (str or datapoints.BoundingBoxFormat): output bounding box format.
-            Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and
+        format (str or tv_tensors.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.tv_tensors.BoundingBoxFormat` and
             string values match the enums, e.g. "XYXY" or "XYWH" etc.
     """
 
-    _transformed_types = (datapoints.BoundingBoxes,)
+    _transformed_types = (tv_tensors.BoundingBoxes,)
 
-    def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
+    def __init__(self, format: Union[str, tv_tensors.BoundingBoxFormat]) -> None:
         super().__init__()
         if isinstance(format, str):
-            format = datapoints.BoundingBoxFormat[format]
+            format = tv_tensors.BoundingBoxFormat[format]
         self.format = format
 
-    def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
+    def _transform(self, inpt: tv_tensors.BoundingBoxes, params: Dict[str, Any]) -> tv_tensors.BoundingBoxes:
         return F.convert_bounding_box_format(inpt, new_format=self.format)  # type: ignore[return-value]
 
 
@@ -36,7 +36,7 @@ class ClampBoundingBoxes(Transform):
 
     """
 
-    _transformed_types = (datapoints.BoundingBoxes,)
+    _transformed_types = (tv_tensors.BoundingBoxes,)
 
-    def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
+    def _transform(self, inpt: tv_tensors.BoundingBoxes, params: Dict[str, Any]) -> tv_tensors.BoundingBoxes:
         return F.clamp_bounding_boxes(inpt)  # type: ignore[return-value]
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index c17530ecf..739f2fb7f 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -6,7 +6,7 @@ import PIL.Image
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
-from torchvision import datapoints, transforms as _transforms
+from torchvision import transforms as _transforms, tv_tensors
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
@@ -74,7 +74,7 @@ class LinearTransformation(Transform):
 
     _v1_transform_cls = _transforms.LinearTransformation
 
-    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
 
     def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
         super().__init__()
@@ -129,8 +129,8 @@ class LinearTransformation(Transform):
         output = torch.mm(flat_inpt, transformation_matrix)
         output = output.reshape(shape)
 
-        if isinstance(inpt, (datapoints.Image, datapoints.Video)):
-            output = datapoints.wrap(output, like=inpt)
+        if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+            output = tv_tensors.wrap(output, like=inpt)
         return output
 
 
@@ -227,12 +227,12 @@ class ToDtype(Transform):
         ``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``.
 
     Args:
-        dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
+        dtype (``torch.dtype`` or dict of ``TVTensor`` -> ``torch.dtype``): The dtype to convert to.
             If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted
             to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`.
-            A dict can be passed to specify per-datapoint conversions, e.g.
-            ``dtype={datapoints.Image: torch.float32, datapoints.Mask: torch.int64, "others":None}``. The "others"
-            key can be used as a catch-all for any other datapoint type, and ``None`` means no conversion.
+            A dict can be passed to specify per-tv_tensor conversions, e.g.
+            ``dtype={tv_tensors.Image: torch.float32, tv_tensors.Mask: torch.int64, "others":None}``. The "others"
+            key can be used as a catch-all for any other tv_tensor type, and ``None`` means no conversion.
         scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
             Default: ``False``.
     """
@@ -250,12 +250,12 @@ class ToDtype(Transform):
         if (
             isinstance(dtype, dict)
             and torch.Tensor in dtype
-            and any(cls in dtype for cls in [datapoints.Image, datapoints.Video])
+            and any(cls in dtype for cls in [tv_tensors.Image, tv_tensors.Video])
         ):
             warnings.warn(
-                "Got `dtype` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. "
+                "Got `dtype` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
                 "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
-                "in case a `datapoints.Image` or `datapoints.Video` is present in the input."
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
             )
         self.dtype = dtype
         self.scale = scale
@@ -264,7 +264,7 @@ class ToDtype(Transform):
         if isinstance(self.dtype, torch.dtype):
             # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
             # is a simple torch.dtype
-            if not is_pure_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            if not is_pure_tensor(inpt) and not isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
                 return inpt
 
             dtype: Optional[torch.dtype] = self.dtype
@@ -278,10 +278,10 @@ class ToDtype(Transform):
                 "If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. "
                 "If you're passing a dict as dtype, "
                 'you can use "others" as a catch-all key '
-                'e.g. dtype={datapoints.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
+                'e.g. dtype={tv_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
             )
 
-        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video))
+        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (tv_tensors.Image, tv_tensors.Video))
         if dtype is None:
             if self.scale and supports_scaling:
                 warnings.warn(
@@ -389,10 +389,10 @@ class SanitizeBoundingBoxes(Transform):
             )
 
         boxes = cast(
-            datapoints.BoundingBoxes,
+            tv_tensors.BoundingBoxes,
             F.convert_bounding_box_format(
                 boxes,
-                new_format=datapoints.BoundingBoxFormat.XYXY,
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
             ),
         )
         ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
@@ -415,7 +415,7 @@ class SanitizeBoundingBoxes(Transform):
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         is_label = inpt is not None and inpt is params["labels"]
-        is_bounding_boxes_or_mask = isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask))
+        is_bounding_boxes_or_mask = isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask))
 
         if not (is_label or is_bounding_boxes_or_mask):
             return inpt
@@ -425,4 +425,4 @@ class SanitizeBoundingBoxes(Transform):
         if is_label:
             return output
 
-        return datapoints.wrap(output, like=inpt)
+        return tv_tensors.wrap(output, like=inpt)
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index f377c822a..b7eced5a2 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.v2._utils import check_type, has_any, is_pure_tensor
 from torchvision.utils import _log_api_usage_once
 
@@ -56,8 +56,8 @@ class Transform(nn.Module):
 
     def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]:
         # Below is a heuristic on how to deal with pure tensor inputs:
-        # 1. Pure tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
-        #    (`datapoints.Image` or `PIL.Image.Image`) or video (`datapoints.Video`) in the sample.
+        # 1. Pure tensors, i.e. tensors that are not a tv_tensor, are passed through if there is an explicit image
+        #    (`tv_tensors.Image` or `PIL.Image.Image`) or video (`tv_tensors.Video`) in the sample.
         # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
         #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
         #    of `tree_flatten`, which recurses depth-first through the input.
@@ -72,7 +72,7 @@ class Transform(nn.Module):
         # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
 
         needs_transform_list = []
-        transform_pure_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
+        transform_pure_tensor = not has_any(flat_inputs, tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)
         for inpt in flat_inputs:
             needs_transform = True
 
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index e92c98e6c..c909a17be 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -4,7 +4,7 @@ import numpy as np
 import PIL.Image
 import torch
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.v2 import functional as F, Transform
 
 from torchvision.transforms.v2._utils import is_pure_tensor
@@ -27,7 +27,7 @@ class PILToTensor(Transform):
 
 
 class ToImage(Transform):
-    """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
+    """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
     ; this does not scale values.
 
     .. v2betastatus:: ToImage transform
@@ -39,7 +39,7 @@ class ToImage(Transform):
 
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
-    ) -> datapoints.Image:
+    ) -> tv_tensors.Image:
         return F.to_image(inpt)
 
 
@@ -66,7 +66,7 @@ class ToPILImage(Transform):
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
 
-    _transformed_types = (is_pure_tensor, datapoints.Image, np.ndarray)
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:
         super().__init__()
@@ -79,14 +79,14 @@ class ToPILImage(Transform):
 
 
 class ToPureTensor(Transform):
-    """[BETA] Convert all datapoints to pure tensors, removing associated metadata (if any).
+    """[BETA] Convert all tv_tensors to pure tensors, removing associated metadata (if any).
 
     .. v2betastatus:: ToPureTensor transform
 
     This doesn't scale or change the values, only the type.
     """
 
-    _transformed_types = (datapoints.Datapoint,)
+    _transformed_types = (tv_tensors.TVTensor,)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
         return inpt.as_subclass(torch.Tensor)
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index 6b327d45c..d5669f573 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -9,7 +9,7 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple
 import PIL.Image
 import torch
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 
 from torchvision._utils import sequence_to_str
 
@@ -149,10 +149,10 @@ def _parse_labels_getter(
         raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.")
 
 
-def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
+def get_bounding_boxes(flat_inputs: List[Any]) -> tv_tensors.BoundingBoxes:
     # This assumes there is only one bbox per sample as per the general convention
     try:
-        return next(inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes))
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, tv_tensors.BoundingBoxes))
     except StopIteration:
         raise ValueError("No bounding boxes were found in the sample")
 
@@ -161,7 +161,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs
-        if check_type(inpt, (is_pure_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
+        if check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -179,11 +179,11 @@ def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
             inpt,
             (
                 is_pure_tensor,
-                datapoints.Image,
+                tv_tensors.Image,
                 PIL.Image.Image,
-                datapoints.Video,
-                datapoints.Mask,
-                datapoints.BoundingBoxes,
+                tv_tensors.Video,
+                tv_tensors.Mask,
+                tv_tensors.BoundingBoxes,
             ),
         )
     }
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index bc6c4030b..c9b1f3951 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -1,7 +1,7 @@
 import PIL.Image
 
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 from torchvision.utils import _log_api_usage_once
 
@@ -28,7 +28,7 @@ def erase(
 
 
 @_register_kernel_internal(erase, torch.Tensor)
-@_register_kernel_internal(erase, datapoints.Image)
+@_register_kernel_internal(erase, tv_tensors.Image)
 def erase_image(
     image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> torch.Tensor:
@@ -48,7 +48,7 @@ def _erase_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
-@_register_kernel_internal(erase, datapoints.Video)
+@_register_kernel_internal(erase, tv_tensors.Video)
 def erase_video(
     video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
 ) -> torch.Tensor:
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
index f5c3fa69e..33ac7bc3b 100644
--- a/torchvision/transforms/v2/functional/_color.py
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -3,7 +3,7 @@ from typing import List
 import PIL.Image
 import torch
 from torch.nn.functional import conv2d
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms import _functional_pil as _FP
 from torchvision.transforms._functional_tensor import _max_value
 
@@ -47,7 +47,7 @@ def _rgb_to_grayscale_image(
 
 
 @_register_kernel_internal(rgb_to_grayscale, torch.Tensor)
-@_register_kernel_internal(rgb_to_grayscale, datapoints.Image)
+@_register_kernel_internal(rgb_to_grayscale, tv_tensors.Image)
 def rgb_to_grayscale_image(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
     if num_output_channels not in (1, 3):
         raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
@@ -82,7 +82,7 @@ def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Ten
 
 
 @_register_kernel_internal(adjust_brightness, torch.Tensor)
-@_register_kernel_internal(adjust_brightness, datapoints.Image)
+@_register_kernel_internal(adjust_brightness, tv_tensors.Image)
 def adjust_brightness_image(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     if brightness_factor < 0:
         raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
@@ -102,7 +102,7 @@ def _adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: floa
     return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
 
 
-@_register_kernel_internal(adjust_brightness, datapoints.Video)
+@_register_kernel_internal(adjust_brightness, tv_tensors.Video)
 def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
     return adjust_brightness_image(video, brightness_factor=brightness_factor)
 
@@ -119,7 +119,7 @@ def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Ten
 
 
 @_register_kernel_internal(adjust_saturation, torch.Tensor)
-@_register_kernel_internal(adjust_saturation, datapoints.Image)
+@_register_kernel_internal(adjust_saturation, tv_tensors.Image)
 def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     if saturation_factor < 0:
         raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
@@ -141,7 +141,7 @@ def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> to
 _adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
 
 
-@_register_kernel_internal(adjust_saturation, datapoints.Video)
+@_register_kernel_internal(adjust_saturation, tv_tensors.Video)
 def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
     return adjust_saturation_image(video, saturation_factor=saturation_factor)
 
@@ -158,7 +158,7 @@ def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
 
 
 @_register_kernel_internal(adjust_contrast, torch.Tensor)
-@_register_kernel_internal(adjust_contrast, datapoints.Image)
+@_register_kernel_internal(adjust_contrast, tv_tensors.Image)
 def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     if contrast_factor < 0:
         raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
@@ -180,7 +180,7 @@ def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.
 _adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
 
 
-@_register_kernel_internal(adjust_contrast, datapoints.Video)
+@_register_kernel_internal(adjust_contrast, tv_tensors.Video)
 def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
     return adjust_contrast_image(video, contrast_factor=contrast_factor)
 
@@ -197,7 +197,7 @@ def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tenso
 
 
 @_register_kernel_internal(adjust_sharpness, torch.Tensor)
-@_register_kernel_internal(adjust_sharpness, datapoints.Image)
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Image)
 def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     num_channels, height, width = image.shape[-3:]
     if num_channels not in (1, 3):
@@ -253,7 +253,7 @@ def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torc
 _adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
 
 
-@_register_kernel_internal(adjust_sharpness, datapoints.Video)
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Video)
 def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
     return adjust_sharpness_image(video, sharpness_factor=sharpness_factor)
 
@@ -340,7 +340,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
 
 
 @_register_kernel_internal(adjust_hue, torch.Tensor)
-@_register_kernel_internal(adjust_hue, datapoints.Image)
+@_register_kernel_internal(adjust_hue, tv_tensors.Image)
 def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
     if not (-0.5 <= hue_factor <= 0.5):
         raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
@@ -371,7 +371,7 @@ def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
 _adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
 
 
-@_register_kernel_internal(adjust_hue, datapoints.Video)
+@_register_kernel_internal(adjust_hue, tv_tensors.Video)
 def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
     return adjust_hue_image(video, hue_factor=hue_factor)
 
@@ -388,7 +388,7 @@ def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Ten
 
 
 @_register_kernel_internal(adjust_gamma, torch.Tensor)
-@_register_kernel_internal(adjust_gamma, datapoints.Image)
+@_register_kernel_internal(adjust_gamma, tv_tensors.Image)
 def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
     if gamma < 0:
         raise ValueError("Gamma should be a non-negative real number")
@@ -411,7 +411,7 @@ def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) ->
 _adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
 
 
-@_register_kernel_internal(adjust_gamma, datapoints.Video)
+@_register_kernel_internal(adjust_gamma, tv_tensors.Video)
 def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
     return adjust_gamma_image(video, gamma=gamma, gain=gain)
 
@@ -428,7 +428,7 @@ def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
 
 
 @_register_kernel_internal(posterize, torch.Tensor)
-@_register_kernel_internal(posterize, datapoints.Image)
+@_register_kernel_internal(posterize, tv_tensors.Image)
 def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor:
     if image.is_floating_point():
         levels = 1 << bits
@@ -445,7 +445,7 @@ def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor:
 _posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
 
 
-@_register_kernel_internal(posterize, datapoints.Video)
+@_register_kernel_internal(posterize, tv_tensors.Video)
 def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
     return posterize_image(video, bits=bits)
 
@@ -462,7 +462,7 @@ def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
 
 
 @_register_kernel_internal(solarize, torch.Tensor)
-@_register_kernel_internal(solarize, datapoints.Image)
+@_register_kernel_internal(solarize, tv_tensors.Image)
 def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor:
     if threshold > _max_value(image.dtype):
         raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
@@ -473,7 +473,7 @@ def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor:
 _solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
 
 
-@_register_kernel_internal(solarize, datapoints.Video)
+@_register_kernel_internal(solarize, tv_tensors.Video)
 def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
     return solarize_image(video, threshold=threshold)
 
@@ -490,7 +490,7 @@ def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
 
 
 @_register_kernel_internal(autocontrast, torch.Tensor)
-@_register_kernel_internal(autocontrast, datapoints.Image)
+@_register_kernel_internal(autocontrast, tv_tensors.Image)
 def autocontrast_image(image: torch.Tensor) -> torch.Tensor:
     c = image.shape[-3]
     if c not in [1, 3]:
@@ -523,7 +523,7 @@ def autocontrast_image(image: torch.Tensor) -> torch.Tensor:
 _autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
 
 
-@_register_kernel_internal(autocontrast, datapoints.Video)
+@_register_kernel_internal(autocontrast, tv_tensors.Video)
 def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
     return autocontrast_image(video)
 
@@ -540,7 +540,7 @@ def equalize(inpt: torch.Tensor) -> torch.Tensor:
 
 
 @_register_kernel_internal(equalize, torch.Tensor)
-@_register_kernel_internal(equalize, datapoints.Image)
+@_register_kernel_internal(equalize, tv_tensors.Image)
 def equalize_image(image: torch.Tensor) -> torch.Tensor:
     if image.numel() == 0:
         return image
@@ -613,7 +613,7 @@ def equalize_image(image: torch.Tensor) -> torch.Tensor:
 _equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
 
 
-@_register_kernel_internal(equalize, datapoints.Video)
+@_register_kernel_internal(equalize, tv_tensors.Video)
 def equalize_video(video: torch.Tensor) -> torch.Tensor:
     return equalize_image(video)
 
@@ -630,7 +630,7 @@ def invert(inpt: torch.Tensor) -> torch.Tensor:
 
 
 @_register_kernel_internal(invert, torch.Tensor)
-@_register_kernel_internal(invert, datapoints.Image)
+@_register_kernel_internal(invert, tv_tensors.Image)
 def invert_image(image: torch.Tensor) -> torch.Tensor:
     if image.is_floating_point():
         return 1.0 - image
@@ -644,7 +644,7 @@ def invert_image(image: torch.Tensor) -> torch.Tensor:
 _invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
 
 
-@_register_kernel_internal(invert, datapoints.Video)
+@_register_kernel_internal(invert, tv_tensors.Video)
 def invert_video(video: torch.Tensor) -> torch.Tensor:
     return invert_image(video)
 
@@ -653,7 +653,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor
     """Permute the channels of the input according to the given permutation.
 
     This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and
-    :class:`torchvision.datapoints.Image` and :class:`torchvision.datapoints.Video`.
+    :class:`torchvision.tv_tensors.Image` and :class:`torchvision.tv_tensors.Video`.
 
     Example:
         >>> rgb_image = torch.rand(3, 256, 256)
@@ -681,7 +681,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor
 
 
 @_register_kernel_internal(permute_channels, torch.Tensor)
-@_register_kernel_internal(permute_channels, datapoints.Image)
+@_register_kernel_internal(permute_channels, tv_tensors.Image)
 def permute_channels_image(image: torch.Tensor, permutation: List[int]) -> torch.Tensor:
     shape = image.shape
     num_channels, height, width = shape[-3:]
@@ -704,6 +704,6 @@ def _permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int])
     return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation))
 
 
-@_register_kernel_internal(permute_channels, datapoints.Video)
+@_register_kernel_internal(permute_channels, tv_tensors.Video)
 def permute_channels_video(video: torch.Tensor, permutation: List[int]) -> torch.Tensor:
     return permute_channels_image(video, permutation=permutation)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index abc3716cf..7838d7e3e 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -7,7 +7,7 @@ import PIL.Image
 import torch
 from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms import _functional_pil as _FP
 from torchvision.transforms._functional_tensor import _pad_symmetric
 from torchvision.transforms.functional import (
@@ -51,7 +51,7 @@ def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
 
 
 @_register_kernel_internal(horizontal_flip, torch.Tensor)
-@_register_kernel_internal(horizontal_flip, datapoints.Image)
+@_register_kernel_internal(horizontal_flip, tv_tensors.Image)
 def horizontal_flip_image(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-1)
 
@@ -61,37 +61,37 @@ def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
     return _FP.hflip(image)
 
 
-@_register_kernel_internal(horizontal_flip, datapoints.Mask)
+@_register_kernel_internal(horizontal_flip, tv_tensors.Mask)
 def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image(mask)
 
 
 def horizontal_flip_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int]
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_boxes.shape
 
     bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
 
-    if format == datapoints.BoundingBoxFormat.XYXY:
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
         bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_()
-    elif format == datapoints.BoundingBoxFormat.XYWH:
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
         bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_()
-    else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWH:
         bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
 
     return bounding_boxes.reshape(shape)
 
 
-@_register_kernel_internal(horizontal_flip, datapoints.BoundingBoxes, datapoint_wrapper=False)
-def _horizontal_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> datapoints.BoundingBoxes:
+@_register_kernel_internal(horizontal_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _horizontal_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
     output = horizontal_flip_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(horizontal_flip, datapoints.Video)
+@_register_kernel_internal(horizontal_flip, tv_tensors.Video)
 def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
     return horizontal_flip_image(video)
 
@@ -108,7 +108,7 @@ def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
 
 
 @_register_kernel_internal(vertical_flip, torch.Tensor)
-@_register_kernel_internal(vertical_flip, datapoints.Image)
+@_register_kernel_internal(vertical_flip, tv_tensors.Image)
 def vertical_flip_image(image: torch.Tensor) -> torch.Tensor:
     return image.flip(-2)
 
@@ -118,37 +118,37 @@ def _vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
     return _FP.vflip(image)
 
 
-@_register_kernel_internal(vertical_flip, datapoints.Mask)
+@_register_kernel_internal(vertical_flip, tv_tensors.Mask)
 def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
     return vertical_flip_image(mask)
 
 
 def vertical_flip_bounding_boxes(
-    bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int]
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_boxes.shape
 
     bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
 
-    if format == datapoints.BoundingBoxFormat.XYXY:
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
         bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_()
-    elif format == datapoints.BoundingBoxFormat.XYWH:
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
         bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_()
-    else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWH:
         bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
 
     return bounding_boxes.reshape(shape)
 
 
-@_register_kernel_internal(vertical_flip, datapoints.BoundingBoxes, datapoint_wrapper=False)
-def _vertical_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> datapoints.BoundingBoxes:
+@_register_kernel_internal(vertical_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _vertical_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
     output = vertical_flip_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(vertical_flip, datapoints.Video)
+@_register_kernel_internal(vertical_flip, tv_tensors.Video)
 def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
     return vertical_flip_image(video)
 
@@ -190,7 +190,7 @@ def resize(
 
 
 @_register_kernel_internal(resize, torch.Tensor)
-@_register_kernel_internal(resize, datapoints.Image)
+@_register_kernel_internal(resize, tv_tensors.Image)
 def resize_image(
     image: torch.Tensor,
     size: List[int],
@@ -319,12 +319,12 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
     return output
 
 
-@_register_kernel_internal(resize, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(resize, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _resize_mask_dispatch(
-    inpt: datapoints.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any
-) -> datapoints.Mask:
+    inpt: tv_tensors.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.Mask:
     output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size)
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
 def resize_bounding_boxes(
@@ -345,17 +345,17 @@ def resize_bounding_boxes(
     )
 
 
-@_register_kernel_internal(resize, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(resize, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _resize_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.BoundingBoxes:
     output, canvas_size = resize_bounding_boxes(
         inpt.as_subclass(torch.Tensor), inpt.canvas_size, size, max_size=max_size
     )
-    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
-@_register_kernel_internal(resize, datapoints.Video)
+@_register_kernel_internal(resize, tv_tensors.Video)
 def resize_video(
     video: torch.Tensor,
     size: List[int],
@@ -651,7 +651,7 @@ def _affine_grid(
 
 
 @_register_kernel_internal(affine, torch.Tensor)
-@_register_kernel_internal(affine, datapoints.Image)
+@_register_kernel_internal(affine, tv_tensors.Image)
 def affine_image(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -730,7 +730,7 @@ def _affine_image_pil(
 
 def _affine_bounding_boxes_with_expand(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
@@ -749,7 +749,7 @@ def _affine_bounding_boxes_with_expand(
     device = bounding_boxes.device
     bounding_boxes = (
         convert_bounding_box_format(
-            bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+            bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY, inplace=True
         )
     ).reshape(-1, 4)
 
@@ -808,9 +808,9 @@ def _affine_bounding_boxes_with_expand(
         new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
         canvas_size = (new_height, new_width)
 
-    out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size)
+    out_bboxes = clamp_bounding_boxes(out_bboxes, format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size)
     out_bboxes = convert_bounding_box_format(
-        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
     out_bboxes = out_bboxes.to(original_dtype)
@@ -819,7 +819,7 @@ def _affine_bounding_boxes_with_expand(
 
 def affine_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
@@ -841,16 +841,16 @@ def affine_bounding_boxes(
     return out_box
 
 
-@_register_kernel_internal(affine, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(affine, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _affine_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes,
+    inpt: tv_tensors.BoundingBoxes,
     angle: Union[int, float],
     translate: List[float],
     scale: float,
     shear: List[float],
     center: Optional[List[float]] = None,
     **kwargs,
-) -> datapoints.BoundingBoxes:
+) -> tv_tensors.BoundingBoxes:
     output = affine_bounding_boxes(
         inpt.as_subclass(torch.Tensor),
         format=inpt.format,
@@ -861,7 +861,7 @@ def _affine_bounding_boxes_dispatch(
         shear=shear,
         center=center,
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
 def affine_mask(
@@ -896,9 +896,9 @@ def affine_mask(
     return output
 
 
-@_register_kernel_internal(affine, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(affine, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _affine_mask_dispatch(
-    inpt: datapoints.Mask,
+    inpt: tv_tensors.Mask,
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -906,7 +906,7 @@ def _affine_mask_dispatch(
     fill: _FillTypeJIT = None,
     center: Optional[List[float]] = None,
     **kwargs,
-) -> datapoints.Mask:
+) -> tv_tensors.Mask:
     output = affine_mask(
         inpt.as_subclass(torch.Tensor),
         angle=angle,
@@ -916,10 +916,10 @@ def _affine_mask_dispatch(
         fill=fill,
         center=center,
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(affine, datapoints.Video)
+@_register_kernel_internal(affine, tv_tensors.Video)
 def affine_video(
     video: torch.Tensor,
     angle: Union[int, float],
@@ -961,7 +961,7 @@ def rotate(
 
 
 @_register_kernel_internal(rotate, torch.Tensor)
-@_register_kernel_internal(rotate, datapoints.Image)
+@_register_kernel_internal(rotate, tv_tensors.Image)
 def rotate_image(
     image: torch.Tensor,
     angle: float,
@@ -1027,7 +1027,7 @@ def _rotate_image_pil(
 
 def rotate_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     angle: float,
     expand: bool = False,
@@ -1049,10 +1049,10 @@ def rotate_bounding_boxes(
     )
 
 
-@_register_kernel_internal(rotate, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(rotate, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _rotate_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, angle: float, expand: bool = False, center: Optional[List[float]] = None, **kwargs
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, angle: float, expand: bool = False, center: Optional[List[float]] = None, **kwargs
+) -> tv_tensors.BoundingBoxes:
     output, canvas_size = rotate_bounding_boxes(
         inpt.as_subclass(torch.Tensor),
         format=inpt.format,
@@ -1061,7 +1061,7 @@ def _rotate_bounding_boxes_dispatch(
         expand=expand,
         center=center,
     )
-    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 def rotate_mask(
@@ -1092,20 +1092,20 @@ def rotate_mask(
     return output
 
 
-@_register_kernel_internal(rotate, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(rotate, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _rotate_mask_dispatch(
-    inpt: datapoints.Mask,
+    inpt: tv_tensors.Mask,
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
     fill: _FillTypeJIT = None,
     **kwargs,
-) -> datapoints.Mask:
+) -> tv_tensors.Mask:
     output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(rotate, datapoints.Video)
+@_register_kernel_internal(rotate, tv_tensors.Video)
 def rotate_video(
     video: torch.Tensor,
     angle: float,
@@ -1158,7 +1158,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
 
 
 @_register_kernel_internal(pad, torch.Tensor)
-@_register_kernel_internal(pad, datapoints.Image)
+@_register_kernel_internal(pad, tv_tensors.Image)
 def pad_image(
     image: torch.Tensor,
     padding: List[int],
@@ -1260,7 +1260,7 @@ def _pad_with_vector_fill(
 _pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
 
 
-@_register_kernel_internal(pad, datapoints.Mask)
+@_register_kernel_internal(pad, tv_tensors.Mask)
 def pad_mask(
     mask: torch.Tensor,
     padding: List[int],
@@ -1289,7 +1289,7 @@ def pad_mask(
 
 def pad_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     padding: List[int],
     padding_mode: str = "constant",
@@ -1300,7 +1300,7 @@ def pad_bounding_boxes(
 
     left, right, top, bottom = _parse_pad_padding(padding)
 
-    if format == datapoints.BoundingBoxFormat.XYXY:
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
         pad = [left, top, left, top]
     else:
         pad = [left, top, 0, 0]
@@ -1314,10 +1314,10 @@ def pad_bounding_boxes(
     return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
 
 
-@_register_kernel_internal(pad, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(pad, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _pad_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, padding: List[int], padding_mode: str = "constant", **kwargs
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, padding: List[int], padding_mode: str = "constant", **kwargs
+) -> tv_tensors.BoundingBoxes:
     output, canvas_size = pad_bounding_boxes(
         inpt.as_subclass(torch.Tensor),
         format=inpt.format,
@@ -1325,10 +1325,10 @@ def _pad_bounding_boxes_dispatch(
         padding=padding,
         padding_mode=padding_mode,
     )
-    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
-@_register_kernel_internal(pad, datapoints.Video)
+@_register_kernel_internal(pad, tv_tensors.Video)
 def pad_video(
     video: torch.Tensor,
     padding: List[int],
@@ -1350,7 +1350,7 @@ def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> to
 
 
 @_register_kernel_internal(crop, torch.Tensor)
-@_register_kernel_internal(crop, datapoints.Image)
+@_register_kernel_internal(crop, tv_tensors.Image)
 def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     h, w = image.shape[-2:]
 
@@ -1375,7 +1375,7 @@ _register_kernel_internal(crop, PIL.Image.Image)(_crop_image_pil)
 
 def crop_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     top: int,
     left: int,
     height: int,
@@ -1383,7 +1383,7 @@ def crop_bounding_boxes(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
 
     # Crop or implicit pad if left and/or top have negative values:
-    if format == datapoints.BoundingBoxFormat.XYXY:
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
         sub = [left, top, left, top]
     else:
         sub = [left, top, 0, 0]
@@ -1394,17 +1394,17 @@ def crop_bounding_boxes(
     return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
 
 
-@_register_kernel_internal(crop, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _crop_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, top: int, left: int, height: int, width: int
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int
+) -> tv_tensors.BoundingBoxes:
     output, canvas_size = crop_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width
     )
-    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
-@_register_kernel_internal(crop, datapoints.Mask)
+@_register_kernel_internal(crop, tv_tensors.Mask)
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1420,7 +1420,7 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int)
     return output
 
 
-@_register_kernel_internal(crop, datapoints.Video)
+@_register_kernel_internal(crop, tv_tensors.Video)
 def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
     return crop_image(video, top, left, height, width)
 
@@ -1505,7 +1505,7 @@ def _perspective_coefficients(
 
 
 @_register_kernel_internal(perspective, torch.Tensor)
-@_register_kernel_internal(perspective, datapoints.Image)
+@_register_kernel_internal(perspective, tv_tensors.Image)
 def perspective_image(
     image: torch.Tensor,
     startpoints: Optional[List[List[int]]],
@@ -1568,7 +1568,7 @@ def _perspective_image_pil(
 
 def perspective_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
@@ -1582,7 +1582,7 @@ def perspective_bounding_boxes(
     original_shape = bounding_boxes.shape
     # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
     bounding_boxes = (
-        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
@@ -1649,25 +1649,25 @@ def perspective_bounding_boxes(
 
     out_bboxes = clamp_bounding_boxes(
         torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
-        format=datapoints.BoundingBoxFormat.XYXY,
+        format=tv_tensors.BoundingBoxFormat.XYXY,
         canvas_size=canvas_size,
     )
 
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_bounding_box_format(
-        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
-@_register_kernel_internal(perspective, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(perspective, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _perspective_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes,
+    inpt: tv_tensors.BoundingBoxes,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
     **kwargs,
-) -> datapoints.BoundingBoxes:
+) -> tv_tensors.BoundingBoxes:
     output = perspective_bounding_boxes(
         inpt.as_subclass(torch.Tensor),
         format=inpt.format,
@@ -1676,7 +1676,7 @@ def _perspective_bounding_boxes_dispatch(
         endpoints=endpoints,
         coefficients=coefficients,
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
 def perspective_mask(
@@ -1702,15 +1702,15 @@ def perspective_mask(
     return output
 
 
-@_register_kernel_internal(perspective, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(perspective, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _perspective_mask_dispatch(
-    inpt: datapoints.Mask,
+    inpt: tv_tensors.Mask,
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     fill: _FillTypeJIT = None,
     coefficients: Optional[List[float]] = None,
     **kwargs,
-) -> datapoints.Mask:
+) -> tv_tensors.Mask:
     output = perspective_mask(
         inpt.as_subclass(torch.Tensor),
         startpoints=startpoints,
@@ -1718,10 +1718,10 @@ def _perspective_mask_dispatch(
         fill=fill,
         coefficients=coefficients,
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(perspective, datapoints.Video)
+@_register_kernel_internal(perspective, tv_tensors.Video)
 def perspective_video(
     video: torch.Tensor,
     startpoints: Optional[List[List[int]]],
@@ -1755,7 +1755,7 @@ elastic_transform = elastic
 
 
 @_register_kernel_internal(elastic, torch.Tensor)
-@_register_kernel_internal(elastic, datapoints.Image)
+@_register_kernel_internal(elastic, tv_tensors.Image)
 def elastic_image(
     image: torch.Tensor,
     displacement: torch.Tensor,
@@ -1841,7 +1841,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
 
 def elastic_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     displacement: torch.Tensor,
 ) -> torch.Tensor:
@@ -1864,7 +1864,7 @@ def elastic_bounding_boxes(
     original_shape = bounding_boxes.shape
     # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
     bounding_boxes = (
-        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
     id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
@@ -1887,23 +1887,23 @@ def elastic_bounding_boxes(
     out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
     out_bboxes = clamp_bounding_boxes(
         torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
-        format=datapoints.BoundingBoxFormat.XYXY,
+        format=tv_tensors.BoundingBoxFormat.XYXY,
         canvas_size=canvas_size,
     )
 
     return convert_bounding_box_format(
-        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
-@_register_kernel_internal(elastic, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(elastic, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _elastic_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, displacement: torch.Tensor, **kwargs
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, displacement: torch.Tensor, **kwargs
+) -> tv_tensors.BoundingBoxes:
     output = elastic_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, displacement=displacement
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
 def elastic_mask(
@@ -1925,15 +1925,15 @@ def elastic_mask(
     return output
 
 
-@_register_kernel_internal(elastic, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(elastic, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _elastic_mask_dispatch(
-    inpt: datapoints.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
-) -> datapoints.Mask:
+    inpt: tv_tensors.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
+) -> tv_tensors.Mask:
     output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(elastic, datapoints.Video)
+@_register_kernel_internal(elastic, tv_tensors.Video)
 def elastic_video(
     video: torch.Tensor,
     displacement: torch.Tensor,
@@ -1982,7 +1982,7 @@ def _center_crop_compute_crop_anchor(
 
 
 @_register_kernel_internal(center_crop, torch.Tensor)
-@_register_kernel_internal(center_crop, datapoints.Image)
+@_register_kernel_internal(center_crop, tv_tensors.Image)
 def center_crop_image(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
     shape = image.shape
@@ -2021,7 +2021,7 @@ def _center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PI
 
 def center_crop_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     canvas_size: Tuple[int, int],
     output_size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
@@ -2032,17 +2032,17 @@ def center_crop_bounding_boxes(
     )
 
 
-@_register_kernel_internal(center_crop, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(center_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _center_crop_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, output_size: List[int]
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, output_size: List[int]
+) -> tv_tensors.BoundingBoxes:
     output, canvas_size = center_crop_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, output_size=output_size
     )
-    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
-@_register_kernel_internal(center_crop, datapoints.Mask)
+@_register_kernel_internal(center_crop, tv_tensors.Mask)
 def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -2058,7 +2058,7 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor
     return output
 
 
-@_register_kernel_internal(center_crop, datapoints.Video)
+@_register_kernel_internal(center_crop, tv_tensors.Video)
 def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor:
     return center_crop_image(video, output_size)
 
@@ -2102,7 +2102,7 @@ def resized_crop(
 
 
 @_register_kernel_internal(resized_crop, torch.Tensor)
-@_register_kernel_internal(resized_crop, datapoints.Image)
+@_register_kernel_internal(resized_crop, tv_tensors.Image)
 def resized_crop_image(
     image: torch.Tensor,
     top: int,
@@ -2156,7 +2156,7 @@ def _resized_crop_image_pil_dispatch(
 
 def resized_crop_bounding_boxes(
     bounding_boxes: torch.Tensor,
-    format: datapoints.BoundingBoxFormat,
+    format: tv_tensors.BoundingBoxFormat,
     top: int,
     left: int,
     height: int,
@@ -2167,14 +2167,14 @@ def resized_crop_bounding_boxes(
     return resize_bounding_boxes(bounding_boxes, canvas_size=canvas_size, size=size)
 
 
-@_register_kernel_internal(resized_crop, datapoints.BoundingBoxes, datapoint_wrapper=False)
+@_register_kernel_internal(resized_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _resized_crop_bounding_boxes_dispatch(
-    inpt: datapoints.BoundingBoxes, top: int, left: int, height: int, width: int, size: List[int], **kwargs
-) -> datapoints.BoundingBoxes:
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: List[int], **kwargs
+) -> tv_tensors.BoundingBoxes:
     output, canvas_size = resized_crop_bounding_boxes(
         inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width, size=size
     )
-    return datapoints.wrap(output, like=inpt, canvas_size=canvas_size)
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
 
 
 def resized_crop_mask(
@@ -2189,17 +2189,17 @@ def resized_crop_mask(
     return resize_mask(mask, size)
 
 
-@_register_kernel_internal(resized_crop, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(resized_crop, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _resized_crop_mask_dispatch(
-    inpt: datapoints.Mask, top: int, left: int, height: int, width: int, size: List[int], **kwargs
-) -> datapoints.Mask:
+    inpt: tv_tensors.Mask, top: int, left: int, height: int, width: int, size: List[int], **kwargs
+) -> tv_tensors.Mask:
     output = resized_crop_mask(
         inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
     )
-    return datapoints.wrap(output, like=inpt)
+    return tv_tensors.wrap(output, like=inpt)
 
 
-@_register_kernel_internal(resized_crop, datapoints.Video)
+@_register_kernel_internal(resized_crop, tv_tensors.Video)
 def resized_crop_video(
     video: torch.Tensor,
     top: int,
@@ -2243,7 +2243,7 @@ def _parse_five_crop_size(size: List[int]) -> List[int]:
 
 
 @_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor)
-@_register_five_ten_crop_kernel_internal(five_crop, datapoints.Image)
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Image)
 def five_crop_image(
     image: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -2281,7 +2281,7 @@ def _five_crop_image_pil(
     return tl, tr, bl, br, center
 
 
-@_register_five_ten_crop_kernel_internal(five_crop, datapoints.Video)
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Video)
 def five_crop_video(
     video: torch.Tensor, size: List[int]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -2313,7 +2313,7 @@ def ten_crop(
 
 
 @_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor)
-@_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Image)
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Image)
 def ten_crop_image(
     image: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
@@ -2367,7 +2367,7 @@ def _ten_crop_image_pil(
     return non_flipped + flipped
 
 
-@_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Video)
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Video)
 def ten_crop_video(
     video: torch.Tensor, size: List[int], vertical_flip: bool = False
 ) -> Tuple[
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index be57f424b..61e21ef81 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -2,9 +2,9 @@ from typing import List, Optional, Tuple
 
 import PIL.Image
 import torch
-from torchvision import datapoints
-from torchvision.datapoints import BoundingBoxFormat
+from torchvision import tv_tensors
 from torchvision.transforms import _functional_pil as _FP
+from torchvision.tv_tensors import BoundingBoxFormat
 
 from torchvision.utils import _log_api_usage_once
 
@@ -22,7 +22,7 @@ def get_dimensions(inpt: torch.Tensor) -> List[int]:
 
 
 @_register_kernel_internal(get_dimensions, torch.Tensor)
-@_register_kernel_internal(get_dimensions, datapoints.Image, datapoint_wrapper=False)
+@_register_kernel_internal(get_dimensions, tv_tensors.Image, tv_tensor_wrapper=False)
 def get_dimensions_image(image: torch.Tensor) -> List[int]:
     chw = list(image.shape[-3:])
     ndims = len(chw)
@@ -38,7 +38,7 @@ def get_dimensions_image(image: torch.Tensor) -> List[int]:
 _get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
 
 
-@_register_kernel_internal(get_dimensions, datapoints.Video, datapoint_wrapper=False)
+@_register_kernel_internal(get_dimensions, tv_tensors.Video, tv_tensor_wrapper=False)
 def get_dimensions_video(video: torch.Tensor) -> List[int]:
     return get_dimensions_image(video)
 
@@ -54,7 +54,7 @@ def get_num_channels(inpt: torch.Tensor) -> int:
 
 
 @_register_kernel_internal(get_num_channels, torch.Tensor)
-@_register_kernel_internal(get_num_channels, datapoints.Image, datapoint_wrapper=False)
+@_register_kernel_internal(get_num_channels, tv_tensors.Image, tv_tensor_wrapper=False)
 def get_num_channels_image(image: torch.Tensor) -> int:
     chw = image.shape[-3:]
     ndims = len(chw)
@@ -69,7 +69,7 @@ def get_num_channels_image(image: torch.Tensor) -> int:
 _get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
 
 
-@_register_kernel_internal(get_num_channels, datapoints.Video, datapoint_wrapper=False)
+@_register_kernel_internal(get_num_channels, tv_tensors.Video, tv_tensor_wrapper=False)
 def get_num_channels_video(video: torch.Tensor) -> int:
     return get_num_channels_image(video)
 
@@ -90,7 +90,7 @@ def get_size(inpt: torch.Tensor) -> List[int]:
 
 
 @_register_kernel_internal(get_size, torch.Tensor)
-@_register_kernel_internal(get_size, datapoints.Image, datapoint_wrapper=False)
+@_register_kernel_internal(get_size, tv_tensors.Image, tv_tensor_wrapper=False)
 def get_size_image(image: torch.Tensor) -> List[int]:
     hw = list(image.shape[-2:])
     ndims = len(hw)
@@ -106,18 +106,18 @@ def _get_size_image_pil(image: PIL.Image.Image) -> List[int]:
     return [height, width]
 
 
-@_register_kernel_internal(get_size, datapoints.Video, datapoint_wrapper=False)
+@_register_kernel_internal(get_size, tv_tensors.Video, tv_tensor_wrapper=False)
 def get_size_video(video: torch.Tensor) -> List[int]:
     return get_size_image(video)
 
 
-@_register_kernel_internal(get_size, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(get_size, tv_tensors.Mask, tv_tensor_wrapper=False)
 def get_size_mask(mask: torch.Tensor) -> List[int]:
     return get_size_image(mask)
 
 
-@_register_kernel_internal(get_size, datapoints.BoundingBoxes, datapoint_wrapper=False)
-def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]:
+@_register_kernel_internal(get_size, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def get_size_bounding_boxes(bounding_box: tv_tensors.BoundingBoxes) -> List[int]:
     return list(bounding_box.canvas_size)
 
 
@@ -132,7 +132,7 @@ def get_num_frames(inpt: torch.Tensor) -> int:
 
 
 @_register_kernel_internal(get_num_frames, torch.Tensor)
-@_register_kernel_internal(get_num_frames, datapoints.Video, datapoint_wrapper=False)
+@_register_kernel_internal(get_num_frames, tv_tensors.Video, tv_tensor_wrapper=False)
 def get_num_frames_video(video: torch.Tensor) -> int:
     return video.shape[-4]
 
@@ -205,7 +205,7 @@ def convert_bounding_box_format(
 ) -> torch.Tensor:
     """[BETA] See :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat` for details."""
     # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
-    # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
+    # inputs as well as extract it from `tv_tensors.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
     # default error that would be thrown if `new_format` had no default value.
     if new_format is None:
@@ -218,16 +218,16 @@ def convert_bounding_box_format(
         if old_format is None:
             raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
         return _convert_bounding_box_format(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
-    elif isinstance(inpt, datapoints.BoundingBoxes):
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
         if old_format is not None:
-            raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.")
+            raise ValueError("For bounding box tv_tensor inputs, `old_format` must not be passed.")
         output = _convert_bounding_box_format(
             inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
         )
-        return datapoints.wrap(output, like=inpt, format=new_format)
+        return tv_tensors.wrap(output, like=inpt, format=new_format)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
         )
 
 
@@ -239,7 +239,7 @@ def _clamp_bounding_boxes(
     in_dtype = bounding_boxes.dtype
     bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
     xyxy_boxes = convert_bounding_box_format(
-        bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+        bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
@@ -263,12 +263,12 @@ def clamp_bounding_boxes(
         if format is None or canvas_size is None:
             raise ValueError("For pure tensor inputs, `format` and `canvas_size` has to be passed.")
         return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
-    elif isinstance(inpt, datapoints.BoundingBoxes):
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
         if format is not None or canvas_size is not None:
-            raise ValueError("For bounding box datapoint inputs, `format` and `canvas_size` must not be passed.")
+            raise ValueError("For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed.")
         output = _clamp_bounding_boxes(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size)
-        return datapoints.wrap(output, like=inpt)
+        return tv_tensors.wrap(output, like=inpt)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
         )
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index 1ed134b09..2a6468bb4 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -5,7 +5,7 @@ import PIL.Image
 import torch
 from torch.nn.functional import conv2d, pad as torch_pad
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms._functional_tensor import _max_value
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
@@ -31,7 +31,7 @@ def normalize(
 
 
 @_register_kernel_internal(normalize, torch.Tensor)
-@_register_kernel_internal(normalize, datapoints.Image)
+@_register_kernel_internal(normalize, tv_tensors.Image)
 def normalize_image(image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
     if not image.is_floating_point():
         raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
@@ -65,7 +65,7 @@ def normalize_image(image: torch.Tensor, mean: List[float], std: List[float], in
     return image.div_(std)
 
 
-@_register_kernel_internal(normalize, datapoints.Video)
+@_register_kernel_internal(normalize, tv_tensors.Video)
 def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
     return normalize_image(video, mean, std, inplace=inplace)
 
@@ -98,7 +98,7 @@ def _get_gaussian_kernel2d(
 
 
 @_register_kernel_internal(gaussian_blur, torch.Tensor)
-@_register_kernel_internal(gaussian_blur, datapoints.Image)
+@_register_kernel_internal(gaussian_blur, tv_tensors.Image)
 def gaussian_blur_image(
     image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
@@ -172,7 +172,7 @@ def _gaussian_blur_image_pil(
     return to_pil_image(output, mode=image.mode)
 
 
-@_register_kernel_internal(gaussian_blur, datapoints.Video)
+@_register_kernel_internal(gaussian_blur, tv_tensors.Video)
 def gaussian_blur_video(
     video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
 ) -> torch.Tensor:
@@ -206,7 +206,7 @@ def _num_value_bits(dtype: torch.dtype) -> int:
 
 
 @_register_kernel_internal(to_dtype, torch.Tensor)
-@_register_kernel_internal(to_dtype, datapoints.Image)
+@_register_kernel_internal(to_dtype, tv_tensors.Image)
 def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
 
     if image.dtype == dtype:
@@ -265,13 +265,13 @@ def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32)
     return to_dtype_image(image, dtype=dtype, scale=True)
 
 
-@_register_kernel_internal(to_dtype, datapoints.Video)
+@_register_kernel_internal(to_dtype, tv_tensors.Video)
 def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
     return to_dtype_image(video, dtype, scale=scale)
 
 
-@_register_kernel_internal(to_dtype, datapoints.BoundingBoxes, datapoint_wrapper=False)
-@_register_kernel_internal(to_dtype, datapoints.Mask, datapoint_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.Mask, tv_tensor_wrapper=False)
 def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: bool = False) -> torch.Tensor:
-    # We don't need to unwrap and rewrap here, since Datapoint.to() preserves the type
+    # We don't need to unwrap and rewrap here, since TVTensor.to() preserves the type
     return inpt.to(dtype)
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
index 9464adf5f..ca2903bbc 100644
--- a/torchvision/transforms/v2/functional/_temporal.py
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -1,6 +1,6 @@
 import torch
 
-from torchvision import datapoints
+from torchvision import tv_tensors
 
 from torchvision.utils import _log_api_usage_once
 
@@ -19,7 +19,7 @@ def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Te
 
 
 @_register_kernel_internal(uniform_temporal_subsample, torch.Tensor)
-@_register_kernel_internal(uniform_temporal_subsample, datapoints.Video)
+@_register_kernel_internal(uniform_temporal_subsample, tv_tensors.Video)
 def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
     # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
     t_max = video.shape[-4] - 1
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index 4359e0e66..062f85198 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -3,12 +3,12 @@ from typing import Union
 import numpy as np
 import PIL.Image
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 from torchvision.transforms import functional as _F
 
 
 @torch.jit.unused
-def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoints.Image:
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tensors.Image:
     """[BETA] See :class:`~torchvision.transforms.v2.ToImage` for details."""
     if isinstance(inpt, np.ndarray):
         output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous()
@@ -18,7 +18,7 @@ def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoin
         output = inpt
     else:
         raise TypeError(f"Input can either be a numpy array or a PIL image, but got {type(inpt)} instead.")
-    return datapoints.Image(output)
+    return tv_tensors.Image(output)
 
 
 to_pil_image = _F.to_pil_image
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 5a907121b..771833cbe 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -2,21 +2,21 @@ import functools
 from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
 
 import torch
-from torchvision import datapoints
+from torchvision import tv_tensors
 
 _FillType = Union[int, float, Sequence[int], Sequence[float], None]
 _FillTypeJIT = Optional[List[float]]
 
 
 def is_pure_tensor(inpt: Any) -> bool:
-    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint)
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, tv_tensors.TVTensor)
 
 
 # {functional: {input_type: type_specific_kernel}}
 _KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {}
 
 
-def _kernel_datapoint_wrapper(kernel):
+def _kernel_tv_tensor_wrapper(kernel):
     @functools.wraps(kernel)
     def wrapper(inpt, *args, **kwargs):
         # If you're wondering whether we could / should get rid of this wrapper,
@@ -25,24 +25,24 @@ def _kernel_datapoint_wrapper(kernel):
         # regardless of whether we override __torch_function__ in our base class
         # or not.
         # Also, even if we didn't call `as_subclass` here, we would still need
-        # this wrapper to call wrap(), because the Datapoint type would be
+        # this wrapper to call wrap(), because the TVTensor type would be
         # lost after the first operation due to our own __torch_function__
         # logic.
         output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
-        return datapoints.wrap(output, like=inpt)
+        return tv_tensors.wrap(output, like=inpt)
 
     return wrapper
 
 
-def _register_kernel_internal(functional, input_type, *, datapoint_wrapper=True):
+def _register_kernel_internal(functional, input_type, *, tv_tensor_wrapper=True):
     registry = _KERNEL_REGISTRY.setdefault(functional, {})
     if input_type in registry:
         raise ValueError(f"Functional {functional} already has a kernel registered for type {input_type}.")
 
     def decorator(kernel):
         registry[input_type] = (
-            _kernel_datapoint_wrapper(kernel)
-            if issubclass(input_type, datapoints.Datapoint) and datapoint_wrapper
+            _kernel_tv_tensor_wrapper(kernel)
+            if issubclass(input_type, tv_tensors.TVTensor) and tv_tensor_wrapper
             else kernel
         )
         return kernel
@@ -62,14 +62,14 @@ def _name_to_functional(name):
 
 
 _BUILTIN_DATAPOINT_TYPES = {
-    obj for obj in datapoints.__dict__.values() if isinstance(obj, type) and issubclass(obj, datapoints.Datapoint)
+    obj for obj in tv_tensors.__dict__.values() if isinstance(obj, type) and issubclass(obj, tv_tensors.TVTensor)
 }
 
 
-def register_kernel(functional, datapoint_cls):
-    """[BETA] Decorate a kernel to register it for a functional and a (custom) datapoint type.
+def register_kernel(functional, tv_tensor_cls):
+    """[BETA] Decorate a kernel to register it for a functional and a (custom) tv_tensor type.
 
-    See :ref:`sphx_glr_auto_examples_transforms_plot_custom_datapoints.py` for usage
+    See :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for usage
     details.
     """
     if isinstance(functional, str):
@@ -83,16 +83,16 @@ def register_kernel(functional, datapoint_cls):
             f"but got {functional}."
         )
 
-    if not (isinstance(datapoint_cls, type) and issubclass(datapoint_cls, datapoints.Datapoint)):
+    if not (isinstance(tv_tensor_cls, type) and issubclass(tv_tensor_cls, tv_tensors.TVTensor)):
         raise ValueError(
-            f"Kernels can only be registered for subclasses of torchvision.datapoints.Datapoint, "
-            f"but got {datapoint_cls}."
+            f"Kernels can only be registered for subclasses of torchvision.tv_tensors.TVTensor, "
+            f"but got {tv_tensor_cls}."
         )
 
-    if datapoint_cls in _BUILTIN_DATAPOINT_TYPES:
-        raise ValueError(f"Kernels cannot be registered for the builtin datapoint classes, but got {datapoint_cls}")
+    if tv_tensor_cls in _BUILTIN_DATAPOINT_TYPES:
+        raise ValueError(f"Kernels cannot be registered for the builtin tv_tensor classes, but got {tv_tensor_cls}")
 
-    return _register_kernel_internal(functional, datapoint_cls, datapoint_wrapper=False)
+    return _register_kernel_internal(functional, tv_tensor_cls, tv_tensor_wrapper=False)
 
 
 def _get_kernel(functional, input_type, *, allow_passthrough=False):
@@ -103,10 +103,10 @@ def _get_kernel(functional, input_type, *, allow_passthrough=False):
     for cls in input_type.__mro__:
         if cls in registry:
             return registry[cls]
-        elif cls is datapoints.Datapoint:
-            # We don't want user-defined datapoints to dispatch to the pure Tensor kernels, so we explicit stop the
-            # MRO traversal before hitting torch.Tensor. We can even stop at datapoints.Datapoint, since we don't
-            # allow kernels to be registered for datapoints.Datapoint anyway.
+        elif cls is tv_tensors.TVTensor:
+            # We don't want user-defined tv_tensors to dispatch to the pure Tensor kernels, so we explicit stop the
+            # MRO traversal before hitting torch.Tensor. We can even stop at tv_tensors.TVTensor, since we don't
+            # allow kernels to be registered for tv_tensors.TVTensor anyway.
             break
 
     if allow_passthrough:
@@ -130,12 +130,12 @@ def _register_five_ten_crop_kernel_internal(functional, input_type):
         def wrapper(inpt, *args, **kwargs):
             output = kernel(inpt, *args, **kwargs)
             container_type = type(output)
-            return container_type(datapoints.wrap(o, like=inpt) for o in output)
+            return container_type(tv_tensors.wrap(o, like=inpt) for o in output)
 
         return wrapper
 
     def decorator(kernel):
-        registry[input_type] = wrap(kernel) if issubclass(input_type, datapoints.Datapoint) else kernel
+        registry[input_type] = wrap(kernel) if issubclass(input_type, tv_tensors.TVTensor) else kernel
         return kernel
 
     return decorator
diff --git a/torchvision/datapoints/__init__.py b/torchvision/tv_tensors/__init__.py
similarity index 76%
rename from torchvision/datapoints/__init__.py
rename to torchvision/tv_tensors/__init__.py
index 512a8d606..cb9bf702f 100644
--- a/torchvision/datapoints/__init__.py
+++ b/torchvision/tv_tensors/__init__.py
@@ -1,24 +1,24 @@
 import torch
 
 from ._bounding_box import BoundingBoxes, BoundingBoxFormat
-from ._datapoint import Datapoint
 from ._image import Image
 from ._mask import Mask
 from ._torch_function_helpers import set_return_type
+from ._tv_tensor import TVTensor
 from ._video import Video
 
 
 def wrap(wrappee, *, like, **kwargs):
-    """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.datapoints.Datapoint` subclass as ``like``.
+    """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.tv_tensors.TVTensor` subclass as ``like``.
 
-    If ``like`` is a :class:`~torchvision.datapoints.BoundingBoxes`, the ``format`` and ``canvas_size`` of
+    If ``like`` is a :class:`~torchvision.tv_tensors.BoundingBoxes`, the ``format`` and ``canvas_size`` of
     ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``.
 
     Args:
         wrappee (Tensor): The tensor to convert.
-        like (:class:`~torchvision.datapoints.Datapoint`): The reference.
+        like (:class:`~torchvision.tv_tensors.TVTensor`): The reference.
             ``wrappee`` will be converted into the same subclass as ``like``.
-        kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.datapoint.BoundingBoxes`.
+        kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.tv_tensor.BoundingBoxes`.
             Ignored otherwise.
     """
     if isinstance(like, BoundingBoxes):
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/tv_tensors/_bounding_box.py
similarity index 95%
rename from torchvision/datapoints/_bounding_box.py
rename to torchvision/tv_tensors/_bounding_box.py
index ebed06282..ce617ce47 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/tv_tensors/_bounding_box.py
@@ -6,7 +6,7 @@ from typing import Any, Mapping, Optional, Sequence, Tuple, Union
 import torch
 from torch.utils._pytree import tree_flatten
 
-from ._datapoint import Datapoint
+from ._tv_tensor import TVTensor
 
 
 class BoundingBoxFormat(Enum):
@@ -24,13 +24,13 @@ class BoundingBoxFormat(Enum):
     CXCYWH = "CXCYWH"
 
 
-class BoundingBoxes(Datapoint):
+class BoundingBoxes(TVTensor):
     """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
 
     .. note::
-        There should be only one :class:`~torchvision.datapoints.BoundingBoxes`
+        There should be only one :class:`~torchvision.tv_tensors.BoundingBoxes`
         instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``,
-        although one :class:`~torchvision.datapoints.BoundingBoxes` object can
+        although one :class:`~torchvision.tv_tensors.BoundingBoxes` object can
         contain multiple bounding boxes.
 
     Args:
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/tv_tensors/_dataset_wrapper.py
similarity index 91%
rename from torchvision/datapoints/_dataset_wrapper.py
rename to torchvision/tv_tensors/_dataset_wrapper.py
index 07a3e0ff7..ef9260ebd 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/tv_tensors/_dataset_wrapper.py
@@ -9,7 +9,7 @@ from collections import defaultdict
 
 import torch
 
-from torchvision import datapoints, datasets
+from torchvision import datasets, tv_tensors
 from torchvision.transforms.v2 import functional as F
 
 __all__ = ["wrap_dataset_for_transforms_v2"]
@@ -36,26 +36,26 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
 
         * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
           returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
-          ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``.
+          ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.tv_tensors``.
           The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the
           ``"image_id"``, ``"boxes"``, and ``"labels"``.
         * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
-          the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
+          the target and wrap the data in the corresponding ``torchvision.tv_tensors``. The original keys are
           preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
-          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint.
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
         * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
           dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
-          in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is
+          in the corresponding ``torchvision.tv_tensors``. The original keys are preserved. If ``target_keys`` is
           omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
         * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
-          :class:`~torchvision.datapoints.Mask` datapoint.
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor.
         * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
-          :class:`~torchvision.datapoints.Mask` datapoint. The target for ``target_type="instance"`` is *replaced* by
-          a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.datapoints.Mask` datapoint) and
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor. The target for ``target_type="instance"`` is *replaced* by
+          a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.tv_tensors.Mask` tv_tensor) and
           ``"labels"``.
         * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY``
-          coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint.
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
 
     Image classification datasets
 
@@ -66,13 +66,13 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
 
         Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation`, return a two-tuple of
         :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the
-        segmentation mask into a :class:`~torchvision.datapoints.Mask` (second item).
+        segmentation mask into a :class:`~torchvision.tv_tensors.Mask` (second item).
 
     Video classification datasets
 
         Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics`, return a three-tuple containing a
         :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a
-        :class:`~torchvision.datapoints.Video` while leaving the other items as is.
+        :class:`~torchvision.tv_tensors.Video` while leaving the other items as is.
 
         .. note::
 
@@ -98,12 +98,12 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
         )
 
     # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name
-    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetDatapointWrapper (see below) as well as the
+    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetTVTensorWrapper (see below) as well as the
     # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks,
     # while we can still inject everything that we need.
-    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetDatapointWrapper, type(dataset)), {})
-    # Since VisionDatasetDatapointWrapper comes before ImageNet in the MRO, calling the class hits
-    # VisionDatasetDatapointWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
+    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetTVTensorWrapper, type(dataset)), {})
+    # Since VisionDatasetTVTensorWrapper comes before ImageNet in the MRO, calling the class hits
+    # VisionDatasetTVTensorWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
     # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather
     # have the existing instance as attribute on the new object.
     return wrapped_dataset_cls(dataset, target_keys)
@@ -125,7 +125,7 @@ class WrapperFactories(dict):
 WRAPPER_FACTORIES = WrapperFactories()
 
 
-class VisionDatasetDatapointWrapper:
+class VisionDatasetTVTensorWrapper:
     def __init__(self, dataset, target_keys):
         dataset_cls = type(dataset)
 
@@ -134,7 +134,7 @@ class VisionDatasetDatapointWrapper:
                 f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, "
                 f"but got a '{dataset_cls.__name__}' instead.\n"
                 f"For an example of how to perform the wrapping for custom datasets, see\n\n"
-                "https://pytorch.org/vision/main/auto_examples/plot_datapoints.html#do-i-have-to-wrap-the-output-of-the-datasets-myself"
+                "https://pytorch.org/vision/main/auto_examples/plot_tv_tensors.html#do-i-have-to-wrap-the-output-of-the-datasets-myself"
             )
 
         for cls in dataset_cls.mro():
@@ -221,7 +221,7 @@ def identity_wrapper_factory(dataset, target_keys):
 
 
 def pil_image_to_mask(pil_image):
-    return datapoints.Mask(pil_image)
+    return tv_tensors.Mask(pil_image)
 
 
 def parse_target_keys(target_keys, *, available, default):
@@ -302,7 +302,7 @@ def video_classification_wrapper_factory(dataset, target_keys):
     def wrapper(idx, sample):
         video, audio, label = sample
 
-        video = datapoints.Video(video)
+        video = tv_tensors.Video(video)
 
         return video, audio, label
 
@@ -373,16 +373,16 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
 
         if "boxes" in target_keys:
             target["boxes"] = F.convert_bounding_box_format(
-                datapoints.BoundingBoxes(
+                tv_tensors.BoundingBoxes(
                     batched_target["bbox"],
-                    format=datapoints.BoundingBoxFormat.XYWH,
+                    format=tv_tensors.BoundingBoxFormat.XYWH,
                     canvas_size=canvas_size,
                 ),
-                new_format=datapoints.BoundingBoxFormat.XYXY,
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
             )
 
         if "masks" in target_keys:
-            target["masks"] = datapoints.Mask(
+            target["masks"] = tv_tensors.Mask(
                 torch.stack(
                     [
                         segmentation_to_mask(segmentation, canvas_size=canvas_size)
@@ -454,12 +454,12 @@ def voc_detection_wrapper_factory(dataset, target_keys):
             target = {}
 
         if "boxes" in target_keys:
-            target["boxes"] = datapoints.BoundingBoxes(
+            target["boxes"] = tv_tensors.BoundingBoxes(
                 [
                     [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
                     for bndbox in batched_instances["bndbox"]
                 ],
-                format=datapoints.BoundingBoxFormat.XYXY,
+                format=tv_tensors.BoundingBoxFormat.XYXY,
                 canvas_size=(image.height, image.width),
             )
 
@@ -494,12 +494,12 @@ def celeba_wrapper_factory(dataset, target_keys):
             target_types=dataset.target_type,
             type_wrappers={
                 "bbox": lambda item: F.convert_bounding_box_format(
-                    datapoints.BoundingBoxes(
+                    tv_tensors.BoundingBoxes(
                         item,
-                        format=datapoints.BoundingBoxFormat.XYWH,
+                        format=tv_tensors.BoundingBoxFormat.XYWH,
                         canvas_size=(image.height, image.width),
                     ),
-                    new_format=datapoints.BoundingBoxFormat.XYXY,
+                    new_format=tv_tensors.BoundingBoxFormat.XYXY,
                 ),
             },
         )
@@ -544,9 +544,9 @@ def kitti_wrapper_factory(dataset, target_keys):
         target = {}
 
         if "boxes" in target_keys:
-            target["boxes"] = datapoints.BoundingBoxes(
+            target["boxes"] = tv_tensors.BoundingBoxes(
                 batched_target["bbox"],
-                format=datapoints.BoundingBoxFormat.XYXY,
+                format=tv_tensors.BoundingBoxFormat.XYXY,
                 canvas_size=(image.height, image.width),
             )
 
@@ -596,7 +596,7 @@ def cityscapes_wrapper_factory(dataset, target_keys):
             if label >= 1_000:
                 label //= 1_000
             labels.append(label)
-        return dict(masks=datapoints.Mask(torch.stack(masks)), labels=torch.stack(labels))
+        return dict(masks=tv_tensors.Mask(torch.stack(masks)), labels=torch.stack(labels))
 
     def wrapper(idx, sample):
         image, target = sample
@@ -641,10 +641,10 @@ def widerface_wrapper(dataset, target_keys):
 
         if "bbox" in target_keys:
             target["bbox"] = F.convert_bounding_box_format(
-                datapoints.BoundingBoxes(
-                    target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
+                tv_tensors.BoundingBoxes(
+                    target["bbox"], format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
                 ),
-                new_format=datapoints.BoundingBoxFormat.XYXY,
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
             )
 
         return image, target
diff --git a/torchvision/datapoints/_image.py b/torchvision/tv_tensors/_image.py
similarity index 96%
rename from torchvision/datapoints/_image.py
rename to torchvision/tv_tensors/_image.py
index c9cc10c8d..a785e4b3e 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/tv_tensors/_image.py
@@ -5,10 +5,10 @@ from typing import Any, Optional, Union
 import PIL.Image
 import torch
 
-from ._datapoint import Datapoint
+from ._tv_tensor import TVTensor
 
 
-class Image(Datapoint):
+class Image(TVTensor):
     """[BETA] :class:`torch.Tensor` subclass for images.
 
     .. note::
diff --git a/torchvision/datapoints/_mask.py b/torchvision/tv_tensors/_mask.py
similarity index 96%
rename from torchvision/datapoints/_mask.py
rename to torchvision/tv_tensors/_mask.py
index 6725ac5fe..553fc581c 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/tv_tensors/_mask.py
@@ -5,10 +5,10 @@ from typing import Any, Optional, Union
 import PIL.Image
 import torch
 
-from ._datapoint import Datapoint
+from ._tv_tensor import TVTensor
 
 
-class Mask(Datapoint):
+class Mask(TVTensor):
     """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks.
 
     Args:
diff --git a/torchvision/datapoints/_torch_function_helpers.py b/torchvision/tv_tensors/_torch_function_helpers.py
similarity index 81%
rename from torchvision/datapoints/_torch_function_helpers.py
rename to torchvision/tv_tensors/_torch_function_helpers.py
index b35b5a1eb..106c42605 100644
--- a/torchvision/datapoints/_torch_function_helpers.py
+++ b/torchvision/tv_tensors/_torch_function_helpers.py
@@ -16,7 +16,7 @@ class _ReturnTypeCM:
 
 
 def set_return_type(return_type: str):
-    """[BETA] Set the return type of torch operations on datapoints.
+    """[BETA] Set the return type of torch operations on tv_tensors.
 
     This only affects the behaviour of torch operations. It has no effect on
     ``torchvision`` transforms or functionals, which will always return as
@@ -33,28 +33,28 @@ def set_return_type(return_type: str):
 
     .. code:: python
 
-        img = datapoints.Image(torch.rand(3, 5, 5))
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
         img + 2  # This is a pure Tensor (default behaviour)
 
-        set_return_type("datapoints")
+        set_return_type("tv_tensors")
         img + 2  # This is an Image
 
     or as a context manager to restrict the scope:
 
     .. code:: python
 
-        img = datapoints.Image(torch.rand(3, 5, 5))
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
         img + 2  # This is a pure Tensor
-        with set_return_type("datapoints"):
+        with set_return_type("tv_tensors"):
             img + 2  # This is an Image
         img + 2  # This is a pure Tensor
 
     Args:
-        return_type (str): Can be "datapoint" or "tensor". Default is "tensor".
+        return_type (str): Can be "tv_tensor" or "tensor". Default is "tensor".
     """
     global _TORCHFUNCTION_SUBCLASS
     to_restore = _TORCHFUNCTION_SUBCLASS
-    _TORCHFUNCTION_SUBCLASS = {"tensor": False, "datapoint": True}[return_type.lower()]
+    _TORCHFUNCTION_SUBCLASS = {"tensor": False, "tv_tensor": True}[return_type.lower()]
 
     return _ReturnTypeCM(to_restore)
 
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/tv_tensors/_tv_tensor.py
similarity index 90%
rename from torchvision/datapoints/_datapoint.py
rename to torchvision/tv_tensors/_tv_tensor.py
index 64103f583..abeab9ae0 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/tv_tensors/_tv_tensor.py
@@ -6,18 +6,18 @@ import torch
 from torch._C import DisableTorchFunctionSubclass
 from torch.types import _device, _dtype, _size
 
-from torchvision.datapoints._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass
+from torchvision.tv_tensors._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass
 
 
-D = TypeVar("D", bound="Datapoint")
+D = TypeVar("D", bound="TVTensor")
 
 
-class Datapoint(torch.Tensor):
-    """[Beta] Base class for all datapoints.
+class TVTensor(torch.Tensor):
+    """[Beta] Base class for all tv_tensors.
 
     You probably don't want to use this class unless you're defining your own
-    custom Datapoints. See
-    :ref:`sphx_glr_auto_examples_transforms_plot_custom_datapoints.py` for details.
+    custom TVTensors. See
+    :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for details.
     """
 
     @staticmethod
@@ -62,9 +62,9 @@ class Datapoint(torch.Tensor):
         ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
         ``args`` and ``kwargs`` of the original call.
 
-        Why do we override this? Because the base implementation in torch.Tensor would preserve the Datapoint type
+        Why do we override this? Because the base implementation in torch.Tensor would preserve the TVTensor type
         of the output. In our case, we want to return pure tensors instead (with a few exceptions). Refer to the
-        "Datapoints FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet).
+        "TVTensors FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet).
 
         Our implementation below is very similar to the base implementation in ``torch.Tensor`` - go check it out.
         """
@@ -79,7 +79,7 @@ class Datapoint(torch.Tensor):
         must_return_subclass = _must_return_subclass()
         if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)):
             # If you're wondering why we need the `isinstance(args[0], cls)` check, remove it and see what fails
-            # in test_to_datapoint_reference().
+            # in test_to_tv_tensor_reference().
             # The __torch_function__ protocol will invoke the __torch_function__ method on *all* types involved in
             # the computation by walking the MRO upwards. For example,
             # `out = a_pure_tensor.to(an_image)` will invoke `Image.__torch_function__` with
@@ -89,7 +89,7 @@ class Datapoint(torch.Tensor):
 
         if not must_return_subclass and isinstance(output, cls):
             # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`,
-            # so for those, the output is still a Datapoint. Thus, we need to manually unwrap.
+            # so for those, the output is still a TVTensor. Thus, we need to manually unwrap.
             return output.as_subclass(torch.Tensor)
 
         return output
diff --git a/torchvision/datapoints/_video.py b/torchvision/tv_tensors/_video.py
similarity index 95%
rename from torchvision/datapoints/_video.py
rename to torchvision/tv_tensors/_video.py
index b54bfc54a..a1efe4fe4 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/tv_tensors/_video.py
@@ -4,10 +4,10 @@ from typing import Any, Optional, Union
 
 import torch
 
-from ._datapoint import Datapoint
+from ._tv_tensor import TVTensor
 
 
-class Video(Datapoint):
+class Video(TVTensor):
     """[BETA] :class:`torch.Tensor` subclass for videos.
 
     Args:
-- 
GitLab


From 25ec3f2617450ed7c5f848e18c0f10354e9786d6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 30 Aug 2023 12:00:27 +0100
Subject: [PATCH 609/624] tv_tensor -> TVTensor where it matters (#7904)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst                    |  2 +-
 docs/source/tv_tensors.rst                    |  9 ++--
 gallery/transforms/plot_custom_transforms.py  |  4 +-
 gallery/transforms/plot_custom_tv_tensors.py  |  6 +--
 .../plot_transforms_getting_started.py        | 14 ++---
 gallery/transforms/plot_tv_tensors.py         | 42 +++++++--------
 test/test_tv_tensors.py                       | 53 ++++++++++++-------
 .../tv_tensors/_torch_function_helpers.py     | 17 +++---
 torchvision/tv_tensors/_tv_tensor.py          |  2 +-
 9 files changed, 84 insertions(+), 65 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index fe9258d73..3cae407a7 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -183,7 +183,7 @@ Transforms are available as classes like
 This is very much like the :mod:`torch.nn` package which defines both classes
 and functional equivalents in :mod:`torch.nn.functional`.
 
-The functionals support PIL images, pure tensors, or :ref:`tv_tensors
+The functionals support PIL images, pure tensors, or :ref:`TVTensors
 <tv_tensors>`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are
 valid.
 
diff --git a/docs/source/tv_tensors.rst b/docs/source/tv_tensors.rst
index d9a96b981..e80a1ed88 100644
--- a/docs/source/tv_tensors.rst
+++ b/docs/source/tv_tensors.rst
@@ -5,10 +5,11 @@ TVTensors
 
 .. currentmodule:: torchvision.tv_tensors
 
-TVTensors are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
-dispatch their inputs to the appropriate lower-level kernels. Most users do not
-need to manipulate tv_tensors directly and can simply rely on dataset wrapping -
-see e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
+TVTensors are :class:`torch.Tensor` subclasses which the v2 :ref:`transforms
+<transforms>` use under the hood to dispatch their inputs to the appropriate
+lower-level kernels. Most users do not need to manipulate TVTensors directly and
+can simply rely on dataset wrapping - see e.g.
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
 
 .. autosummary::
     :toctree: generated/
diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py
index 789de0ea6..898c2cd0b 100644
--- a/gallery/transforms/plot_custom_transforms.py
+++ b/gallery/transforms/plot_custom_transforms.py
@@ -74,7 +74,7 @@ out_img, out_bboxes, out_label = transforms(img, bboxes, label)
 print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
 # %%
 # .. note::
-#     While working with tv_tensor classes in your code, make sure to
+#     While working with TVTensor classes in your code, make sure to
 #     familiarize yourself with this section:
 #     :ref:`tv_tensor_unwrapping_behaviour`
 #
@@ -111,7 +111,7 @@ print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
 # In brief, the core logic is to unpack the input into a flat list using `pytree
 # <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
 # then transform only the entries that can be transformed (the decision is made
-# based on the **class** of the entries, as all tv_tensors are
+# based on the **class** of the entries, as all TVTensors are
 # tensor-subclasses) plus some custom logic that is out of score here - check the
 # code for details. The (potentially transformed) entries are then repacked and
 # returned, in the same structure as the input.
diff --git a/gallery/transforms/plot_custom_tv_tensors.py b/gallery/transforms/plot_custom_tv_tensors.py
index 75c4e8254..bf5ee1988 100644
--- a/gallery/transforms/plot_custom_tv_tensors.py
+++ b/gallery/transforms/plot_custom_tv_tensors.py
@@ -1,14 +1,14 @@
 """
-=====================================
+====================================
 How to write your own TVTensor class
-=====================================
+====================================
 
 .. note::
     Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_tv_tensors.ipynb>`_
     or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_tv_tensors.py>` to download the full example code.
 
 This guide is intended for advanced users and downstream library maintainers. We explain how to
-write your own tv_tensor class, and how to make it compatible with the built-in
+write your own TVTensor class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
 :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
 """
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
index 62ee13643..cbaab3dc9 100644
--- a/gallery/transforms/plot_transforms_getting_started.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -115,7 +115,7 @@ plot([(img, boxes), (out_img, out_boxes)])
 # segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have
 # passed them to the transforms in exactly the same way.
 #
-# By now you likely have a few questions: what are these tv_tensors, how do we
+# By now you likely have a few questions: what are these TVTensors, how do we
 # use them, and what is the expected input/output of those transforms? We'll
 # answer these in the next sections.
 
@@ -126,7 +126,7 @@ plot([(img, boxes), (out_img, out_boxes)])
 # What are TVTensors?
 # --------------------
 #
-# TVTensors are :class:`torch.Tensor` subclasses. The available tv_tensors are
+# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are
 # :class:`~torchvision.tv_tensors.Image`,
 # :class:`~torchvision.tv_tensors.BoundingBoxes`,
 # :class:`~torchvision.tv_tensors.Mask`, and
@@ -134,7 +134,7 @@ plot([(img, boxes), (out_img, out_boxes)])
 #
 # TVTensors look and feel just like regular tensors - they **are** tensors.
 # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
-# or any ``torch.*`` operator will also work on a tv_tensor:
+# or any ``torch.*`` operator will also work on a TVTensor:
 
 img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
 
@@ -146,7 +146,7 @@ print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 # transform a given input, the transforms first look at the **class** of the
 # object, and dispatch to the appropriate implementation accordingly.
 #
-# You don't need to know much more about tv_tensors at this point, but advanced
+# You don't need to know much more about TVTensors at this point, but advanced
 # users who want to learn more can refer to
 # :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
 #
@@ -234,9 +234,9 @@ print(f"{out_target['this_is_ignored']}")
 # Torchvision also supports datasets for object detection or segmentation like
 # :class:`torchvision.datasets.CocoDetection`. Those datasets predate
 # the existence of the :mod:`torchvision.transforms.v2` module and of the
-# tv_tensors, so they don't return tv_tensors out of the box.
+# TVTensors, so they don't return TVTensors out of the box.
 #
-# An easy way to force those datasets to return tv_tensors and to make them
+# An easy way to force those datasets to return TVTensors and to make them
 # compatible with v2 transforms is to use the
 # :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
 #
@@ -246,7 +246,7 @@ print(f"{out_target['this_is_ignored']}")
 #
 #    dataset = CocoDetection(..., transforms=my_transforms)
 #    dataset = wrap_dataset_for_transforms_v2(dataset)
-#    # Now the dataset returns tv_tensors!
+#    # Now the dataset returns TVTensors!
 #
 # Using your own datasets
 # ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/gallery/transforms/plot_tv_tensors.py b/gallery/transforms/plot_tv_tensors.py
index c5813189e..0cdbe9d08 100644
--- a/gallery/transforms/plot_tv_tensors.py
+++ b/gallery/transforms/plot_tv_tensors.py
@@ -9,18 +9,18 @@ TVTensors FAQ
 
 
 TVTensors are Tensor subclasses introduced together with
-``torchvision.transforms.v2``. This example showcases what these tv_tensors are
+``torchvision.transforms.v2``. This example showcases what these TVTensors are
 and how they behave.
 
 .. warning::
 
-    **Intended Audience** Unless you're writing your own transforms or your own tv_tensors, you
+    **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you
     probably do not need to read this guide. This is a fairly low-level topic
     that most users will not need to worry about: you do not need to understand
-    the internals of tv_tensors to efficiently rely on
+    the internals of TVTensors to efficiently rely on
     ``torchvision.transforms.v2``. It may however be useful for advanced users
     trying to implement their own datasets, transforms, or work directly with
-    the tv_tensors.
+    the TVTensors.
 """
 
 # %%
@@ -31,8 +31,8 @@ from torchvision import tv_tensors
 
 
 # %%
-# What are tv_tensors?
-# --------------------
+# What are TVTensors?
+# -------------------
 #
 # TVTensors are zero-copy tensor subclasses:
 
@@ -46,31 +46,31 @@ assert image.data_ptr() == tensor.data_ptr()
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
-# :mod:`torchvision.tv_tensors` supports four types of tv_tensors:
+# :mod:`torchvision.tv_tensors` supports four types of TVTensors:
 #
 # * :class:`~torchvision.tv_tensors.Image`
 # * :class:`~torchvision.tv_tensors.Video`
 # * :class:`~torchvision.tv_tensors.BoundingBoxes`
 # * :class:`~torchvision.tv_tensors.Mask`
 #
-# What can I do with a tv_tensor?
-# -------------------------------
+# What can I do with a TVTensor?
+# ------------------------------
 #
 # TVTensors look and feel just like regular tensors - they **are** tensors.
 # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
-# any ``torch.*`` operator will also work on tv_tensors. See
+# any ``torch.*`` operator will also work on TVTensors. See
 # :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas.
 
 # %%
 # .. _tv_tensor_creation:
 #
-# How do I construct a tv_tensor?
-# -------------------------------
+# How do I construct a TVTensor?
+# ------------------------------
 #
 # Using the constructor
 # ^^^^^^^^^^^^^^^^^^^^^
 #
-# Each tv_tensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
 
 image = tv_tensors.Image([[[[0, 1], [1, 0]]]])
 print(image)
@@ -92,7 +92,7 @@ image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
 # %%
-# Some tv_tensors require additional metadata to be passed in ordered to be constructed. For example,
+# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example,
 # :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
 # corresponding image (``canvas_size``) alongside the actual values. These
 # metadata are required to properly transform the bounding boxes.
@@ -109,7 +109,7 @@ print(bboxes)
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object
-# into a tv_tensor. This is useful when you already have an object of the
+# into a TVTensor. This is useful when you already have an object of the
 # desired type, which typically happens when writing transforms: you just want
 # to wrap the output like the input.
 
@@ -125,7 +125,7 @@ assert new_bboxes.canvas_size == bboxes.canvas_size
 # .. _tv_tensor_unwrapping_behaviour:
 #
 # I had a TVTensor but now I have a Tensor. Help!
-# ------------------------------------------------
+# -----------------------------------------------
 #
 # By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects
 # will return a pure Tensor:
@@ -151,7 +151,7 @@ assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 # But I want a TVTensor back!
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# You can re-wrap a pure tensor into a tv_tensor by just calling the tv_tensor
+# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor
 # constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function
 # (see more details above in :ref:`tv_tensor_creation`):
 
@@ -164,7 +164,7 @@ assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 # as a global config setting for the whole program, or as a context manager
 # (read its docs to learn more about caveats):
 
-with tv_tensors.set_return_type("tv_tensor"):
+with tv_tensors.set_return_type("TVTensor"):
     new_bboxes = bboxes + 3
 assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 
@@ -203,9 +203,9 @@ assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
 # There are a few exceptions to this "unwrapping" rule:
 # :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
 # :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
-# the tv_tensor type.
+# the TVTensor type.
 #
-# Inplace operations on tv_tensors like ``obj.add_()`` will preserve the type of
+# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of
 # ``obj``. However, the **returned** value of inplace operations will be a pure
 # tensor:
 
@@ -213,7 +213,7 @@ image = tv_tensors.Image([[[0, 1], [1, 0]]])
 
 new_image = image.add_(1).mul_(2)
 
-# image got transformed in-place and is still an Image tv_tensor, but new_image
+# image got transformed in-place and is still a TVTensor Image, but new_image
 # is a Tensor. They share the same underlying data and they're equal, just
 # different classes.
 assert isinstance(image, tv_tensors.Image)
diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py
index 92747f7eb..ed75ae35e 100644
--- a/test/test_tv_tensors.py
+++ b/test/test_tv_tensors.py
@@ -91,7 +91,7 @@ def test_to_wrapping(make_input):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_to_tv_tensor_reference(make_input, return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
     dp = make_input()
@@ -99,13 +99,13 @@ def test_to_tv_tensor_reference(make_input, return_type):
     with tv_tensors.set_return_type(return_type):
         tensor_to = tensor.to(dp)
 
-    assert type(tensor_to) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
+    assert type(tensor_to) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
     assert tensor_to.dtype is dp.dtype
     assert type(tensor) is torch.Tensor
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_clone_wrapping(make_input, return_type):
     dp = make_input()
 
@@ -117,7 +117,7 @@ def test_clone_wrapping(make_input, return_type):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_requires_grad__wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float)
 
@@ -132,7 +132,7 @@ def test_requires_grad__wrapping(make_input, return_type):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_detach_wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float).requires_grad_(True)
 
@@ -142,7 +142,7 @@ def test_detach_wrapping(make_input, return_type):
     assert type(dp_detached) is type(dp)
 
 
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_force_subclass_with_metadata(return_type):
     # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and tv_tensors with metadata
     # Largely the same as above, we additionally check that the metadata is preserved
@@ -151,27 +151,27 @@ def test_force_subclass_with_metadata(return_type):
 
     tv_tensors.set_return_type(return_type)
     bbox = bbox.clone()
-    if return_type == "tv_tensor":
+    if return_type == "TVTensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     bbox = bbox.to(torch.float64)
-    if return_type == "tv_tensor":
+    if return_type == "TVTensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     bbox = bbox.detach()
-    if return_type == "tv_tensor":
+    if return_type == "TVTensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
 
     assert not bbox.requires_grad
     bbox.requires_grad_(True)
-    if return_type == "tv_tensor":
+    if return_type == "TVTensor":
         assert bbox.format, bbox.canvas_size == (format, canvas_size)
         assert bbox.requires_grad
     tv_tensors.set_return_type("tensor")
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_other_op_no_wrapping(make_input, return_type):
     dp = make_input()
 
@@ -179,7 +179,7 @@ def test_other_op_no_wrapping(make_input, return_type):
         # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
         output = dp * 2
 
-    assert type(output) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
+    assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
@@ -200,7 +200,7 @@ def test_no_tensor_output_op_no_wrapping(make_input, op):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 def test_inplace_op_no_wrapping(make_input, return_type):
     dp = make_input()
     original_type = type(dp)
@@ -208,7 +208,7 @@ def test_inplace_op_no_wrapping(make_input, return_type):
     with tv_tensors.set_return_type(return_type):
         output = dp.add_(0)
 
-    assert type(output) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
+    assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
     assert type(dp) is original_type
 
 
@@ -243,7 +243,7 @@ def test_deepcopy(make_input, requires_grad):
 
 
 @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
-@pytest.mark.parametrize("return_type", ["Tensor", "tv_tensor"])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
 @pytest.mark.parametrize(
     "op",
     (
@@ -267,8 +267,8 @@ def test_usual_operations(make_input, return_type, op):
     dp = make_input()
     with tv_tensors.set_return_type(return_type):
         out = op(dp)
-    assert type(out) is (type(dp) if return_type == "tv_tensor" else torch.Tensor)
-    if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "tv_tensor":
+    assert type(out) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "TVTensor":
         assert hasattr(out, "format")
         assert hasattr(out, "canvas_size")
 
@@ -286,16 +286,16 @@ def test_set_return_type():
 
     assert type(img + 3) is torch.Tensor
 
-    with tv_tensors.set_return_type("tv_tensor"):
+    with tv_tensors.set_return_type("TVTensor"):
         assert type(img + 3) is tv_tensors.Image
     assert type(img + 3) is torch.Tensor
 
-    tv_tensors.set_return_type("tv_tensor")
+    tv_tensors.set_return_type("TVTensor")
     assert type(img + 3) is tv_tensors.Image
 
     with tv_tensors.set_return_type("tensor"):
         assert type(img + 3) is torch.Tensor
-        with tv_tensors.set_return_type("tv_tensor"):
+        with tv_tensors.set_return_type("TVTensor"):
             assert type(img + 3) is tv_tensors.Image
             tv_tensors.set_return_type("tensor")
             assert type(img + 3) is torch.Tensor
@@ -305,3 +305,16 @@ def test_set_return_type():
     assert type(img + 3) is tv_tensors.Image
 
     tv_tensors.set_return_type("tensor")
+
+
+def test_return_type_input():
+    img = make_image()
+
+    # Case-insensitive
+    with tv_tensors.set_return_type("tvtensor"):
+        assert type(img + 3) is tv_tensors.Image
+
+    with pytest.raises(ValueError, match="return_type must be"):
+        tv_tensors.set_return_type("typo")
+
+    tv_tensors.set_return_type("tensor")
diff --git a/torchvision/tv_tensors/_torch_function_helpers.py b/torchvision/tv_tensors/_torch_function_helpers.py
index 106c42605..7edc471b1 100644
--- a/torchvision/tv_tensors/_torch_function_helpers.py
+++ b/torchvision/tv_tensors/_torch_function_helpers.py
@@ -16,7 +16,7 @@ class _ReturnTypeCM:
 
 
 def set_return_type(return_type: str):
-    """[BETA] Set the return type of torch operations on tv_tensors.
+    """[BETA] Set the return type of torch operations on :class:`~torchvision.tv_tensors.TVTensor`.
 
     This only affects the behaviour of torch operations. It has no effect on
     ``torchvision`` transforms or functionals, which will always return as
@@ -26,7 +26,7 @@ def set_return_type(return_type: str):
 
         We recommend using :class:`~torchvision.transforms.v2.ToPureTensor` at
         the end of your transform pipelines if you use
-        ``set_return_type("dataptoint")``. This will avoid the
+        ``set_return_type("TVTensor")``. This will avoid the
         ``__torch_function__`` overhead in the models ``forward()``.
 
     Can be used as a global flag for the entire program:
@@ -36,7 +36,7 @@ def set_return_type(return_type: str):
         img = tv_tensors.Image(torch.rand(3, 5, 5))
         img + 2  # This is a pure Tensor (default behaviour)
 
-        set_return_type("tv_tensors")
+        set_return_type("TVTensor")
         img + 2  # This is an Image
 
     or as a context manager to restrict the scope:
@@ -45,16 +45,21 @@ def set_return_type(return_type: str):
 
         img = tv_tensors.Image(torch.rand(3, 5, 5))
         img + 2  # This is a pure Tensor
-        with set_return_type("tv_tensors"):
+        with set_return_type("TVTensor"):
             img + 2  # This is an Image
         img + 2  # This is a pure Tensor
 
     Args:
-        return_type (str): Can be "tv_tensor" or "tensor". Default is "tensor".
+        return_type (str): Can be "TVTensor" or "Tensor" (case-insensitive).
+            Default is "Tensor" (i.e. pure :class:`torch.Tensor`).
     """
     global _TORCHFUNCTION_SUBCLASS
     to_restore = _TORCHFUNCTION_SUBCLASS
-    _TORCHFUNCTION_SUBCLASS = {"tensor": False, "tv_tensor": True}[return_type.lower()]
+
+    try:
+        _TORCHFUNCTION_SUBCLASS = {"tensor": False, "tvtensor": True}[return_type.lower()]
+    except KeyError:
+        raise ValueError(f"return_type must be 'TVTensor' or 'Tensor', got {return_type}") from None
 
     return _ReturnTypeCM(to_restore)
 
diff --git a/torchvision/tv_tensors/_tv_tensor.py b/torchvision/tv_tensors/_tv_tensor.py
index abeab9ae0..0c6af95af 100644
--- a/torchvision/tv_tensors/_tv_tensor.py
+++ b/torchvision/tv_tensors/_tv_tensor.py
@@ -13,7 +13,7 @@ D = TypeVar("D", bound="TVTensor")
 
 
 class TVTensor(torch.Tensor):
-    """[Beta] Base class for all tv_tensors.
+    """[Beta] Base class for all TVTensors.
 
     You probably don't want to use this class unless you're defining your own
     custom TVTensors. See
-- 
GitLab


From a2189d6c7d12426e3b8595128f7ff44349ed959a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 30 Aug 2023 13:16:23 +0100
Subject: [PATCH 610/624] Avoid / silence warnings in refactored tests (#7905)

---
 test/test_transforms_v2_refactored.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index c7a40bbb4..9de1ed6d7 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -821,7 +821,7 @@ class TestResize:
         # Non-regression test for https://github.com/pytorch/vision/issues/7667
 
         input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16)
-        output = F.resize_image(input, size=self.OUTPUT_SIZES[0])
+        output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True)
 
         assert output.dtype is torch.float16
         assert (output.round() - output).abs().sum() > 0
@@ -1375,6 +1375,7 @@ class TestVerticalFlip:
         assert_equal(output, input)
 
 
+@pytest.mark.filterwarnings("ignore:The provided center argument has no effect")
 class TestRotate:
     _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
         # float, int
-- 
GitLab


From ce441f6bde971a4231e9178b8531b620e478bed3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 30 Aug 2023 14:58:01 +0200
Subject: [PATCH 611/624] remove trailing _tensor from image kernel checks
 (#7908)

---
 test/test_transforms_v2_refactored.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 9de1ed6d7..544484b8f 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -459,7 +459,7 @@ class TestResize:
     @pytest.mark.parametrize("antialias", [True, False])
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_image_tensor(self, size, interpolation, use_max_size, antialias, dtype, device):
+    def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
@@ -830,7 +830,7 @@ class TestResize:
 class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_image_tensor(self, dtype, device):
+    def test_kernel_image(self, dtype, device):
         check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@@ -980,7 +980,7 @@ class TestAffine:
     )
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_image_tensor(self, param, value, dtype, device):
+    def test_kernel_image(self, param, value, dtype, device):
         if param == "fill":
             value = adapt_fill(value, dtype=dtype)
         self._check_kernel(
@@ -1280,7 +1280,7 @@ class TestAffine:
 class TestVerticalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_image_tensor(self, dtype, device):
+    def test_kernel_image(self, dtype, device):
         check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@@ -1404,7 +1404,7 @@ class TestRotate:
     )
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_image_tensor(self, param, value, dtype, device):
+    def test_kernel_image(self, param, value, dtype, device):
         kwargs = {param: value}
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
@@ -2382,7 +2382,7 @@ class TestElastic:
     )
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_kernel_image_tensor(self, param, value, dtype, device):
+    def test_kernel_image(self, param, value, dtype, device):
         image = make_image_tensor(dtype=dtype, device=device)
 
         check_kernel(
-- 
GitLab


From 58f834a39a2328da666fe08ad0be3e2ae4aaa604 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 30 Aug 2023 14:16:14 +0100
Subject: [PATCH 612/624] Bunch of doc edits (#7906)

---
 docs/source/transforms.rst                    | 45 ++++++++++++-------
 docs/source/tv_tensors.rst                    | 10 +++--
 gallery/README.rst                            |  2 +
 gallery/transforms/plot_transforms_e2e.py     | 13 ++++++
 .../plot_transforms_getting_started.py        |  2 +
 5 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 3cae407a7..2aa1fc5ba 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -33,20 +33,33 @@ tasks (image classification, detection, segmentation, video classification).
     from torchvision import tv_tensors
 
     img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
-    bboxes = torch.randint(0, H // 2, size=(3, 4))
-    bboxes[:, 2:] += bboxes[:, :2]
-    bboxes = tv_tensors.BoundingBoxes(bboxes, format="XYXY", canvas_size=(H, W))
+    boxes = torch.randint(0, H // 2, size=(3, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
 
     # The same transforms can be used!
-    img, bboxes = transforms(img, bboxes)
+    img, boxes = transforms(img, boxes)
     # And you can pass arbitrary input structures
-    output_dict = transforms({"image": img, "bboxes": bboxes})
+    output_dict = transforms({"image": img, "boxes": boxes})
 
 Transforms are typically passed as the ``transform`` or ``transforms`` argument
 to the :ref:`Datasets <datasets>`.
 
-.. TODO: Reader guide, i.e. what to read depending on what you're looking for
-.. TODO: add link to getting started guide here.
+Start here
+----------
+
+Whether you're new to Torchvision transforms, or you're already experienced with
+them, we encourage you to start with
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` in
+order to learn more about what can be done with the new v2 transforms.
+
+Then, browse the sections in below this page for general information and
+performance tips. The available transforms and functionals are listed in the
+:ref:`API reference <v2_api_ref>`.
+
+More information and tutorials can also be found in our :ref:`example gallery
+<gallery>`, e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`
+or :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`.
 
 .. _conventions:
 
@@ -98,25 +111,21 @@ advantages compared to the v1 ones (in ``torchvision.transforms``):
 
 - They can transform images **but also** bounding boxes, masks, or videos. This
   provides support for tasks beyond image classification: detection, segmentation,
-  video classification, etc.
+  video classification, etc. See
+  :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py`
+  and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
 - They support more transforms like :class:`~torchvision.transforms.v2.CutMix`
-  and :class:`~torchvision.transforms.v2.MixUp`.
+  and :class:`~torchvision.transforms.v2.MixUp`. See
+  :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py`.
 - They're :ref:`faster <transforms_perf>`.
 - They support arbitrary input structures (dicts, lists, tuples, etc.).
 - Future improvements and features will be added to the v2 transforms only.
 
-.. TODO: Add link to e2e example for first bullet point.
-
 These transforms are **fully backward compatible** with the v1 ones, so if
 you're already using tranforms from ``torchvision.transforms``, all you need to
 do to is to update the import to ``torchvision.transforms.v2``. In terms of
 output, there might be negligible differences due to implementation differences.
 
-To learn more about the v2 transforms, check out
-:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py`.
-
-.. TODO: make sure link is still good!!
-
 .. note::
 
     The v2 transforms are still BETA, but at this point we do not expect
@@ -184,7 +193,7 @@ This is very much like the :mod:`torch.nn` package which defines both classes
 and functional equivalents in :mod:`torch.nn.functional`.
 
 The functionals support PIL images, pure tensors, or :ref:`TVTensors
-<tv_tensors>`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are
+<tv_tensors>`, e.g. both ``resize(image_tensor)`` and ``resize(boxes)`` are
 valid.
 
 .. note::
@@ -248,6 +257,8 @@ be derived from ``torch.nn.Module``.
 
 See also: :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`.
 
+.. _v2_api_ref:
+
 V2 API reference - Recommended
 ------------------------------
 
diff --git a/docs/source/tv_tensors.rst b/docs/source/tv_tensors.rst
index e80a1ed88..cb8a3c45f 100644
--- a/docs/source/tv_tensors.rst
+++ b/docs/source/tv_tensors.rst
@@ -7,9 +7,13 @@ TVTensors
 
 TVTensors are :class:`torch.Tensor` subclasses which the v2 :ref:`transforms
 <transforms>` use under the hood to dispatch their inputs to the appropriate
-lower-level kernels. Most users do not need to manipulate TVTensors directly and
-can simply rely on dataset wrapping - see e.g.
-:ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
+lower-level kernels. Most users do not need to manipulate TVTensors directly.
+
+Refer to
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` for
+an introduction to TVTensors, or
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py` for more advanced
+info.
 
 .. autosummary::
     :toctree: generated/
diff --git a/gallery/README.rst b/gallery/README.rst
index 9a0838f49..8dfea3552 100644
--- a/gallery/README.rst
+++ b/gallery/README.rst
@@ -1,2 +1,4 @@
+.. _gallery:
+
 Examples and tutorials
 ======================
diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py
index 66d9203d7..6c58b4a5a 100644
--- a/gallery/transforms/plot_transforms_e2e.py
+++ b/gallery/transforms/plot_transforms_e2e.py
@@ -166,3 +166,16 @@ for imgs, targets in data_loader:
     print(f"{[type(target) for target in targets] = }")
     for name, loss_val in loss_dict.items():
         print(f"{name:<20}{loss_val:.3f}")
+
+# %%
+# Training References
+# -------------------
+#
+# From there, you can check out the `torchvision references
+# <https://github.com/pytorch/vision/tree/main/references>`_ where you'll find
+# the actual training scripts we use to train our models.
+#
+# **Disclaimer** The code in our references is more complex than what you'll
+# need for your own use-cases: this is because we're supporting different
+# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and
+# v2). So don't be afraid to simplify and only keep what you need.
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
index cbaab3dc9..c61d1cc1b 100644
--- a/gallery/transforms/plot_transforms_getting_started.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -217,6 +217,8 @@ print(f"{out_target['this_is_ignored']}")
 #     can still be transformed by some transforms like
 #     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
 #
+# .. _transforms_datasets_intercompatibility:
+#
 # Transforms and Datasets intercompatibility
 # ------------------------------------------
 #
-- 
GitLab


From a06df0d9229e49bd859e2ff0355a48b3bddd1e10 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 30 Aug 2023 16:01:10 +0200
Subject: [PATCH 613/624] add tests for F.crop and transforms.RandomCrop
 (#7892)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2.py                    |  62 -----
 test/test_transforms_v2_consistency.py        |  21 --
 test/test_transforms_v2_functional.py         |  57 -----
 test/test_transforms_v2_refactored.py         | 240 +++++++++++++++++-
 test/transforms_v2_dispatcher_infos.py        |  10 -
 test/transforms_v2_kernel_infos.py            |  99 --------
 .../transforms/v2/functional/_geometry.py     |   2 +-
 7 files changed, 237 insertions(+), 254 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 65e65481b..c92a8cd7c 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -449,68 +449,6 @@ class TestRandomZoomOut:
         assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
 
 
-class TestRandomCrop:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Please provide only two dimensions"):
-            transforms.RandomCrop([10, 12, 14])
-
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.RandomCrop([10, 12], padding="abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomCrop([10, 12], padding=1, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size):
-        h, w = size = (24, 32)
-        image = make_image(size)
-
-        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
-        params = transform._get_params([image])
-
-        if padding is not None:
-            if isinstance(padding, int):
-                pad_top = pad_bottom = pad_left = pad_right = padding
-            elif isinstance(padding, list) and len(padding) == 2:
-                pad_left = pad_right = padding[0]
-                pad_top = pad_bottom = padding[1]
-            elif isinstance(padding, list) and len(padding) == 4:
-                pad_left, pad_top, pad_right, pad_bottom = padding
-
-            h += pad_top + pad_bottom
-            w += pad_left + pad_right
-        else:
-            pad_left = pad_right = pad_top = pad_bottom = 0
-
-        if pad_if_needed:
-            if w < size[1]:
-                diff = size[1] - w
-                pad_left += diff
-                pad_right += diff
-                w += 2 * diff
-            if h < size[0]:
-                diff = size[0] - h
-                pad_top += diff
-                pad_bottom += diff
-                h += 2 * diff
-
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-
-        assert 0 <= params["top"] <= h - size[0] + 1
-        assert 0 <= params["left"] <= w - size[1] + 1
-        assert params["height"] == size[0]
-        assert params["width"] == size[1]
-        assert params["needs_pad"] is any(padding)
-        assert params["padding"] == padding
-
-
 class TestGaussianBlur:
     def test_assertions(self):
         with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index e37913a94..e8d6487e9 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -318,26 +318,6 @@ CONSISTENCY_CONFIGS = [
         ],
         closeness_kwargs={"rtol": 1e-5, "atol": 1e-5},
     ),
-    ConsistencyConfig(
-        v2_transforms.RandomCrop,
-        legacy_transforms.RandomCrop,
-        [
-            ArgsKwargs(12),
-            ArgsKwargs((15, 17)),
-            NotScriptableArgsKwargs(11, padding=1),
-            ArgsKwargs(11, padding=[1]),
-            ArgsKwargs((8, 13), padding=(2, 3)),
-            ArgsKwargs((14, 9), padding=(0, 2, 1, 0)),
-            ArgsKwargs(36, pad_if_needed=True),
-            ArgsKwargs((7, 8), fill=1),
-            NotScriptableArgsKwargs(5, fill=(1, 2, 3)),
-            ArgsKwargs(12),
-            NotScriptableArgsKwargs(15, padding=2, padding_mode="edge"),
-            ArgsKwargs(17, padding=(1, 0), padding_mode="reflect"),
-            ArgsKwargs(8, padding=(3, 0, 0, 1), padding_mode="symmetric"),
-        ],
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(26, 26), (18, 33), (29, 22)]),
-    ),
     ConsistencyConfig(
         v2_transforms.RandomPerspective,
         legacy_transforms.RandomPerspective,
@@ -573,7 +553,6 @@ get_params_parametrization = pytest.mark.parametrize(
             (v2_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
             (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
-            (v2_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
             (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
             (v2_transforms.AutoAugment, ArgsKwargs(5)),
         ]
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index cdd75ca0f..23f06475c 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -576,63 +576,6 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
     return true_matrix
 
 
-@pytest.mark.parametrize("device", cpu_and_cuda())
-@pytest.mark.parametrize(
-    "format",
-    [tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH, tv_tensors.BoundingBoxFormat.CXCYWH],
-)
-@pytest.mark.parametrize(
-    "top, left, height, width, expected_bboxes",
-    [
-        [8, 12, 30, 40, [(-2.0, 7.0, 13.0, 27.0), (38.0, -3.0, 58.0, 14.0), (33.0, 38.0, 44.0, 54.0)]],
-        [-8, 12, 70, 40, [(-2.0, 23.0, 13.0, 43.0), (38.0, 13.0, 58.0, 30.0), (33.0, 54.0, 44.0, 70.0)]],
-    ],
-)
-def test_correctness_crop_bounding_boxes(device, format, top, left, height, width, expected_bboxes):
-
-    # Expected bboxes computed using Albumentations:
-    # import numpy as np
-    # from albumentations.augmentations.crops.functional import crop_bbox_by_coords, normalize_bbox, denormalize_bbox
-    # expected_bboxes = []
-    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *size)
-    #     n_out_box = crop_bbox_by_coords(
-    #         n_in_box, (left, top, left + width, top + height), height, width, *size
-    #     )
-    #     out_box = denormalize_bbox(n_out_box, height, width)
-    #     expected_bboxes.append(out_box)
-
-    format = tv_tensors.BoundingBoxFormat.XYXY
-    canvas_size = (64, 76)
-    in_boxes = [
-        [10.0, 15.0, 25.0, 35.0],
-        [50.0, 5.0, 70.0, 22.0],
-        [45.0, 46.0, 56.0, 62.0],
-    ]
-    in_boxes = torch.tensor(in_boxes, device=device)
-    if format != tv_tensors.BoundingBoxFormat.XYXY:
-        in_boxes = convert_bounding_box_format(in_boxes, tv_tensors.BoundingBoxFormat.XYXY, format)
-
-    expected_bboxes = clamp_bounding_boxes(
-        tv_tensors.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size)
-    ).tolist()
-
-    output_boxes, output_canvas_size = F.crop_bounding_boxes(
-        in_boxes,
-        format,
-        top,
-        left,
-        canvas_size[0],
-        canvas_size[1],
-    )
-
-    if format != tv_tensors.BoundingBoxFormat.XYXY:
-        output_boxes = convert_bounding_box_format(output_boxes, format, tv_tensors.BoundingBoxFormat.XYXY)
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_canvas_size, canvas_size)
-
-
 @pytest.mark.parametrize("device", cpu_and_cuda())
 def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
     mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 544484b8f..6492aead3 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -228,7 +228,7 @@ def check_functional_kernel_signature_match(functional, *, kernel, input_type):
         assert functional_param == kernel_param
 
 
-def _check_transform_v1_compatibility(transform, input, rtol, atol):
+def _check_transform_v1_compatibility(transform, input, *, rtol, atol):
     """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
     ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version
     can be called without error."""
@@ -357,10 +357,11 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
 
     def affine_bounding_boxes(bounding_boxes):
         dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
 
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         input_xyxy = F.convert_bounding_box_format(
-            bounding_boxes.to(torch.float64, copy=True),
+            bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
             old_format=format,
             new_format=tv_tensors.BoundingBoxFormat.XYXY,
             inplace=True,
@@ -396,9 +397,13 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
                 output,
                 format=format,
                 canvas_size=canvas_size,
-            ).to(dtype)
+            )
+        else:
+            # We leave the bounding box as float64 so the caller gets the full precision to perform any additional
+            # operation
+            dtype = output.dtype
 
-        return output
+        return output.to(dtype=dtype, device=device)
 
     return tv_tensors.BoundingBoxes(
         torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
@@ -2486,3 +2491,230 @@ class TestToPureTensor:
                 assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor)
             else:
                 assert isinstance(out_value, type(input_value))
+
+
+class TestCrop:
+    INPUT_SIZE = (21, 11)
+
+    CORRECTNESS_CROP_KWARGS = [
+        # center
+        dict(top=5, left=5, height=10, width=5),
+        # larger than input, i.e. pad
+        dict(top=-5, left=-5, height=30, width=20),
+        # sides: left, right, top, bottom
+        dict(top=-5, left=-5, height=30, width=10),
+        dict(top=-5, left=5, height=30, width=10),
+        dict(top=-5, left=-5, height=20, width=20),
+        dict(top=5, left=-5, height=20, width=20),
+        # corners: top-left, top-right, bottom-left, bottom-right
+        dict(top=-5, left=-5, height=20, width=10),
+        dict(top=-5, left=5, height=20, width=10),
+        dict(top=5, left=-5, height=20, width=10),
+        dict(top=5, left=5, height=20, width=10),
+    ]
+    MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0]
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, kwargs, dtype, device):
+        check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, kwargs, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+        check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs)
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.crop_image, torch.Tensor),
+            (F._crop_image_pil, PIL.Image.Image),
+            (F.crop_image, tv_tensors.Image),
+            (F.crop_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.crop_mask, tv_tensors.Mask),
+            (F.crop_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    def test_functional_image_correctness(self, kwargs):
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu")
+
+        actual = F.crop(image, **kwargs)
+        expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs))
+
+        assert_equal(actual, expected)
+
+    @param_value_parametrization(
+        size=[(10, 5), (25, 15), (25, 5), (10, 15)],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_transform(self, param, value, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        kwargs = {param: value}
+        if param == "fill":
+            # 1. size is required
+            # 2. the fill parameter only has an affect if we need padding
+            kwargs["size"] = [s + 4 for s in self.INPUT_SIZE]
+
+            if isinstance(input, PIL.Image.Image) and isinstance(value, (tuple, list)) and len(value) == 1:
+                pytest.xfail("F._pad_image_pil does not support sequences of length 1 for fill.")
+
+            if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)):
+                pytest.skip("F.pad_mask doesn't support non-scalar fill.")
+
+        check_transform(
+            transforms.RandomCrop(**kwargs, pad_if_needed=True),
+            input,
+            check_v1_compatibility=param != "fill" or isinstance(value, (int, float)),
+        )
+
+    @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)])
+    def test_transform_padding(self, padding):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s + 2 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, padding=padding)
+
+        output = transform(inpt)
+
+        assert F.get_size(output) == output_size
+
+    @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)])
+    def test_transform_insufficient_padding(self, padding):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s + 3 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, padding=padding)
+
+        with pytest.raises(ValueError, match="larger than (padded )?input image size"):
+            transform(inpt)
+
+    def test_transform_pad_if_needed(self):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s * 2 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, pad_if_needed=True)
+
+        output = transform(inpt)
+
+        assert F.get_size(output) == output_size
+
+    @param_value_parametrization(
+        size=[(10, 5), (25, 15), (25, 5), (10, 15)],
+        fill=CORRECTNESS_FILLS,
+        padding_mode=["constant", "edge", "reflect", "symmetric"],
+    )
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, param, value, seed):
+        kwargs = {param: value}
+        if param != "size":
+            # 1. size is required
+            # 2. the fill / padding_mode parameters only have an affect if we need padding
+            kwargs["size"] = [s + 4 for s in self.INPUT_SIZE]
+        if param == "fill":
+            kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8)
+
+        transform = transforms.RandomCrop(pad_if_needed=True, **kwargs)
+
+        image = make_image(self.INPUT_SIZE)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+            torch.manual_seed(seed)
+            expected = F.to_image(transform(F.to_pil_image(image)))
+
+        assert_equal(actual, expected)
+
+    def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width):
+        affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+            ],
+        )
+        return reference_affine_bounding_boxes_helper(
+            bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width)
+        )
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+
+        actual = F.crop(bounding_boxes, **kwargs)
+        expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs)
+
+        assert_equal(actual, expected, atol=1, rtol=0)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)])
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed):
+        input_size = [s * 2 for s in output_size]
+        bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device)
+
+        transform = transforms.RandomCrop(output_size)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            params = transform._get_params([bounding_boxes])
+            assert not params.pop("needs_pad")
+            del params["padding"]
+            assert params.pop("needs_crop")
+
+            torch.manual_seed(seed)
+            actual = transform(bounding_boxes)
+
+        expected = self._reference_crop_bounding_boxes(bounding_boxes, **params)
+
+        assert_equal(actual, expected)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    def test_errors(self):
+        with pytest.raises(ValueError, match="Please provide only two dimensions"):
+            transforms.RandomCrop([10, 12, 14])
+
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.RandomCrop([10, 12], padding="abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomCrop([10, 12], padding=1, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 31d98db7f..12c1417d9 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -139,16 +139,6 @@ xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
 
 
 DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.crop,
-        kernels={
-            tv_tensors.Image: F.crop_image,
-            tv_tensors.Video: F.crop_video,
-            tv_tensors.BoundingBoxes: F.crop_bounding_boxes,
-            tv_tensors.Mask: F.crop_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F._crop_image_pil, kernel_name="crop_image_pil"),
-    ),
     DispatcherInfo(
         F.resized_crop,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index b682c992d..22eda3539 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -259,105 +259,6 @@ KERNEL_INFOS.append(
 )
 
 
-_CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20], width=[12, 20])
-
-
-def sample_inputs_crop_image_tensor():
-    for image_loader, params in itertools.product(
-        make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]),
-        [
-            dict(top=4, left=3, height=7, width=8),
-            dict(top=-1, left=3, height=7, width=8),
-            dict(top=4, left=-1, height=7, width=8),
-            dict(top=4, left=3, height=17, width=8),
-            dict(top=4, left=3, height=7, width=18),
-        ],
-    ):
-        yield ArgsKwargs(image_loader, **params)
-
-
-def reference_inputs_crop_image_tensor():
-    for image_loader, params in itertools.product(
-        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _CROP_PARAMS
-    ):
-        yield ArgsKwargs(image_loader, **params)
-
-
-def sample_inputs_crop_bounding_boxes():
-    for bounding_boxes_loader, params in itertools.product(
-        make_bounding_box_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
-    ):
-        yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **params)
-
-
-def sample_inputs_crop_mask():
-    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=[10], num_objects=[5]):
-        yield ArgsKwargs(mask_loader, top=4, left=3, height=7, width=8)
-
-
-def reference_inputs_crop_mask():
-    for mask_loader, params in itertools.product(make_mask_loaders(extra_dims=[()], num_objects=[1]), _CROP_PARAMS):
-        yield ArgsKwargs(mask_loader, **params)
-
-
-def sample_inputs_crop_video():
-    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=[3]):
-        yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
-
-
-def reference_crop_bounding_boxes(bounding_boxes, *, format, top, left, height, width):
-    affine_matrix = np.array(
-        [
-            [1, 0, -left],
-            [0, 1, -top],
-        ],
-        dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
-    )
-
-    canvas_size = (height, width)
-    expected_bboxes = reference_affine_bounding_boxes_helper(
-        bounding_boxes, format=format, canvas_size=canvas_size, affine_matrix=affine_matrix
-    )
-    return expected_bboxes, canvas_size
-
-
-def reference_inputs_crop_bounding_boxes():
-    for bounding_boxes_loader, params in itertools.product(
-        make_bounding_box_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
-    ):
-        yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **params)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.crop_image,
-            kernel_name="crop_image_tensor",
-            sample_inputs_fn=sample_inputs_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F._crop_image_pil),
-            reference_inputs_fn=reference_inputs_crop_image_tensor,
-            float32_vs_uint8=True,
-        ),
-        KernelInfo(
-            F.crop_bounding_boxes,
-            sample_inputs_fn=sample_inputs_crop_bounding_boxes,
-            reference_fn=reference_crop_bounding_boxes,
-            reference_inputs_fn=reference_inputs_crop_bounding_boxes,
-        ),
-        KernelInfo(
-            F.crop_mask,
-            sample_inputs_fn=sample_inputs_crop_mask,
-            reference_fn=pil_reference_wrapper(F._crop_image_pil),
-            reference_inputs_fn=reference_inputs_crop_mask,
-            float32_vs_uint8=True,
-        ),
-        KernelInfo(
-            F.crop_video,
-            sample_inputs_fn=sample_inputs_crop_video,
-        ),
-    ]
-)
-
 _RESIZED_CROP_PARAMS = combinations_grid(top=[-8, 9], left=[-8, 9], height=[12], width=[12], size=[(16, 18)])
 
 
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 7838d7e3e..8c74f6002 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -1165,7 +1165,7 @@ def pad_image(
     fill: Optional[Union[int, float, List[float]]] = None,
     padding_mode: str = "constant",
 ) -> torch.Tensor:
-    # Be aware that while `padding` has order `[left, top, right, bottom]` has order, `torch_padding` uses
+    # Be aware that while `padding` has order `[left, top, right, bottom]`, `torch_padding` uses
     # `[left, right, top, bottom]`. This stems from the fact that we align our API with PIL, but need to use `torch_pad`
     # internally.
     torch_padding = _parse_pad_padding(padding)
-- 
GitLab


From a2f8f8e9bd40805b6342b998a25e665d748d4fc7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 30 Aug 2023 17:00:15 +0200
Subject: [PATCH 614/624] port tests for F.erase and transforms.RandomErasing
 (#7902)

---
 test/test_transforms_v2.py             |  47 ---------
 test/test_transforms_v2_consistency.py |  15 ---
 test/test_transforms_v2_refactored.py  | 141 +++++++++++++++++++++++++
 test/transforms_v2_dispatcher_infos.py |  11 --
 test/transforms_v2_kernel_infos.py     |  30 ------
 5 files changed, 141 insertions(+), 103 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index c92a8cd7c..175a3ac16 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -540,53 +540,6 @@ class TestElasticTransform:
         assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
 
 
-class TestRandomErasing:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
-            transforms.RandomErasing(value={})
-
-        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
-            transforms.RandomErasing(value="abc")
-
-        with pytest.raises(TypeError, match="Scale should be a sequence"):
-            transforms.RandomErasing(scale=123)
-
-        with pytest.raises(TypeError, match="Ratio should be a sequence"):
-            transforms.RandomErasing(ratio=123)
-
-        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
-            transforms.RandomErasing(scale=[-1, 2])
-
-        image = make_image((24, 32))
-
-        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
-
-        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
-            transform._get_params([image])
-
-    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value):
-        image = make_image((24, 32))
-        num_channels, height, width = F.get_dimensions(image)
-
-        transform = transforms.RandomErasing(value=value)
-        params = transform._get_params([image])
-
-        v = params["v"]
-        h, w = params["h"], params["w"]
-        i, j = params["i"], params["j"]
-        assert isinstance(v, torch.Tensor)
-        if value == "random":
-            assert v.shape == (num_channels, h, w)
-        elif isinstance(value, (int, float)):
-            assert v.shape == (1, 1, 1)
-        elif isinstance(value, (list, tuple)):
-            assert v.shape == (num_channels, 1, 1)
-
-        assert 0 <= i <= height - h
-        assert 0 <= j <= width - w
-
-
 class TestTransform:
     @pytest.mark.parametrize(
         "inpt_type",
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index e8d6487e9..1f96caa24 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -276,20 +276,6 @@ CONSISTENCY_CONFIGS = [
         ],
         closeness_kwargs=dict(rtol=0, atol=21),
     ),
-    ConsistencyConfig(
-        v2_transforms.RandomErasing,
-        legacy_transforms.RandomErasing,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-            ArgsKwargs(p=1, scale=(0.3, 0.7)),
-            ArgsKwargs(p=1, ratio=(0.5, 1.5)),
-            ArgsKwargs(p=1, value=1),
-            ArgsKwargs(p=1, value=(1, 2, 3)),
-            ArgsKwargs(p=1, value="random"),
-        ],
-        supports_pil=False,
-    ),
     ConsistencyConfig(
         v2_transforms.ColorJitter,
         legacy_transforms.ColorJitter,
@@ -550,7 +536,6 @@ get_params_parametrization = pytest.mark.parametrize(
         )
         for transform_cls, get_params_args_kwargs in [
             (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
-            (v2_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
             (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
             (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 6492aead3..ad5cd8e00 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2718,3 +2718,144 @@ class TestCrop:
 
         with pytest.raises(ValueError, match="Padding mode should be either"):
             transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
+
+
+class TestErase:
+    INPUT_SIZE = (17, 11)
+    FUNCTIONAL_KWARGS = dict(
+        zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)])
+    )
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_inplace(self, dtype, device):
+        input = make_image(self.INPUT_SIZE, dtype=dtype, device=device)
+        input_version = input._version
+
+        output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS)
+        assert output_out_of_place.data_ptr() != input.data_ptr()
+        assert output_out_of_place is not input
+
+        output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True)
+        assert output_inplace.data_ptr() == input.data_ptr()
+        assert output_inplace._version > input_version
+        assert output_inplace is input
+
+        assert_equal(output_inplace, output_out_of_place)
+
+    def test_kernel_video(self):
+        check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.erase_image, torch.Tensor),
+            (F._erase_image_pil, PIL.Image.Image),
+            (F.erase_image, tv_tensors.Image),
+            (F.erase_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomErasing(p=1), make_input(device=device))
+
+    def _reference_erase_image(self, image, *, i, j, h, w, v):
+        mask = torch.zeros_like(image, dtype=torch.bool)
+        mask[..., i : i + h, j : j + w] = True
+
+        # The broadcasting and type casting logic is handled automagically in the kernel through indexing
+        value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image)
+
+        erased_image = torch.empty_like(image)
+        erased_image[mask] = value.flatten()
+        erased_image[~mask] = image[~mask]
+
+        return erased_image
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_image_correctness(self, dtype, device):
+        image = make_image(dtype=dtype, device=device)
+
+        actual = F.erase(image, **self.FUNCTIONAL_KWARGS)
+        expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS)
+
+        assert_equal(actual, expected)
+
+    @param_value_parametrization(
+        scale=[(0.1, 0.2), [0.0, 1.0]],
+        ratio=[(0.3, 0.7), [0.1, 5.0]],
+        value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, param, value, dtype, device, seed):
+        transform = transforms.RandomErasing(**{param: value}, p=1)
+
+        image = make_image(dtype=dtype, device=device)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            # This emulates the random apply check that happens before _get_params is called
+            torch.rand(1)
+            params = transform._get_params([image])
+
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+        expected = self._reference_erase_image(image, **params)
+
+        assert_equal(actual, expected)
+
+    def test_transform_errors(self):
+        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
+            transforms.RandomErasing(value={})
+
+        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
+            transforms.RandomErasing(value="abc")
+
+        with pytest.raises(TypeError, match="Scale should be a sequence"):
+            transforms.RandomErasing(scale=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence"):
+            transforms.RandomErasing(ratio=123)
+
+        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
+            transforms.RandomErasing(scale=[-1, 2])
+
+        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
+
+        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
+            transform._get_params([make_image()])
+
+    @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask])
+    def test_transform_passthrough(self, make_input):
+        transform = transforms.RandomErasing(p=1)
+
+        input = make_input(self.INPUT_SIZE)
+
+        with pytest.warns(UserWarning, match="currently passing through inputs of type"):
+            # RandomErasing requires an image or video to be present
+            _, output = transform(make_image(self.INPUT_SIZE), input)
+
+        assert output is input
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 12c1417d9..6d7ee64d2 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -269,17 +269,6 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
     ),
-    DispatcherInfo(
-        F.erase,
-        kernels={
-            tv_tensors.Image: F.erase_image,
-            tv_tensors.Video: F.erase_video,
-        },
-        pil_kernel_info=PILKernelInfo(F._erase_image_pil),
-        test_marks=[
-            skip_dispatch_tv_tensor,
-        ],
-    ),
     DispatcherInfo(
         F.adjust_contrast,
         kernels={
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index 22eda3539..a549bfe72 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -1123,36 +1123,6 @@ KERNEL_INFOS.extend(
 )
 
 
-def sample_inputs_erase_image_tensor():
-    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
-        # FIXME: make the parameters more diverse
-        h, w = 6, 7
-        v = torch.rand(image_loader.num_channels, h, w)
-        yield ArgsKwargs(image_loader, i=1, j=2, h=h, w=w, v=v)
-
-
-def sample_inputs_erase_video():
-    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
-        # FIXME: make the parameters more diverse
-        h, w = 6, 7
-        v = torch.rand(video_loader.num_channels, h, w)
-        yield ArgsKwargs(video_loader, i=1, j=2, h=h, w=w, v=v)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.erase_image,
-            kernel_name="erase_image_tensor",
-            sample_inputs_fn=sample_inputs_erase_image_tensor,
-        ),
-        KernelInfo(
-            F.erase_video,
-            sample_inputs_fn=sample_inputs_erase_video,
-        ),
-    ]
-)
-
 _ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
 
 
-- 
GitLab


From f1b4c7a6fd65479a096ed6ae44fb5e762af6c0f4 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Wed, 30 Aug 2023 18:13:02 +0200
Subject: [PATCH 615/624] Fixed sigma input type for v2.GaussianBlur (#7887)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2.py             | 45 +++-----------------------
 test/test_transforms_v2_refactored.py  | 43 ++++++++++++++++++++++++
 torchvision/transforms/v2/_geometry.py |  6 ++--
 torchvision/transforms/v2/_misc.py     | 15 +++------
 torchvision/transforms/v2/_utils.py    | 23 +++++++------
 5 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 175a3ac16..3f0056e96 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -449,37 +449,6 @@ class TestRandomZoomOut:
         assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
 
 
-class TestGaussianBlur:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
-            transforms.GaussianBlur([10, 12, 14])
-
-        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
-            transforms.GaussianBlur(4)
-
-        with pytest.raises(
-            TypeError, match="sigma should be a single int or float or a list/tuple with length 2 floats."
-        ):
-            transforms.GaussianBlur(3, sigma=[1, 2, 3])
-
-        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
-            transforms.GaussianBlur(3, sigma=-1.0)
-
-        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
-            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
-
-    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
-    def test__get_params(self, sigma):
-        transform = transforms.GaussianBlur(3, sigma=sigma)
-        params = transform._get_params([])
-
-        if isinstance(sigma, float):
-            assert params["sigma"][0] == params["sigma"][1] == 10
-        else:
-            assert sigma[0] <= params["sigma"][0] <= sigma[1]
-            assert sigma[0] <= params["sigma"][1] <= sigma[1]
-
-
 class TestRandomPerspective:
     def test_assertions(self):
         with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
@@ -503,24 +472,18 @@ class TestRandomPerspective:
 class TestElasticTransform:
     def test_assertions(self):
 
-        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
+        with pytest.raises(TypeError, match="alpha should be a number or a sequence of numbers"):
             transforms.ElasticTransform({})
 
-        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
+        with pytest.raises(ValueError, match="alpha is a sequence its length should be 1 or 2"):
             transforms.ElasticTransform([1.0, 2.0, 3.0])
 
-        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
-            transforms.ElasticTransform([1, 2])
-
-        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
+        with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"):
             transforms.ElasticTransform(1.0, {})
 
-        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
+        with pytest.raises(ValueError, match="sigma is a sequence its length should be 1 or 2"):
             transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
 
-        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
-            transforms.ElasticTransform(1.0, [1, 2])
-
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.ElasticTransform(1.0, 2.0, fill="abc")
 
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index ad5cd8e00..b2e21fc4a 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -2859,3 +2859,46 @@ class TestErase:
             _, output = transform(make_image(self.INPUT_SIZE), input)
 
         assert output is input
+
+
+class TestGaussianBlur:
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("sigma", [5, (0.5, 2)])
+    def test_transform(self, make_input, device, sigma):
+        check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device))
+
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
+            transforms.GaussianBlur([10, 12, 14])
+
+        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
+            transforms.GaussianBlur(4)
+
+        with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"):
+            transforms.GaussianBlur(3, sigma=[1, 2, 3])
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=-1.0)
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
+
+        with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"):
+            transforms.GaussianBlur(3, sigma={})
+
+    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]])
+    def test__get_params(self, sigma):
+        transform = transforms.GaussianBlur(3, sigma=sigma)
+        params = transform._get_params([])
+
+        if isinstance(sigma, float):
+            assert params["sigma"][0] == params["sigma"][1] == sigma
+        elif isinstance(sigma, list) and len(sigma) == 1:
+            assert params["sigma"][0] == params["sigma"][1] == sigma[0]
+        else:
+            assert sigma[0] <= params["sigma"][0] <= sigma[1]
+            assert sigma[0] <= params["sigma"][1] <= sigma[1]
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index ba3e690dd..721e9b7e4 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -21,7 +21,7 @@ from ._utils import (
     _get_fill,
     _setup_angle,
     _setup_fill_arg,
-    _setup_float_or_seq,
+    _setup_number_or_seq,
     _setup_size,
     get_bounding_boxes,
     has_all,
@@ -1060,8 +1060,8 @@ class ElasticTransform(Transform):
         fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
     ) -> None:
         super().__init__()
-        self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
+        self.alpha = _setup_number_or_seq(alpha, "alpha")
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
 
         self.interpolation = _check_interpolation(interpolation)
         self.fill = fill
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 739f2fb7f..67aaf4f37 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -9,7 +9,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import transforms as _transforms, tv_tensors
 from torchvision.transforms.v2 import functional as F, Transform
 
-from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
+from ._utils import _parse_labels_getter, _setup_number_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
 
 
 # TODO: do we want/need to expose this?
@@ -198,17 +198,10 @@ class GaussianBlur(Transform):
             if ks <= 0 or ks % 2 == 0:
                 raise ValueError("Kernel size value should be an odd and positive number.")
 
-        if isinstance(sigma, (int, float)):
-            if sigma <= 0:
-                raise ValueError("If sigma is a single number, it must be positive.")
-            sigma = float(sigma)
-        elif isinstance(sigma, Sequence) and len(sigma) == 2:
-            if not 0.0 < sigma[0] <= sigma[1]:
-                raise ValueError("sigma values should be positive and of the form (min, max).")
-        else:
-            raise TypeError("sigma should be a single int or float or a list/tuple with length 2 floats.")
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
 
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
+        if not 0.0 < self.sigma[0] <= self.sigma[1]:
+            raise ValueError(f"sigma values should be positive and of the form (min, max). Got {self.sigma}")
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
index d5669f573..6147180a9 100644
--- a/torchvision/transforms/v2/_utils.py
+++ b/torchvision/transforms/v2/_utils.py
@@ -18,20 +18,23 @@ from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pu
 from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
 
-def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size: int = 2) -> Sequence[float]:
-    if not isinstance(arg, (float, Sequence)):
-        raise TypeError(f"{name} should be float or a sequence of floats. Got {type(arg)}")
-    if isinstance(arg, Sequence) and len(arg) != req_size:
-        raise ValueError(f"If {name} is a sequence its length should be one of {req_size}. Got {len(arg)}")
+def _setup_number_or_seq(arg: Union[int, float, Sequence[Union[int, float]]], name: str) -> Sequence[float]:
+    if not isinstance(arg, (int, float, Sequence)):
+        raise TypeError(f"{name} should be a number or a sequence of numbers. Got {type(arg)}")
+    if isinstance(arg, Sequence) and len(arg) not in (1, 2):
+        raise ValueError(f"If {name} is a sequence its length should be 1 or 2. Got {len(arg)}")
     if isinstance(arg, Sequence):
         for element in arg:
-            if not isinstance(element, float):
-                raise ValueError(f"{name} should be a sequence of floats. Got {type(element)}")
+            if not isinstance(element, (int, float)):
+                raise ValueError(f"{name} should be a sequence of numbers. Got {type(element)}")
 
-    if isinstance(arg, float):
+    if isinstance(arg, (int, float)):
         arg = [float(arg), float(arg)]
-    if isinstance(arg, (list, tuple)) and len(arg) == 1:
-        arg = [arg[0], arg[0]]
+    elif isinstance(arg, Sequence):
+        if len(arg) == 1:
+            arg = [float(arg[0]), float(arg[0])]
+        else:
+            arg = [float(arg[0]), float(arg[1])]
     return arg
 
 
-- 
GitLab


From ab6f1edec00237cf78024ccd560d41d1221d274a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 30 Aug 2023 12:52:19 -0700
Subject: [PATCH 616/624] [Release-Only] Branch Cut - Bump vision Version
 0.16.0 (#7913)

---
 .github/workflows/build-cmake.yml             |  6 ++--
 .github/workflows/build-conda-linux.yml       |  8 ++---
 .github/workflows/build-conda-m1.yml          |  8 ++---
 .github/workflows/build-conda-macos.yml       |  8 ++---
 .github/workflows/build-conda-windows.yml     |  8 ++---
 .../workflows/build-wheels-aarch64-linux.yml  |  8 ++---
 .github/workflows/build-wheels-linux.yml      |  8 ++---
 .github/workflows/build-wheels-m1.yml         |  8 ++---
 .github/workflows/build-wheels-macos.yml      |  8 ++---
 .github/workflows/build-wheels-windows.yml    |  8 ++---
 .github/workflows/docs.yml                    | 22 +++++++-------
 .github/workflows/lint.yml                    | 29 +++++++++---------
 .../workflows/prototype-tests-linux-gpu.yml   |  4 +--
 .github/workflows/tests.yml                   | 30 +++++++++----------
 .github/workflows/update-viablestrict.yml     |  2 +-
 version.txt                                   |  2 +-
 16 files changed, 83 insertions(+), 84 deletions(-)

diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
index 3871dca34..23f2b4b06 100644
--- a/.github/workflows/build-cmake.yml
+++ b/.github/workflows/build-cmake.yml
@@ -20,7 +20,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -42,7 +42,7 @@ jobs:
           - runner: macos-12
           - runner: macos-m1-12
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.1
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -65,7 +65,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@release/2.1
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
index a445ef9af..8da9d488f 100644
--- a/.github/workflows/build-conda-linux.yml
+++ b/.github/workflows/build-conda-linux.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: conda
       os: linux
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -34,13 +34,13 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_conda_linux.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_conda_linux.yml@release/2.1
     with:
       conda-package-directory: ${{ matrix.conda-package-directory }}
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
index 159a3c79a..4a347e1ba 100644
--- a/.github/workflows/build-conda-m1.yml
+++ b/.github/workflows/build-conda-m1.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: conda
       os: macos-arm64
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -34,13 +34,13 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@release/2.1
     with:
       conda-package-directory: ${{ matrix.conda-package-directory }}
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml
index 6f4929e27..aca1b1275 100644
--- a/.github/workflows/build-conda-macos.yml
+++ b/.github/workflows/build-conda-macos.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: conda
       os: macos
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -34,13 +34,13 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@release/2.1
     with:
       conda-package-directory: ${{ matrix.conda-package-directory }}
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 968cb7007..f03e4c57f 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: conda
       os: windows
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -34,13 +34,13 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_conda_windows.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_conda_windows.yml@release/2.1
     with:
       conda-package-directory: ${{ matrix.conda-package-directory }}
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-wheels-aarch64-linux.yml b/.github/workflows/build-wheels-aarch64-linux.yml
index 035800e93..30bcd3955 100644
--- a/.github/workflows/build-wheels-aarch64-linux.yml
+++ b/.github/workflows/build-wheels-aarch64-linux.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: wheel
       os: linux-aarch64
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       with-cuda: disable
   build:
     needs: generate-matrix
@@ -34,12 +34,12 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.1
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index e997d648e..e04c7383e 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: wheel
       os: linux
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -33,12 +33,12 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.1
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index 66c7687ac..b4c4becc7 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: wheel
       os: macos-arm64
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -33,12 +33,12 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.1
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
index 6c5ebc0fc..f79ace745 100644
--- a/.github/workflows/build-wheels-macos.yml
+++ b/.github/workflows/build-wheels-macos.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: wheel
       os: macos
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -33,12 +33,12 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.1
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
index 3d818ece5..c9b3ebd72 100644
--- a/.github/workflows/build-wheels-windows.yml
+++ b/.github/workflows/build-wheels-windows.yml
@@ -15,12 +15,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
     with:
       package-type: wheel
       os: windows
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
   build:
     needs: generate-matrix
     strategy:
@@ -34,12 +34,12 @@ jobs:
             smoke-test-script: test/smoke_test.py
             package-name: torchvision
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@release/2.1
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.1
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       env-script: ${{ matrix.env-script }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 779da13e3..724ee09a4 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   build:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       upload-artifact: docs
@@ -25,7 +25,7 @@ jobs:
         export GPU_ARCH_TYPE=cpu
         export GPU_ARCH_VERSION=''
         ./.github/scripts/setup-env.sh
-        
+
         # Prepare conda
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
@@ -36,13 +36,13 @@ jobs:
         # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
         # have to pay attention in all other workflows?
         export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
-        
+
         cd docs
-        
+
         echo '::group::Install doc requirements'
         pip install --progress-bar=off -r requirements.txt
         echo '::endgroup::'
-        
+
         if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then
           echo '::group::Enable version string sanitization'
           # This environment variable just has to exist and must not be empty. The actual value is arbitrary.
@@ -66,9 +66,9 @@ jobs:
             cp $file build/html/_generated_ipynb_notebooks/
           fi
         done
-        
+
         cp -r build/html "${RUNNER_ARTIFACT_DIR}"
-        
+
         # On PRs we also want to upload the docs into our S3 bucket for preview.
         if [[ ${{ github.event_name == 'pull_request' }} ]]; then
           cp -r build/html/* "${RUNNER_DOCS_DIR}"
@@ -80,14 +80,14 @@ jobs:
         ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
     permissions:
       contents: write
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       download-artifact: docs
       ref: gh-pages
       script: |
         set -euo pipefail
-        
+
         REF_TYPE=${{ github.ref_type }}
         REF_NAME=${{ github.ref_name }}
 
@@ -112,14 +112,14 @@ jobs:
         rm -rf "${TARGET_FOLDER}"/*
         mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
         git add "${TARGET_FOLDER}" || true
-        
+
         if [[ "${TARGET_FOLDER}" == main ]]; then
           mkdir -p _static
           rm -rf _static/*
           cp -r "${TARGET_FOLDER}"/_static/* _static
           git add _static || true
         fi
-        
+
         git config user.name 'pytorchbot'
         git config user.email 'soumith+bot@pytorch.org'
         git config http.postBuffer 524288000
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 8203bb61e..917bc54c8 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   python-source-and-configs:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       script: |
@@ -23,26 +23,26 @@ jobs:
         conda create --name ci --quiet --yes python=3.8 pip
         conda activate ci
         echo '::endgroup::'
-        
+
         echo '::group::Install lint tools'
         pip install --progress-bar=off pre-commit
         echo '::endgroup::'
-        
+
         set +e
         pre-commit run --all-files
-        
+
         if [ $? -ne 0 ]; then
           git --no-pager diff
           exit 1
         fi
 
   c-source:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       script: |
         set -euo pipefail
-        
+
         echo '::group::Setup environment'
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
@@ -53,7 +53,7 @@ jobs:
         conda activate ci
         export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
         echo '::endgroup::'
-        
+
         echo '::group::Install lint tools'
         curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format
         chmod +x ./clang-format
@@ -62,35 +62,34 @@ jobs:
         echo '::group::Lint C source'
         set +e
         ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format
-        
+
         if [ $? -ne 0 ]; then
           git --no-pager diff
           exit 1
         fi
         echo '::endgroup::'
 
-
   python-types:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       script: |
         set -euo pipefail
-        
+
         export PYTHON_VERSION=3.8
         export GPU_ARCH_TYPE=cpu
         export GPU_ARCH_VERSION=''
 
         ./.github/scripts/setup-env.sh
-        
+
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
         conda activate ci
-        
+
         echo '::group::Install lint tools'
         pip install --progress-bar=off mypy
         echo '::endgroup::'
-        
+
         echo '::group::Lint Python types'
         mypy --install-types --non-interactive --config-file mypy.ini
         echo '::endgroup::'
@@ -100,7 +99,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Run BC Lint Action
-        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        uses: pytorch/test-infra/.github/actions/bc-lint@release/2.1
         with:
           repo: ${{ github.event.pull_request.head.repo.full_name }}
           base_sha: ${{ github.event.pull_request.base.sha }}
diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index dd850c14e..10e51368b 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -20,7 +20,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -34,7 +34,7 @@ jobs:
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
         ./.github/scripts/setup-env.sh
-        
+
         # Prepare conda
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 22e1a4ac1..7c1a334e1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,7 +26,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -41,7 +41,7 @@ jobs:
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
 
         ./.github/scripts/unittest.sh
-        
+
   unittests-macos:
     strategy:
       matrix:
@@ -55,7 +55,7 @@ jobs:
           - python-version: "3.8"
             runner: macos-m1-12
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.1
     with:
       repository: pytorch/vision
       # We need an increased timeout here, since the macos-12 runner is the free one from GH
@@ -87,7 +87,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@release/2.1
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -102,11 +102,11 @@ jobs:
         export VSDEVCMD_ARGS=""
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
-        
+
         ./.github/scripts/unittest.sh
 
   onnx:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       script: |
@@ -117,26 +117,26 @@ jobs:
         export GPU_ARCH_VERSION=''
 
         ./.github/scripts/setup-env.sh
-        
+
         # Prepare conda
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
         conda activate ci
-        
+
         echo '::group::Install ONNX'
         pip install --progress-bar=off onnx onnxruntime
         echo '::endgroup::'
-        
+
         echo '::group::Install testing utilities'
         pip install --progress-bar=off pytest
         echo '::endgroup::'
-        
+
         echo '::group::Run ONNX tests'
         pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py
         echo '::endgroup::'
 
   unittests-extended:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
     with:
       repository: pytorch/vision
       script: |
@@ -147,21 +147,21 @@ jobs:
         export GPU_ARCH_VERSION=''
 
         ./.github/scripts/setup-env.sh
-        
+
         # Prepare conda
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
         conda activate ci
-        
+
         echo '::group::Pre-download model weights'
         pip install --progress-bar=off aiohttp aiofiles tqdm
         python scripts/download_model_urls.py
         echo '::endgroup::'
-        
+
         echo '::group::Install testing utilities'
         pip install --progress-bar=off pytest
         echo '::endgroup::'
-        
+
         echo '::group::Run extended unittests'
         export PYTORCH_TEST_WITH_EXTENDED=1
         pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 665d833b6..ac81f3ff1 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -14,7 +14,7 @@ concurrency:
 
 jobs:
   do_update_viablestrict:
-    uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@main
+    uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@release/2.1
     with:
       repository: pytorch/vision
       required_checks: "Build Linux,Build M1,Build Macos,Build Windows,Tests,CMake,Lint,Docs"
diff --git a/version.txt b/version.txt
index 5e0f9f3c7..04a373efe 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.16.0a0
+0.16.0
-- 
GitLab


From 6fa2469b0b1d0f86bc2fcac915cd64efe2340109 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 30 Aug 2023 18:00:42 -0400
Subject: [PATCH 617/624] Try to turn off DEBUG mode when building for release
 (#7914)

---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index 482ab9146..ce67413f4 100644
--- a/setup.py
+++ b/setup.py
@@ -223,6 +223,9 @@ def get_extensions():
             extra_compile_args["nvcc"] = [f for f in nvcc_flags if not ("-O" in f or "-g" in f)]
             extra_compile_args["nvcc"].append("-O0")
             extra_compile_args["nvcc"].append("-g")
+    else:
+        print("Compiling with debug mode OFF")
+        extra_compile_args["cxx"].append("-g0")
 
     sources = [os.path.join(extensions_dir, s) for s in sources]
 
-- 
GitLab


From f588fd1a843e846315fbe30830eb75c7467534be Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 31 Aug 2023 09:52:46 +0100
Subject: [PATCH 618/624] [Cherry pick] rename BoundingBoxes module to match
 class name (#7910) (#7918)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/tv_tensors/__init__.py                              | 2 +-
 torchvision/tv_tensors/{_bounding_box.py => _bounding_boxes.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename torchvision/tv_tensors/{_bounding_box.py => _bounding_boxes.py} (100%)

diff --git a/torchvision/tv_tensors/__init__.py b/torchvision/tv_tensors/__init__.py
index cb9bf702f..d55e10e86 100644
--- a/torchvision/tv_tensors/__init__.py
+++ b/torchvision/tv_tensors/__init__.py
@@ -1,6 +1,6 @@
 import torch
 
-from ._bounding_box import BoundingBoxes, BoundingBoxFormat
+from ._bounding_boxes import BoundingBoxes, BoundingBoxFormat
 from ._image import Image
 from ._mask import Mask
 from ._torch_function_helpers import set_return_type
diff --git a/torchvision/tv_tensors/_bounding_box.py b/torchvision/tv_tensors/_bounding_boxes.py
similarity index 100%
rename from torchvision/tv_tensors/_bounding_box.py
rename to torchvision/tv_tensors/_bounding_boxes.py
-- 
GitLab


From d60d5e71c93d93b4f2791ecc5018d56bab9f96b4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 31 Aug 2023 14:26:37 +0200
Subject: [PATCH 619/624] [CHERRYPICK] allow sequence fill for v2 AA scripted
 (#7920)

---
 test/test_transforms_v2_consistency.py     | 21 ++++++++++++---------
 torchvision/transforms/v2/_auto_augment.py |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 1f96caa24..1f47eb211 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -755,10 +755,11 @@ class TestAATransforms:
             v2_transforms.InterpolationMode.BILINEAR,
         ],
     )
-    def test_randaug_jit(self, interpolation):
+    @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
+    def test_randaug_jit(self, interpolation, fill):
         inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
-        t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1)
-        t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1)
+        t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1, fill=fill)
+        t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1, fill=fill)
 
         tt_ref = torch.jit.script(t_ref)
         tt = torch.jit.script(t)
@@ -830,10 +831,11 @@ class TestAATransforms:
             v2_transforms.InterpolationMode.BILINEAR,
         ],
     )
-    def test_trivial_aug_jit(self, interpolation):
+    @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
+    def test_trivial_aug_jit(self, interpolation, fill):
         inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
-        t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation)
-        t = v2_transforms.TrivialAugmentWide(interpolation=interpolation)
+        t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation, fill=fill)
+        t = v2_transforms.TrivialAugmentWide(interpolation=interpolation, fill=fill)
 
         tt_ref = torch.jit.script(t_ref)
         tt = torch.jit.script(t)
@@ -906,11 +908,12 @@ class TestAATransforms:
             v2_transforms.InterpolationMode.BILINEAR,
         ],
     )
-    def test_augmix_jit(self, interpolation):
+    @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
+    def test_augmix_jit(self, interpolation, fill):
         inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
 
-        t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
-        t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
+        t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1, fill=fill)
+        t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1, fill=fill)
 
         tt_ref = torch.jit.script(t_ref)
         tt = torch.jit.script(t)
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 4fec62f1b..8ddd5aacd 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -33,8 +33,8 @@ class _AutoAugmentBase(Transform):
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
         params = super()._extract_params_for_v1_transform()
 
-        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
-            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+        if isinstance(params["fill"], dict):
+            raise ValueError(f"{type(self).__name__}() can not be scripted for when `fill` is a dictionary.")
 
         return params
 
-- 
GitLab


From 1044d0fb3c1e38812598ac620844bc291db6675e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 31 Aug 2023 17:48:46 +0200
Subject: [PATCH 620/624] [CHERRYPICK] downgrade warnign for F.rotate with
 center and expand=True (#7921)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2_refactored.py             | 1 -
 torchvision/transforms/v2/_geometry.py            | 9 ++++++++-
 torchvision/transforms/v2/functional/_geometry.py | 9 ---------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index b2e21fc4a..f8a47c7cf 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -1380,7 +1380,6 @@ class TestVerticalFlip:
         assert_equal(output, input)
 
 
-@pytest.mark.filterwarnings("ignore:The provided center argument has no effect")
 class TestRotate:
     _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
         # float, int
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 721e9b7e4..8d79fb6fd 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -590,9 +590,16 @@ class RandomRotation(Transform):
         expand (bool, optional): Optional expansion flag.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
-            Note that the expand flag assumes rotation around the center and no translation.
+            Note that the expand flag assumes rotation around the center (see note below) and no translation.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
+
+            .. note::
+
+                In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
+                center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
+                differences of the resulting image size compared to using the image center in the first place. Thus, when
+                setting ``expand=True``, it's best to leave ``center=None`` (default).
         fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
             Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
             Fill value can be also a dictionary mapping data type to the fill value, e.g.
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 8c74f6002..387be3ff9 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -977,9 +977,6 @@ def rotate_image(
 
     center_f = [0.0, 0.0]
     if center is not None:
-        if expand:
-            # TODO: Do we actually want to warn, or just document this?
-            warnings.warn("The provided center argument has no effect on the result if expand is True")
         # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
         center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
 
@@ -1017,9 +1014,6 @@ def _rotate_image_pil(
 ) -> PIL.Image.Image:
     interpolation = _check_interpolation(interpolation)
 
-    if center is not None and expand:
-        warnings.warn("The provided center argument has no effect on the result if expand is True")
-
     return _FP.rotate(
         image, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center
     )
@@ -1033,9 +1027,6 @@ def rotate_bounding_boxes(
     expand: bool = False,
     center: Optional[List[float]] = None,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    if center is not None and expand:
-        warnings.warn("The provided center argument has no effect on the result if expand is True")
-
     return _affine_bounding_boxes_with_expand(
         bounding_boxes,
         format=format,
-- 
GitLab


From 704f831bdc9a0c7bb9f68f8b53a4ddedbf0b0846 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 1 Sep 2023 03:35:19 -0400
Subject: [PATCH 621/624] Use regular env setup (#7924)

---
 .github/scripts/setup-env.sh | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index 4b852efd9..a4f113c36 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -73,11 +73,21 @@ else
   CHANNEL=nightly
 fi
 
-pip install --progress-bar=off light-the-torch
-ltt install --progress-bar=off \
-  --pytorch-computation-backend="${GPU_ARCH_TYPE}${GPU_ARCH_VERSION}" \
-  --pytorch-channel="${CHANNEL}" \
-  torch
+case $GPU_ARCH_TYPE in
+  cpu)
+    GPU_ARCH_ID="cpu"
+    ;;
+  cuda)
+    VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//')
+    GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}"
+    ;;
+  *)
+    echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}"
+    exit 1
+    ;;
+esac
+PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}"
+pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
 
 if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
   python -c "import torch; exit(not torch.cuda.is_available())"
-- 
GitLab


From eab7cfbf498fb7f9ed4b4a14916a6f3d3a865c20 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Thu, 7 Sep 2023 09:52:08 +0100
Subject: [PATCH 622/624] Update versions in readme.md (#7941)

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dd29290a0..373b6b795 100644
--- a/README.md
+++ b/README.md
@@ -21,15 +21,16 @@ versions.
 | `torch`            | `torchvision`      | Python              |
 | ------------------ | ------------------ | ------------------- |
 | `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.11`   |
+| `2.1`              | `0.16`             | `>=3.8`, `<=3.11`   |
 | `2.0`              | `0.15`             | `>=3.8`, `<=3.11`   |
 | `1.13`             | `0.14`             | `>=3.7.2`, `<=3.10` |
-| `1.12`             | `0.13`             | `>=3.7`, `<=3.10`   |
 
 <details>
     <summary>older versions</summary>
 
 | `torch` | `torchvision`     | Python                    |
 |---------|-------------------|---------------------------|
+| `1.12`  | `0.13`            | `>=3.7`, `<=3.10`         |
 | `1.11`  | `0.12`            | `>=3.7`, `<=3.10`         |
 | `1.10`  | `0.11`            | `>=3.6`, `<=3.9`          |
 | `1.9`   | `0.10`            | `>=3.6`, `<=3.9`          |
-- 
GitLab


From a90e584667fc3a7d85485764245e0db92387aca1 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 8 Sep 2023 23:38:53 +0200
Subject: [PATCH 623/624] [CHERRYPICK] PIL fill len 1 seq / float fill for int
 images (#7951)

---
 test/test_transforms_v2_refactored.py         | 26 ++++++++-----
 test/transforms_v2_dispatcher_infos.py        | 37 -------------------
 torchvision/transforms/_functional_pil.py     |  6 ++-
 .../transforms/v2/functional/_geometry.py     |  6 ++-
 4 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index f8a47c7cf..e18beb35a 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -309,11 +309,12 @@ def adapt_fill(value, *, dtype):
         return value
 
     max_value = get_max_value(dtype)
+    value_type = float if dtype.is_floating_point else int
 
     if isinstance(value, (int, float)):
-        return type(value)(value * max_value)
+        return value_type(value * max_value)
     elif isinstance(value, (list, tuple)):
-        return type(value)(type(v)(v * max_value) for v in value)
+        return type(value)(value_type(v * max_value) for v in value)
     else:
         raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.")
 
@@ -414,6 +415,10 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
     )
 
 
+# turns all warnings into errors for this module
+pytestmark = pytest.mark.filterwarnings("error")
+
+
 class TestResize:
     INPUT_SIZE = (17, 11)
     OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)]
@@ -2575,18 +2580,19 @@ class TestCrop:
     def test_transform(self, param, value, make_input):
         input = make_input(self.INPUT_SIZE)
 
-        kwargs = {param: value}
         if param == "fill":
-            # 1. size is required
-            # 2. the fill parameter only has an affect if we need padding
-            kwargs["size"] = [s + 4 for s in self.INPUT_SIZE]
-
-            if isinstance(input, PIL.Image.Image) and isinstance(value, (tuple, list)) and len(value) == 1:
-                pytest.xfail("F._pad_image_pil does not support sequences of length 1 for fill.")
-
             if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)):
                 pytest.skip("F.pad_mask doesn't support non-scalar fill.")
 
+            kwargs = dict(
+                # 1. size is required
+                # 2. the fill parameter only has an affect if we need padding
+                size=[s + 4 for s in self.INPUT_SIZE],
+                fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8),
+            )
+        else:
+            kwargs = {param: value}
+
         check_transform(
             transforms.RandomCrop(**kwargs, pad_if_needed=True),
             input,
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 6d7ee64d2..b84d87eb7 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -1,5 +1,3 @@
-import collections.abc
-
 import pytest
 import torchvision.transforms.v2.functional as F
 from torchvision import tv_tensors
@@ -112,32 +110,6 @@ multi_crop_skips = [
 multi_crop_skips.append(skip_dispatch_tv_tensor)
 
 
-def xfails_pil(reason, *, condition=None):
-    return [
-        TestMark(("TestDispatchers", test_name), pytest.mark.xfail(reason=reason), condition=condition)
-        for test_name in ["test_dispatch_pil", "test_pil_output_type"]
-    ]
-
-
-def fill_sequence_needs_broadcast(args_kwargs):
-    (image_loader, *_), kwargs = args_kwargs
-    try:
-        fill = kwargs["fill"]
-    except KeyError:
-        return False
-
-    if not isinstance(fill, collections.abc.Sequence) or len(fill) > 1:
-        return False
-
-    return image_loader.num_channels > 1
-
-
-xfails_pil_if_fill_sequence_needs_broadcast = xfails_pil(
-    "PIL kernel doesn't support sequences of length 1 for `fill` if the number of color channels is larger.",
-    condition=fill_sequence_needs_broadcast,
-)
-
-
 DISPATCHER_INFOS = [
     DispatcherInfo(
         F.resized_crop,
@@ -159,14 +131,6 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"),
         test_marks=[
-            *xfails_pil(
-                reason=(
-                    "PIL kernel doesn't support sequences of length 1 for argument `fill` and "
-                    "`padding_mode='constant'`, if the number of color channels is larger."
-                ),
-                condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs)
-                and args_kwargs.kwargs.get("padding_mode", "constant") == "constant",
-            ),
             xfail_jit("F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition),
             xfail_jit_python_scalar_arg("padding"),
         ],
@@ -181,7 +145,6 @@ DISPATCHER_INFOS = [
         },
         pil_kernel_info=PILKernelInfo(F._perspective_image_pil),
         test_marks=[
-            *xfails_pil_if_fill_sequence_needs_broadcast,
             xfail_jit_python_scalar_arg("fill"),
         ],
     ),
diff --git a/torchvision/transforms/_functional_pil.py b/torchvision/transforms/_functional_pil.py
index 120998d00..277848224 100644
--- a/torchvision/transforms/_functional_pil.py
+++ b/torchvision/transforms/_functional_pil.py
@@ -264,11 +264,13 @@ def _parse_fill(
     if isinstance(fill, (int, float)) and num_channels > 1:
         fill = tuple([fill] * num_channels)
     if isinstance(fill, (list, tuple)):
-        if len(fill) != num_channels:
+        if len(fill) == 1:
+            fill = fill * num_channels
+        elif len(fill) != num_channels:
             msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
             raise ValueError(msg.format(len(fill), num_channels))
 
-        fill = tuple(fill)
+        fill = tuple(fill)  # type: ignore[arg-type]
 
     if img.mode != "F":
         if isinstance(fill, (list, tuple)):
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index 387be3ff9..c4798a78e 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -1235,7 +1235,11 @@ def _pad_with_vector_fill(
 
     output = _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
     left, right, top, bottom = torch_padding
-    fill = torch.tensor(fill, dtype=image.dtype, device=image.device).reshape(-1, 1, 1)
+
+    # We are creating the tensor in the autodetected dtype first and convert to the right one after to avoid an implicit
+    # float -> int conversion. That happens for example for the valid input of a uint8 image with floating point fill
+    # value.
+    fill = torch.tensor(fill, device=image.device).to(dtype=image.dtype).reshape(-1, 1, 1)
 
     if top > 0:
         output[..., :top, :] = fill
-- 
GitLab


From fbb4cc54ed521ba912f50f180dc16a213775bf5c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 25 Sep 2023 15:45:29 +0200
Subject: [PATCH 624/624] remove torchvision.prototype module and related tests
 / CI from release branch (#7983)

---
 .../workflows/prototype-tests-linux-gpu.yml   |   54 -
 test/test_prototype_datasets_builtin.py       |  282 ----
 test/test_prototype_datasets_utils.py         |  302 ----
 test/test_prototype_models.py                 |   84 -
 test/test_prototype_transforms.py             |  428 -----
 torchvision/prototype/__init__.py             |    1 -
 torchvision/prototype/datasets/README.md      |    7 -
 torchvision/prototype/datasets/__init__.py    |   15 -
 torchvision/prototype/datasets/_api.py        |   65 -
 .../prototype/datasets/_builtin/README.md     |  340 ----
 .../prototype/datasets/_builtin/__init__.py   |   22 -
 .../prototype/datasets/_builtin/caltech.py    |  212 ---
 .../datasets/_builtin/caltech101.categories   |  101 --
 .../datasets/_builtin/caltech256.categories   |  257 ---
 .../prototype/datasets/_builtin/celeba.py     |  200 ---
 .../prototype/datasets/_builtin/cifar.py      |  142 --
 .../datasets/_builtin/cifar10.categories      |   10 -
 .../datasets/_builtin/cifar100.categories     |  100 --
 .../prototype/datasets/_builtin/clevr.py      |  107 --
 .../datasets/_builtin/coco.categories         |   91 -
 .../prototype/datasets/_builtin/coco.py       |  274 ---
 .../datasets/_builtin/country211.categories   |  211 ---
 .../prototype/datasets/_builtin/country211.py |   81 -
 .../datasets/_builtin/cub200.categories       |  200 ---
 .../prototype/datasets/_builtin/cub200.py     |  265 ---
 .../datasets/_builtin/dtd.categories          |   47 -
 .../prototype/datasets/_builtin/dtd.py        |  139 --
 .../prototype/datasets/_builtin/eurosat.py    |   66 -
 .../prototype/datasets/_builtin/fer2013.py    |   64 -
 .../datasets/_builtin/food101.categories      |  101 --
 .../prototype/datasets/_builtin/food101.py    |   97 --
 .../prototype/datasets/_builtin/gtsrb.py      |  112 --
 .../datasets/_builtin/imagenet.categories     | 1000 -----------
 .../prototype/datasets/_builtin/imagenet.py   |  223 ---
 .../prototype/datasets/_builtin/mnist.py      |  419 -----
 .../_builtin/oxford-iiit-pet.categories       |   37 -
 .../datasets/_builtin/oxford_iiit_pet.py      |  146 --
 .../prototype/datasets/_builtin/pcam.py       |  129 --
 .../datasets/_builtin/sbd.categories          |   20 -
 .../prototype/datasets/_builtin/sbd.py        |  165 --
 .../prototype/datasets/_builtin/semeion.py    |   55 -
 .../_builtin/stanford-cars.categories         |  196 ---
 .../datasets/_builtin/stanford_cars.py        |  117 --
 .../prototype/datasets/_builtin/svhn.py       |   84 -
 .../prototype/datasets/_builtin/usps.py       |   70 -
 .../datasets/_builtin/voc.categories          |   21 -
 .../prototype/datasets/_builtin/voc.py        |  222 ---
 torchvision/prototype/datasets/_folder.py     |   66 -
 torchvision/prototype/datasets/_home.py       |   28 -
 torchvision/prototype/datasets/benchmark.py   |  661 --------
 .../datasets/generate_category_files.py       |   61 -
 .../prototype/datasets/utils/__init__.py      |    4 -
 .../prototype/datasets/utils/_dataset.py      |   57 -
 .../prototype/datasets/utils/_encoded.py      |   57 -
 .../prototype/datasets/utils/_internal.py     |  194 ---
 .../prototype/datasets/utils/_resource.py     |  235 ---
 torchvision/prototype/models/__init__.py      |    1 -
 .../prototype/models/depth/__init__.py        |    1 -
 .../prototype/models/depth/stereo/__init__.py |    2 -
 .../models/depth/stereo/crestereo.py          | 1463 -----------------
 .../models/depth/stereo/raft_stereo.py        |  843 ----------
 torchvision/prototype/transforms/__init__.py  |    6 -
 torchvision/prototype/transforms/_augment.py  |  205 ---
 torchvision/prototype/transforms/_geometry.py |  134 --
 torchvision/prototype/transforms/_misc.py     |   68 -
 torchvision/prototype/transforms/_presets.py  |   80 -
 .../prototype/transforms/_type_conversion.py  |   29 -
 torchvision/prototype/tv_tensors/__init__.py  |    1 -
 torchvision/prototype/tv_tensors/_label.py    |   71 -
 torchvision/prototype/utils/__init__.py       |    1 -
 torchvision/prototype/utils/_internal.py      |  126 --
 71 files changed, 11775 deletions(-)
 delete mode 100644 .github/workflows/prototype-tests-linux-gpu.yml
 delete mode 100644 test/test_prototype_datasets_builtin.py
 delete mode 100644 test/test_prototype_datasets_utils.py
 delete mode 100644 test/test_prototype_models.py
 delete mode 100644 test/test_prototype_transforms.py
 delete mode 100644 torchvision/prototype/__init__.py
 delete mode 100644 torchvision/prototype/datasets/README.md
 delete mode 100644 torchvision/prototype/datasets/__init__.py
 delete mode 100644 torchvision/prototype/datasets/_api.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/README.md
 delete mode 100644 torchvision/prototype/datasets/_builtin/__init__.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/caltech.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/caltech101.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/caltech256.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/celeba.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/cifar.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/cifar10.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/cifar100.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/clevr.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/coco.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/coco.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/country211.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/country211.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/cub200.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/cub200.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/dtd.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/dtd.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/eurosat.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/fer2013.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/food101.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/food101.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/gtsrb.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/imagenet.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/imagenet.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/mnist.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/pcam.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/sbd.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/sbd.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/semeion.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/stanford-cars.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/stanford_cars.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/svhn.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/usps.py
 delete mode 100644 torchvision/prototype/datasets/_builtin/voc.categories
 delete mode 100644 torchvision/prototype/datasets/_builtin/voc.py
 delete mode 100644 torchvision/prototype/datasets/_folder.py
 delete mode 100644 torchvision/prototype/datasets/_home.py
 delete mode 100644 torchvision/prototype/datasets/benchmark.py
 delete mode 100644 torchvision/prototype/datasets/generate_category_files.py
 delete mode 100644 torchvision/prototype/datasets/utils/__init__.py
 delete mode 100644 torchvision/prototype/datasets/utils/_dataset.py
 delete mode 100644 torchvision/prototype/datasets/utils/_encoded.py
 delete mode 100644 torchvision/prototype/datasets/utils/_internal.py
 delete mode 100644 torchvision/prototype/datasets/utils/_resource.py
 delete mode 100644 torchvision/prototype/models/__init__.py
 delete mode 100644 torchvision/prototype/models/depth/__init__.py
 delete mode 100644 torchvision/prototype/models/depth/stereo/__init__.py
 delete mode 100644 torchvision/prototype/models/depth/stereo/crestereo.py
 delete mode 100644 torchvision/prototype/models/depth/stereo/raft_stereo.py
 delete mode 100644 torchvision/prototype/transforms/__init__.py
 delete mode 100644 torchvision/prototype/transforms/_augment.py
 delete mode 100644 torchvision/prototype/transforms/_geometry.py
 delete mode 100644 torchvision/prototype/transforms/_misc.py
 delete mode 100644 torchvision/prototype/transforms/_presets.py
 delete mode 100644 torchvision/prototype/transforms/_type_conversion.py
 delete mode 100644 torchvision/prototype/tv_tensors/__init__.py
 delete mode 100644 torchvision/prototype/tv_tensors/_label.py
 delete mode 100644 torchvision/prototype/utils/__init__.py
 delete mode 100644 torchvision/prototype/utils/_internal.py

diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
deleted file mode 100644
index 10e51368b..000000000
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: Prototype tests on Linux
-
-on:
-  pull_request:
-
-jobs:
-  unittests-prototype:
-    strategy:
-      matrix:
-        python-version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
-        runner: ["linux.12xlarge"]
-        gpu-arch-type: ["cpu"]
-        include:
-          - python-version: "3.8"
-            runner: linux.g5.4xlarge.nvidia.gpu
-            gpu-arch-type: cuda
-            gpu-arch-version: "11.8"
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
-    with:
-      repository: pytorch/vision
-      runner: ${{ matrix.runner }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      timeout: 120
-      script: |
-        set -euo pipefail
-
-        export PYTHON_VERSION=${{ matrix.python-version }}
-        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
-        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
-        ./.github/scripts/setup-env.sh
-
-        # Prepare conda
-        CONDA_PATH=$(which conda)
-        eval "$(${CONDA_PATH} shell.bash hook)"
-        conda activate ci
-
-        echo '::group::Install testing utilities'
-        pip install --progress-bar=off pytest pytest-mock pytest-cov
-        echo '::endgroup::'
-
-        # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e.
-        # `test/test_prototype*.py` takes the highest priority, neither `--ignore` nor `--ignore-glob` can help us here.
-        rm test/test_prototype_datasets*.py
-        pytest \
-          -v --durations=25 \
-          --cov=torchvision/prototype --cov-report=term-missing \
-          --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" \
-          test/test_prototype_*.py
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
deleted file mode 100644
index 6a8068ed2..000000000
--- a/test/test_prototype_datasets_builtin.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import io
-import pickle
-from collections import deque
-from pathlib import Path
-
-import pytest
-import torch
-import torchvision.transforms.v2 as transforms
-
-from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
-from torch.testing._comparison import not_close_error_metas, ObjectPair, TensorLikePair
-
-# TODO: replace with torchdata.dataloader2.DataLoader2 as soon as it is stable-ish
-from torch.utils.data import DataLoader
-
-# TODO: replace with torchdata equivalent as soon as it is available
-from torch.utils.data.graph_settings import get_all_graph_pipes
-
-from torchdata.dataloader2.graph.utils import traverse_dps
-from torchdata.datapipes.iter import ShardingFilter, Shuffler
-from torchdata.datapipes.utils import StreamWrapper
-from torchvision import tv_tensors
-from torchvision._utils import sequence_to_str
-from torchvision.prototype import datasets
-from torchvision.prototype.datasets.utils import EncodedImage
-from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.prototype.tv_tensors import Label
-from torchvision.transforms.v2._utils import is_pure_tensor
-
-
-def assert_samples_equal(*args, msg=None, **kwargs):
-    error_metas = not_close_error_metas(
-        *args, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True, **kwargs
-    )
-    if error_metas:
-        raise error_metas[0].to_error(msg)
-
-
-def extract_datapipes(dp):
-    return get_all_graph_pipes(traverse_dps(dp))
-
-
-def consume(iterator):
-    # Copied from the official itertools recipes: https://docs.python.org/3/library/itertools.html#itertools-recipes
-    deque(iterator, maxlen=0)
-
-
-def next_consume(iterator):
-    item = next(iterator)
-    consume(iterator)
-    return item
-
-
-@pytest.fixture(autouse=True)
-def test_home(mocker, tmp_path):
-    mocker.patch("torchvision.prototype.datasets._api.home", return_value=str(tmp_path))
-    mocker.patch("torchvision.prototype.datasets.home", return_value=str(tmp_path))
-    yield tmp_path
-
-
-def test_coverage():
-    untested_datasets = set(datasets.list_datasets()) - DATASET_MOCKS.keys()
-    if untested_datasets:
-        raise AssertionError(
-            f"The dataset(s) {sequence_to_str(sorted(untested_datasets), separate_last='and ')} "
-            f"are exposed through `torchvision.prototype.datasets.load()`, but are not tested. "
-            f"Please add mock data to `test/builtin_dataset_mocks.py`."
-        )
-
-
-@pytest.mark.filterwarnings("error")
-class TestCommon:
-    @pytest.mark.parametrize("name", datasets.list_datasets())
-    def test_info(self, name):
-        try:
-            info = datasets.info(name)
-        except ValueError:
-            raise AssertionError("No info available.") from None
-
-        if not (isinstance(info, dict) and all(isinstance(key, str) for key in info.keys())):
-            raise AssertionError("Info should be a dictionary with string keys.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_smoke(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        if not isinstance(dataset, datasets.utils.Dataset):
-            raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_sample(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        try:
-            sample = next_consume(iter(dataset))
-        except StopIteration:
-            raise AssertionError("Unable to draw any sample.") from None
-        except Exception as error:
-            raise AssertionError("Drawing a sample raised the error above.") from error
-
-        if not isinstance(sample, dict):
-            raise AssertionError(f"Samples should be dictionaries, but got {type(sample)} instead.")
-
-        if not sample:
-            raise AssertionError("Sample dictionary is empty.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_num_samples(self, dataset_mock, config):
-        dataset, mock_info = dataset_mock.load(config)
-
-        assert len(list(dataset)) == mock_info["num_samples"]
-
-    @pytest.fixture
-    def log_session_streams(self):
-        debug_unclosed_streams = StreamWrapper.debug_unclosed_streams
-        try:
-            StreamWrapper.debug_unclosed_streams = True
-            yield
-        finally:
-            StreamWrapper.debug_unclosed_streams = debug_unclosed_streams
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_stream_closing(self, log_session_streams, dataset_mock, config):
-        def make_msg_and_close(head):
-            unclosed_streams = []
-            for stream in list(StreamWrapper.session_streams.keys()):
-                unclosed_streams.append(repr(stream.file_obj))
-                stream.close()
-            unclosed_streams = "\n".join(unclosed_streams)
-            return f"{head}\n\n{unclosed_streams}"
-
-        if StreamWrapper.session_streams:
-            raise pytest.UsageError(make_msg_and_close("A previous test did not close the following streams:"))
-
-        dataset, _ = dataset_mock.load(config)
-
-        consume(iter(dataset))
-
-        if StreamWrapper.session_streams:
-            raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:"))
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_unaccompanied_pure_tensors(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-        sample = next_consume(iter(dataset))
-
-        pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)}
-
-        if pure_tensors and not any(
-            isinstance(item, (tv_tensors.Image, tv_tensors.Video, EncodedImage)) for item in sample.values()
-        ):
-            raise AssertionError(
-                f"The values of key(s) "
-                f"{sequence_to_str(sorted(pure_tensors), separate_last='and ')} contained pure tensors, "
-                f"but didn't find any (encoded) image or video."
-            )
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_transformable(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        dataset = dataset.map(transforms.Identity())
-
-        consume(iter(dataset))
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_traversable(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        traverse_dps(dataset)
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_serializable(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        pickle.dumps(dataset)
-
-    # This has to be a proper function, since lambda's or local functions
-    # cannot be pickled, but this is a requirement for the DataLoader with
-    # multiprocessing, i.e. num_workers > 0
-    def _collate_fn(self, batch):
-        return batch
-
-    @pytest.mark.parametrize("num_workers", [0, 1])
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_data_loader(self, dataset_mock, config, num_workers):
-        dataset, _ = dataset_mock.load(config)
-
-        dl = DataLoader(
-            dataset,
-            batch_size=2,
-            num_workers=num_workers,
-            collate_fn=self._collate_fn,
-        )
-
-        consume(dl)
-
-    # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
-    #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680
-    #  contain a custom test for that, but we opted to wait for a potential solution / test from torchdata for now.
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
-    def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
-        dataset, _ = dataset_mock.load(config)
-
-        if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
-            raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_save_load(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        sample = next_consume(iter(dataset))
-
-        with io.BytesIO() as buffer:
-            torch.save(sample, buffer)
-            buffer.seek(0)
-            assert_samples_equal(torch.load(buffer), sample)
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_infinite_buffer_size(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        for dp in extract_datapipes(dataset):
-            if hasattr(dp, "buffer_size"):
-                # TODO: replace this with the proper sentinel as soon as https://github.com/pytorch/data/issues/335 is
-                #  resolved
-                assert dp.buffer_size == INFINITE_BUFFER_SIZE
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_has_length(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        assert len(dataset) > 0
-
-
-@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
-class TestQMNIST:
-    def test_extra_label(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        sample = next_consume(iter(dataset))
-        for key, type in (
-            ("nist_hsf_series", int),
-            ("nist_writer_id", int),
-            ("digit_index", int),
-            ("nist_label", int),
-            ("global_digit_index", int),
-            ("duplicate", bool),
-            ("unused", bool),
-        ):
-            assert key in sample and isinstance(sample[key], type)
-
-
-@parametrize_dataset_mocks(DATASET_MOCKS["gtsrb"])
-class TestGTSRB:
-    def test_label_matches_path(self, dataset_mock, config):
-        # We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
-        # This test makes sure that they're both the same
-        if config["split"] != "train":
-            return
-
-        dataset, _ = dataset_mock.load(config)
-
-        for sample in dataset:
-            label_from_path = int(Path(sample["path"]).parent.name)
-            assert sample["label"] == label_from_path
-
-
-@parametrize_dataset_mocks(DATASET_MOCKS["usps"])
-class TestUSPS:
-    def test_sample_content(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        for sample in dataset:
-            assert "image" in sample
-            assert "label" in sample
-
-            assert isinstance(sample["image"], tv_tensors.Image)
-            assert isinstance(sample["label"], Label)
-
-            assert sample["image"].shape == (1, 16, 16)
diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
deleted file mode 100644
index 2098ac736..000000000
--- a/test/test_prototype_datasets_utils.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import gzip
-import pathlib
-import sys
-
-import numpy as np
-import pytest
-import torch
-from datasets_utils import make_fake_flo_file, make_tar
-from torchdata.datapipes.iter import FileOpener, TarArchiveLoader
-from torchvision.datasets._optical_flow import _read_flo as read_flo_ref
-from torchvision.datasets.utils import _decompress
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import fromfile, read_flo
-
-
-@pytest.mark.filterwarnings("error:The given NumPy array is not writeable:UserWarning")
-@pytest.mark.parametrize(
-    ("np_dtype", "torch_dtype", "byte_order"),
-    [
-        (">f4", torch.float32, "big"),
-        ("<f8", torch.float64, "little"),
-        ("<i4", torch.int32, "little"),
-        (">i8", torch.int64, "big"),
-        ("|u1", torch.uint8, sys.byteorder),
-    ],
-)
-@pytest.mark.parametrize("count", (-1, 2))
-@pytest.mark.parametrize("mode", ("rb", "r+b"))
-def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, count, mode):
-    path = tmpdir / "data.bin"
-    rng = np.random.RandomState(0)
-    rng.randn(5 if count == -1 else count + 1).astype(np_dtype).tofile(path)
-
-    for count_ in (-1, count // 2):
-        expected = torch.from_numpy(np.fromfile(path, dtype=np_dtype, count=count_).astype(np_dtype[1:]))
-
-        with open(path, mode) as file:
-            actual = fromfile(file, dtype=torch_dtype, byte_order=byte_order, count=count_)
-
-        torch.testing.assert_close(actual, expected)
-
-
-def test_read_flo(tmpdir):
-    path = tmpdir / "test.flo"
-    make_fake_flo_file(3, 4, path)
-
-    with open(path, "rb") as file:
-        actual = read_flo(file)
-
-    expected = torch.from_numpy(read_flo_ref(path).astype("f4", copy=False))
-
-    torch.testing.assert_close(actual, expected)
-
-
-class TestOnlineResource:
-    class DummyResource(OnlineResource):
-        def __init__(self, download_fn=None, **kwargs):
-            super().__init__(**kwargs)
-            self._download_fn = download_fn
-
-        def _download(self, root):
-            if self._download_fn is None:
-                raise pytest.UsageError(
-                    "`_download()` was called, but `DummyResource(...)` was constructed without `download_fn`."
-                )
-
-            return self._download_fn(self, root)
-
-    def _make_file(self, root, *, content, name="file.txt"):
-        file = root / name
-        with open(file, "w") as fh:
-            fh.write(content)
-
-        return file
-
-    def _make_folder(self, root, *, name="folder"):
-        folder = root / name
-        subfolder = folder / "subfolder"
-        subfolder.mkdir(parents=True)
-
-        files = {}
-        for idx, root in enumerate([folder, folder, subfolder]):
-            content = f"sentinel{idx}"
-            file = self._make_file(root, name=f"file{idx}.txt", content=content)
-            files[str(file)] = content
-
-        return folder, files
-
-    def _make_tar(self, root, *, name="archive.tar", remove=True):
-        folder, files = self._make_folder(root, name=name.split(".")[0])
-        archive = make_tar(root, name, folder, remove=remove)
-        files = {str(archive / pathlib.Path(file).relative_to(root)): content for file, content in files.items()}
-        return archive, files
-
-    def test_load_file(self, tmp_path):
-        content = "sentinel"
-        file = self._make_file(tmp_path, content=content)
-
-        resource = self.DummyResource(file_name=file.name)
-
-        dp = resource.load(tmp_path)
-        assert isinstance(dp, FileOpener)
-
-        data = list(dp)
-        assert len(data) == 1
-
-        path, buffer = data[0]
-        assert path == str(file)
-        assert buffer.read().decode() == content
-
-    def test_load_folder(self, tmp_path):
-        folder, files = self._make_folder(tmp_path)
-
-        resource = self.DummyResource(file_name=folder.name)
-
-        dp = resource.load(tmp_path)
-        assert isinstance(dp, FileOpener)
-        assert {path: buffer.read().decode() for path, buffer in dp} == files
-
-    def test_load_archive(self, tmp_path):
-        archive, files = self._make_tar(tmp_path)
-
-        resource = self.DummyResource(file_name=archive.name)
-
-        dp = resource.load(tmp_path)
-        assert isinstance(dp, TarArchiveLoader)
-        assert {path: buffer.read().decode() for path, buffer in dp} == files
-
-    def test_priority_decompressed_gt_raw(self, tmp_path):
-        # We don't need to actually compress here. Adding the suffix is sufficient
-        self._make_file(tmp_path, content="raw_sentinel", name="file.txt.gz")
-        file = self._make_file(tmp_path, content="decompressed_sentinel", name="file.txt")
-
-        resource = self.DummyResource(file_name=file.name)
-
-        dp = resource.load(tmp_path)
-        path, buffer = next(iter(dp))
-
-        assert path == str(file)
-        assert buffer.read().decode() == "decompressed_sentinel"
-
-    def test_priority_extracted_gt_decompressed(self, tmp_path):
-        archive, _ = self._make_tar(tmp_path, remove=False)
-
-        resource = self.DummyResource(file_name=archive.name)
-
-        dp = resource.load(tmp_path)
-        # If the archive had been selected, this would be a `TarArchiveReader`
-        assert isinstance(dp, FileOpener)
-
-    def test_download(self, tmp_path):
-        download_fn_was_called = False
-
-        def download_fn(resource, root):
-            nonlocal download_fn_was_called
-            download_fn_was_called = True
-
-            return self._make_file(root, content="_", name=resource.file_name)
-
-        resource = self.DummyResource(
-            file_name="file.txt",
-            download_fn=download_fn,
-        )
-
-        resource.load(tmp_path)
-
-        assert download_fn_was_called, "`download_fn()` was never called"
-
-    # This tests the `"decompress"` literal as well as a custom callable
-    @pytest.mark.parametrize(
-        "preprocess",
-        [
-            "decompress",
-            lambda path: _decompress(str(path), remove_finished=True),
-        ],
-    )
-    def test_preprocess_decompress(self, tmp_path, preprocess):
-        file_name = "file.txt.gz"
-        content = "sentinel"
-
-        def download_fn(resource, root):
-            file = root / resource.file_name
-            with gzip.open(file, "wb") as fh:
-                fh.write(content.encode())
-            return file
-
-        resource = self.DummyResource(file_name=file_name, preprocess=preprocess, download_fn=download_fn)
-
-        dp = resource.load(tmp_path)
-        data = list(dp)
-        assert len(data) == 1
-
-        path, buffer = data[0]
-        assert path == str(tmp_path / file_name).replace(".gz", "")
-        assert buffer.read().decode() == content
-
-    def test_preprocess_extract(self, tmp_path):
-        files = None
-
-        def download_fn(resource, root):
-            nonlocal files
-            archive, files = self._make_tar(root, name=resource.file_name)
-            return archive
-
-        resource = self.DummyResource(file_name="folder.tar", preprocess="extract", download_fn=download_fn)
-
-        dp = resource.load(tmp_path)
-        assert files is not None, "`download_fn()` was never called"
-        assert isinstance(dp, FileOpener)
-
-        actual = {path: buffer.read().decode() for path, buffer in dp}
-        expected = {
-            path.replace(resource.file_name, resource.file_name.split(".")[0]): content
-            for path, content in files.items()
-        }
-        assert actual == expected
-
-    def test_preprocess_only_after_download(self, tmp_path):
-        file = self._make_file(tmp_path, content="_")
-
-        def preprocess(path):
-            raise AssertionError("`preprocess` was called although the file was already present.")
-
-        resource = self.DummyResource(
-            file_name=file.name,
-            preprocess=preprocess,
-        )
-
-        resource.load(tmp_path)
-
-
-class TestHttpResource:
-    def test_resolve_to_http(self, mocker):
-        file_name = "data.tar"
-        original_url = f"http://downloads.pytorch.org/{file_name}"
-
-        redirected_url = original_url.replace("http", "https")
-
-        sha256_sentinel = "sha256_sentinel"
-
-        def preprocess_sentinel(path):
-            return path
-
-        original_resource = HttpResource(
-            original_url,
-            sha256=sha256_sentinel,
-            preprocess=preprocess_sentinel,
-        )
-
-        mocker.patch("torchvision.prototype.datasets.utils._resource._get_redirect_url", return_value=redirected_url)
-        redirected_resource = original_resource.resolve()
-
-        assert isinstance(redirected_resource, HttpResource)
-        assert redirected_resource.url == redirected_url
-        assert redirected_resource.file_name == file_name
-        assert redirected_resource.sha256 == sha256_sentinel
-        assert redirected_resource._preprocess is preprocess_sentinel
-
-    def test_resolve_to_gdrive(self, mocker):
-        file_name = "data.tar"
-        original_url = f"http://downloads.pytorch.org/{file_name}"
-
-        id_sentinel = "id-sentinel"
-        redirected_url = f"https://drive.google.com/file/d/{id_sentinel}/view"
-
-        sha256_sentinel = "sha256_sentinel"
-
-        def preprocess_sentinel(path):
-            return path
-
-        original_resource = HttpResource(
-            original_url,
-            sha256=sha256_sentinel,
-            preprocess=preprocess_sentinel,
-        )
-
-        mocker.patch("torchvision.prototype.datasets.utils._resource._get_redirect_url", return_value=redirected_url)
-        redirected_resource = original_resource.resolve()
-
-        assert isinstance(redirected_resource, GDriveResource)
-        assert redirected_resource.id == id_sentinel
-        assert redirected_resource.file_name == file_name
-        assert redirected_resource.sha256 == sha256_sentinel
-        assert redirected_resource._preprocess is preprocess_sentinel
-
-
-def test_missing_dependency_error():
-    class DummyDataset(Dataset):
-        def __init__(self):
-            super().__init__(root="root", dependencies=("fake_dependency",))
-
-        def _resources(self):
-            pass
-
-        def _datapipe(self, resource_dps):
-            pass
-
-        def __len__(self):
-            pass
-
-    with pytest.raises(ModuleNotFoundError, match="depends on the third-party package 'fake_dependency'"):
-        DummyDataset()
diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
deleted file mode 100644
index d32df68f1..000000000
--- a/test/test_prototype_models.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import pytest
-import test_models as TM
-import torch
-from common_utils import cpu_and_cuda, set_rng_seed
-from torchvision.prototype import models
-
-
-@pytest.mark.parametrize("model_fn", (models.depth.stereo.raft_stereo_base,))
-@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_cuda())
-def test_raft_stereo(model_fn, model_mode, dev):
-    # A simple test to make sure the model can do forward pass and jit scriptable
-    set_rng_seed(0)
-
-    # Use corr_pyramid and corr_block with smaller num_levels and radius to prevent nan output
-    # get the idea from test_models.test_raft
-    corr_pyramid = models.depth.stereo.raft_stereo.CorrPyramid1d(num_levels=2)
-    corr_block = models.depth.stereo.raft_stereo.CorrBlock1d(num_levels=2, radius=2)
-    model = model_fn(corr_pyramid=corr_pyramid, corr_block=corr_block).eval().to(dev)
-
-    if model_mode == "scripted":
-        model = torch.jit.script(model)
-
-    img1 = torch.rand(1, 3, 64, 64).to(dev)
-    img2 = torch.rand(1, 3, 64, 64).to(dev)
-    num_iters = 3
-
-    preds = model(img1, img2, num_iters=num_iters)
-    depth_pred = preds[-1]
-
-    assert len(preds) == num_iters, "Number of predictions should be the same as model.num_iters"
-
-    assert depth_pred.shape == torch.Size(
-        [1, 1, 64, 64]
-    ), f"The output shape of depth_pred should be [1, 1, 64, 64] but instead it is {preds[0].shape}"
-
-    # Test against expected file output
-    TM._assert_expected(depth_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
-
-
-@pytest.mark.parametrize("model_fn", (models.depth.stereo.crestereo_base,))
-@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_cuda())
-def test_crestereo(model_fn, model_mode, dev):
-    set_rng_seed(0)
-
-    model = model_fn().eval().to(dev)
-
-    if model_mode == "scripted":
-        model = torch.jit.script(model)
-
-    img1 = torch.rand(1, 3, 64, 64).to(dev)
-    img2 = torch.rand(1, 3, 64, 64).to(dev)
-    iterations = 3
-
-    preds = model(img1, img2, flow_init=None, num_iters=iterations)
-    disparity_pred = preds[-1]
-
-    # all the pyramid levels except the highest res make only half the number of iterations
-    expected_iterations = (iterations // 2) * (len(model.resolutions) - 1)
-    expected_iterations += iterations
-    assert (
-        len(preds) == expected_iterations
-    ), "Number of predictions should be the number of iterations multiplied by the number of pyramid levels"
-
-    assert disparity_pred.shape == torch.Size(
-        [1, 2, 64, 64]
-    ), f"Predicted disparity should have the same spatial shape as the input. Inputs shape {img1.shape[2:]}, Prediction shape {disparity_pred.shape[2:]}"
-
-    assert all(
-        d.shape == torch.Size([1, 2, 64, 64]) for d in preds
-    ), "All predicted disparities are expected to have the same shape"
-
-    # test a backward pass with a dummy loss as well
-    preds = torch.stack(preds, dim=0)
-    targets = torch.ones_like(preds, requires_grad=False)
-    loss = torch.nn.functional.mse_loss(preds, targets)
-
-    try:
-        loss.backward()
-    except Exception as e:
-        assert False, f"Backward pass failed with an unexpected exception: {e.__class__.__name__} {e}"
-
-    TM._assert_expected(disparity_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
deleted file mode 100644
index 9794b196a..000000000
--- a/test/test_prototype_transforms.py
+++ /dev/null
@@ -1,428 +0,0 @@
-import re
-
-import PIL.Image
-import pytest
-import torch
-
-from common_utils import assert_equal
-
-from prototype_common_utils import make_label
-from torchvision.prototype import transforms, tv_tensors
-from torchvision.transforms.v2._utils import check_type, is_pure_tensor
-from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
-
-from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
-from transforms_v2_legacy_utils import (
-    DEFAULT_EXTRA_DIMS,
-    make_bounding_boxes,
-    make_detection_mask,
-    make_image,
-    make_video,
-)
-
-BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
-
-
-def parametrize(transforms_with_inputs):
-    return pytest.mark.parametrize(
-        ("transform", "input"),
-        [
-            pytest.param(
-                transform,
-                input,
-                id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}",
-            )
-            for transform, inputs in transforms_with_inputs
-            for idx, input in enumerate(inputs)
-        ],
-    )
-
-
-class TestSimpleCopyPaste:
-    def create_fake_image(self, mocker, image_type):
-        if image_type == PIL.Image.Image:
-            return PIL.Image.new("RGB", (32, 32), 123)
-        return mocker.MagicMock(spec=image_type)
-
-    def test__extract_image_targets_assertion(self, mocker):
-        transform = transforms.SimpleCopyPaste()
-
-        flat_sample = [
-            # images, batch size = 2
-            self.create_fake_image(mocker, Image),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=tv_tensors.Label),
-            mocker.MagicMock(spec=BoundingBoxes),
-            mocker.MagicMock(spec=Mask),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=BoundingBoxes),
-            mocker.MagicMock(spec=Mask),
-        ]
-
-        with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"):
-            transform._extract_image_targets(flat_sample)
-
-    @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel])
-    def test__extract_image_targets(self, image_type, label_type, mocker):
-        transform = transforms.SimpleCopyPaste()
-
-        flat_sample = [
-            # images, batch size = 2
-            self.create_fake_image(mocker, image_type),
-            self.create_fake_image(mocker, image_type),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=BoundingBoxes),
-            mocker.MagicMock(spec=Mask),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=BoundingBoxes),
-            mocker.MagicMock(spec=Mask),
-        ]
-
-        images, targets = transform._extract_image_targets(flat_sample)
-
-        assert len(images) == len(targets) == 2
-        if image_type == PIL.Image.Image:
-            torch.testing.assert_close(images[0], pil_to_tensor(flat_sample[0]))
-            torch.testing.assert_close(images[1], pil_to_tensor(flat_sample[1]))
-        else:
-            assert images[0] == flat_sample[0]
-            assert images[1] == flat_sample[1]
-
-        for target in targets:
-            for key, type_ in [
-                ("boxes", BoundingBoxes),
-                ("masks", Mask),
-                ("labels", label_type),
-            ]:
-                assert key in target
-                assert isinstance(target[key], type_)
-                assert target[key] in flat_sample
-
-    @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel])
-    def test__copy_paste(self, label_type):
-        image = 2 * torch.ones(3, 32, 32)
-        masks = torch.zeros(2, 32, 32)
-        masks[0, 3:9, 2:8] = 1
-        masks[1, 20:30, 20:30] = 1
-        labels = torch.tensor([1, 2])
-        blending = True
-        resize_interpolation = InterpolationMode.BILINEAR
-        antialias = None
-        if label_type == tv_tensors.OneHotLabel:
-            labels = torch.nn.functional.one_hot(labels, num_classes=5)
-        target = {
-            "boxes": BoundingBoxes(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", canvas_size=(32, 32)
-            ),
-            "masks": Mask(masks),
-            "labels": label_type(labels),
-        }
-
-        paste_image = 10 * torch.ones(3, 32, 32)
-        paste_masks = torch.zeros(2, 32, 32)
-        paste_masks[0, 13:19, 12:18] = 1
-        paste_masks[1, 15:19, 1:8] = 1
-        paste_labels = torch.tensor([3, 4])
-        if label_type == tv_tensors.OneHotLabel:
-            paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
-        paste_target = {
-            "boxes": BoundingBoxes(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", canvas_size=(32, 32)
-            ),
-            "masks": Mask(paste_masks),
-            "labels": label_type(paste_labels),
-        }
-
-        transform = transforms.SimpleCopyPaste()
-        random_selection = torch.tensor([0, 1])
-        output_image, output_target = transform._copy_paste(
-            image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias
-        )
-
-        assert output_image.unique().tolist() == [2, 10]
-        assert output_target["boxes"].shape == (4, 4)
-        torch.testing.assert_close(output_target["boxes"][:2, :], target["boxes"])
-        torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
-
-        expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == tv_tensors.OneHotLabel:
-            expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
-        torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
-
-        assert output_target["masks"].shape == (4, 32, 32)
-        torch.testing.assert_close(output_target["masks"][:2, :], target["masks"])
-        torch.testing.assert_close(output_target["masks"][2:, :], paste_target["masks"])
-
-
-class TestFixedSizeCrop:
-    def test__get_params(self, mocker):
-        crop_size = (7, 7)
-        batch_shape = (10,)
-        canvas_size = (11, 5)
-
-        transform = transforms.FixedSizeCrop(size=crop_size)
-
-        flat_inputs = [
-            make_image(size=canvas_size, color_space="RGB"),
-            make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
-        ]
-        params = transform._get_params(flat_inputs)
-
-        assert params["needs_crop"]
-        assert params["height"] <= crop_size[0]
-        assert params["width"] <= crop_size[1]
-
-        assert (
-            isinstance(params["is_valid"], torch.Tensor)
-            and params["is_valid"].dtype is torch.bool
-            and params["is_valid"].shape == batch_shape
-        )
-
-        assert params["needs_pad"]
-        assert any(pad > 0 for pad in params["padding"])
-
-    def test__transform_culling(self, mocker):
-        batch_size = 10
-        canvas_size = (10, 10)
-
-        is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=True,
-                top=0,
-                left=0,
-                height=canvas_size[0],
-                width=canvas_size[1],
-                is_valid=is_valid,
-                needs_pad=False,
-            ),
-        )
-
-        bounding_boxes = make_bounding_boxes(
-            format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
-        )
-        masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,))
-        labels = make_label(extra_dims=(batch_size,))
-
-        transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        output = transform(
-            dict(
-                bounding_boxes=bounding_boxes,
-                masks=masks,
-                labels=labels,
-            )
-        )
-
-        assert_equal(output["bounding_boxes"], bounding_boxes[is_valid])
-        assert_equal(output["masks"], masks[is_valid])
-        assert_equal(output["labels"], labels[is_valid])
-
-    def test__transform_bounding_boxes_clamping(self, mocker):
-        batch_size = 3
-        canvas_size = (10, 10)
-
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=True,
-                top=0,
-                left=0,
-                height=canvas_size[0],
-                width=canvas_size[1],
-                is_valid=torch.full((batch_size,), fill_value=True),
-                needs_pad=False,
-            ),
-        )
-
-        bounding_boxes = make_bounding_boxes(
-            format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
-        )
-        mock = mocker.patch(
-            "torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes", wraps=clamp_bounding_boxes
-        )
-
-        transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        transform(bounding_boxes)
-
-        mock.assert_called_once()
-
-
-class TestLabelToOneHot:
-    def test__transform(self):
-        categories = ["apple", "pear", "pineapple"]
-        labels = tv_tensors.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
-        transform = transforms.LabelToOneHot()
-        ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, tv_tensors.OneHotLabel)
-        assert ohe_labels.shape == (4, 3)
-        assert ohe_labels.categories == labels.categories == categories
-
-
-class TestPermuteDimensions:
-    @pytest.mark.parametrize(
-        ("dims", "inverse_dims"),
-        [
-            (
-                {Image: (2, 1, 0), Video: None},
-                {Image: (2, 1, 0), Video: None},
-            ),
-            (
-                {Image: (2, 1, 0), Video: (1, 2, 3, 0)},
-                {Image: (2, 1, 0), Video: (3, 0, 1, 2)},
-            ),
-        ],
-    )
-    def test_call(self, dims, inverse_dims):
-        sample = dict(
-            image=make_image(),
-            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
-            video=make_video(),
-            str="str",
-            int=0,
-        )
-
-        transform = transforms.PermuteDimensions(dims)
-        transformed_sample = transform(sample)
-
-        for key, value in sample.items():
-            value_type = type(value)
-            transformed_value = transformed_sample[key]
-
-            if check_type(value, (Image, is_pure_tensor, Video)):
-                if transform.dims.get(value_type) is not None:
-                    assert transformed_value.permute(inverse_dims[value_type]).equal(value)
-                assert type(transformed_value) == torch.Tensor
-            else:
-                assert transformed_value is value
-
-    @pytest.mark.filterwarnings("error")
-    def test_plain_tensor_call(self):
-        tensor = torch.empty((2, 3, 4))
-        transform = transforms.PermuteDimensions(dims=(1, 2, 0))
-
-        assert transform(tensor).shape == (3, 4, 2)
-
-    @pytest.mark.parametrize("other_type", [Image, Video])
-    def test_plain_tensor_warning(self, other_type):
-        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
-
-
-class TestTransposeDimensions:
-    @pytest.mark.parametrize(
-        "dims",
-        [
-            (-1, -2),
-            {Image: (1, 2), Video: None},
-        ],
-    )
-    def test_call(self, dims):
-        sample = dict(
-            image=make_image(),
-            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
-            video=make_video(),
-            str="str",
-            int=0,
-        )
-
-        transform = transforms.TransposeDimensions(dims)
-        transformed_sample = transform(sample)
-
-        for key, value in sample.items():
-            value_type = type(value)
-            transformed_value = transformed_sample[key]
-
-            transposed_dims = transform.dims.get(value_type)
-            if check_type(value, (Image, is_pure_tensor, Video)):
-                if transposed_dims is not None:
-                    assert transformed_value.transpose(*transposed_dims).equal(value)
-                assert type(transformed_value) == torch.Tensor
-            else:
-                assert transformed_value is value
-
-    @pytest.mark.filterwarnings("error")
-    def test_plain_tensor_call(self):
-        tensor = torch.empty((2, 3, 4))
-        transform = transforms.TransposeDimensions(dims=(0, 2))
-
-        assert transform(tensor).shape == (4, 3, 2)
-
-    @pytest.mark.parametrize("other_type", [Image, Video])
-    def test_plain_tensor_warning(self, other_type):
-        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
-
-
-import importlib.machinery
-import importlib.util
-from pathlib import Path
-
-
-def import_transforms_from_references(reference):
-    HERE = Path(__file__).parent
-    PROJECT_ROOT = HERE.parent
-
-    loader = importlib.machinery.SourceFileLoader(
-        "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py")
-    )
-    spec = importlib.util.spec_from_loader("transforms", loader)
-    module = importlib.util.module_from_spec(spec)
-    loader.exec_module(module)
-    return module
-
-
-det_transforms = import_transforms_from_references("detection")
-
-
-def test_fixed_sized_crop_against_detection_reference():
-    def make_tv_tensors():
-        size = (600, 800)
-        num_objects = 22
-
-        pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
-        target = {
-            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
-            "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
-        }
-
-        yield (pil_image, target)
-
-        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
-        target = {
-            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
-            "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
-        }
-
-        yield (tensor_image, target)
-
-        tv_tensor_image = make_image(size=size, color_space="RGB")
-        target = {
-            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
-            "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
-        }
-
-        yield (tv_tensor_image, target)
-
-    t = transforms.FixedSizeCrop((1024, 1024), fill=0)
-    t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0)
-
-    for dp in make_tv_tensors():
-        # We should use prototype transform first as reference transform performs inplace target update
-        torch.manual_seed(12)
-        output = t(dp)
-
-        torch.manual_seed(12)
-        expected_output = t_ref(*dp)
-
-        assert_equal(expected_output, output)
diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py
deleted file mode 100644
index 0621c9bf7..000000000
--- a/torchvision/prototype/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import models, transforms, tv_tensors, utils
diff --git a/torchvision/prototype/datasets/README.md b/torchvision/prototype/datasets/README.md
deleted file mode 100644
index 79b426caa..000000000
--- a/torchvision/prototype/datasets/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Status of prototype datasets
-
-These prototype datasets are based on [torchdata](https://github.com/pytorch/data)'s datapipes. Torchdata
-development [is
-paused](https://github.com/pytorch/data/#torchdata-see-note-below-on-current-status)
-as of July 2023, so we are not actively maintaining this module. There is no
-estimated date for a stable release of these datasets.
diff --git a/torchvision/prototype/datasets/__init__.py b/torchvision/prototype/datasets/__init__.py
deleted file mode 100644
index 848d9135c..000000000
--- a/torchvision/prototype/datasets/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-try:
-    import torchdata
-except ModuleNotFoundError:
-    raise ModuleNotFoundError(
-        "`torchvision.prototype.datasets` depends on PyTorch's `torchdata` (https://github.com/pytorch/data). "
-        "You can install it with `pip install --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu"
-    ) from None
-
-from . import utils
-from ._home import home
-
-# Load this last, since some parts depend on the above being loaded first
-from ._api import list_datasets, info, load, register_info, register_dataset  # usort: skip
-from ._folder import from_data_folder, from_image_folder
-from ._builtin import *
diff --git a/torchvision/prototype/datasets/_api.py b/torchvision/prototype/datasets/_api.py
deleted file mode 100644
index f6f06c60a..000000000
--- a/torchvision/prototype/datasets/_api.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import pathlib
-from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
-
-from torchvision.prototype.datasets import home
-from torchvision.prototype.datasets.utils import Dataset
-from torchvision.prototype.utils._internal import add_suggestion
-
-
-T = TypeVar("T")
-D = TypeVar("D", bound=Type[Dataset])
-
-BUILTIN_INFOS: Dict[str, Dict[str, Any]] = {}
-
-
-def register_info(name: str) -> Callable[[Callable[[], Dict[str, Any]]], Callable[[], Dict[str, Any]]]:
-    def wrapper(fn: Callable[[], Dict[str, Any]]) -> Callable[[], Dict[str, Any]]:
-        BUILTIN_INFOS[name] = fn()
-        return fn
-
-    return wrapper
-
-
-BUILTIN_DATASETS = {}
-
-
-def register_dataset(name: str) -> Callable[[D], D]:
-    def wrapper(dataset_cls: D) -> D:
-        BUILTIN_DATASETS[name] = dataset_cls
-        return dataset_cls
-
-    return wrapper
-
-
-def list_datasets() -> List[str]:
-    return sorted(BUILTIN_DATASETS.keys())
-
-
-def find(dct: Dict[str, T], name: str) -> T:
-    name = name.lower()
-    try:
-        return dct[name]
-    except KeyError as error:
-        raise ValueError(
-            add_suggestion(
-                f"Unknown dataset '{name}'.",
-                word=name,
-                possibilities=dct.keys(),
-                alternative_hint=lambda _: (
-                    "You can use torchvision.datasets.list_datasets() to get a list of all available datasets."
-                ),
-            )
-        ) from error
-
-
-def info(name: str) -> Dict[str, Any]:
-    return find(BUILTIN_INFOS, name)
-
-
-def load(name: str, *, root: Optional[Union[str, pathlib.Path]] = None, **config: Any) -> Dataset:
-    dataset_cls = find(BUILTIN_DATASETS, name)
-
-    if root is None:
-        root = pathlib.Path(home()) / name
-
-    return dataset_cls(root, **config)
diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
deleted file mode 100644
index 3b33100eb..000000000
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ /dev/null
@@ -1,340 +0,0 @@
-# How to add new built-in prototype datasets
-
-As the name implies, the datasets are still in a prototype state and thus subject to rapid change. This in turn means
-that this document will also change a lot.
-
-If you hit a blocker while adding a dataset, please have a look at another similar dataset to see how it is implemented
-there. If you can't resolve it yourself, feel free to send a draft PR in order for us to help you out.
-
-Finally, `from torchvision.prototype import datasets` is implied below.
-
-## Implementation
-
-Before we start with the actual implementation, you should create a module in `torchvision/prototype/datasets/_builtin`
-that hints at the dataset you are going to add. For example `caltech.py` for `caltech101` and `caltech256`. In that
-module create a class that inherits from `datasets.utils.Dataset` and overwrites four methods that will be discussed in
-detail below:
-
-```python
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
-
-from torchdata.datapipes.iter import IterDataPipe
-from torchvision.prototype.datasets.utils import Dataset, OnlineResource
-
-from .._api import register_dataset, register_info
-
-NAME = "my-dataset"
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-  return dict(
-      ...
-  )
-
-@register_dataset(NAME)
-class MyDataset(Dataset):
-    def __init__(self, root: Union[str, pathlib.Path], *, ..., skip_integrity_check: bool = False) -> None:
-        ...
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        ...
-
-    def _datapipe(self, resource_dps: List[IterDataPipe[Tuple[str, BinaryIO]]]) -> IterDataPipe[Dict[str, Any]]:
-        ...
-
-    def __len__(self) -> int:
-        ...
-```
-
-In addition to the dataset, you also need to implement an `_info()` function that takes no arguments and returns a
-dictionary of static information. The most common use case is to provide human-readable categories.
-[See below](#how-do-i-handle-a-dataset-that-defines-many-categories) how to handle cases with many categories.
-
-Finally, both the dataset class and the info function need to be registered on the API with the respective decorators.
-With that they are loadable through `datasets.load("my-dataset")` and `datasets.info("my-dataset")`, respectively.
-
-### `__init__(self, root, *, ..., skip_integrity_check = False)`
-
-Constructor of the dataset that will be called when the dataset is instantiated. In addition to the parameters of the
-base class, it can take arbitrary keyword-only parameters with defaults. The checking of these parameters as well as
-setting them as instance attributes has to happen before the call of `super().__init__(...)`, because that will invoke
-the other methods, which possibly depend on the parameters. All instance attributes must be private, i.e. prefixed with
-an underscore.
-
-If the implementation of the dataset depends on third-party packages, pass them as a collection of strings to the base
-class constructor, e.g. `super().__init__(..., dependencies=("scipy",))`. Their availability will be automatically
-checked if a user tries to load the dataset. Within the implementation of the dataset, import these packages lazily to
-avoid missing dependencies at import time.
-
-### `_resources(self)`
-
-Returns `List[datasets.utils.OnlineResource]` of all the files that need to be present locally before the dataset can be
-build. The download will happen automatically.
-
-Currently, the following `OnlineResource`'s are supported:
-
-- `HttpResource`: Used for files that are directly exposed through HTTP(s) and only requires the URL.
-- `GDriveResource`: Used for files that are hosted on GDrive and requires the GDrive ID as well as the `file_name`.
-- `ManualDownloadResource`: Used files are not publicly accessible and requires instructions how to download them
-  manually. If the file does not exist, an error will be raised with the supplied instructions.
-- `KaggleDownloadResource`: Used for files that are available on Kaggle. This inherits from `ManualDownloadResource`.
-
-Although optional in general, all resources used in the built-in datasets should comprise
-[SHA256](https://en.wikipedia.org/wiki/SHA-2) checksum for security. It will be automatically checked after the
-download. You can compute the checksum with system utilities e.g `sha256-sum`, or this snippet:
-
-```python
-import hashlib
-
-def sha256sum(path, chunk_size=1024 * 1024):
-    checksum = hashlib.sha256()
-    with open(path, "rb") as f:
-        while chunk := f.read(chunk_size):
-            checksum.update(chunk)
-    print(checksum.hexdigest())
-```
-
-### `_datapipe(self, resource_dps)`
-
-This method is the heart of the dataset, where we transform the raw data into a usable form. A major difference compared
-to the current stable datasets is that everything is performed through `IterDataPipe`'s. From the perspective of someone
-that is working with them rather than on them, `IterDataPipe`'s behave just as generators, i.e. you can't do anything
-with them besides iterating.
-
-Of course, there are some common building blocks that should suffice in 95% of the cases. The most used are:
-
-- `Mapper`: Apply a callable to every item in the datapipe.
-- `Filter`: Keep only items that satisfy a condition.
-- `Demultiplexer`: Split a datapipe into multiple ones.
-- `IterKeyZipper`: Merge two datapipes into one.
-
-All of them can be imported `from torchdata.datapipes.iter`. In addition, use `functools.partial` in case a callable
-needs extra arguments. If the provided `IterDataPipe`'s are not sufficient for the use case, it is also not complicated
-to add one. See the MNIST or CelebA datasets for example.
-
-`_datapipe()` receives `resource_dps`, which is a list of datapipes that has a 1-to-1 correspondence with the return
-value of `_resources()`. In case of archives with regular suffixes (`.tar`, `.zip`, ...), the datapipe will contain
-tuples comprised of the path and the handle for every file in the archive. Otherwise, the datapipe will only contain one
-of such tuples for the file specified by the resource.
-
-Since the datapipes are iterable in nature, some datapipes feature an in-memory buffer, e.g. `IterKeyZipper` and
-`Grouper`. There are two issues with that:
-
-1. If not used carefully, this can easily overflow the host memory, since most datasets will not fit in completely.
-2. This can lead to unnecessarily long warm-up times when data is buffered that is only needed at runtime.
-
-Thus, all buffered datapipes should be used as early as possible, e.g. zipping two datapipes of file handles rather than
-trying to zip already loaded images.
-
-There are two special datapipes that are not used through their class, but through the functions `hint_shuffling` and
-`hint_sharding`. As the name implies they only hint at a location in the datapipe graph where shuffling and sharding
-should take place, but are no-ops by default. They can be imported from `torchvision.prototype.datasets.utils._internal`
-and are required in each dataset. `hint_shuffling` has to be placed before `hint_sharding`.
-
-Finally, each item in the final datapipe should be a dictionary with `str` keys. There is no standardization of the
-names (yet!).
-
-### `__len__`
-
-This returns an integer denoting the number of samples that can be drawn from the dataset. Please use
-[underscores](https://peps.python.org/pep-0515/) after every three digits starting from the right to enhance the
-readability. For example, `1_281_167` vs. `1281167`.
-
-If there are only two different numbers, a simple `if` / `else` is fine:
-
-```py
-def __len__(self):
-    return 12_345 if self._split == "train" else 6_789
-```
-
-If there are more options, using a dictionary usually is the most readable option:
-
-```py
-def __len__(self):
-    return {
-        "train": 3,
-        "val": 2,
-        "test": 1,
-    }[self._split]
-```
-
-If the number of samples depends on more than one parameter, you can use tuples as dictionary keys:
-
-```py
-def __len__(self):
-    return {
-        ("train", "bar"): 4,
-        ("train", "baz"): 3,
-        ("test", "bar"): 2,
-        ("test", "baz"): 1,
-    }[(self._split, self._foo)]
-```
-
-The length of the datapipe is only an annotation for subsequent processing of the datapipe and not needed during the
-development process. Since it is an `@abstractmethod` you still have to implement it from the start. The canonical way
-is to define a dummy method like
-
-```py
-def __len__(self):
-    return 1
-```
-
-and only fill it with the correct data if the implementation is otherwise finished.
-[See below](#how-do-i-compute-the-number-of-samples) for a possible way to compute the number of samples.
-
-## Tests
-
-To test the dataset implementation, you usually don't need to add any tests, but need to provide a mock-up of the data.
-This mock-up should resemble the original data as close as necessary, while containing only few examples.
-
-To do this, add a new function in [`test/builtin_dataset_mocks.py`](../../../../test/builtin_dataset_mocks.py) with the
-same name as you have used in `@register_info` and `@register_dataset`. This function is called "mock data function".
-Decorate it with `@register_mock(configs=[dict(...), ...])`. Each dictionary denotes one configuration that the dataset
-will be loaded with, e.g. `datasets.load("my-dataset", **config)`. For the most common case of a product of all options,
-you can use the `combinations_grid()` helper function, e.g.
-`configs=combinations_grid(split=("train", "test"), foo=("bar", "baz"))`.
-
-In case the name of the dataset includes hyphens `-`, replace them with underscores `_` in the function name and pass
-the `name` parameter to `@register_mock`
-
-```py
-# this is defined in torchvision/prototype/datasets/_builtin
-@register_dataset("my-dataset")
-class MyDataset(Dataset):
-    ...
-
-@register_mock(name="my-dataset", configs=...)
-def my_dataset(root, config):
-    ...
-```
-
-The mock data function receives two arguments:
-
-- `root`: A [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html#pathlib.Path) of a folder, in which the data
-  needs to be placed.
-- `config`: The configuration to generate the data for. This is one of the dictionaries defined in
-  `@register_mock(configs=...)`
-
-The function should generate all files that are needed for the current `config`. Each file should be complete, e.g. if
-the dataset only has a single archive that contains multiple splits, you need to generate the full archive regardless of
-the current `config`. Although this seems odd at first, this is important. Consider the following original data setup:
-
-```
-root
-├── test
-│   ├── test_image0.jpg
-│   ...
-└── train
-    ├── train_image0.jpg
-    ...
-```
-
-For map-style datasets (like the one currently in `torchvision.datasets`), one explicitly selects the files they want to
-load. For example, something like `(root / split).iterdir()` works fine even if only the specific split folder is
-present. With iterable-style datasets though, we get something like `root.iterdir()` from `resource_dps` in
-`_datapipe()` and need to manually `Filter` it to only keep the files we want. If we would only generate the data for
-the current `config`, the test would also pass if the dataset is missing the filtering, but would fail on the real data.
-
-For datasets that are ported from the old API, we already have some mock data in
-[`test/test_datasets.py`](../../../../test/test_datasets.py). You can find the test case corresponding test case there
-and have a look at the `inject_fake_data` function. There are a few differences though:
-
-- `tmp_dir` corresponds to `root`, but is a `str` rather than a
-  [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html#pathlib.Path). Thus, you often see something like
-  `folder = pathlib.Path(tmp_dir)`. This is not needed.
-- The data generated by `inject_fake_data` was supposed to be in an extracted state. This is no longer the case for the
-  new mock-ups. Thus, you need to use helper functions like `make_zip` or `make_tar` to actually generate the files
-  specified in the dataset.
-- As explained in the paragraph above, the generated data is often "incomplete" and only valid for given the config.
-  Make sure you follow the instructions above.
-
-The function should return an integer indicating the number of samples in the dataset for the current `config`.
-Preferably, this number should be different for different `config`'s to have more confidence in the dataset
-implementation.
-
-Finally, you can run the tests with `pytest test/test_prototype_builtin_datasets.py -k {name}`.
-
-## FAQ
-
-### How do I start?
-
-Get the skeleton of your dataset class ready with all 4 methods. For `_datapipe()`, you can just do
-`return resources_dp[0]` to get started. Then import the dataset class in
-`torchvision/prototype/datasets/_builtin/__init__.py`: this will automatically register the dataset, and it will be
-instantiable via `datasets.load("mydataset")`. On a separate script, try something like
-
-```py
-from torchvision.prototype import datasets
-
-dataset = datasets.load("mydataset")
-for sample in dataset:
-    print(sample)  # this is the content of an item in datapipe returned by _datapipe()
-    break
-# Or you can also inspect the sample in a debugger
-```
-
-This will give you an idea of what the first datapipe in `resources_dp` contains. You can also do that with
-`resources_dp[1]` or `resources_dp[2]` (etc.) if they exist. Then follow the instructions above to manipulate these
-datapipes and return the appropriate dictionary format.
-
-### How do I handle a dataset that defines many categories?
-
-As a rule of thumb, `categories` in the info dictionary should only be set manually for ten categories or fewer. If more
-categories are needed, you can add a `$NAME.categories` file to the `_builtin` folder in which each line specifies a
-category. To load such a file, use the `from torchvision.prototype.datasets.utils._internal import read_categories_file`
-function and pass it `$NAME`.
-
-In case the categories can be generated from the dataset files, e.g. the dataset follows an image folder approach where
-each folder denotes the name of the category, the dataset can overwrite the `_generate_categories` method. The method
-should return a sequence of strings representing the category names. In the method body, you'll have to manually load
-the resources, e.g.
-
-```py
-resources = self._resources()
-dp = resources[0].load(self._root)
-```
-
-Note that it is not necessary here to keep a datapipe until the final step. Stick with datapipes as long as it makes
-sense and afterwards materialize the data with `next(iter(dp))` or `list(dp)` and proceed with that.
-
-To generate the `$NAME.categories` file, run `python -m torchvision.prototype.datasets.generate_category_files $NAME`.
-
-### What if a resource file forms an I/O bottleneck?
-
-In general, we are ok with small performance hits of iterating archives rather than their extracted content. However, if
-the performance hit becomes significant, the archives can still be preprocessed. `OnlineResource` accepts the
-`preprocess` parameter that can be a `Callable[[pathlib.Path], pathlib.Path]` where the input points to the file to be
-preprocessed and the return value should be the result of the preprocessing to load. For convenience, `preprocess` also
-accepts `"decompress"` and `"extract"` to handle these common scenarios.
-
-### How do I compute the number of samples?
-
-Unless the authors of the dataset published the exact numbers (even in this case we should check), there is no other way
-than to iterate over the dataset and count the number of samples:
-
-```py
-import itertools
-from torchvision.prototype import datasets
-
-
-def combinations_grid(**kwargs):
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
-# If you have implemented the mock data function for the dataset tests, you can simply copy-paste from there
-configs = combinations_grid(split=("train", "test"), foo=("bar", "baz"))
-
-for config in configs:
-    dataset = datasets.load("my-dataset", **config)
-
-    num_samples = 0
-    for _ in dataset:
-        num_samples += 1
-
-    print(", ".join(f"{key}={value}" for key, value in config.items()), num_samples)
-```
-
-To speed this up, it is useful to temporarily comment out all unnecessary I/O, such as loading of images or annotation
-files.
diff --git a/torchvision/prototype/datasets/_builtin/__init__.py b/torchvision/prototype/datasets/_builtin/__init__.py
deleted file mode 100644
index d84e9af9f..000000000
--- a/torchvision/prototype/datasets/_builtin/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from .caltech import Caltech101, Caltech256
-from .celeba import CelebA
-from .cifar import Cifar10, Cifar100
-from .clevr import CLEVR
-from .coco import Coco
-from .country211 import Country211
-from .cub200 import CUB200
-from .dtd import DTD
-from .eurosat import EuroSAT
-from .fer2013 import FER2013
-from .food101 import Food101
-from .gtsrb import GTSRB
-from .imagenet import ImageNet
-from .mnist import EMNIST, FashionMNIST, KMNIST, MNIST, QMNIST
-from .oxford_iiit_pet import OxfordIIITPet
-from .pcam import PCAM
-from .sbd import SBD
-from .semeion import SEMEION
-from .stanford_cars import StanfordCars
-from .svhn import SVHN
-from .usps import USPS
-from .voc import VOC
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
deleted file mode 100644
index 5072902b2..000000000
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import pathlib
-import re
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
-
-import numpy as np
-
-import torch
-from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes
-
-from .._api import register_dataset, register_info
-
-
-@register_info("caltech101")
-def _caltech101_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("caltech101"))
-
-
-@register_dataset("caltech101")
-class Caltech101(Dataset):
-    """
-    - **homepage**: https://data.caltech.edu/records/20086
-    - **dependencies**:
-        - <scipy `https://scipy.org/`>_
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._categories = _caltech101_info()["categories"]
-
-        super().__init__(
-            root,
-            dependencies=("scipy",),
-            skip_integrity_check=skip_integrity_check,
-        )
-
-    def _resources(self) -> List[OnlineResource]:
-        images = GDriveResource(
-            "137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp",
-            file_name="101_ObjectCategories.tar.gz",
-            sha256="af6ece2f339791ca20f855943d8b55dd60892c0a25105fcd631ee3d6430f9926",
-            preprocess="decompress",
-        )
-        anns = GDriveResource(
-            "175kQy3UsZ0wUEHZjqkUDdNVssr7bgh_m",
-            file_name="Annotations.tar",
-            sha256="1717f4e10aa837b05956e3f4c94456527b143eec0d95e935028b30aff40663d8",
-        )
-        return [images, anns]
-
-    _IMAGES_NAME_PATTERN = re.compile(r"image_(?P<id>\d+)[.]jpg")
-    _ANNS_NAME_PATTERN = re.compile(r"annotation_(?P<id>\d+)[.]mat")
-    _ANNS_CATEGORY_MAP = {
-        "Faces_2": "Faces",
-        "Faces_3": "Faces_easy",
-        "Motorbikes_16": "Motorbikes",
-        "Airplanes_Side_2": "airplanes",
-    }
-
-    def _is_not_background_image(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.parent.name != "BACKGROUND_Google"
-
-    def _is_ann(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return bool(self._ANNS_NAME_PATTERN.match(path.name))
-
-    def _images_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
-        path = pathlib.Path(data[0])
-
-        category = path.parent.name
-        id = self._IMAGES_NAME_PATTERN.match(path.name).group("id")  # type: ignore[union-attr]
-
-        return category, id
-
-    def _anns_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
-        path = pathlib.Path(data[0])
-
-        category = path.parent.name
-        if category in self._ANNS_CATEGORY_MAP:
-            category = self._ANNS_CATEGORY_MAP[category]
-
-        id = self._ANNS_NAME_PATTERN.match(path.name).group("id")  # type: ignore[union-attr]
-
-        return category, id
-
-    def _prepare_sample(
-        self, data: Tuple[Tuple[str, str], Tuple[Tuple[str, BinaryIO], Tuple[str, BinaryIO]]]
-    ) -> Dict[str, Any]:
-        key, (image_data, ann_data) = data
-        category, _ = key
-        image_path, image_buffer = image_data
-        ann_path, ann_buffer = ann_data
-
-        image = EncodedImage.from_file(image_buffer)
-        ann = read_mat(ann_buffer)
-
-        return dict(
-            label=Label.from_category(category, categories=self._categories),
-            image_path=image_path,
-            image=image,
-            ann_path=ann_path,
-            bounding_boxes=BoundingBoxes(
-                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]],
-                format="xyxy",
-                spatial_size=image.spatial_size,
-            ),
-            contour=torch.as_tensor(ann["obj_contour"].T),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, anns_dp = resource_dps
-
-        images_dp = Filter(images_dp, self._is_not_background_image)
-        images_dp = hint_shuffling(images_dp)
-        images_dp = hint_sharding(images_dp)
-
-        anns_dp = Filter(anns_dp, self._is_ann)
-
-        dp = IterKeyZipper(
-            images_dp,
-            anns_dp,
-            key_fn=self._images_key_fn,
-            ref_key_fn=self._anns_key_fn,
-            buffer_size=INFINITE_BUFFER_SIZE,
-            keep_key=True,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 8677
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, self._is_not_background_image)
-
-        return sorted({pathlib.Path(path).parent.name for path, _ in dp})
-
-
-@register_info("caltech256")
-def _caltech256_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("caltech256"))
-
-
-@register_dataset("caltech256")
-class Caltech256(Dataset):
-    """
-    - **homepage**: https://data.caltech.edu/records/20087
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._categories = _caltech256_info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            GDriveResource(
-                "1r6o0pSROcV1_VwT4oSjA2FBUSCWGuxLK",
-                file_name="256_ObjectCategories.tar",
-                sha256="08ff01b03c65566014ae88eb0490dbe4419fc7ac4de726ee1163e39fd809543e",
-            )
-        ]
-
-    def _is_not_rogue_file(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.name != "RENAME2"
-
-    def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
-        path, buffer = data
-
-        return dict(
-            path=path,
-            image=EncodedImage.from_file(buffer),
-            label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, self._is_not_rogue_file)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 30607
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dir_names = {pathlib.Path(path).parent.name for path, _ in dp}
-
-        return [name.split(".")[1] for name in sorted(dir_names)]
diff --git a/torchvision/prototype/datasets/_builtin/caltech101.categories b/torchvision/prototype/datasets/_builtin/caltech101.categories
deleted file mode 100644
index d5c18654b..000000000
--- a/torchvision/prototype/datasets/_builtin/caltech101.categories
+++ /dev/null
@@ -1,101 +0,0 @@
-Faces
-Faces_easy
-Leopards
-Motorbikes
-accordion
-airplanes
-anchor
-ant
-barrel
-bass
-beaver
-binocular
-bonsai
-brain
-brontosaurus
-buddha
-butterfly
-camera
-cannon
-car_side
-ceiling_fan
-cellphone
-chair
-chandelier
-cougar_body
-cougar_face
-crab
-crayfish
-crocodile
-crocodile_head
-cup
-dalmatian
-dollar_bill
-dolphin
-dragonfly
-electric_guitar
-elephant
-emu
-euphonium
-ewer
-ferry
-flamingo
-flamingo_head
-garfield
-gerenuk
-gramophone
-grand_piano
-hawksbill
-headphone
-hedgehog
-helicopter
-ibis
-inline_skate
-joshua_tree
-kangaroo
-ketch
-lamp
-laptop
-llama
-lobster
-lotus
-mandolin
-mayfly
-menorah
-metronome
-minaret
-nautilus
-octopus
-okapi
-pagoda
-panda
-pigeon
-pizza
-platypus
-pyramid
-revolver
-rhino
-rooster
-saxophone
-schooner
-scissors
-scorpion
-sea_horse
-snoopy
-soccer_ball
-stapler
-starfish
-stegosaurus
-stop_sign
-strawberry
-sunflower
-tick
-trilobite
-umbrella
-watch
-water_lilly
-wheelchair
-wild_cat
-windsor_chair
-wrench
-yin_yang
diff --git a/torchvision/prototype/datasets/_builtin/caltech256.categories b/torchvision/prototype/datasets/_builtin/caltech256.categories
deleted file mode 100644
index 82128efba..000000000
--- a/torchvision/prototype/datasets/_builtin/caltech256.categories
+++ /dev/null
@@ -1,257 +0,0 @@
-ak47
-american-flag
-backpack
-baseball-bat
-baseball-glove
-basketball-hoop
-bat
-bathtub
-bear
-beer-mug
-billiards
-binoculars
-birdbath
-blimp
-bonsai-101
-boom-box
-bowling-ball
-bowling-pin
-boxing-glove
-brain-101
-breadmaker
-buddha-101
-bulldozer
-butterfly
-cactus
-cake
-calculator
-camel
-cannon
-canoe
-car-tire
-cartman
-cd
-centipede
-cereal-box
-chandelier-101
-chess-board
-chimp
-chopsticks
-cockroach
-coffee-mug
-coffin
-coin
-comet
-computer-keyboard
-computer-monitor
-computer-mouse
-conch
-cormorant
-covered-wagon
-cowboy-hat
-crab-101
-desk-globe
-diamond-ring
-dice
-dog
-dolphin-101
-doorknob
-drinking-straw
-duck
-dumb-bell
-eiffel-tower
-electric-guitar-101
-elephant-101
-elk
-ewer-101
-eyeglasses
-fern
-fighter-jet
-fire-extinguisher
-fire-hydrant
-fire-truck
-fireworks
-flashlight
-floppy-disk
-football-helmet
-french-horn
-fried-egg
-frisbee
-frog
-frying-pan
-galaxy
-gas-pump
-giraffe
-goat
-golden-gate-bridge
-goldfish
-golf-ball
-goose
-gorilla
-grand-piano-101
-grapes
-grasshopper
-guitar-pick
-hamburger
-hammock
-harmonica
-harp
-harpsichord
-hawksbill-101
-head-phones
-helicopter-101
-hibiscus
-homer-simpson
-horse
-horseshoe-crab
-hot-air-balloon
-hot-dog
-hot-tub
-hourglass
-house-fly
-human-skeleton
-hummingbird
-ibis-101
-ice-cream-cone
-iguana
-ipod
-iris
-jesus-christ
-joy-stick
-kangaroo-101
-kayak
-ketch-101
-killer-whale
-knife
-ladder
-laptop-101
-lathe
-leopards-101
-license-plate
-lightbulb
-light-house
-lightning
-llama-101
-mailbox
-mandolin
-mars
-mattress
-megaphone
-menorah-101
-microscope
-microwave
-minaret
-minotaur
-motorbikes-101
-mountain-bike
-mushroom
-mussels
-necktie
-octopus
-ostrich
-owl
-palm-pilot
-palm-tree
-paperclip
-paper-shredder
-pci-card
-penguin
-people
-pez-dispenser
-photocopier
-picnic-table
-playing-card
-porcupine
-pram
-praying-mantis
-pyramid
-raccoon
-radio-telescope
-rainbow
-refrigerator
-revolver-101
-rifle
-rotary-phone
-roulette-wheel
-saddle
-saturn
-school-bus
-scorpion-101
-screwdriver
-segway
-self-propelled-lawn-mower
-sextant
-sheet-music
-skateboard
-skunk
-skyscraper
-smokestack
-snail
-snake
-sneaker
-snowmobile
-soccer-ball
-socks
-soda-can
-spaghetti
-speed-boat
-spider
-spoon
-stained-glass
-starfish-101
-steering-wheel
-stirrups
-sunflower-101
-superman
-sushi
-swan
-swiss-army-knife
-sword
-syringe
-tambourine
-teapot
-teddy-bear
-teepee
-telephone-box
-tennis-ball
-tennis-court
-tennis-racket
-theodolite
-toaster
-tomato
-tombstone
-top-hat
-touring-bike
-tower-pisa
-traffic-light
-treadmill
-triceratops
-tricycle
-trilobite-101
-tripod
-t-shirt
-tuning-fork
-tweezer
-umbrella-101
-unicorn
-vcr
-video-projector
-washing-machine
-watch-101
-waterfall
-watermelon
-welding-mask
-wheelbarrow
-windmill
-wine-bottle
-xylophone
-yarmulke
-yo-yo
-zebra
-airplanes-101
-car-side-101
-faces-easy-101
-greyhound
-tennis-shoes
-toad
-clutter
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
deleted file mode 100644
index 5ec3ee3eb..000000000
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import csv
-import pathlib
-from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes
-
-from .._api import register_dataset, register_info
-
-csv.register_dialect("celeba", delimiter=" ", skipinitialspace=True)
-
-
-class CelebACSVParser(IterDataPipe[Tuple[str, Dict[str, str]]]):
-    def __init__(
-        self,
-        datapipe: IterDataPipe[Tuple[Any, BinaryIO]],
-        *,
-        fieldnames: Optional[Sequence[str]] = None,
-    ) -> None:
-        self.datapipe = datapipe
-        self.fieldnames = fieldnames
-
-    def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]:
-        for _, file in self.datapipe:
-            try:
-                lines = (line.decode() for line in file)
-
-                if self.fieldnames:
-                    fieldnames = self.fieldnames
-                else:
-                    # The first row is skipped, because it only contains the number of samples
-                    next(lines)
-
-                    # Empty field names are filtered out, because some files have an extra white space after the header
-                    # line, which is recognized as extra column
-                    fieldnames = [name for name in next(csv.reader([next(lines)], dialect="celeba")) if name]
-                    # Some files do not include a label for the image ID column
-                    if fieldnames[0] != "image_id":
-                        fieldnames.insert(0, "image_id")
-
-                for line in csv.DictReader(lines, fieldnames=fieldnames, dialect="celeba"):
-                    yield line.pop("image_id"), line
-            finally:
-                file.close()
-
-
-NAME = "celeba"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict()
-
-
-@register_dataset(NAME)
-class CelebA(Dataset):
-    """
-    - **homepage**: https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "test"))
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        splits = GDriveResource(
-            "0B7EVK8r0v71pY0NSMzRuSXJEVkk",
-            sha256="fc955bcb3ef8fbdf7d5640d9a8693a8431b5f2ee291a5c1449a1549e7e073fe7",
-            file_name="list_eval_partition.txt",
-        )
-        images = GDriveResource(
-            "0B7EVK8r0v71pZjFTYXZWM3FlRnM",
-            sha256="46fb89443c578308acf364d7d379fe1b9efb793042c0af734b6112e4fd3a8c74",
-            file_name="img_align_celeba.zip",
-        )
-        identities = GDriveResource(
-            "1_ee_0u7vcNLOfNLegJRHmolfH5ICW-XS",
-            sha256="c6143857c3e2630ac2da9f782e9c1232e5e59be993a9d44e8a7916c78a6158c0",
-            file_name="identity_CelebA.txt",
-        )
-        attributes = GDriveResource(
-            "0B7EVK8r0v71pblRyaVFSWGxPY0U",
-            sha256="f0e5da289d5ccf75ffe8811132694922b60f2af59256ed362afa03fefba324d0",
-            file_name="list_attr_celeba.txt",
-        )
-        bounding_boxes = GDriveResource(
-            "0B7EVK8r0v71pbThiMVRxWXZ4dU0",
-            sha256="7487a82e57c4bb956c5445ae2df4a91ffa717e903c5fa22874ede0820c8ec41b",
-            file_name="list_bbox_celeba.txt",
-        )
-        landmarks = GDriveResource(
-            "0B7EVK8r0v71pd0FJY3Blby1HUTQ",
-            sha256="6c02a87569907f6db2ba99019085697596730e8129f67a3d61659f198c48d43b",
-            file_name="list_landmarks_align_celeba.txt",
-        )
-        return [splits, images, identities, attributes, bounding_boxes, landmarks]
-
-    def _filter_split(self, data: Tuple[str, Dict[str, str]]) -> bool:
-        split_id = {
-            "train": "0",
-            "val": "1",
-            "test": "2",
-        }[self._split]
-        return data[1]["split_id"] == split_id
-
-    def _prepare_sample(
-        self,
-        data: Tuple[
-            Tuple[str, Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO]]],
-            Tuple[
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-            ],
-        ],
-    ) -> Dict[str, Any]:
-        split_and_image_data, ann_data = data
-        _, (_, image_data) = split_and_image_data
-        path, buffer = image_data
-
-        image = EncodedImage.from_file(buffer)
-        (_, identity), (_, attributes), (_, bounding_boxes), (_, landmarks) = ann_data
-
-        return dict(
-            path=path,
-            image=image,
-            identity=Label(int(identity["identity"])),
-            attributes={attr: value == "1" for attr, value in attributes.items()},
-            bounding_boxes=BoundingBoxes(
-                [int(bounding_boxes[key]) for key in ("x_1", "y_1", "width", "height")],
-                format="xywh",
-                spatial_size=image.spatial_size,
-            ),
-            landmarks={
-                landmark: torch.tensor((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
-                for landmark in {key[:-2] for key in landmarks.keys()}
-            },
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        splits_dp, images_dp, identities_dp, attributes_dp, bounding_boxes_dp, landmarks_dp = resource_dps
-
-        splits_dp = CelebACSVParser(splits_dp, fieldnames=("image_id", "split_id"))
-        splits_dp = Filter(splits_dp, self._filter_split)
-        splits_dp = hint_shuffling(splits_dp)
-        splits_dp = hint_sharding(splits_dp)
-
-        anns_dp = Zipper(
-            *[
-                CelebACSVParser(dp, fieldnames=fieldnames)
-                for dp, fieldnames in (
-                    (identities_dp, ("image_id", "identity")),
-                    (attributes_dp, None),
-                    (bounding_boxes_dp, None),
-                    (landmarks_dp, None),
-                )
-            ]
-        )
-
-        dp = IterKeyZipper(
-            splits_dp,
-            images_dp,
-            key_fn=getitem(0),
-            ref_key_fn=path_accessor("name"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-            keep_key=True,
-        )
-        dp = IterKeyZipper(
-            dp,
-            anns_dp,
-            key_fn=getitem(0),
-            ref_key_fn=getitem(0, 0),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 162_770,
-            "val": 19_867,
-            "test": 19_962,
-        }[self._split]
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
deleted file mode 100644
index 0668ec9fc..000000000
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import abc
-import io
-import pathlib
-import pickle
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Tuple, Union
-
-import numpy as np
-from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-
-class CifarFileReader(IterDataPipe[Tuple[np.ndarray, int]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[str, Any]], *, labels_key: str) -> None:
-        self.datapipe = datapipe
-        self.labels_key = labels_key
-
-    def __iter__(self) -> Iterator[Tuple[np.ndarray, int]]:
-        for mapping in self.datapipe:
-            image_arrays = mapping["data"].reshape((-1, 3, 32, 32))
-            category_idcs = mapping[self.labels_key]
-            yield from iter(zip(image_arrays, category_idcs))
-
-
-class _CifarBase(Dataset):
-    _FILE_NAME: str
-    _SHA256: str
-    _LABELS_KEY: str
-    _META_FILE_NAME: str
-    _CATEGORIES_KEY: str
-    _categories: List[str]
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    @abc.abstractmethod
-    def _is_data_file(self, data: Tuple[str, BinaryIO]) -> Optional[int]:
-        pass
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                f"https://www.cs.toronto.edu/~kriz/{self._FILE_NAME}",
-                sha256=self._SHA256,
-            )
-        ]
-
-    def _unpickle(self, data: Tuple[str, io.BytesIO]) -> Dict[str, Any]:
-        _, file = data
-        content = cast(Dict[str, Any], pickle.load(file, encoding="latin1"))
-        file.close()
-        return content
-
-    def _prepare_sample(self, data: Tuple[np.ndarray, int]) -> Dict[str, Any]:
-        image_array, category_idx = data
-        return dict(
-            image=Image(image_array),
-            label=Label(category_idx, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, self._is_data_file)
-        dp = Mapper(dp, self._unpickle)
-        dp = CifarFileReader(dp, labels_key=self._LABELS_KEY)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 50_000 if self._split == "train" else 10_000
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", self._META_FILE_NAME))
-        dp = Mapper(dp, self._unpickle)
-
-        return cast(List[str], next(iter(dp))[self._CATEGORIES_KEY])
-
-
-@register_info("cifar10")
-def _cifar10_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("cifar10"))
-
-
-@register_dataset("cifar10")
-class Cifar10(_CifarBase):
-    """
-    - **homepage**: https://www.cs.toronto.edu/~kriz/cifar.html
-    """
-
-    _FILE_NAME = "cifar-10-python.tar.gz"
-    _SHA256 = "6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce"
-    _LABELS_KEY = "labels"
-    _META_FILE_NAME = "batches.meta"
-    _CATEGORIES_KEY = "label_names"
-    _categories = _cifar10_info()["categories"]
-
-    def _is_data_file(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.name.startswith("data" if self._split == "train" else "test")
-
-
-@register_info("cifar100")
-def _cifar100_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("cifar100"))
-
-
-@register_dataset("cifar100")
-class Cifar100(_CifarBase):
-    """
-    - **homepage**: https://www.cs.toronto.edu/~kriz/cifar.html
-    """
-
-    _FILE_NAME = "cifar-100-python.tar.gz"
-    _SHA256 = "85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7"
-    _LABELS_KEY = "fine_labels"
-    _META_FILE_NAME = "meta"
-    _CATEGORIES_KEY = "fine_label_names"
-    _categories = _cifar100_info()["categories"]
-
-    def _is_data_file(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.name == self._split
diff --git a/torchvision/prototype/datasets/_builtin/cifar10.categories b/torchvision/prototype/datasets/_builtin/cifar10.categories
deleted file mode 100644
index fa30c22b9..000000000
--- a/torchvision/prototype/datasets/_builtin/cifar10.categories
+++ /dev/null
@@ -1,10 +0,0 @@
-airplane
-automobile
-bird
-cat
-deer
-dog
-frog
-horse
-ship
-truck
diff --git a/torchvision/prototype/datasets/_builtin/cifar100.categories b/torchvision/prototype/datasets/_builtin/cifar100.categories
deleted file mode 100644
index 7f7bf51d1..000000000
--- a/torchvision/prototype/datasets/_builtin/cifar100.categories
+++ /dev/null
@@ -1,100 +0,0 @@
-apple
-aquarium_fish
-baby
-bear
-beaver
-bed
-bee
-beetle
-bicycle
-bottle
-bowl
-boy
-bridge
-bus
-butterfly
-camel
-can
-castle
-caterpillar
-cattle
-chair
-chimpanzee
-clock
-cloud
-cockroach
-couch
-crab
-crocodile
-cup
-dinosaur
-dolphin
-elephant
-flatfish
-forest
-fox
-girl
-hamster
-house
-kangaroo
-keyboard
-lamp
-lawn_mower
-leopard
-lion
-lizard
-lobster
-man
-maple_tree
-motorcycle
-mountain
-mouse
-mushroom
-oak_tree
-orange
-orchid
-otter
-palm_tree
-pear
-pickup_truck
-pine_tree
-plain
-plate
-poppy
-porcupine
-possum
-rabbit
-raccoon
-ray
-road
-rocket
-rose
-sea
-seal
-shark
-shrew
-skunk
-skyscraper
-snail
-snake
-spider
-squirrel
-streetcar
-sunflower
-sweet_pepper
-table
-tank
-telephone
-television
-tiger
-tractor
-train
-trout
-tulip
-turtle
-wardrobe
-whale
-willow_tree
-wolf
-woman
-worm
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
deleted file mode 100644
index 46cef90ef..000000000
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-)
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-NAME = "clevr"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict()
-
-
-@register_dataset(NAME)
-class CLEVR(Dataset):
-    """
-    - **homepage**: https://cs.stanford.edu/people/jcjohns/clevr/
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "train", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "test"))
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip",
-            sha256="5cd61cf1096ed20944df93c9adb31e74d189b8459a94f54ba00090e5c59936d1",
-        )
-        return [archive]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.parents[1].name == "images":
-            return 0
-        elif path.parent.name == "scenes":
-            return 1
-        else:
-            return None
-
-    def _filter_scene_anns(self, data: Tuple[str, Any]) -> bool:
-        key, _ = data
-        return key == "scenes"
-
-    def _add_empty_anns(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[str, BinaryIO], None]:
-        return data, None
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Optional[Dict[str, Any]]]) -> Dict[str, Any]:
-        image_data, scenes_data = data
-        path, buffer = image_data
-
-        return dict(
-            path=path,
-            image=EncodedImage.from_file(buffer),
-            label=Label(len(scenes_data["objects"])) if scenes_data else None,
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        images_dp, scenes_dp = Demultiplexer(
-            archive_dp,
-            2,
-            self._classify_archive,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        images_dp = Filter(images_dp, path_comparator("parent.name", self._split))
-        images_dp = hint_shuffling(images_dp)
-        images_dp = hint_sharding(images_dp)
-
-        if self._split != "test":
-            scenes_dp = Filter(scenes_dp, path_comparator("name", f"CLEVR_{self._split}_scenes.json"))
-            scenes_dp = JsonParser(scenes_dp)
-            scenes_dp = Mapper(scenes_dp, getitem(1, "scenes"))
-            scenes_dp = UnBatcher(scenes_dp)
-
-            dp = IterKeyZipper(
-                images_dp,
-                scenes_dp,
-                key_fn=path_accessor("name"),
-                ref_key_fn=getitem("image_filename"),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-        else:
-            for _, file in scenes_dp:
-                file.close()
-            dp = Mapper(images_dp, self._add_empty_anns)
-
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 70_000 if self._split == "train" else 15_000
diff --git a/torchvision/prototype/datasets/_builtin/coco.categories b/torchvision/prototype/datasets/_builtin/coco.categories
deleted file mode 100644
index 27e612f6d..000000000
--- a/torchvision/prototype/datasets/_builtin/coco.categories
+++ /dev/null
@@ -1,91 +0,0 @@
-__background__,N/A
-person,person
-bicycle,vehicle
-car,vehicle
-motorcycle,vehicle
-airplane,vehicle
-bus,vehicle
-train,vehicle
-truck,vehicle
-boat,vehicle
-traffic light,outdoor
-fire hydrant,outdoor
-N/A,N/A
-stop sign,outdoor
-parking meter,outdoor
-bench,outdoor
-bird,animal
-cat,animal
-dog,animal
-horse,animal
-sheep,animal
-cow,animal
-elephant,animal
-bear,animal
-zebra,animal
-giraffe,animal
-N/A,N/A
-backpack,accessory
-umbrella,accessory
-N/A,N/A
-N/A,N/A
-handbag,accessory
-tie,accessory
-suitcase,accessory
-frisbee,sports
-skis,sports
-snowboard,sports
-sports ball,sports
-kite,sports
-baseball bat,sports
-baseball glove,sports
-skateboard,sports
-surfboard,sports
-tennis racket,sports
-bottle,kitchen
-N/A,N/A
-wine glass,kitchen
-cup,kitchen
-fork,kitchen
-knife,kitchen
-spoon,kitchen
-bowl,kitchen
-banana,food
-apple,food
-sandwich,food
-orange,food
-broccoli,food
-carrot,food
-hot dog,food
-pizza,food
-donut,food
-cake,food
-chair,furniture
-couch,furniture
-potted plant,furniture
-bed,furniture
-N/A,N/A
-dining table,furniture
-N/A,N/A
-N/A,N/A
-toilet,furniture
-N/A,N/A
-tv,electronic
-laptop,electronic
-mouse,electronic
-remote,electronic
-keyboard,electronic
-cell phone,electronic
-microwave,appliance
-oven,appliance
-toaster,appliance
-sink,appliance
-refrigerator,appliance
-N/A,N/A
-book,indoor
-clock,indoor
-vase,indoor
-scissors,indoor
-teddy bear,indoor
-hair drier,indoor
-toothbrush,indoor
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
deleted file mode 100644
index 628629e33..000000000
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ /dev/null
@@ -1,274 +0,0 @@
-import pathlib
-import re
-from collections import defaultdict, OrderedDict
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import (
-    Demultiplexer,
-    Filter,
-    Grouper,
-    IterDataPipe,
-    IterKeyZipper,
-    JsonParser,
-    Mapper,
-    UnBatcher,
-)
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    MappingIterator,
-    path_accessor,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes, Mask
-
-from .._api import register_dataset, register_info
-
-
-NAME = "coco"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    categories, super_categories = zip(*read_categories_file(NAME))
-    return dict(categories=categories, super_categories=super_categories)
-
-
-@register_dataset(NAME)
-class Coco(Dataset):
-    """
-    - **homepage**: https://cocodataset.org/
-    - **dependencies**:
-        - <pycocotools `https://github.com/cocodataset/cocoapi`>_
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        year: str = "2017",
-        annotations: Optional[str] = "instances",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val"})
-        self._year = self._verify_str_arg(year, "year", {"2017", "2014"})
-        self._annotations = (
-            self._verify_str_arg(annotations, "annotations", self._ANN_DECODERS.keys())
-            if annotations is not None
-            else None
-        )
-
-        info = _info()
-        categories, super_categories = info["categories"], info["super_categories"]
-        self._categories = categories
-        self._category_to_super_category = dict(zip(categories, super_categories))
-
-        super().__init__(root, dependencies=("pycocotools",), skip_integrity_check=skip_integrity_check)
-
-    _IMAGE_URL_BASE = "http://images.cocodataset.org/zips"
-
-    _IMAGES_CHECKSUMS = {
-        ("2014", "train"): "ede4087e640bddba550e090eae701092534b554b42b05ac33f0300b984b31775",
-        ("2014", "val"): "fe9be816052049c34717e077d9e34aa60814a55679f804cd043e3cbee3b9fde0",
-        ("2017", "train"): "69a8bb58ea5f8f99d24875f21416de2e9ded3178e903f1f7603e283b9e06d929",
-        ("2017", "val"): "4f7e2ccb2866ec5041993c9cf2a952bbed69647b115d0f74da7ce8f4bef82f05",
-    }
-
-    _META_URL_BASE = "http://images.cocodataset.org/annotations"
-
-    _META_CHECKSUMS = {
-        "2014": "031296bbc80c45a1d1f76bf9a90ead27e94e99ec629208449507a4917a3bf009",
-        "2017": "113a836d90195ee1f884e704da6304dfaaecff1f023f49b6ca93c4aaae470268",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        images = HttpResource(
-            f"{self._IMAGE_URL_BASE}/{self._split}{self._year}.zip",
-            sha256=self._IMAGES_CHECKSUMS[(self._year, self._split)],
-        )
-        meta = HttpResource(
-            f"{self._META_URL_BASE}/annotations_trainval{self._year}.zip",
-            sha256=self._META_CHECKSUMS[self._year],
-        )
-        return [images, meta]
-
-    def _segmentation_to_mask(
-        self, segmentation: Any, *, is_crowd: bool, spatial_size: Tuple[int, int]
-    ) -> torch.Tensor:
-        from pycocotools import mask
-
-        if is_crowd:
-            segmentation = mask.frPyObjects(segmentation, *spatial_size)
-        else:
-            segmentation = mask.merge(mask.frPyObjects(segmentation, *spatial_size))
-
-        return torch.from_numpy(mask.decode(segmentation)).to(torch.bool)
-
-    def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        spatial_size = (image_meta["height"], image_meta["width"])
-        labels = [ann["category_id"] for ann in anns]
-        return dict(
-            segmentations=Mask(
-                torch.stack(
-                    [
-                        self._segmentation_to_mask(
-                            ann["segmentation"], is_crowd=ann["iscrowd"], spatial_size=spatial_size
-                        )
-                        for ann in anns
-                    ]
-                )
-            ),
-            areas=torch.as_tensor([ann["area"] for ann in anns]),
-            crowds=torch.as_tensor([ann["iscrowd"] for ann in anns], dtype=torch.bool),
-            bounding_boxes=BoundingBoxes(
-                [ann["bbox"] for ann in anns],
-                format="xywh",
-                spatial_size=spatial_size,
-            ),
-            labels=Label(labels, categories=self._categories),
-            super_categories=[self._category_to_super_category[self._categories[label]] for label in labels],
-            ann_ids=[ann["id"] for ann in anns],
-        )
-
-    def _decode_captions_ann(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        return dict(
-            captions=[ann["caption"] for ann in anns],
-            ann_ids=[ann["id"] for ann in anns],
-        )
-
-    _ANN_DECODERS = OrderedDict(
-        [
-            ("instances", _decode_instances_anns),
-            ("captions", _decode_captions_ann),
-        ]
-    )
-
-    _META_FILE_PATTERN = re.compile(
-        rf"(?P<annotations>({'|'.join(_ANN_DECODERS.keys())}))_(?P<split>[a-zA-Z]+)(?P<year>\d+)[.]json"
-    )
-
-    def _filter_meta_files(self, data: Tuple[str, Any]) -> bool:
-        match = self._META_FILE_PATTERN.match(pathlib.Path(data[0]).name)
-        return bool(
-            match
-            and match["split"] == self._split
-            and match["year"] == self._year
-            and match["annotations"] == self._annotations
-        )
-
-    def _classify_meta(self, data: Tuple[str, Any]) -> Optional[int]:
-        key, _ = data
-        if key == "images":
-            return 0
-        elif key == "annotations":
-            return 1
-        else:
-            return None
-
-    def _prepare_image(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
-        path, buffer = data
-        return dict(
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Tuple[List[Dict[str, Any]], Dict[str, Any]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
-        ann_data, image_data = data
-        anns, image_meta = ann_data
-
-        sample = self._prepare_image(image_data)
-        # this method is only called if we have annotations
-        annotations = cast(str, self._annotations)
-        sample.update(self._ANN_DECODERS[annotations](self, anns, image_meta))
-        return sample
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, meta_dp = resource_dps
-
-        if self._annotations is None:
-            dp = hint_shuffling(images_dp)
-            dp = hint_sharding(dp)
-            dp = hint_shuffling(dp)
-            return Mapper(dp, self._prepare_image)
-
-        meta_dp = Filter(meta_dp, self._filter_meta_files)
-        meta_dp = JsonParser(meta_dp)
-        meta_dp = Mapper(meta_dp, getitem(1))
-        meta_dp: IterDataPipe[Dict[str, Dict[str, Any]]] = MappingIterator(meta_dp)
-        images_meta_dp, anns_meta_dp = Demultiplexer(
-            meta_dp,
-            2,
-            self._classify_meta,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        images_meta_dp = Mapper(images_meta_dp, getitem(1))
-        images_meta_dp = UnBatcher(images_meta_dp)
-
-        anns_meta_dp = Mapper(anns_meta_dp, getitem(1))
-        anns_meta_dp = UnBatcher(anns_meta_dp)
-        anns_meta_dp = Grouper(anns_meta_dp, group_key_fn=getitem("image_id"), buffer_size=INFINITE_BUFFER_SIZE)
-        anns_meta_dp = hint_shuffling(anns_meta_dp)
-        anns_meta_dp = hint_sharding(anns_meta_dp)
-
-        anns_dp = IterKeyZipper(
-            anns_meta_dp,
-            images_meta_dp,
-            key_fn=getitem(0, "image_id"),
-            ref_key_fn=getitem("id"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        dp = IterKeyZipper(
-            anns_dp,
-            images_dp,
-            key_fn=getitem(1, "file_name"),
-            ref_key_fn=path_accessor("name"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            ("train", "2017"): defaultdict(lambda: 118_287, instances=117_266),
-            ("train", "2014"): defaultdict(lambda: 82_783, instances=82_081),
-            ("val", "2017"): defaultdict(lambda: 5_000, instances=4_952),
-            ("val", "2014"): defaultdict(lambda: 40_504, instances=40_137),
-        }[(self._split, self._year)][
-            self._annotations  # type: ignore[index]
-        ]
-
-    def _generate_categories(self) -> Tuple[Tuple[str, str]]:
-        self._annotations = "instances"
-        resources = self._resources()
-
-        dp = resources[1].load(self._root)
-        dp = Filter(dp, self._filter_meta_files)
-        dp = JsonParser(dp)
-
-        _, meta = next(iter(dp))
-        # List[Tuple[super_category, id, category]]
-        label_data = [cast(Tuple[str, int, str], tuple(info.values())) for info in meta["categories"]]
-
-        # COCO actually defines 91 categories, but only 80 of them have instances. Still, the category_id refers to the
-        # full set. To keep the labels dense, we fill the gaps with N/A. Note that there are only 10 gaps, so the total
-        # number of categories is 90 rather than 91.
-        _, ids, _ = zip(*label_data)
-        missing_ids = set(range(1, max(ids) + 1)) - set(ids)
-        label_data.extend([("N/A", id, "N/A") for id in missing_ids])
-
-        # We also add a background category to be used during segmentation.
-        label_data.append(("N/A", 0, "__background__"))
-
-        super_categories, _, categories = zip(*sorted(label_data, key=lambda info: info[1]))
-
-        return cast(Tuple[Tuple[str, str]], tuple(zip(categories, super_categories)))
diff --git a/torchvision/prototype/datasets/_builtin/country211.categories b/torchvision/prototype/datasets/_builtin/country211.categories
deleted file mode 100644
index 6fc3e99a1..000000000
--- a/torchvision/prototype/datasets/_builtin/country211.categories
+++ /dev/null
@@ -1,211 +0,0 @@
-AD
-AE
-AF
-AG
-AI
-AL
-AM
-AO
-AQ
-AR
-AT
-AU
-AW
-AX
-AZ
-BA
-BB
-BD
-BE
-BF
-BG
-BH
-BJ
-BM
-BN
-BO
-BQ
-BR
-BS
-BT
-BW
-BY
-BZ
-CA
-CD
-CF
-CH
-CI
-CK
-CL
-CM
-CN
-CO
-CR
-CU
-CV
-CW
-CY
-CZ
-DE
-DK
-DM
-DO
-DZ
-EC
-EE
-EG
-ES
-ET
-FI
-FJ
-FK
-FO
-FR
-GA
-GB
-GD
-GE
-GF
-GG
-GH
-GI
-GL
-GM
-GP
-GR
-GS
-GT
-GU
-GY
-HK
-HN
-HR
-HT
-HU
-ID
-IE
-IL
-IM
-IN
-IQ
-IR
-IS
-IT
-JE
-JM
-JO
-JP
-KE
-KG
-KH
-KN
-KP
-KR
-KW
-KY
-KZ
-LA
-LB
-LC
-LI
-LK
-LR
-LT
-LU
-LV
-LY
-MA
-MC
-MD
-ME
-MF
-MG
-MK
-ML
-MM
-MN
-MO
-MQ
-MR
-MT
-MU
-MV
-MW
-MX
-MY
-MZ
-NA
-NC
-NG
-NI
-NL
-NO
-NP
-NZ
-OM
-PA
-PE
-PF
-PG
-PH
-PK
-PL
-PR
-PS
-PT
-PW
-PY
-QA
-RE
-RO
-RS
-RU
-RW
-SA
-SB
-SC
-SD
-SE
-SG
-SH
-SI
-SJ
-SK
-SL
-SM
-SN
-SO
-SS
-SV
-SX
-SY
-SZ
-TG
-TH
-TJ
-TL
-TM
-TN
-TO
-TR
-TT
-TW
-TZ
-UA
-UG
-US
-UY
-UZ
-VA
-VE
-VG
-VI
-VN
-VU
-WS
-XK
-YE
-ZA
-ZM
-ZW
diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py
deleted file mode 100644
index 3308ddb99..000000000
--- a/torchvision/prototype/datasets/_builtin/country211.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Tuple, Union
-
-from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-NAME = "country211"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class Country211(Dataset):
-    """
-    - **homepage**: https://github.com/openai/CLIP/blob/main/data/country211.md
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "test"))
-        self._split_folder_name = "valid" if split == "val" else split
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                "https://openaipublic.azureedge.net/clip/data/country211.tgz",
-                sha256="c011343cdc1296a8c31ff1d7129cf0b5e5b8605462cffd24f89266d6e6f4da3c",
-            )
-        ]
-
-    def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
-        path, buffer = data
-        category = pathlib.Path(path).parent.name
-        return dict(
-            label=Label.from_category(category, categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _filter_split(self, data: Tuple[str, Any], *, split: str) -> bool:
-        return pathlib.Path(data[0]).parent.parent.name == split
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, path_comparator("parent.parent.name", self._split_folder_name))
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 31_650,
-            "val": 10_550,
-            "test": 21_100,
-        }[self._split]
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-        dp = resources[0].load(self._root)
-        return sorted({pathlib.Path(path).parent.name for path, _ in dp})
diff --git a/torchvision/prototype/datasets/_builtin/cub200.categories b/torchvision/prototype/datasets/_builtin/cub200.categories
deleted file mode 100644
index f91754c93..000000000
--- a/torchvision/prototype/datasets/_builtin/cub200.categories
+++ /dev/null
@@ -1,200 +0,0 @@
-Black_footed_Albatross
-Laysan_Albatross
-Sooty_Albatross
-Groove_billed_Ani
-Crested_Auklet
-Least_Auklet
-Parakeet_Auklet
-Rhinoceros_Auklet
-Brewer_Blackbird
-Red_winged_Blackbird
-Rusty_Blackbird
-Yellow_headed_Blackbird
-Bobolink
-Indigo_Bunting
-Lazuli_Bunting
-Painted_Bunting
-Cardinal
-Spotted_Catbird
-Gray_Catbird
-Yellow_breasted_Chat
-Eastern_Towhee
-Chuck_will_Widow
-Brandt_Cormorant
-Red_faced_Cormorant
-Pelagic_Cormorant
-Bronzed_Cowbird
-Shiny_Cowbird
-Brown_Creeper
-American_Crow
-Fish_Crow
-Black_billed_Cuckoo
-Mangrove_Cuckoo
-Yellow_billed_Cuckoo
-Gray_crowned_Rosy_Finch
-Purple_Finch
-Northern_Flicker
-Acadian_Flycatcher
-Great_Crested_Flycatcher
-Least_Flycatcher
-Olive_sided_Flycatcher
-Scissor_tailed_Flycatcher
-Vermilion_Flycatcher
-Yellow_bellied_Flycatcher
-Frigatebird
-Northern_Fulmar
-Gadwall
-American_Goldfinch
-European_Goldfinch
-Boat_tailed_Grackle
-Eared_Grebe
-Horned_Grebe
-Pied_billed_Grebe
-Western_Grebe
-Blue_Grosbeak
-Evening_Grosbeak
-Pine_Grosbeak
-Rose_breasted_Grosbeak
-Pigeon_Guillemot
-California_Gull
-Glaucous_winged_Gull
-Heermann_Gull
-Herring_Gull
-Ivory_Gull
-Ring_billed_Gull
-Slaty_backed_Gull
-Western_Gull
-Anna_Hummingbird
-Ruby_throated_Hummingbird
-Rufous_Hummingbird
-Green_Violetear
-Long_tailed_Jaeger
-Pomarine_Jaeger
-Blue_Jay
-Florida_Jay
-Green_Jay
-Dark_eyed_Junco
-Tropical_Kingbird
-Gray_Kingbird
-Belted_Kingfisher
-Green_Kingfisher
-Pied_Kingfisher
-Ringed_Kingfisher
-White_breasted_Kingfisher
-Red_legged_Kittiwake
-Horned_Lark
-Pacific_Loon
-Mallard
-Western_Meadowlark
-Hooded_Merganser
-Red_breasted_Merganser
-Mockingbird
-Nighthawk
-Clark_Nutcracker
-White_breasted_Nuthatch
-Baltimore_Oriole
-Hooded_Oriole
-Orchard_Oriole
-Scott_Oriole
-Ovenbird
-Brown_Pelican
-White_Pelican
-Western_Wood_Pewee
-Sayornis
-American_Pipit
-Whip_poor_Will
-Horned_Puffin
-Common_Raven
-White_necked_Raven
-American_Redstart
-Geococcyx
-Loggerhead_Shrike
-Great_Grey_Shrike
-Baird_Sparrow
-Black_throated_Sparrow
-Brewer_Sparrow
-Chipping_Sparrow
-Clay_colored_Sparrow
-House_Sparrow
-Field_Sparrow
-Fox_Sparrow
-Grasshopper_Sparrow
-Harris_Sparrow
-Henslow_Sparrow
-Le_Conte_Sparrow
-Lincoln_Sparrow
-Nelson_Sharp_tailed_Sparrow
-Savannah_Sparrow
-Seaside_Sparrow
-Song_Sparrow
-Tree_Sparrow
-Vesper_Sparrow
-White_crowned_Sparrow
-White_throated_Sparrow
-Cape_Glossy_Starling
-Bank_Swallow
-Barn_Swallow
-Cliff_Swallow
-Tree_Swallow
-Scarlet_Tanager
-Summer_Tanager
-Artic_Tern
-Black_Tern
-Caspian_Tern
-Common_Tern
-Elegant_Tern
-Forsters_Tern
-Least_Tern
-Green_tailed_Towhee
-Brown_Thrasher
-Sage_Thrasher
-Black_capped_Vireo
-Blue_headed_Vireo
-Philadelphia_Vireo
-Red_eyed_Vireo
-Warbling_Vireo
-White_eyed_Vireo
-Yellow_throated_Vireo
-Bay_breasted_Warbler
-Black_and_white_Warbler
-Black_throated_Blue_Warbler
-Blue_winged_Warbler
-Canada_Warbler
-Cape_May_Warbler
-Cerulean_Warbler
-Chestnut_sided_Warbler
-Golden_winged_Warbler
-Hooded_Warbler
-Kentucky_Warbler
-Magnolia_Warbler
-Mourning_Warbler
-Myrtle_Warbler
-Nashville_Warbler
-Orange_crowned_Warbler
-Palm_Warbler
-Pine_Warbler
-Prairie_Warbler
-Prothonotary_Warbler
-Swainson_Warbler
-Tennessee_Warbler
-Wilson_Warbler
-Worm_eating_Warbler
-Yellow_Warbler
-Northern_Waterthrush
-Louisiana_Waterthrush
-Bohemian_Waxwing
-Cedar_Waxwing
-American_Three_toed_Woodpecker
-Pileated_Woodpecker
-Red_bellied_Woodpecker
-Red_cockaded_Woodpecker
-Red_headed_Woodpecker
-Downy_Woodpecker
-Bewick_Wren
-Cactus_Wren
-Carolina_Wren
-House_Wren
-Marsh_Wren
-Rock_Wren
-Winter_Wren
-Common_Yellowthroat
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
deleted file mode 100644
index 1230c88fb..000000000
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ /dev/null
@@ -1,265 +0,0 @@
-import csv
-import functools
-import pathlib
-from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import (
-    CSVDictParser,
-    CSVParser,
-    Demultiplexer,
-    Filter,
-    IterDataPipe,
-    IterKeyZipper,
-    LineReader,
-    Mapper,
-)
-from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes
-
-from .._api import register_dataset, register_info
-
-csv.register_dialect("cub200", delimiter=" ")
-
-
-NAME = "cub200"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class CUB200(Dataset):
-    """
-    - **homepage**: http://www.vision.caltech.edu/visipedia/CUB-200.html
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        year: str = "2011",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        self._year = self._verify_str_arg(year, "year", ("2010", "2011"))
-
-        self._categories = _info()["categories"]
-
-        super().__init__(
-            root,
-            # TODO: this will only be available after https://github.com/pytorch/vision/pull/5473
-            # dependencies=("scipy",),
-            skip_integrity_check=skip_integrity_check,
-        )
-
-    def _resources(self) -> List[OnlineResource]:
-        if self._year == "2011":
-            archive = GDriveResource(
-                "1hbzc_P1FuxMkcabkgn9ZKinBwW683j45",
-                file_name="CUB_200_2011.tgz",
-                sha256="0c685df5597a8b24909f6a7c9db6d11e008733779a671760afef78feb49bf081",
-                preprocess="decompress",
-            )
-            segmentations = GDriveResource(
-                "1EamOKGLoTuZdtcVYbHMWNpkn3iAVj8TP",
-                file_name="segmentations.tgz",
-                sha256="dc77f6cffea0cbe2e41d4201115c8f29a6320ecb04fffd2444f51b8066e4b84f",
-                preprocess="decompress",
-            )
-            return [archive, segmentations]
-        else:  # self._year == "2010"
-            split = GDriveResource(
-                "1vZuZPqha0JjmwkdaS_XtYryE3Jf5Q1AC",
-                file_name="lists.tgz",
-                sha256="aeacbd5e3539ae84ea726e8a266a9a119c18f055cd80f3836d5eb4500b005428",
-                preprocess="decompress",
-            )
-            images = GDriveResource(
-                "1GDr1OkoXdhaXWGA8S3MAq3a522Tak-nx",
-                file_name="images.tgz",
-                sha256="2a6d2246bbb9778ca03aa94e2e683ccb4f8821a36b7f235c0822e659d60a803e",
-                preprocess="decompress",
-            )
-            anns = GDriveResource(
-                "16NsbTpMs5L6hT4hUJAmpW2u7wH326WTR",
-                file_name="annotations.tgz",
-                sha256="c17b7841c21a66aa44ba8fe92369cc95dfc998946081828b1d7b8a4b716805c1",
-                preprocess="decompress",
-            )
-            return [split, images, anns]
-
-    def _2011_classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.parents[1].name == "images":
-            return 0
-        elif path.name == "train_test_split.txt":
-            return 1
-        elif path.name == "images.txt":
-            return 2
-        elif path.name == "bounding_boxes.txt":
-            return 3
-        else:
-            return None
-
-    def _2011_extract_file_name(self, rel_posix_path: str) -> str:
-        return rel_posix_path.rsplit("/", maxsplit=1)[1]
-
-    def _2011_filter_split(self, row: List[str]) -> bool:
-        _, split_id = row
-        return {
-            "0": "test",
-            "1": "train",
-        }[split_id] == self._split
-
-    def _2011_segmentation_key(self, data: Tuple[str, Any]) -> str:
-        path = pathlib.Path(data[0])
-        return path.with_suffix(".jpg").name
-
-    def _2011_prepare_ann(
-        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int]
-    ) -> Dict[str, Any]:
-        _, (bounding_boxes_data, segmentation_data) = data
-        segmentation_path, segmentation_buffer = segmentation_data
-        return dict(
-            bounding_boxes=BoundingBoxes(
-                [float(part) for part in bounding_boxes_data[1:]], format="xywh", spatial_size=spatial_size
-            ),
-            segmentation_path=segmentation_path,
-            segmentation=EncodedImage.from_file(segmentation_buffer),
-        )
-
-    def _2010_split_key(self, data: str) -> str:
-        return data.rsplit("/", maxsplit=1)[1]
-
-    def _2010_anns_key(self, data: Tuple[str, BinaryIO]) -> Tuple[str, Tuple[str, BinaryIO]]:
-        path = pathlib.Path(data[0])
-        return path.with_suffix(".jpg").name, data
-
-    def _2010_prepare_ann(
-        self, data: Tuple[str, Tuple[str, BinaryIO]], spatial_size: Tuple[int, int]
-    ) -> Dict[str, Any]:
-        _, (path, buffer) = data
-        content = read_mat(buffer)
-        return dict(
-            ann_path=path,
-            bounding_boxes=BoundingBoxes(
-                [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
-                format="xyxy",
-                spatial_size=spatial_size,
-            ),
-            segmentation=torch.as_tensor(content["seg"]),
-        )
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Tuple[str, Tuple[str, BinaryIO]], Any],
-        *,
-        prepare_ann_fn: Callable[[Any, Tuple[int, int]], Dict[str, Any]],
-    ) -> Dict[str, Any]:
-        data, anns_data = data
-        _, image_data = data
-        path, buffer = image_data
-
-        image = EncodedImage.from_file(buffer)
-
-        return dict(
-            prepare_ann_fn(anns_data, image.spatial_size),
-            image=image,
-            label=Label(
-                int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1,
-                categories=self._categories,
-            ),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        prepare_ann_fn: Callable
-        if self._year == "2011":
-            archive_dp, segmentations_dp = resource_dps
-            images_dp, split_dp, image_files_dp, bounding_boxes_dp = Demultiplexer(
-                archive_dp, 4, self._2011_classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-            )
-
-            image_files_dp = CSVParser(image_files_dp, dialect="cub200")
-            image_files_dp = Mapper(image_files_dp, self._2011_extract_file_name, input_col=1)
-            image_files_map = IterToMapConverter(image_files_dp)
-
-            split_dp = CSVParser(split_dp, dialect="cub200")
-            split_dp = Filter(split_dp, self._2011_filter_split)
-            split_dp = Mapper(split_dp, getitem(0))
-            split_dp = Mapper(split_dp, image_files_map.__getitem__)
-
-            bounding_boxes_dp = CSVParser(bounding_boxes_dp, dialect="cub200")
-            bounding_boxes_dp = Mapper(bounding_boxes_dp, image_files_map.__getitem__, input_col=0)
-
-            anns_dp = IterKeyZipper(
-                bounding_boxes_dp,
-                segmentations_dp,
-                key_fn=getitem(0),
-                ref_key_fn=self._2011_segmentation_key,
-                keep_key=True,
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-
-            prepare_ann_fn = self._2011_prepare_ann
-        else:  # self._year == "2010"
-            split_dp, images_dp, anns_dp = resource_dps
-
-            split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-            split_dp = LineReader(split_dp, decode=True, return_path=False)
-            split_dp = Mapper(split_dp, self._2010_split_key)
-
-            anns_dp = Mapper(anns_dp, self._2010_anns_key)
-
-            prepare_ann_fn = self._2010_prepare_ann
-
-        split_dp = hint_shuffling(split_dp)
-        split_dp = hint_sharding(split_dp)
-
-        dp = IterKeyZipper(
-            split_dp,
-            images_dp,
-            getitem(),
-            path_accessor("name"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        dp = IterKeyZipper(
-            dp,
-            anns_dp,
-            getitem(0),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, functools.partial(self._prepare_sample, prepare_ann_fn=prepare_ann_fn))
-
-    def __len__(self) -> int:
-        return {
-            ("train", "2010"): 3_000,
-            ("test", "2010"): 3_033,
-            ("train", "2011"): 5_994,
-            ("test", "2011"): 5_794,
-        }[(self._split, self._year)]
-
-    def _generate_categories(self) -> List[str]:
-        self._year = "2011"
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", "classes.txt"))
-        dp = CSVDictParser(dp, fieldnames=("label", "category"), dialect="cub200")
-
-        return [row["category"].split(".")[1] for row in dp]
diff --git a/torchvision/prototype/datasets/_builtin/dtd.categories b/torchvision/prototype/datasets/_builtin/dtd.categories
deleted file mode 100644
index 7f3df8a2b..000000000
--- a/torchvision/prototype/datasets/_builtin/dtd.categories
+++ /dev/null
@@ -1,47 +0,0 @@
-banded
-blotchy
-braided
-bubbly
-bumpy
-chequered
-cobwebbed
-cracked
-crosshatched
-crystalline
-dotted
-fibrous
-flecked
-freckled
-frilly
-gauzy
-grid
-grooved
-honeycombed
-interlaced
-knitted
-lacelike
-lined
-marbled
-matted
-meshed
-paisley
-perforated
-pitted
-pleated
-polka-dotted
-porous
-potholed
-scaly
-smeared
-spiralled
-sprinkled
-stained
-stratified
-striped
-studded
-swirly
-veined
-waffled
-woven
-wrinkled
-zigzagged
diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py
deleted file mode 100644
index 5b9922a82..000000000
--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import enum
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "dtd"
-
-
-class DTDDemux(enum.IntEnum):
-    SPLIT = 0
-    JOINT_CATEGORIES = 1
-    IMAGES = 2
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class DTD(Dataset):
-    """DTD Dataset.
-    homepage="https://www.robots.ox.ac.uk/~vgg/data/dtd/",
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        fold: int = 1,
-        skip_validation_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val", "test"})
-
-        if not (1 <= fold <= 10):
-            raise ValueError(f"The fold parameter should be an integer in [1, 10]. Got {fold}")
-        self._fold = fold
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_validation_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz",
-            sha256="e42855a52a4950a3b59612834602aa253914755c95b0cff9ead6d07395f8e205",
-            preprocess="decompress",
-        )
-        return [archive]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.parent.name == "labels":
-            if path.name == "labels_joint_anno.txt":
-                return DTDDemux.JOINT_CATEGORIES
-
-            return DTDDemux.SPLIT
-        elif path.parents[1].name == "images":
-            return DTDDemux.IMAGES
-        else:
-            return None
-
-    def _image_key_fn(self, data: Tuple[str, Any]) -> str:
-        path = pathlib.Path(data[0])
-        # The split files contain hardcoded posix paths for the images, e.g. banded/banded_0001.jpg
-        return str(path.relative_to(path.parents[1]).as_posix())
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
-        (_, joint_categories_data), image_data = data
-        _, *joint_categories = joint_categories_data
-        path, buffer = image_data
-
-        category = pathlib.Path(path).parent.name
-
-        return dict(
-            joint_categories={category for category in joint_categories if category},
-            label=Label.from_category(category, categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-
-        splits_dp, joint_categories_dp, images_dp = Demultiplexer(
-            archive_dp, 3, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-        )
-
-        splits_dp = Filter(splits_dp, path_comparator("name", f"{self._split}{self._fold}.txt"))
-        splits_dp = LineReader(splits_dp, decode=True, return_path=False)
-        splits_dp = hint_shuffling(splits_dp)
-        splits_dp = hint_sharding(splits_dp)
-
-        joint_categories_dp = CSVParser(joint_categories_dp, delimiter=" ")
-
-        dp = IterKeyZipper(
-            splits_dp,
-            joint_categories_dp,
-            key_fn=getitem(),
-            ref_key_fn=getitem(0),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        dp = IterKeyZipper(
-            dp,
-            images_dp,
-            key_fn=getitem(0),
-            ref_key_fn=self._image_key_fn,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
-        return self._classify_archive(data) == DTDDemux.IMAGES
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, self._filter_images)
-
-        return sorted({pathlib.Path(path).parent.name for path, _ in dp})
-
-    def __len__(self) -> int:
-        return 1_880  # All splits have the same length
diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py
deleted file mode 100644
index 747f0320e..000000000
--- a/torchvision/prototype/datasets/_builtin/eurosat.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Tuple, Union
-
-from torchdata.datapipes.iter import IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-NAME = "eurosat"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(
-        categories=(
-            "AnnualCrop",
-            "Forest",
-            "HerbaceousVegetation",
-            "Highway",
-            "Industrial",
-            "Pasture",
-            "PermanentCrop",
-            "Residential",
-            "River",
-            "SeaLake",
-        )
-    )
-
-
-@register_dataset(NAME)
-class EuroSAT(Dataset):
-    """EuroSAT Dataset.
-    homepage="https://github.com/phelber/eurosat",
-    """
-
-    def __init__(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False) -> None:
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
-                sha256="8ebea626349354c5328b142b96d0430e647051f26efc2dc974c843f25ecf70bd",
-            )
-        ]
-
-    def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
-        path, buffer = data
-        category = pathlib.Path(path).parent.name
-        return dict(
-            label=Label.from_category(category, categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 27_000
diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py
deleted file mode 100644
index e6194ab01..000000000
--- a/torchvision/prototype/datasets/_builtin/fer2013.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Union
-
-import torch
-from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-NAME = "fer2013"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=("angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"))
-
-
-@register_dataset(NAME)
-class FER2013(Dataset):
-    """FER 2013 Dataset
-    homepage="https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge"
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "train", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _CHECKSUMS = {
-        "train": "a2b7c9360cc0b38d21187e5eece01c2799fce5426cdeecf746889cc96cda2d10",
-        "test": "dec8dfe8021e30cd6704b85ec813042b4a5d99d81cb55e023291a94104f575c3",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = KaggleDownloadResource(
-            "https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge",
-            file_name=f"{self._split}.csv.zip",
-            sha256=self._CHECKSUMS[self._split],
-        )
-        return [archive]
-
-    def _prepare_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        label_id = data.get("emotion")
-
-        return dict(
-            image=Image(torch.tensor([int(idx) for idx in data["pixels"].split()], dtype=torch.uint8).reshape(48, 48)),
-            label=Label(int(label_id), categories=self._categories) if label_id is not None else None,
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = CSVDictParser(dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 28_709 if self._split == "train" else 3_589
diff --git a/torchvision/prototype/datasets/_builtin/food101.categories b/torchvision/prototype/datasets/_builtin/food101.categories
deleted file mode 100644
index 59f252ddf..000000000
--- a/torchvision/prototype/datasets/_builtin/food101.categories
+++ /dev/null
@@ -1,101 +0,0 @@
-apple_pie
-baby_back_ribs
-baklava
-beef_carpaccio
-beef_tartare
-beet_salad
-beignets
-bibimbap
-bread_pudding
-breakfast_burrito
-bruschetta
-caesar_salad
-cannoli
-caprese_salad
-carrot_cake
-ceviche
-cheesecake
-cheese_plate
-chicken_curry
-chicken_quesadilla
-chicken_wings
-chocolate_cake
-chocolate_mousse
-churros
-clam_chowder
-club_sandwich
-crab_cakes
-creme_brulee
-croque_madame
-cup_cakes
-deviled_eggs
-donuts
-dumplings
-edamame
-eggs_benedict
-escargots
-falafel
-filet_mignon
-fish_and_chips
-foie_gras
-french_fries
-french_onion_soup
-french_toast
-fried_calamari
-fried_rice
-frozen_yogurt
-garlic_bread
-gnocchi
-greek_salad
-grilled_cheese_sandwich
-grilled_salmon
-guacamole
-gyoza
-hamburger
-hot_and_sour_soup
-hot_dog
-huevos_rancheros
-hummus
-ice_cream
-lasagna
-lobster_bisque
-lobster_roll_sandwich
-macaroni_and_cheese
-macarons
-miso_soup
-mussels
-nachos
-omelette
-onion_rings
-oysters
-pad_thai
-paella
-pancakes
-panna_cotta
-peking_duck
-pho
-pizza
-pork_chop
-poutine
-prime_rib
-pulled_pork_sandwich
-ramen
-ravioli
-red_velvet_cake
-risotto
-samosa
-sashimi
-scallops
-seaweed_salad
-shrimp_and_grits
-spaghetti_bolognese
-spaghetti_carbonara
-spring_rolls
-steak
-strawberry_shortcake
-sushi
-tacos
-takoyaki
-tiramisu
-tuna_tartare
-waffles
diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py
deleted file mode 100644
index ed446f342..000000000
--- a/torchvision/prototype/datasets/_builtin/food101.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from pathlib import Path
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "food101"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class Food101(Dataset):
-    """Food 101 dataset
-    homepage="https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101",
-    """
-
-    def __init__(self, root: Union[str, Path], *, split: str = "train", skip_integrity_check: bool = False) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                url="http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz",
-                sha256="d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4",
-                preprocess="decompress",
-            )
-        ]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = Path(data[0])
-        if path.parents[1].name == "images":
-            return 0
-        elif path.parents[0].name == "meta":
-            return 1
-        else:
-            return None
-
-    def _prepare_sample(self, data: Tuple[str, Tuple[str, BinaryIO]]) -> Dict[str, Any]:
-        id, (path, buffer) = data
-        return dict(
-            label=Label.from_category(id.split("/", 1)[0], categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _image_key(self, data: Tuple[str, Any]) -> str:
-        path = Path(data[0])
-        return path.relative_to(path.parents[1]).with_suffix("").as_posix()
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        images_dp, split_dp = Demultiplexer(
-            archive_dp, 2, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-        )
-        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-        split_dp = LineReader(split_dp, decode=True, return_path=False)
-        split_dp = hint_sharding(split_dp)
-        split_dp = hint_shuffling(split_dp)
-
-        dp = IterKeyZipper(
-            split_dp,
-            images_dp,
-            key_fn=getitem(),
-            ref_key_fn=self._image_key,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        return Mapper(dp, self._prepare_sample)
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", "classes.txt"))
-        dp = LineReader(dp, decode=True, return_path=False)
-        return list(dp)
-
-    def __len__(self) -> int:
-        return 75_750 if self._split == "train" else 25_250
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
deleted file mode 100644
index b31793c0f..000000000
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_comparator,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes
-
-from .._api import register_dataset, register_info
-
-NAME = "gtsrb"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(
-        categories=[f"{label:05d}" for label in range(43)],
-    )
-
-
-@register_dataset(NAME)
-class GTSRB(Dataset):
-    """GTSRB Dataset
-
-    homepage="https://benchmark.ini.rub.de"
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "train", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_ROOT = "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/"
-    _URLS = {
-        "train": f"{_URL_ROOT}GTSRB-Training_fixed.zip",
-        "test": f"{_URL_ROOT}GTSRB_Final_Test_Images.zip",
-        "test_ground_truth": f"{_URL_ROOT}GTSRB_Final_Test_GT.zip",
-    }
-    _CHECKSUMS = {
-        "train": "df4144942083645bd60b594de348aa6930126c3e0e5de09e39611630abf8455a",
-        "test": "48ba6fab7e877eb64eaf8de99035b0aaecfbc279bee23e35deca4ac1d0a837fa",
-        "test_ground_truth": "f94e5a7614d75845c74c04ddb26b8796b9e483f43541dd95dd5b726504e16d6d",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        rsrcs: List[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUMS[self._split])]
-
-        if self._split == "test":
-            rsrcs.append(
-                HttpResource(
-                    self._URLS["test_ground_truth"],
-                    sha256=self._CHECKSUMS["test_ground_truth"],
-                )
-            )
-
-        return rsrcs
-
-    def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.suffix == ".ppm":
-            return 0
-        elif path.suffix == ".csv":
-            return 1
-        else:
-            return None
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Dict[str, Any]:
-        (path, buffer), csv_info = data
-        label = int(csv_info["ClassId"])
-
-        bounding_boxes = BoundingBoxes(
-            [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
-            format="xyxy",
-            spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])),
-        )
-
-        return {
-            "path": path,
-            "image": EncodedImage.from_file(buffer),
-            "label": Label(label, categories=self._categories),
-            "bounding_boxes": bounding_boxes,
-        }
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        if self._split == "train":
-            images_dp, ann_dp = Demultiplexer(
-                resource_dps[0], 2, self._classify_train_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-            )
-        else:
-            images_dp, ann_dp = resource_dps
-            images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
-
-        # The order of the image files in the .zip archives perfectly match the order of the entries in the
-        # (possibly concatenated) .csv files. So we're able to use Zipper here instead of a IterKeyZipper.
-        ann_dp = CSVDictParser(ann_dp, delimiter=";")
-        dp = Zipper(images_dp, ann_dp)
-
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 26_640 if self._split == "train" else 12_630
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.categories b/torchvision/prototype/datasets/_builtin/imagenet.categories
deleted file mode 100644
index 7b6006ff5..000000000
--- a/torchvision/prototype/datasets/_builtin/imagenet.categories
+++ /dev/null
@@ -1,1000 +0,0 @@
-tench,n01440764
-goldfish,n01443537
-great white shark,n01484850
-tiger shark,n01491361
-hammerhead,n01494475
-electric ray,n01496331
-stingray,n01498041
-cock,n01514668
-hen,n01514859
-ostrich,n01518878
-brambling,n01530575
-goldfinch,n01531178
-house finch,n01532829
-junco,n01534433
-indigo bunting,n01537544
-robin,n01558993
-bulbul,n01560419
-jay,n01580077
-magpie,n01582220
-chickadee,n01592084
-water ouzel,n01601694
-kite,n01608432
-bald eagle,n01614925
-vulture,n01616318
-great grey owl,n01622779
-European fire salamander,n01629819
-common newt,n01630670
-eft,n01631663
-spotted salamander,n01632458
-axolotl,n01632777
-bullfrog,n01641577
-tree frog,n01644373
-tailed frog,n01644900
-loggerhead,n01664065
-leatherback turtle,n01665541
-mud turtle,n01667114
-terrapin,n01667778
-box turtle,n01669191
-banded gecko,n01675722
-common iguana,n01677366
-American chameleon,n01682714
-whiptail,n01685808
-agama,n01687978
-frilled lizard,n01688243
-alligator lizard,n01689811
-Gila monster,n01692333
-green lizard,n01693334
-African chameleon,n01694178
-Komodo dragon,n01695060
-African crocodile,n01697457
-American alligator,n01698640
-triceratops,n01704323
-thunder snake,n01728572
-ringneck snake,n01728920
-hognose snake,n01729322
-green snake,n01729977
-king snake,n01734418
-garter snake,n01735189
-water snake,n01737021
-vine snake,n01739381
-night snake,n01740131
-boa constrictor,n01742172
-rock python,n01744401
-Indian cobra,n01748264
-green mamba,n01749939
-sea snake,n01751748
-horned viper,n01753488
-diamondback,n01755581
-sidewinder,n01756291
-trilobite,n01768244
-harvestman,n01770081
-scorpion,n01770393
-black and gold garden spider,n01773157
-barn spider,n01773549
-garden spider,n01773797
-black widow,n01774384
-tarantula,n01774750
-wolf spider,n01775062
-tick,n01776313
-centipede,n01784675
-black grouse,n01795545
-ptarmigan,n01796340
-ruffed grouse,n01797886
-prairie chicken,n01798484
-peacock,n01806143
-quail,n01806567
-partridge,n01807496
-African grey,n01817953
-macaw,n01818515
-sulphur-crested cockatoo,n01819313
-lorikeet,n01820546
-coucal,n01824575
-bee eater,n01828970
-hornbill,n01829413
-hummingbird,n01833805
-jacamar,n01843065
-toucan,n01843383
-drake,n01847000
-red-breasted merganser,n01855032
-goose,n01855672
-black swan,n01860187
-tusker,n01871265
-echidna,n01872401
-platypus,n01873310
-wallaby,n01877812
-koala,n01882714
-wombat,n01883070
-jellyfish,n01910747
-sea anemone,n01914609
-brain coral,n01917289
-flatworm,n01924916
-nematode,n01930112
-conch,n01943899
-snail,n01944390
-slug,n01945685
-sea slug,n01950731
-chiton,n01955084
-chambered nautilus,n01968897
-Dungeness crab,n01978287
-rock crab,n01978455
-fiddler crab,n01980166
-king crab,n01981276
-American lobster,n01983481
-spiny lobster,n01984695
-crayfish,n01985128
-hermit crab,n01986214
-isopod,n01990800
-white stork,n02002556
-black stork,n02002724
-spoonbill,n02006656
-flamingo,n02007558
-little blue heron,n02009229
-American egret,n02009912
-bittern,n02011460
-crane,n02012849
-limpkin,n02013706
-European gallinule,n02017213
-American coot,n02018207
-bustard,n02018795
-ruddy turnstone,n02025239
-red-backed sandpiper,n02027492
-redshank,n02028035
-dowitcher,n02033041
-oystercatcher,n02037110
-pelican,n02051845
-king penguin,n02056570
-albatross,n02058221
-grey whale,n02066245
-killer whale,n02071294
-dugong,n02074367
-sea lion,n02077923
-Chihuahua,n02085620
-Japanese spaniel,n02085782
-Maltese dog,n02085936
-Pekinese,n02086079
-Shih-Tzu,n02086240
-Blenheim spaniel,n02086646
-papillon,n02086910
-toy terrier,n02087046
-Rhodesian ridgeback,n02087394
-Afghan hound,n02088094
-basset,n02088238
-beagle,n02088364
-bloodhound,n02088466
-bluetick,n02088632
-black-and-tan coonhound,n02089078
-Walker hound,n02089867
-English foxhound,n02089973
-redbone,n02090379
-borzoi,n02090622
-Irish wolfhound,n02090721
-Italian greyhound,n02091032
-whippet,n02091134
-Ibizan hound,n02091244
-Norwegian elkhound,n02091467
-otterhound,n02091635
-Saluki,n02091831
-Scottish deerhound,n02092002
-Weimaraner,n02092339
-Staffordshire bullterrier,n02093256
-American Staffordshire terrier,n02093428
-Bedlington terrier,n02093647
-Border terrier,n02093754
-Kerry blue terrier,n02093859
-Irish terrier,n02093991
-Norfolk terrier,n02094114
-Norwich terrier,n02094258
-Yorkshire terrier,n02094433
-wire-haired fox terrier,n02095314
-Lakeland terrier,n02095570
-Sealyham terrier,n02095889
-Airedale,n02096051
-cairn,n02096177
-Australian terrier,n02096294
-Dandie Dinmont,n02096437
-Boston bull,n02096585
-miniature schnauzer,n02097047
-giant schnauzer,n02097130
-standard schnauzer,n02097209
-Scotch terrier,n02097298
-Tibetan terrier,n02097474
-silky terrier,n02097658
-soft-coated wheaten terrier,n02098105
-West Highland white terrier,n02098286
-Lhasa,n02098413
-flat-coated retriever,n02099267
-curly-coated retriever,n02099429
-golden retriever,n02099601
-Labrador retriever,n02099712
-Chesapeake Bay retriever,n02099849
-German short-haired pointer,n02100236
-vizsla,n02100583
-English setter,n02100735
-Irish setter,n02100877
-Gordon setter,n02101006
-Brittany spaniel,n02101388
-clumber,n02101556
-English springer,n02102040
-Welsh springer spaniel,n02102177
-cocker spaniel,n02102318
-Sussex spaniel,n02102480
-Irish water spaniel,n02102973
-kuvasz,n02104029
-schipperke,n02104365
-groenendael,n02105056
-malinois,n02105162
-briard,n02105251
-kelpie,n02105412
-komondor,n02105505
-Old English sheepdog,n02105641
-Shetland sheepdog,n02105855
-collie,n02106030
-Border collie,n02106166
-Bouvier des Flandres,n02106382
-Rottweiler,n02106550
-German shepherd,n02106662
-Doberman,n02107142
-miniature pinscher,n02107312
-Greater Swiss Mountain dog,n02107574
-Bernese mountain dog,n02107683
-Appenzeller,n02107908
-EntleBucher,n02108000
-boxer,n02108089
-bull mastiff,n02108422
-Tibetan mastiff,n02108551
-French bulldog,n02108915
-Great Dane,n02109047
-Saint Bernard,n02109525
-Eskimo dog,n02109961
-malamute,n02110063
-Siberian husky,n02110185
-dalmatian,n02110341
-affenpinscher,n02110627
-basenji,n02110806
-pug,n02110958
-Leonberg,n02111129
-Newfoundland,n02111277
-Great Pyrenees,n02111500
-Samoyed,n02111889
-Pomeranian,n02112018
-chow,n02112137
-keeshond,n02112350
-Brabancon griffon,n02112706
-Pembroke,n02113023
-Cardigan,n02113186
-toy poodle,n02113624
-miniature poodle,n02113712
-standard poodle,n02113799
-Mexican hairless,n02113978
-timber wolf,n02114367
-white wolf,n02114548
-red wolf,n02114712
-coyote,n02114855
-dingo,n02115641
-dhole,n02115913
-African hunting dog,n02116738
-hyena,n02117135
-red fox,n02119022
-kit fox,n02119789
-Arctic fox,n02120079
-grey fox,n02120505
-tabby,n02123045
-tiger cat,n02123159
-Persian cat,n02123394
-Siamese cat,n02123597
-Egyptian cat,n02124075
-cougar,n02125311
-lynx,n02127052
-leopard,n02128385
-snow leopard,n02128757
-jaguar,n02128925
-lion,n02129165
-tiger,n02129604
-cheetah,n02130308
-brown bear,n02132136
-American black bear,n02133161
-ice bear,n02134084
-sloth bear,n02134418
-mongoose,n02137549
-meerkat,n02138441
-tiger beetle,n02165105
-ladybug,n02165456
-ground beetle,n02167151
-long-horned beetle,n02168699
-leaf beetle,n02169497
-dung beetle,n02172182
-rhinoceros beetle,n02174001
-weevil,n02177972
-fly,n02190166
-bee,n02206856
-ant,n02219486
-grasshopper,n02226429
-cricket,n02229544
-walking stick,n02231487
-cockroach,n02233338
-mantis,n02236044
-cicada,n02256656
-leafhopper,n02259212
-lacewing,n02264363
-dragonfly,n02268443
-damselfly,n02268853
-admiral,n02276258
-ringlet,n02277742
-monarch,n02279972
-cabbage butterfly,n02280649
-sulphur butterfly,n02281406
-lycaenid,n02281787
-starfish,n02317335
-sea urchin,n02319095
-sea cucumber,n02321529
-wood rabbit,n02325366
-hare,n02326432
-Angora,n02328150
-hamster,n02342885
-porcupine,n02346627
-fox squirrel,n02356798
-marmot,n02361337
-beaver,n02363005
-guinea pig,n02364673
-sorrel,n02389026
-zebra,n02391049
-hog,n02395406
-wild boar,n02396427
-warthog,n02397096
-hippopotamus,n02398521
-ox,n02403003
-water buffalo,n02408429
-bison,n02410509
-ram,n02412080
-bighorn,n02415577
-ibex,n02417914
-hartebeest,n02422106
-impala,n02422699
-gazelle,n02423022
-Arabian camel,n02437312
-llama,n02437616
-weasel,n02441942
-mink,n02442845
-polecat,n02443114
-black-footed ferret,n02443484
-otter,n02444819
-skunk,n02445715
-badger,n02447366
-armadillo,n02454379
-three-toed sloth,n02457408
-orangutan,n02480495
-gorilla,n02480855
-chimpanzee,n02481823
-gibbon,n02483362
-siamang,n02483708
-guenon,n02484975
-patas,n02486261
-baboon,n02486410
-macaque,n02487347
-langur,n02488291
-colobus,n02488702
-proboscis monkey,n02489166
-marmoset,n02490219
-capuchin,n02492035
-howler monkey,n02492660
-titi,n02493509
-spider monkey,n02493793
-squirrel monkey,n02494079
-Madagascar cat,n02497673
-indri,n02500267
-Indian elephant,n02504013
-African elephant,n02504458
-lesser panda,n02509815
-giant panda,n02510455
-barracouta,n02514041
-eel,n02526121
-coho,n02536864
-rock beauty,n02606052
-anemone fish,n02607072
-sturgeon,n02640242
-gar,n02641379
-lionfish,n02643566
-puffer,n02655020
-abacus,n02666196
-abaya,n02667093
-academic gown,n02669723
-accordion,n02672831
-acoustic guitar,n02676566
-aircraft carrier,n02687172
-airliner,n02690373
-airship,n02692877
-altar,n02699494
-ambulance,n02701002
-amphibian,n02704792
-analog clock,n02708093
-apiary,n02727426
-apron,n02730930
-ashcan,n02747177
-assault rifle,n02749479
-backpack,n02769748
-bakery,n02776631
-balance beam,n02777292
-balloon,n02782093
-ballpoint,n02783161
-Band Aid,n02786058
-banjo,n02787622
-bannister,n02788148
-barbell,n02790996
-barber chair,n02791124
-barbershop,n02791270
-barn,n02793495
-barometer,n02794156
-barrel,n02795169
-barrow,n02797295
-baseball,n02799071
-basketball,n02802426
-bassinet,n02804414
-bassoon,n02804610
-bathing cap,n02807133
-bath towel,n02808304
-bathtub,n02808440
-beach wagon,n02814533
-beacon,n02814860
-beaker,n02815834
-bearskin,n02817516
-beer bottle,n02823428
-beer glass,n02823750
-bell cote,n02825657
-bib,n02834397
-bicycle-built-for-two,n02835271
-bikini,n02837789
-binder,n02840245
-binoculars,n02841315
-birdhouse,n02843684
-boathouse,n02859443
-bobsled,n02860847
-bolo tie,n02865351
-bonnet,n02869837
-bookcase,n02870880
-bookshop,n02871525
-bottlecap,n02877765
-bow,n02879718
-bow tie,n02883205
-brass,n02892201
-brassiere,n02892767
-breakwater,n02894605
-breastplate,n02895154
-broom,n02906734
-bucket,n02909870
-buckle,n02910353
-bulletproof vest,n02916936
-bullet train,n02917067
-butcher shop,n02927161
-cab,n02930766
-caldron,n02939185
-candle,n02948072
-cannon,n02950826
-canoe,n02951358
-can opener,n02951585
-cardigan,n02963159
-car mirror,n02965783
-carousel,n02966193
-carpenter's kit,n02966687
-carton,n02971356
-car wheel,n02974003
-cash machine,n02977058
-cassette,n02978881
-cassette player,n02979186
-castle,n02980441
-catamaran,n02981792
-CD player,n02988304
-cello,n02992211
-cellular telephone,n02992529
-chain,n02999410
-chainlink fence,n03000134
-chain mail,n03000247
-chain saw,n03000684
-chest,n03014705
-chiffonier,n03016953
-chime,n03017168
-china cabinet,n03018349
-Christmas stocking,n03026506
-church,n03028079
-cinema,n03032252
-cleaver,n03041632
-cliff dwelling,n03042490
-cloak,n03045698
-clog,n03047690
-cocktail shaker,n03062245
-coffee mug,n03063599
-coffeepot,n03063689
-coil,n03065424
-combination lock,n03075370
-computer keyboard,n03085013
-confectionery,n03089624
-container ship,n03095699
-convertible,n03100240
-corkscrew,n03109150
-cornet,n03110669
-cowboy boot,n03124043
-cowboy hat,n03124170
-cradle,n03125729
-construction crane,n03126707
-crash helmet,n03127747
-crate,n03127925
-crib,n03131574
-Crock Pot,n03133878
-croquet ball,n03134739
-crutch,n03141823
-cuirass,n03146219
-dam,n03160309
-desk,n03179701
-desktop computer,n03180011
-dial telephone,n03187595
-diaper,n03188531
-digital clock,n03196217
-digital watch,n03197337
-dining table,n03201208
-dishrag,n03207743
-dishwasher,n03207941
-disk brake,n03208938
-dock,n03216828
-dogsled,n03218198
-dome,n03220513
-doormat,n03223299
-drilling platform,n03240683
-drum,n03249569
-drumstick,n03250847
-dumbbell,n03255030
-Dutch oven,n03259280
-electric fan,n03271574
-electric guitar,n03272010
-electric locomotive,n03272562
-entertainment center,n03290653
-envelope,n03291819
-espresso maker,n03297495
-face powder,n03314780
-feather boa,n03325584
-file,n03337140
-fireboat,n03344393
-fire engine,n03345487
-fire screen,n03347037
-flagpole,n03355925
-flute,n03372029
-folding chair,n03376595
-football helmet,n03379051
-forklift,n03384352
-fountain,n03388043
-fountain pen,n03388183
-four-poster,n03388549
-freight car,n03393912
-French horn,n03394916
-frying pan,n03400231
-fur coat,n03404251
-garbage truck,n03417042
-gasmask,n03424325
-gas pump,n03425413
-goblet,n03443371
-go-kart,n03444034
-golf ball,n03445777
-golfcart,n03445924
-gondola,n03447447
-gong,n03447721
-gown,n03450230
-grand piano,n03452741
-greenhouse,n03457902
-grille,n03459775
-grocery store,n03461385
-guillotine,n03467068
-hair slide,n03476684
-hair spray,n03476991
-half track,n03478589
-hammer,n03481172
-hamper,n03482405
-hand blower,n03483316
-hand-held computer,n03485407
-handkerchief,n03485794
-hard disc,n03492542
-harmonica,n03494278
-harp,n03495258
-harvester,n03496892
-hatchet,n03498962
-holster,n03527444
-home theater,n03529860
-honeycomb,n03530642
-hook,n03532672
-hoopskirt,n03534580
-horizontal bar,n03535780
-horse cart,n03538406
-hourglass,n03544143
-iPod,n03584254
-iron,n03584829
-jack-o'-lantern,n03590841
-jean,n03594734
-jeep,n03594945
-jersey,n03595614
-jigsaw puzzle,n03598930
-jinrikisha,n03599486
-joystick,n03602883
-kimono,n03617480
-knee pad,n03623198
-knot,n03627232
-lab coat,n03630383
-ladle,n03633091
-lampshade,n03637318
-laptop,n03642806
-lawn mower,n03649909
-lens cap,n03657121
-letter opener,n03658185
-library,n03661043
-lifeboat,n03662601
-lighter,n03666591
-limousine,n03670208
-liner,n03673027
-lipstick,n03676483
-Loafer,n03680355
-lotion,n03690938
-loudspeaker,n03691459
-loupe,n03692522
-lumbermill,n03697007
-magnetic compass,n03706229
-mailbag,n03709823
-mailbox,n03710193
-maillot,n03710637
-tank suit,n03710721
-manhole cover,n03717622
-maraca,n03720891
-marimba,n03721384
-mask,n03724870
-matchstick,n03729826
-maypole,n03733131
-maze,n03733281
-measuring cup,n03733805
-medicine chest,n03742115
-megalith,n03743016
-microphone,n03759954
-microwave,n03761084
-military uniform,n03763968
-milk can,n03764736
-minibus,n03769881
-miniskirt,n03770439
-minivan,n03770679
-missile,n03773504
-mitten,n03775071
-mixing bowl,n03775546
-mobile home,n03776460
-Model T,n03777568
-modem,n03777754
-monastery,n03781244
-monitor,n03782006
-moped,n03785016
-mortar,n03786901
-mortarboard,n03787032
-mosque,n03788195
-mosquito net,n03788365
-motor scooter,n03791053
-mountain bike,n03792782
-mountain tent,n03792972
-mouse,n03793489
-mousetrap,n03794056
-moving van,n03796401
-muzzle,n03803284
-nail,n03804744
-neck brace,n03814639
-necklace,n03814906
-nipple,n03825788
-notebook,n03832673
-obelisk,n03837869
-oboe,n03838899
-ocarina,n03840681
-odometer,n03841143
-oil filter,n03843555
-organ,n03854065
-oscilloscope,n03857828
-overskirt,n03866082
-oxcart,n03868242
-oxygen mask,n03868863
-packet,n03871628
-paddle,n03873416
-paddlewheel,n03874293
-padlock,n03874599
-paintbrush,n03876231
-pajama,n03877472
-palace,n03877845
-panpipe,n03884397
-paper towel,n03887697
-parachute,n03888257
-parallel bars,n03888605
-park bench,n03891251
-parking meter,n03891332
-passenger car,n03895866
-patio,n03899768
-pay-phone,n03902125
-pedestal,n03903868
-pencil box,n03908618
-pencil sharpener,n03908714
-perfume,n03916031
-Petri dish,n03920288
-photocopier,n03924679
-pick,n03929660
-pickelhaube,n03929855
-picket fence,n03930313
-pickup,n03930630
-pier,n03933933
-piggy bank,n03935335
-pill bottle,n03937543
-pillow,n03938244
-ping-pong ball,n03942813
-pinwheel,n03944341
-pirate,n03947888
-pitcher,n03950228
-plane,n03954731
-planetarium,n03956157
-plastic bag,n03958227
-plate rack,n03961711
-plow,n03967562
-plunger,n03970156
-Polaroid camera,n03976467
-pole,n03976657
-police van,n03977966
-poncho,n03980874
-pool table,n03982430
-pop bottle,n03983396
-pot,n03991062
-potter's wheel,n03992509
-power drill,n03995372
-prayer rug,n03998194
-printer,n04004767
-prison,n04005630
-projectile,n04008634
-projector,n04009552
-puck,n04019541
-punching bag,n04023962
-purse,n04026417
-quill,n04033901
-quilt,n04033995
-racer,n04037443
-racket,n04039381
-radiator,n04040759
-radio,n04041544
-radio telescope,n04044716
-rain barrel,n04049303
-recreational vehicle,n04065272
-reel,n04067472
-reflex camera,n04069434
-refrigerator,n04070727
-remote control,n04074963
-restaurant,n04081281
-revolver,n04086273
-rifle,n04090263
-rocking chair,n04099969
-rotisserie,n04111531
-rubber eraser,n04116512
-rugby ball,n04118538
-rule,n04118776
-running shoe,n04120489
-safe,n04125021
-safety pin,n04127249
-saltshaker,n04131690
-sandal,n04133789
-sarong,n04136333
-sax,n04141076
-scabbard,n04141327
-scale,n04141975
-school bus,n04146614
-schooner,n04147183
-scoreboard,n04149813
-screen,n04152593
-screw,n04153751
-screwdriver,n04154565
-seat belt,n04162706
-sewing machine,n04179913
-shield,n04192698
-shoe shop,n04200800
-shoji,n04201297
-shopping basket,n04204238
-shopping cart,n04204347
-shovel,n04208210
-shower cap,n04209133
-shower curtain,n04209239
-ski,n04228054
-ski mask,n04229816
-sleeping bag,n04235860
-slide rule,n04238763
-sliding door,n04239074
-slot,n04243546
-snorkel,n04251144
-snowmobile,n04252077
-snowplow,n04252225
-soap dispenser,n04254120
-soccer ball,n04254680
-sock,n04254777
-solar dish,n04258138
-sombrero,n04259630
-soup bowl,n04263257
-space bar,n04264628
-space heater,n04265275
-space shuttle,n04266014
-spatula,n04270147
-speedboat,n04273569
-spider web,n04275548
-spindle,n04277352
-sports car,n04285008
-spotlight,n04286575
-stage,n04296562
-steam locomotive,n04310018
-steel arch bridge,n04311004
-steel drum,n04311174
-stethoscope,n04317175
-stole,n04325704
-stone wall,n04326547
-stopwatch,n04328186
-stove,n04330267
-strainer,n04332243
-streetcar,n04335435
-stretcher,n04336792
-studio couch,n04344873
-stupa,n04346328
-submarine,n04347754
-suit,n04350905
-sundial,n04355338
-sunglass,n04355933
-sunglasses,n04356056
-sunscreen,n04357314
-suspension bridge,n04366367
-swab,n04367480
-sweatshirt,n04370456
-swimming trunks,n04371430
-swing,n04371774
-switch,n04372370
-syringe,n04376876
-table lamp,n04380533
-tank,n04389033
-tape player,n04392985
-teapot,n04398044
-teddy,n04399382
-television,n04404412
-tennis ball,n04409515
-thatch,n04417672
-theater curtain,n04418357
-thimble,n04423845
-thresher,n04428191
-throne,n04429376
-tile roof,n04435653
-toaster,n04442312
-tobacco shop,n04443257
-toilet seat,n04447861
-torch,n04456115
-totem pole,n04458633
-tow truck,n04461696
-toyshop,n04462240
-tractor,n04465501
-trailer truck,n04467665
-tray,n04476259
-trench coat,n04479046
-tricycle,n04482393
-trimaran,n04483307
-tripod,n04485082
-triumphal arch,n04486054
-trolleybus,n04487081
-trombone,n04487394
-tub,n04493381
-turnstile,n04501370
-typewriter keyboard,n04505470
-umbrella,n04507155
-unicycle,n04509417
-upright,n04515003
-vacuum,n04517823
-vase,n04522168
-vault,n04523525
-velvet,n04525038
-vending machine,n04525305
-vestment,n04532106
-viaduct,n04532670
-violin,n04536866
-volleyball,n04540053
-waffle iron,n04542943
-wall clock,n04548280
-wallet,n04548362
-wardrobe,n04550184
-warplane,n04552348
-washbasin,n04553703
-washer,n04554684
-water bottle,n04557648
-water jug,n04560804
-water tower,n04562935
-whiskey jug,n04579145
-whistle,n04579432
-wig,n04584207
-window screen,n04589890
-window shade,n04590129
-Windsor tie,n04591157
-wine bottle,n04591713
-wing,n04592741
-wok,n04596742
-wooden spoon,n04597913
-wool,n04599235
-worm fence,n04604644
-wreck,n04606251
-yawl,n04612504
-yurt,n04613696
-web site,n06359193
-comic book,n06596364
-crossword puzzle,n06785654
-street sign,n06794110
-traffic light,n06874185
-book jacket,n07248320
-menu,n07565083
-plate,n07579787
-guacamole,n07583066
-consomme,n07584110
-hot pot,n07590611
-trifle,n07613480
-ice cream,n07614500
-ice lolly,n07615774
-French loaf,n07684084
-bagel,n07693725
-pretzel,n07695742
-cheeseburger,n07697313
-hotdog,n07697537
-mashed potato,n07711569
-head cabbage,n07714571
-broccoli,n07714990
-cauliflower,n07715103
-zucchini,n07716358
-spaghetti squash,n07716906
-acorn squash,n07717410
-butternut squash,n07717556
-cucumber,n07718472
-artichoke,n07718747
-bell pepper,n07720875
-cardoon,n07730033
-mushroom,n07734744
-Granny Smith,n07742313
-strawberry,n07745940
-orange,n07747607
-lemon,n07749582
-fig,n07753113
-pineapple,n07753275
-banana,n07753592
-jackfruit,n07754684
-custard apple,n07760859
-pomegranate,n07768694
-hay,n07802026
-carbonara,n07831146
-chocolate sauce,n07836838
-dough,n07860988
-meat loaf,n07871810
-pizza,n07873807
-potpie,n07875152
-burrito,n07880968
-red wine,n07892512
-espresso,n07920052
-cup,n07930864
-eggnog,n07932039
-alp,n09193705
-bubble,n09229709
-cliff,n09246464
-coral reef,n09256479
-geyser,n09288635
-lakeside,n09332890
-promontory,n09399592
-sandbar,n09421951
-seashore,n09428293
-valley,n09468604
-volcano,n09472597
-ballplayer,n09835506
-groom,n10148035
-scuba diver,n10565667
-rapeseed,n11879895
-daisy,n11939491
-yellow lady's slipper,n12057211
-corn,n12144580
-acorn,n12267677
-hip,n12620546
-buckeye,n12768682
-coral fungus,n12985857
-agaric,n12998815
-gyromitra,n13037406
-stinkhorn,n13040303
-earthstar,n13044778
-hen-of-the-woods,n13052670
-bolete,n13054560
-ear,n13133613
-toilet tissue,n15075141
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py
deleted file mode 100644
index ad561c48c..000000000
--- a/torchvision/prototype/datasets/_builtin/imagenet.py
+++ /dev/null
@@ -1,223 +0,0 @@
-import enum
-import pathlib
-import re
-
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Match, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import (
-    Demultiplexer,
-    Enumerator,
-    Filter,
-    IterDataPipe,
-    IterKeyZipper,
-    LineReader,
-    Mapper,
-    TarArchiveLoader,
-)
-from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, ManualDownloadResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-NAME = "imagenet"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    categories, wnids = zip(*read_categories_file(NAME))
-    return dict(categories=categories, wnids=wnids)
-
-
-class ImageNetResource(ManualDownloadResource):
-    def __init__(self, **kwargs: Any) -> None:
-        super().__init__("Register on https://image-net.org/ and follow the instructions there.", **kwargs)
-
-
-class ImageNetDemux(enum.IntEnum):
-    META = 0
-    LABEL = 1
-
-
-class CategoryAndWordNetIDExtractor(IterDataPipe):
-    # Although the WordNet IDs (wnids) are unique, the corresponding categories are not. For example, both n02012849
-    # and n03126707 are labeled 'crane' while the first means the bird and the latter means the construction equipment
-    _WNID_MAP = {
-        "n03126707": "construction crane",
-        "n03710721": "tank suit",
-    }
-
-    def __init__(self, datapipe: IterDataPipe[Tuple[str, BinaryIO]]) -> None:
-        self.datapipe = datapipe
-
-    def __iter__(self) -> Iterator[Tuple[str, str]]:
-        for _, stream in self.datapipe:
-            synsets = read_mat(stream, squeeze_me=True)["synsets"]
-            for _, wnid, category, _, num_children, *_ in synsets:
-                if num_children > 0:
-                    # we are looking at a superclass that has no direct instance
-                    continue
-
-                yield self._WNID_MAP.get(wnid, category.split(",", 1)[0]), wnid
-
-
-@register_dataset(NAME)
-class ImageNet(Dataset):
-    """
-    - **homepage**: https://www.image-net.org/
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val", "test"})
-
-        info = _info()
-        categories, wnids = info["categories"], info["wnids"]
-        self._categories = categories
-        self._wnids = wnids
-        self._wnid_to_category = dict(zip(wnids, categories))
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _IMAGES_CHECKSUMS = {
-        "train": "b08200a27a8e34218a0e58fde36b0fe8f73bc377f4acea2d91602057c3ca45bb",
-        "val": "c7e06a6c0baccf06d8dbeb6577d71efff84673a5dbdd50633ab44f8ea0456ae0",
-        "test_v10102019": "9cf7f8249639510f17d3d8a0deb47cd22a435886ba8e29e2b3223e65a4079eb4",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        name = "test_v10102019" if self._split == "test" else self._split
-        images = ImageNetResource(
-            file_name=f"ILSVRC2012_img_{name}.tar",
-            sha256=self._IMAGES_CHECKSUMS[name],
-        )
-        resources: List[OnlineResource] = [images]
-
-        if self._split == "val":
-            devkit = ImageNetResource(
-                file_name="ILSVRC2012_devkit_t12.tar.gz",
-                sha256="b59243268c0d266621fd587d2018f69e906fb22875aca0e295b48cafaa927953",
-            )
-            resources.append(devkit)
-
-        return resources
-
-    _TRAIN_IMAGE_NAME_PATTERN = re.compile(r"(?P<wnid>n\d{8})_\d+[.]JPEG")
-
-    def _prepare_train_data(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[Label, str], Tuple[str, BinaryIO]]:
-        path = pathlib.Path(data[0])
-        wnid = cast(Match[str], self._TRAIN_IMAGE_NAME_PATTERN.match(path.name))["wnid"]
-        label = Label.from_category(self._wnid_to_category[wnid], categories=self._categories)
-        return (label, wnid), data
-
-    def _prepare_test_data(self, data: Tuple[str, BinaryIO]) -> Tuple[None, Tuple[str, BinaryIO]]:
-        return None, data
-
-    def _classifiy_devkit(self, data: Tuple[str, BinaryIO]) -> Optional[int]:
-        return {
-            "meta.mat": ImageNetDemux.META,
-            "ILSVRC2012_validation_ground_truth.txt": ImageNetDemux.LABEL,
-        }.get(pathlib.Path(data[0]).name)
-
-    _VAL_TEST_IMAGE_NAME_PATTERN = re.compile(r"ILSVRC2012_(val|test)_(?P<id>\d{8})[.]JPEG")
-
-    def _val_test_image_key(self, path: pathlib.Path) -> int:
-        return int(self._VAL_TEST_IMAGE_NAME_PATTERN.match(path.name)["id"])  # type: ignore[index]
-
-    def _prepare_val_data(
-        self, data: Tuple[Tuple[int, str], Tuple[str, BinaryIO]]
-    ) -> Tuple[Tuple[Label, str], Tuple[str, BinaryIO]]:
-        label_data, image_data = data
-        _, wnid = label_data
-        label = Label.from_category(self._wnid_to_category[wnid], categories=self._categories)
-        return (label, wnid), image_data
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Optional[Tuple[Label, str]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
-        label_data, (path, buffer) = data
-
-        return dict(
-            dict(zip(("label", "wnid"), label_data if label_data else (None, None))),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        if self._split in {"train", "test"}:
-            dp = resource_dps[0]
-
-            # the train archive is a tar of tars
-            if self._split == "train":
-                dp = TarArchiveLoader(dp)
-
-            dp = hint_shuffling(dp)
-            dp = hint_sharding(dp)
-            dp = Mapper(dp, self._prepare_train_data if self._split == "train" else self._prepare_test_data)
-        else:  # config.split == "val":
-            images_dp, devkit_dp = resource_dps
-
-            meta_dp, label_dp = Demultiplexer(
-                devkit_dp, 2, self._classifiy_devkit, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-            )
-
-            # We cannot use self._wnids here, since we use a different order than the dataset
-            meta_dp = CategoryAndWordNetIDExtractor(meta_dp)
-            wnid_dp = Mapper(meta_dp, getitem(1))
-            wnid_dp = Enumerator(wnid_dp, 1)
-            wnid_map = IterToMapConverter(wnid_dp)
-
-            label_dp = LineReader(label_dp, decode=True, return_path=False)
-            label_dp = Mapper(label_dp, int)
-            label_dp = Mapper(label_dp, wnid_map.__getitem__)
-            label_dp: IterDataPipe[Tuple[int, str]] = Enumerator(label_dp, 1)
-            label_dp = hint_shuffling(label_dp)
-            label_dp = hint_sharding(label_dp)
-
-            dp = IterKeyZipper(
-                label_dp,
-                images_dp,
-                key_fn=getitem(0),
-                ref_key_fn=path_accessor(self._val_test_image_key),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-            dp = Mapper(dp, self._prepare_val_data)
-
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 1_281_167,
-            "val": 50_000,
-            "test": 100_000,
-        }[self._split]
-
-    def _filter_meta(self, data: Tuple[str, Any]) -> bool:
-        return self._classifiy_devkit(data) == ImageNetDemux.META
-
-    def _generate_categories(self) -> List[Tuple[str, ...]]:
-        self._split = "val"
-        resources = self._resources()
-
-        devkit_dp = resources[1].load(self._root)
-        meta_dp = Filter(devkit_dp, self._filter_meta)
-        meta_dp = CategoryAndWordNetIDExtractor(meta_dp)
-
-        categories_and_wnids = cast(List[Tuple[str, ...]], list(meta_dp))
-        categories_and_wnids.sort(key=lambda category_and_wnid: category_and_wnid[1])
-        return categories_and_wnids
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
deleted file mode 100644
index 218b8b330..000000000
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ /dev/null
@@ -1,419 +0,0 @@
-import abc
-import functools
-import operator
-import pathlib
-import string
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Sequence, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE
-from torchvision.prototype.tv_tensors import Label
-from torchvision.prototype.utils._internal import fromfile
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-
-prod = functools.partial(functools.reduce, operator.mul)
-
-
-class MNISTFileReader(IterDataPipe[torch.Tensor]):
-    _DTYPE_MAP = {
-        8: torch.uint8,
-        9: torch.int8,
-        11: torch.int16,
-        12: torch.int32,
-        13: torch.float32,
-        14: torch.float64,
-    }
-
-    def __init__(
-        self, datapipe: IterDataPipe[Tuple[Any, BinaryIO]], *, start: Optional[int], stop: Optional[int]
-    ) -> None:
-        self.datapipe = datapipe
-        self.start = start
-        self.stop = stop
-
-    def __iter__(self) -> Iterator[torch.Tensor]:
-        for _, file in self.datapipe:
-            try:
-                read = functools.partial(fromfile, file, byte_order="big")
-
-                magic = int(read(dtype=torch.int32, count=1))
-                dtype = self._DTYPE_MAP[magic // 256]
-                ndim = magic % 256 - 1
-
-                num_samples = int(read(dtype=torch.int32, count=1))
-                shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
-                count = prod(shape) if shape else 1
-
-                start = self.start or 0
-                stop = min(self.stop, num_samples) if self.stop else num_samples
-
-                if start:
-                    num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-                    file.seek(num_bytes_per_value * count * start, 1)
-
-                for _ in range(stop - start):
-                    yield read(dtype=dtype, count=count).reshape(shape)
-            finally:
-                file.close()
-
-
-class _MNISTBase(Dataset):
-    _URL_BASE: Union[str, Sequence[str]]
-
-    @abc.abstractmethod
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        pass
-
-    def _resources(self) -> List[OnlineResource]:
-        (images_file, images_sha256), (
-            labels_file,
-            labels_sha256,
-        ) = self._files_and_checksums()
-
-        url_bases = self._URL_BASE
-        if isinstance(url_bases, str):
-            url_bases = (url_bases,)
-
-        images_urls = [f"{url_base}/{images_file}" for url_base in url_bases]
-        images = HttpResource(images_urls[0], sha256=images_sha256, mirrors=images_urls[1:])
-
-        labels_urls = [f"{url_base}/{labels_file}" for url_base in url_bases]
-        labels = HttpResource(labels_urls[0], sha256=labels_sha256, mirrors=labels_urls[1:])
-
-        return [images, labels]
-
-    def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
-        return None, None
-
-    _categories: List[str]
-
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
-        image, label = data
-        return dict(
-            image=Image(image),
-            label=Label(label, dtype=torch.int64, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, labels_dp = resource_dps
-        start, stop = self.start_and_stop()
-
-        images_dp = Decompressor(images_dp)
-        images_dp = MNISTFileReader(images_dp, start=start, stop=stop)
-
-        labels_dp = Decompressor(labels_dp)
-        labels_dp = MNISTFileReader(labels_dp, start=start, stop=stop)
-
-        dp = Zipper(images_dp, labels_dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-
-@register_info("mnist")
-def _mnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=[str(label) for label in range(10)],
-    )
-
-
-@register_dataset("mnist")
-class MNIST(_MNISTBase):
-    """
-    - **homepage**: http://yann.lecun.com/exdb/mnist
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_BASE: Union[str, Sequence[str]] = (
-        "http://yann.lecun.com/exdb/mnist",
-        "https://ossci-datasets.s3.amazonaws.com/mnist",
-    )
-    _CHECKSUMS = {
-        "train-images-idx3-ubyte.gz": "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609",
-        "train-labels-idx1-ubyte.gz": "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c",
-        "t10k-images-idx3-ubyte.gz": "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6",
-        "t10k-labels-idx1-ubyte.gz": "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6",
-    }
-
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        prefix = "train" if self._split == "train" else "t10k"
-        images_file = f"{prefix}-images-idx3-ubyte.gz"
-        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
-        return (images_file, self._CHECKSUMS[images_file]), (
-            labels_file,
-            self._CHECKSUMS[labels_file],
-        )
-
-    _categories = _mnist_info()["categories"]
-
-    def __len__(self) -> int:
-        return 60_000 if self._split == "train" else 10_000
-
-
-@register_info("fashionmnist")
-def _fashionmnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=[
-            "T-shirt/top",
-            "Trouser",
-            "Pullover",
-            "Dress",
-            "Coat",
-            "Sandal",
-            "Shirt",
-            "Sneaker",
-            "Bag",
-            "Ankle boot",
-        ],
-    )
-
-
-@register_dataset("fashionmnist")
-class FashionMNIST(MNIST):
-    """
-    - **homepage**: https://github.com/zalandoresearch/fashion-mnist
-    """
-
-    _URL_BASE = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com"
-    _CHECKSUMS = {
-        "train-images-idx3-ubyte.gz": "3aede38d61863908ad78613f6a32ed271626dd12800ba2636569512369268a84",
-        "train-labels-idx1-ubyte.gz": "a04f17134ac03560a47e3764e11b92fc97de4d1bfaf8ba1a3aa29af54cc90845",
-        "t10k-images-idx3-ubyte.gz": "346e55b948d973a97e58d2351dde16a484bd415d4595297633bb08f03db6a073",
-        "t10k-labels-idx1-ubyte.gz": "67da17c76eaffca5446c3361aaab5c3cd6d1c2608764d35dfb1850b086bf8dd5",
-    }
-
-    _categories = _fashionmnist_info()["categories"]
-
-
-@register_info("kmnist")
-def _kmnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=["o", "ki", "su", "tsu", "na", "ha", "ma", "ya", "re", "wo"],
-    )
-
-
-@register_dataset("kmnist")
-class KMNIST(MNIST):
-    """
-    - **homepage**: http://codh.rois.ac.jp/kmnist/index.html.en
-    """
-
-    _URL_BASE = "http://codh.rois.ac.jp/kmnist/dataset/kmnist"
-    _CHECKSUMS = {
-        "train-images-idx3-ubyte.gz": "51467d22d8cc72929e2a028a0428f2086b092bb31cfb79c69cc0a90ce135fde4",
-        "train-labels-idx1-ubyte.gz": "e38f9ebcd0f3ebcdec7fc8eabdcdaef93bb0df8ea12bee65224341c8183d8e17",
-        "t10k-images-idx3-ubyte.gz": "edd7a857845ad6bb1d0ba43fe7e794d164fe2dce499a1694695a792adfac43c5",
-        "t10k-labels-idx1-ubyte.gz": "20bb9a0ef54c7db3efc55a92eef5582c109615df22683c380526788f98e42a1c",
-    }
-
-    _categories = _kmnist_info()["categories"]
-
-
-@register_info("emnist")
-def _emnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=list(string.digits + string.ascii_uppercase + string.ascii_lowercase),
-    )
-
-
-@register_dataset("emnist")
-class EMNIST(_MNISTBase):
-    """
-    - **homepage**: https://www.westernsydney.edu.au/icns/reproducible_research/publication_support_materials/emnist
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        image_set: str = "Balanced",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        self._image_set = self._verify_str_arg(
-            image_set, "image_set", ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST")
-        )
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_BASE = "https://rds.westernsydney.edu.au/Institutes/MARCS/BENS/EMNIST"
-
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        prefix = f"emnist-{self._image_set.replace('_', '').lower()}-{self._split}"
-        images_file = f"{prefix}-images-idx3-ubyte.gz"
-        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
-        # Since EMNIST provides the data files inside an archive, we don't need to provide checksums for them
-        return (images_file, ""), (labels_file, "")
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                f"{self._URL_BASE}/emnist-gzip.zip",
-                sha256="909a2a39c5e86bdd7662425e9b9c4a49bb582bf8d0edad427f3c3a9d0c6f7259",
-            )
-        ]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        (images_file, _), (labels_file, _) = self._files_and_checksums()
-        if path.name == images_file:
-            return 0
-        elif path.name == labels_file:
-            return 1
-        else:
-            return None
-
-    _categories = _emnist_info()["categories"]
-
-    _LABEL_OFFSETS = {
-        38: 1,
-        39: 1,
-        40: 1,
-        41: 1,
-        42: 1,
-        43: 6,
-        44: 8,
-        45: 8,
-        46: 9,
-    }
-
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
-        # In these two splits, some lowercase letters are merged into their uppercase ones (see Fig 2. in the paper).
-        # That means for example that there is 'D', 'd', and 'C', but not 'c'. Since the labels are nevertheless dense,
-        # i.e. no gaps between 0 and 46 for 47 total classes, we need to add an offset to create these gaps. For
-        # example, since there is no 'c', 'd' corresponds to
-        # label 38 (10 digits + 26 uppercase letters + 3rd unmerged lower case letter - 1 for zero indexing),
-        # and at the same time corresponds to
-        # index 39 (10 digits + 26 uppercase letters + 4th lower case letter - 1 for zero indexing)
-        # in self._categories. Thus, we need to add 1 to the label to correct this.
-        if self._image_set in ("Balanced", "By_Merge"):
-            image, label = data
-            label += self._LABEL_OFFSETS.get(int(label), 0)
-            data = (image, label)
-        return super()._prepare_sample(data)
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        images_dp, labels_dp = Demultiplexer(
-            archive_dp,
-            2,
-            self._classify_archive,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return super()._datapipe([images_dp, labels_dp])
-
-    def __len__(self) -> int:
-        return {
-            ("train", "Balanced"): 112_800,
-            ("train", "By_Merge"): 697_932,
-            ("train", "By_Class"): 697_932,
-            ("train", "Letters"): 124_800,
-            ("train", "Digits"): 240_000,
-            ("train", "MNIST"): 60_000,
-            ("test", "Balanced"): 18_800,
-            ("test", "By_Merge"): 116_323,
-            ("test", "By_Class"): 116_323,
-            ("test", "Letters"): 20_800,
-            ("test", "Digits"): 40_000,
-            ("test", "MNIST"): 10_000,
-        }[(self._split, self._image_set)]
-
-
-@register_info("qmnist")
-def _qmnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=[str(label) for label in range(10)],
-    )
-
-
-@register_dataset("qmnist")
-class QMNIST(_MNISTBase):
-    """
-    - **homepage**: https://github.com/facebookresearch/qmnist
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test", "test10k", "test50k", "nist"))
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_BASE = "https://raw.githubusercontent.com/facebookresearch/qmnist/master"
-    _CHECKSUMS = {
-        "qmnist-train-images-idx3-ubyte.gz": "9e26a7bf1683614e065d7b76460ccd52807165b3f22561fb782bd9f38c52b51d",
-        "qmnist-train-labels-idx2-int.gz": "2c05dc77f6b916b38e455e97ab129a42a444f3dbef09b278a366f82904e0dd9f",
-        "qmnist-test-images-idx3-ubyte.gz": "43fc22bf7498b8fc98de98369d72f752d0deabc280a43a7bcc364ab19e57b375",
-        "qmnist-test-labels-idx2-int.gz": "9fbcbe594c3766fdf4f0b15c5165dc0d1e57ac604e01422608bb72c906030d06",
-        "xnist-images-idx3-ubyte.xz": "f075553993026d4359ded42208eff77a1941d3963c1eff49d6015814f15f0984",
-        "xnist-labels-idx2-int.xz": "db042968723ec2b7aed5f1beac25d2b6e983b9286d4f4bf725f1086e5ae55c4f",
-    }
-
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        prefix = "xnist" if self._split == "nist" else f"qmnist-{'train' if self._split == 'train' else 'test'}"
-        suffix = "xz" if self._split == "nist" else "gz"
-        images_file = f"{prefix}-images-idx3-ubyte.{suffix}"
-        labels_file = f"{prefix}-labels-idx2-int.{suffix}"
-        return (images_file, self._CHECKSUMS[images_file]), (
-            labels_file,
-            self._CHECKSUMS[labels_file],
-        )
-
-    def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
-        start: Optional[int]
-        stop: Optional[int]
-        if self._split == "test10k":
-            start = 0
-            stop = 10000
-        elif self._split == "test50k":
-            start = 10000
-            stop = None
-        else:
-            start = stop = None
-
-        return start, stop
-
-    _categories = _emnist_info()["categories"]
-
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
-        image, ann = data
-        label, *extra_anns = ann
-        sample = super()._prepare_sample((image, label))
-
-        sample.update(
-            dict(
-                zip(
-                    ("nist_hsf_series", "nist_writer_id", "digit_index", "nist_label", "global_digit_index"),
-                    [int(value) for value in extra_anns[:5]],
-                )
-            )
-        )
-        sample.update(dict(zip(("duplicate", "unused"), [bool(value) for value in extra_anns[-2:]])))
-        return sample
-
-    def __len__(self) -> int:
-        return {
-            "train": 60_000,
-            "test": 60_000,
-            "test10k": 10_000,
-            "test50k": 50_000,
-            "nist": 402_953,
-        }[self._split]
diff --git a/torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories b/torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories
deleted file mode 100644
index 36d29465b..000000000
--- a/torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories
+++ /dev/null
@@ -1,37 +0,0 @@
-Abyssinian
-American Bulldog
-American Pit Bull Terrier
-Basset Hound
-Beagle
-Bengal
-Birman
-Bombay
-Boxer
-British Shorthair
-Chihuahua
-Egyptian Mau
-English Cocker Spaniel
-English Setter
-German Shorthaired
-Great Pyrenees
-Havanese
-Japanese Chin
-Keeshond
-Leonberger
-Maine Coon
-Miniature Pinscher
-Newfoundland
-Persian
-Pomeranian
-Pug
-Ragdoll
-Russian Blue
-Saint Bernard
-Samoyed
-Scottish Terrier
-Shiba Inu
-Siamese
-Sphynx
-Staffordshire Bull Terrier
-Wheaten Terrier
-Yorkshire Terrier
diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
deleted file mode 100644
index 9d14a7b9b..000000000
--- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import enum
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "oxford-iiit-pet"
-
-
-class OxfordIIITPetDemux(enum.IntEnum):
-    SPLIT_AND_CLASSIFICATION = 0
-    SEGMENTATIONS = 1
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class OxfordIIITPet(Dataset):
-    """Oxford IIIT Pet Dataset
-    homepage="https://www.robots.ox.ac.uk/~vgg/data/pets/",
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "trainval", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"trainval", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        images = HttpResource(
-            "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz",
-            sha256="67195c5e1c01f1ab5f9b6a5d22b8c27a580d896ece458917e61d459337fa318d",
-            preprocess="decompress",
-        )
-        anns = HttpResource(
-            "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz",
-            sha256="52425fb6de5c424942b7626b428656fcbd798db970a937df61750c0f1d358e91",
-            preprocess="decompress",
-        )
-        return [images, anns]
-
-    def _classify_anns(self, data: Tuple[str, Any]) -> Optional[int]:
-        return {
-            "annotations": OxfordIIITPetDemux.SPLIT_AND_CLASSIFICATION,
-            "trimaps": OxfordIIITPetDemux.SEGMENTATIONS,
-        }.get(pathlib.Path(data[0]).parent.name)
-
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
-        return pathlib.Path(data[0]).suffix == ".jpg"
-
-    def _filter_segmentations(self, data: Tuple[str, Any]) -> bool:
-        return not pathlib.Path(data[0]).name.startswith(".")
-
-    def _prepare_sample(
-        self, data: Tuple[Tuple[Dict[str, str], Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]
-    ) -> Dict[str, Any]:
-        ann_data, image_data = data
-        classification_data, segmentation_data = ann_data
-        segmentation_path, segmentation_buffer = segmentation_data
-        image_path, image_buffer = image_data
-
-        return dict(
-            label=Label(int(classification_data["label"]) - 1, categories=self._categories),
-            species="cat" if classification_data["species"] == "1" else "dog",
-            segmentation_path=segmentation_path,
-            segmentation=EncodedImage.from_file(segmentation_buffer),
-            image_path=image_path,
-            image=EncodedImage.from_file(image_buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, anns_dp = resource_dps
-
-        images_dp = Filter(images_dp, self._filter_images)
-
-        split_and_classification_dp, segmentations_dp = Demultiplexer(
-            anns_dp,
-            2,
-            self._classify_anns,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        split_and_classification_dp = Filter(split_and_classification_dp, path_comparator("name", f"{self._split}.txt"))
-        split_and_classification_dp = CSVDictParser(
-            split_and_classification_dp, fieldnames=("image_id", "label", "species"), delimiter=" "
-        )
-        split_and_classification_dp = hint_shuffling(split_and_classification_dp)
-        split_and_classification_dp = hint_sharding(split_and_classification_dp)
-
-        segmentations_dp = Filter(segmentations_dp, self._filter_segmentations)
-
-        anns_dp = IterKeyZipper(
-            split_and_classification_dp,
-            segmentations_dp,
-            key_fn=getitem("image_id"),
-            ref_key_fn=path_accessor("stem"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        dp = IterKeyZipper(
-            anns_dp,
-            images_dp,
-            key_fn=getitem(0, "image_id"),
-            ref_key_fn=path_accessor("stem"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def _filter_split_and_classification_anns(self, data: Tuple[str, Any]) -> bool:
-        return self._classify_anns(data) == OxfordIIITPetDemux.SPLIT_AND_CLASSIFICATION
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[1].load(self._root)
-        dp = Filter(dp, self._filter_split_and_classification_anns)
-        dp = Filter(dp, path_comparator("name", "trainval.txt"))
-        dp = CSVDictParser(dp, fieldnames=("image_id", "label"), delimiter=" ")
-
-        raw_categories_and_labels = {(data["image_id"].rsplit("_", 1)[0], data["label"]) for data in dp}
-        raw_categories, _ = zip(
-            *sorted(raw_categories_and_labels, key=lambda raw_category_and_label: int(raw_category_and_label[1]))
-        )
-        return [" ".join(part.title() for part in raw_category.split("_")) for raw_category in raw_categories]
-
-    def __len__(self) -> int:
-        return 3_680 if self._split == "trainval" else 3_669
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
deleted file mode 100644
index 6eb4118ca..000000000
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import io
-import pathlib
-from collections import namedtuple
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-
-NAME = "pcam"
-
-
-class PCAMH5Reader(IterDataPipe[Tuple[str, io.IOBase]]):
-    def __init__(
-        self,
-        datapipe: IterDataPipe[Tuple[str, io.IOBase]],
-        key: Optional[str] = None,  # Note: this key thing might be very specific to the PCAM dataset
-    ) -> None:
-        self.datapipe = datapipe
-        self.key = key
-
-    def __iter__(self) -> Iterator[Tuple[str, io.IOBase]]:
-        import h5py
-
-        for _, handle in self.datapipe:
-            try:
-                with h5py.File(handle) as data:
-                    if self.key is not None:
-                        data = data[self.key]
-                    yield from data
-            finally:
-                handle.close()
-
-
-_Resource = namedtuple("_Resource", ("file_name", "gdrive_id", "sha256"))
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=["0", "1"])
-
-
-@register_dataset(NAME)
-class PCAM(Dataset):
-    # TODO write proper docstring
-    """PCAM Dataset
-
-    homepage="https://github.com/basveeling/pcam"
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], split: str = "train", *, skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check, dependencies=("h5py",))
-
-    _RESOURCES = {
-        "train": (
-            _Resource(  # Images
-                file_name="camelyonpatch_level_2_split_train_x.h5.gz",
-                gdrive_id="1Ka0XfEMiwgCYPdTI-vv6eUElOBnKFKQ2",
-                sha256="d619e741468a7ab35c7e4a75e6821b7e7e6c9411705d45708f2a0efc8960656c",
-            ),
-            _Resource(  # Targets
-                file_name="camelyonpatch_level_2_split_train_y.h5.gz",
-                gdrive_id="1269yhu3pZDP8UYFQs-NYs3FPwuK-nGSG",
-                sha256="b74126d2c01b20d3661f9b46765d29cf4e4fba6faba29c8e0d09d406331ab75a",
-            ),
-        ),
-        "test": (
-            _Resource(  # Images
-                file_name="camelyonpatch_level_2_split_test_x.h5.gz",
-                gdrive_id="1qV65ZqZvWzuIVthK8eVDhIwrbnsJdbg_",
-                sha256="79174c2201ad521602a5888be8f36ee10875f37403dd3f2086caf2182ef87245",
-            ),
-            _Resource(  # Targets
-                file_name="camelyonpatch_level_2_split_test_y.h5.gz",
-                gdrive_id="17BHrSrwWKjYsOgTMmoqrIjDy6Fa2o_gP",
-                sha256="0a522005fccc8bbd04c5a117bfaf81d8da2676f03a29d7499f71d0a0bd6068ef",
-            ),
-        ),
-        "val": (
-            _Resource(  # Images
-                file_name="camelyonpatch_level_2_split_valid_x.h5.gz",
-                gdrive_id="1hgshYGWK8V-eGRy8LToWJJgDU_rXWVJ3",
-                sha256="f82ee1670d027b4ec388048d9eabc2186b77c009655dae76d624c0ecb053ccb2",
-            ),
-            _Resource(  # Targets
-                file_name="camelyonpatch_level_2_split_valid_y.h5.gz",
-                gdrive_id="1bH8ZRbhSVAhScTS0p9-ZzGnX91cHT3uO",
-                sha256="ce1ae30f08feb468447971cfd0472e7becd0ad96d877c64120c72571439ae48c",
-            ),
-        ),
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        return [  # = [images resource, targets resource]
-            GDriveResource(file_name=file_name, id=gdrive_id, sha256=sha256, preprocess="decompress")
-            for file_name, gdrive_id, sha256 in self._RESOURCES[self._split]
-        ]
-
-    def _prepare_sample(self, data: Tuple[Any, Any]) -> Dict[str, Any]:
-        image, target = data  # They're both numpy arrays at this point
-
-        return {
-            "image": Image(image.transpose(2, 0, 1)),
-            "label": Label(target.item(), categories=self._categories),
-        }
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-
-        images_dp, targets_dp = resource_dps
-
-        images_dp = PCAMH5Reader(images_dp, key="x")
-        targets_dp = PCAMH5Reader(targets_dp, key="y")
-
-        dp = Zipper(images_dp, targets_dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 262_144 if self._split == "train" else 32_768
diff --git a/torchvision/prototype/datasets/_builtin/sbd.categories b/torchvision/prototype/datasets/_builtin/sbd.categories
deleted file mode 100644
index 8420ab35e..000000000
--- a/torchvision/prototype/datasets/_builtin/sbd.categories
+++ /dev/null
@@ -1,20 +0,0 @@
-aeroplane
-bicycle
-bird
-boat
-bottle
-bus
-car
-cat
-chair
-cow
-diningtable
-dog
-horse
-motorbike
-person
-pottedplant
-sheep
-sofa
-train
-tvmonitor
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
deleted file mode 100644
index 97986b58b..000000000
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import pathlib
-import re
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-    read_mat,
-)
-
-from .._api import register_dataset, register_info
-
-NAME = "sbd"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class SBD(Dataset):
-    """
-    - **homepage**: http://home.bharathh.info/pubs/codes/SBD/download.html
-    - **dependencies**:
-        - <scipy `https://scipy.org`>_
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "train_noval"))
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, dependencies=("scipy",), skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        resources = [
-            HttpResource(
-                "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz",
-                sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53",
-            )
-        ]
-        if self._split == "train_noval":
-            resources.append(
-                HttpResource(
-                    "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt",
-                    sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432",
-                )
-            )
-        return resources  # type: ignore[return-value]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        parent, grandparent, *_ = path.parents
-
-        if grandparent.name == "dataset":
-            if parent.name == "img":
-                return 0
-            elif parent.name == "cls":
-                return 1
-
-        if parent.name == "dataset" and self._split != "train_noval":
-            return 2
-
-        return None
-
-    def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
-        split_and_image_data, ann_data = data
-        _, image_data = split_and_image_data
-        image_path, image_buffer = image_data
-        ann_path, ann_buffer = ann_data
-
-        anns = read_mat(ann_buffer, squeeze_me=True)["GTcls"]
-
-        return dict(
-            image_path=image_path,
-            image=EncodedImage.from_file(image_buffer),
-            ann_path=ann_path,
-            # the boundaries are stored in sparse CSC format, which is not supported by PyTorch
-            boundaries=torch.as_tensor(
-                np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])
-            ),
-            segmentation=torch.as_tensor(anns["Segmentation"].item()),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        if self._split == "train_noval":
-            archive_dp, split_dp = resource_dps
-            images_dp, anns_dp = Demultiplexer(
-                archive_dp,
-                2,
-                self._classify_archive,
-                buffer_size=INFINITE_BUFFER_SIZE,
-                drop_none=True,
-            )
-        else:
-            archive_dp = resource_dps[0]
-            images_dp, anns_dp, split_dp = Demultiplexer(
-                archive_dp,
-                3,
-                self._classify_archive,
-                buffer_size=INFINITE_BUFFER_SIZE,
-                drop_none=True,
-            )
-
-        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-        split_dp = LineReader(split_dp, decode=True)
-        split_dp = hint_shuffling(split_dp)
-        split_dp = hint_sharding(split_dp)
-
-        dp = split_dp
-        for level, data_dp in enumerate((images_dp, anns_dp)):
-            dp = IterKeyZipper(
-                dp,
-                data_dp,
-                key_fn=getitem(*[0] * level, 1),
-                ref_key_fn=path_accessor("stem"),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 8_498,
-            "val": 2_857,
-            "train_noval": 5_623,
-        }[self._split]
-
-    def _generate_categories(self) -> Tuple[str, ...]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", "category_names.m"))
-        dp = LineReader(dp)
-        dp = Mapper(dp, bytes.decode, input_col=1)
-        lines = tuple(zip(*iter(dp)))[1]
-
-        pattern = re.compile(r"\s*'(?P<category>\w+)';\s*%(?P<label>\d+)")
-        categories_and_labels = cast(
-            List[Tuple[str, ...]],
-            [
-                pattern.match(line).groups()  # type: ignore[union-attr]
-                # the first and last line contain no information
-                for line in lines[1:-1]
-            ],
-        )
-        categories_and_labels.sort(key=lambda category_and_label: int(category_and_label[1]))
-        categories, _ = zip(*categories_and_labels)
-
-        return categories
diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py
deleted file mode 100644
index f10f8e09e..000000000
--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.tv_tensors import OneHotLabel
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-NAME = "semeion"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=[str(i) for i in range(10)])
-
-
-@register_dataset(NAME)
-class SEMEION(Dataset):
-    """Semeion dataset
-    homepage="https://archive.ics.uci.edu/ml/datasets/Semeion+Handwritten+Digit",
-    """
-
-    def __init__(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False) -> None:
-
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        data = HttpResource(
-            "http://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",
-            sha256="f43228ae3da5ea6a3c95069d53450b86166770e3b719dcc333182128fe08d4b1",
-        )
-        return [data]
-
-    def _prepare_sample(self, data: Tuple[str, ...]) -> Dict[str, Any]:
-        image_data, label_data = data[:256], data[256:-1]
-
-        return dict(
-            image=Image(torch.tensor([float(pixel) for pixel in image_data], dtype=torch.float).reshape(16, 16)),
-            label=OneHotLabel([int(label) for label in label_data], categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = CSVParser(dp, delimiter=" ")
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 1_593
diff --git a/torchvision/prototype/datasets/_builtin/stanford-cars.categories b/torchvision/prototype/datasets/_builtin/stanford-cars.categories
deleted file mode 100644
index e54040f77..000000000
--- a/torchvision/prototype/datasets/_builtin/stanford-cars.categories
+++ /dev/null
@@ -1,196 +0,0 @@
-AM General Hummer SUV 2000
-Acura RL Sedan 2012
-Acura TL Sedan 2012
-Acura TL Type-S 2008
-Acura TSX Sedan 2012
-Acura Integra Type R 2001
-Acura ZDX Hatchback 2012
-Aston Martin V8 Vantage Convertible 2012
-Aston Martin V8 Vantage Coupe 2012
-Aston Martin Virage Convertible 2012
-Aston Martin Virage Coupe 2012
-Audi RS 4 Convertible 2008
-Audi A5 Coupe 2012
-Audi TTS Coupe 2012
-Audi R8 Coupe 2012
-Audi V8 Sedan 1994
-Audi 100 Sedan 1994
-Audi 100 Wagon 1994
-Audi TT Hatchback 2011
-Audi S6 Sedan 2011
-Audi S5 Convertible 2012
-Audi S5 Coupe 2012
-Audi S4 Sedan 2012
-Audi S4 Sedan 2007
-Audi TT RS Coupe 2012
-BMW ActiveHybrid 5 Sedan 2012
-BMW 1 Series Convertible 2012
-BMW 1 Series Coupe 2012
-BMW 3 Series Sedan 2012
-BMW 3 Series Wagon 2012
-BMW 6 Series Convertible 2007
-BMW X5 SUV 2007
-BMW X6 SUV 2012
-BMW M3 Coupe 2012
-BMW M5 Sedan 2010
-BMW M6 Convertible 2010
-BMW X3 SUV 2012
-BMW Z4 Convertible 2012
-Bentley Continental Supersports Conv. Convertible 2012
-Bentley Arnage Sedan 2009
-Bentley Mulsanne Sedan 2011
-Bentley Continental GT Coupe 2012
-Bentley Continental GT Coupe 2007
-Bentley Continental Flying Spur Sedan 2007
-Bugatti Veyron 16.4 Convertible 2009
-Bugatti Veyron 16.4 Coupe 2009
-Buick Regal GS 2012
-Buick Rainier SUV 2007
-Buick Verano Sedan 2012
-Buick Enclave SUV 2012
-Cadillac CTS-V Sedan 2012
-Cadillac SRX SUV 2012
-Cadillac Escalade EXT Crew Cab 2007
-Chevrolet Silverado 1500 Hybrid Crew Cab 2012
-Chevrolet Corvette Convertible 2012
-Chevrolet Corvette ZR1 2012
-Chevrolet Corvette Ron Fellows Edition Z06 2007
-Chevrolet Traverse SUV 2012
-Chevrolet Camaro Convertible 2012
-Chevrolet HHR SS 2010
-Chevrolet Impala Sedan 2007
-Chevrolet Tahoe Hybrid SUV 2012
-Chevrolet Sonic Sedan 2012
-Chevrolet Express Cargo Van 2007
-Chevrolet Avalanche Crew Cab 2012
-Chevrolet Cobalt SS 2010
-Chevrolet Malibu Hybrid Sedan 2010
-Chevrolet TrailBlazer SS 2009
-Chevrolet Silverado 2500HD Regular Cab 2012
-Chevrolet Silverado 1500 Classic Extended Cab 2007
-Chevrolet Express Van 2007
-Chevrolet Monte Carlo Coupe 2007
-Chevrolet Malibu Sedan 2007
-Chevrolet Silverado 1500 Extended Cab 2012
-Chevrolet Silverado 1500 Regular Cab 2012
-Chrysler Aspen SUV 2009
-Chrysler Sebring Convertible 2010
-Chrysler Town and Country Minivan 2012
-Chrysler 300 SRT-8 2010
-Chrysler Crossfire Convertible 2008
-Chrysler PT Cruiser Convertible 2008
-Daewoo Nubira Wagon 2002
-Dodge Caliber Wagon 2012
-Dodge Caliber Wagon 2007
-Dodge Caravan Minivan 1997
-Dodge Ram Pickup 3500 Crew Cab 2010
-Dodge Ram Pickup 3500 Quad Cab 2009
-Dodge Sprinter Cargo Van 2009
-Dodge Journey SUV 2012
-Dodge Dakota Crew Cab 2010
-Dodge Dakota Club Cab 2007
-Dodge Magnum Wagon 2008
-Dodge Challenger SRT8 2011
-Dodge Durango SUV 2012
-Dodge Durango SUV 2007
-Dodge Charger Sedan 2012
-Dodge Charger SRT-8 2009
-Eagle Talon Hatchback 1998
-FIAT 500 Abarth 2012
-FIAT 500 Convertible 2012
-Ferrari FF Coupe 2012
-Ferrari California Convertible 2012
-Ferrari 458 Italia Convertible 2012
-Ferrari 458 Italia Coupe 2012
-Fisker Karma Sedan 2012
-Ford F-450 Super Duty Crew Cab 2012
-Ford Mustang Convertible 2007
-Ford Freestar Minivan 2007
-Ford Expedition EL SUV 2009
-Ford Edge SUV 2012
-Ford Ranger SuperCab 2011
-Ford GT Coupe 2006
-Ford F-150 Regular Cab 2012
-Ford F-150 Regular Cab 2007
-Ford Focus Sedan 2007
-Ford E-Series Wagon Van 2012
-Ford Fiesta Sedan 2012
-GMC Terrain SUV 2012
-GMC Savana Van 2012
-GMC Yukon Hybrid SUV 2012
-GMC Acadia SUV 2012
-GMC Canyon Extended Cab 2012
-Geo Metro Convertible 1993
-HUMMER H3T Crew Cab 2010
-HUMMER H2 SUT Crew Cab 2009
-Honda Odyssey Minivan 2012
-Honda Odyssey Minivan 2007
-Honda Accord Coupe 2012
-Honda Accord Sedan 2012
-Hyundai Veloster Hatchback 2012
-Hyundai Santa Fe SUV 2012
-Hyundai Tucson SUV 2012
-Hyundai Veracruz SUV 2012
-Hyundai Sonata Hybrid Sedan 2012
-Hyundai Elantra Sedan 2007
-Hyundai Accent Sedan 2012
-Hyundai Genesis Sedan 2012
-Hyundai Sonata Sedan 2012
-Hyundai Elantra Touring Hatchback 2012
-Hyundai Azera Sedan 2012
-Infiniti G Coupe IPL 2012
-Infiniti QX56 SUV 2011
-Isuzu Ascender SUV 2008
-Jaguar XK XKR 2012
-Jeep Patriot SUV 2012
-Jeep Wrangler SUV 2012
-Jeep Liberty SUV 2012
-Jeep Grand Cherokee SUV 2012
-Jeep Compass SUV 2012
-Lamborghini Reventon Coupe 2008
-Lamborghini Aventador Coupe 2012
-Lamborghini Gallardo LP 570-4 Superleggera 2012
-Lamborghini Diablo Coupe 2001
-Land Rover Range Rover SUV 2012
-Land Rover LR2 SUV 2012
-Lincoln Town Car Sedan 2011
-MINI Cooper Roadster Convertible 2012
-Maybach Landaulet Convertible 2012
-Mazda Tribute SUV 2011
-McLaren MP4-12C Coupe 2012
-Mercedes-Benz 300-Class Convertible 1993
-Mercedes-Benz C-Class Sedan 2012
-Mercedes-Benz SL-Class Coupe 2009
-Mercedes-Benz E-Class Sedan 2012
-Mercedes-Benz S-Class Sedan 2012
-Mercedes-Benz Sprinter Van 2012
-Mitsubishi Lancer Sedan 2012
-Nissan Leaf Hatchback 2012
-Nissan NV Passenger Van 2012
-Nissan Juke Hatchback 2012
-Nissan 240SX Coupe 1998
-Plymouth Neon Coupe 1999
-Porsche Panamera Sedan 2012
-Ram C/V Cargo Van Minivan 2012
-Rolls-Royce Phantom Drophead Coupe Convertible 2012
-Rolls-Royce Ghost Sedan 2012
-Rolls-Royce Phantom Sedan 2012
-Scion xD Hatchback 2012
-Spyker C8 Convertible 2009
-Spyker C8 Coupe 2009
-Suzuki Aerio Sedan 2007
-Suzuki Kizashi Sedan 2012
-Suzuki SX4 Hatchback 2012
-Suzuki SX4 Sedan 2012
-Tesla Model S Sedan 2012
-Toyota Sequoia SUV 2012
-Toyota Camry Sedan 2012
-Toyota Corolla Sedan 2012
-Toyota 4Runner SUV 2012
-Volkswagen Golf Hatchback 2012
-Volkswagen Golf Hatchback 1991
-Volkswagen Beetle Hatchback 2012
-Volvo C30 Hatchback 2012
-Volvo 240 Sedan 1993
-Volvo XC90 SUV 2007
-smart fortwo Convertible 2012
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
deleted file mode 100644
index e6bd6c0cf..000000000
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import pathlib
-from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
-
-from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    path_comparator,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes
-
-from .._api import register_dataset, register_info
-
-
-class StanfordCarsLabelReader(IterDataPipe[Tuple[int, int, int, int, int, str]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[str, Any]]) -> None:
-        self.datapipe = datapipe
-
-    def __iter__(self) -> Iterator[Tuple[int, int, int, int, int, str]]:
-        for _, file in self.datapipe:
-            data = read_mat(file, squeeze_me=True)
-            for ann in data["annotations"]:
-                yield tuple(ann)  # type: ignore[misc]
-
-
-NAME = "stanford-cars"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class StanfordCars(Dataset):
-    """Stanford Cars dataset.
-    homepage="https://ai.stanford.edu/~jkrause/cars/car_dataset.html",
-    dependencies=scipy
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check, dependencies=("scipy",))
-
-    _URL_ROOT = "https://ai.stanford.edu/~jkrause/"
-    _URLS = {
-        "train": f"{_URL_ROOT}car196/cars_train.tgz",
-        "test": f"{_URL_ROOT}car196/cars_test.tgz",
-        "cars_test_annos_withlabels": f"{_URL_ROOT}car196/cars_test_annos_withlabels.mat",
-        "car_devkit": f"{_URL_ROOT}cars/car_devkit.tgz",
-    }
-
-    _CHECKSUM = {
-        "train": "b97deb463af7d58b6bfaa18b2a4de9829f0f79e8ce663dfa9261bf7810e9accd",
-        "test": "bffea656d6f425cba3c91c6d83336e4c5f86c6cffd8975b0f375d3a10da8e243",
-        "cars_test_annos_withlabels": "790f75be8ea34eeded134cc559332baf23e30e91367e9ddca97d26ed9b895f05",
-        "car_devkit": "512b227b30e2f0a8aab9e09485786ab4479582073a144998da74d64b801fd288",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        resources: List[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUM[self._split])]
-        if self._split == "train":
-            resources.append(HttpResource(url=self._URLS["car_devkit"], sha256=self._CHECKSUM["car_devkit"]))
-
-        else:
-            resources.append(
-                HttpResource(
-                    self._URLS["cars_test_annos_withlabels"], sha256=self._CHECKSUM["cars_test_annos_withlabels"]
-                )
-            )
-        return resources
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Tuple[int, int, int, int, int, str]]) -> Dict[str, Any]:
-        image, target = data
-        path, buffer = image
-        image = EncodedImage.from_file(buffer)
-
-        return dict(
-            path=path,
-            image=image,
-            label=Label(target[4] - 1, categories=self._categories),
-            bounding_boxes=BoundingBoxes(target[:4], format="xyxy", spatial_size=image.spatial_size),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-
-        images_dp, targets_dp = resource_dps
-        if self._split == "train":
-            targets_dp = Filter(targets_dp, path_comparator("name", "cars_train_annos.mat"))
-        targets_dp = StanfordCarsLabelReader(targets_dp)
-        dp = Zipper(images_dp, targets_dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        devkit_dp = resources[1].load(self._root)
-        meta_dp = Filter(devkit_dp, path_comparator("name", "cars_meta.mat"))
-        _, meta_file = next(iter(meta_dp))
-
-        return list(read_mat(meta_file, squeeze_me=True)["class_names"])
-
-    def __len__(self) -> int:
-        return 8_144 if self._split == "train" else 8_041
diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py
deleted file mode 100644
index 8c3eb4e1d..000000000
--- a/torchvision/prototype/datasets/_builtin/svhn.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
-
-import numpy as np
-from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-NAME = "svhn"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=[str(c) for c in range(10)])
-
-
-@register_dataset(NAME)
-class SVHN(Dataset):
-    """SVHN Dataset.
-    homepage="http://ufldl.stanford.edu/housenumbers/",
-    dependencies = scipy
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test", "extra"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check, dependencies=("scipy",))
-
-    _CHECKSUMS = {
-        "train": "435e94d69a87fde4fd4d7f3dd208dfc32cb6ae8af2240d066de1df7508d083b8",
-        "test": "cdce80dfb2a2c4c6160906d0bd7c68ec5a99d7ca4831afa54f09182025b6a75b",
-        "extra": "a133a4beb38a00fcdda90c9489e0c04f900b660ce8a316a5e854838379a71eb3",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        data = HttpResource(
-            f"http://ufldl.stanford.edu/housenumbers/{self._split}_32x32.mat",
-            sha256=self._CHECKSUMS[self._split],
-        )
-
-        return [data]
-
-    def _read_images_and_labels(self, data: Tuple[str, BinaryIO]) -> List[Tuple[np.ndarray, np.ndarray]]:
-        _, buffer = data
-        content = read_mat(buffer)
-        return list(
-            zip(
-                content["X"].transpose((3, 0, 1, 2)),
-                content["y"].squeeze(),
-            )
-        )
-
-    def _prepare_sample(self, data: Tuple[np.ndarray, np.ndarray]) -> Dict[str, Any]:
-        image_array, label_array = data
-
-        return dict(
-            image=Image(image_array.transpose((2, 0, 1))),
-            label=Label(int(label_array) % 10, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Mapper(dp, self._read_images_and_labels)
-        dp = UnBatcher(dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 73_257,
-            "test": 26_032,
-            "extra": 531_131,
-        }[self._split]
diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py
deleted file mode 100644
index 27dbba657..000000000
--- a/torchvision/prototype/datasets/_builtin/usps.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Union
-
-import torch
-from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import Image
-
-from .._api import register_dataset, register_info
-
-NAME = "usps"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=[str(c) for c in range(10)])
-
-
-@register_dataset(NAME)
-class USPS(Dataset):
-    """USPS Dataset
-    homepage="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#usps",
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass"
-
-    _RESOURCES = {
-        "train": HttpResource(
-            f"{_URL}/usps.bz2", sha256="3771e9dd6ba685185f89867b6e249233dd74652389f263963b3b741e994b034f"
-        ),
-        "test": HttpResource(
-            f"{_URL}/usps.t.bz2", sha256="a9c0164e797d60142a50604917f0baa604f326e9a689698763793fa5d12ffc4e"
-        ),
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        return [USPS._RESOURCES[self._split]]
-
-    def _prepare_sample(self, line: str) -> Dict[str, Any]:
-        label, *values = line.strip().split(" ")
-        values = [float(value.split(":")[1]) for value in values]
-        pixels = torch.tensor(values).add_(1).div_(2)
-        return dict(
-            image=Image(pixels.reshape(16, 16)),
-            label=Label(int(label) - 1, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = Decompressor(resource_dps[0])
-        dp = LineReader(dp, decode=True, return_path=False)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 7_291 if self._split == "train" else 2_007
diff --git a/torchvision/prototype/datasets/_builtin/voc.categories b/torchvision/prototype/datasets/_builtin/voc.categories
deleted file mode 100644
index febc0012a..000000000
--- a/torchvision/prototype/datasets/_builtin/voc.categories
+++ /dev/null
@@ -1,21 +0,0 @@
-__background__
-aeroplane
-bicycle
-bird
-boat
-bottle
-bus
-car
-cat
-chair
-cow
-diningtable
-dog
-horse
-motorbike
-person
-pottedplant
-sheep
-sofa
-train
-tvmonitor
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
deleted file mode 100644
index 8850b4bcd..000000000
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import enum
-import functools
-import pathlib
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
-from xml.etree import ElementTree
-
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.datasets import VOCDetection
-from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.tv_tensors import Label
-from torchvision.tv_tensors import BoundingBoxes
-
-from .._api import register_dataset, register_info
-
-NAME = "voc"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class VOC(Dataset):
-    """
-    - **homepage**: http://host.robots.ox.ac.uk/pascal/VOC/
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        year: str = "2012",
-        task: str = "detection",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._year = self._verify_str_arg(year, "year", ("2007", "2008", "2009", "2010", "2011", "2012"))
-        if split == "test" and year != "2007":
-            raise ValueError("`split='test'` is only available for `year='2007'`")
-        else:
-            self._split = self._verify_str_arg(split, "split", ("train", "val", "trainval", "test"))
-        self._task = self._verify_str_arg(task, "task", ("detection", "segmentation"))
-
-        self._anns_folder = "Annotations" if task == "detection" else "SegmentationClass"
-        self._split_folder = "Main" if task == "detection" else "Segmentation"
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _TRAIN_VAL_ARCHIVES = {
-        "2007": ("VOCtrainval_06-Nov-2007.tar", "7d8cd951101b0957ddfd7a530bdc8a94f06121cfc1e511bb5937e973020c7508"),
-        "2008": ("VOCtrainval_14-Jul-2008.tar", "7f0ca53c1b5a838fbe946965fc106c6e86832183240af5c88e3f6c306318d42e"),
-        "2009": ("VOCtrainval_11-May-2009.tar", "11cbe1741fb5bdadbbca3c08e9ec62cd95c14884845527d50847bc2cf57e7fd6"),
-        "2010": ("VOCtrainval_03-May-2010.tar", "1af4189cbe44323ab212bff7afbc7d0f55a267cc191eb3aac911037887e5c7d4"),
-        "2011": ("VOCtrainval_25-May-2011.tar", "0a7f5f5d154f7290ec65ec3f78b72ef72c6d93ff6d79acd40dc222a9ee5248ba"),
-        "2012": ("VOCtrainval_11-May-2012.tar", "e14f763270cf193d0b5f74b169f44157a4b0c6efa708f4dd0ff78ee691763bcb"),
-    }
-    _TEST_ARCHIVES = {
-        "2007": ("VOCtest_06-Nov-2007.tar", "6836888e2e01dca84577a849d339fa4f73e1e4f135d312430c4856b5609b4892")
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        file_name, sha256 = (self._TEST_ARCHIVES if self._split == "test" else self._TRAIN_VAL_ARCHIVES)[self._year]
-        archive = HttpResource(f"http://host.robots.ox.ac.uk/pascal/VOC/voc{self._year}/{file_name}", sha256=sha256)
-        return [archive]
-
-    def _is_in_folder(self, data: Tuple[str, Any], *, name: str, depth: int = 1) -> bool:
-        path = pathlib.Path(data[0])
-        return name in path.parent.parts[-depth:]
-
-    class _Demux(enum.IntEnum):
-        SPLIT = 0
-        IMAGES = 1
-        ANNS = 2
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        if self._is_in_folder(data, name="ImageSets", depth=2):
-            return self._Demux.SPLIT
-        elif self._is_in_folder(data, name="JPEGImages"):
-            return self._Demux.IMAGES
-        elif self._is_in_folder(data, name=self._anns_folder):
-            return self._Demux.ANNS
-        else:
-            return None
-
-    def _parse_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        ann = cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"])
-        buffer.close()
-        return ann
-
-    def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        anns = self._parse_detection_ann(buffer)
-        instances = anns["object"]
-        return dict(
-            bounding_boxes=BoundingBoxes(
-                [
-                    [int(instance["bndbox"][part]) for part in ("xmin", "ymin", "xmax", "ymax")]
-                    for instance in instances
-                ],
-                format="xyxy",
-                spatial_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
-            ),
-            labels=Label(
-                [self._categories.index(instance["name"]) for instance in instances], categories=self._categories
-            ),
-        )
-
-    def _prepare_segmentation_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        return dict(segmentation=EncodedImage.from_file(buffer))
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Tuple[Tuple[str, str], Tuple[str, BinaryIO]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
-        split_and_image_data, ann_data = data
-        _, image_data = split_and_image_data
-        image_path, image_buffer = image_data
-        ann_path, ann_buffer = ann_data
-
-        return dict(
-            (self._prepare_detection_ann if self._task == "detection" else self._prepare_segmentation_ann)(ann_buffer),
-            image_path=image_path,
-            image=EncodedImage.from_file(image_buffer),
-            ann_path=ann_path,
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        split_dp, images_dp, anns_dp = Demultiplexer(
-            archive_dp,
-            3,
-            self._classify_archive,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        split_dp = Filter(split_dp, functools.partial(self._is_in_folder, name=self._split_folder))
-        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-        split_dp = LineReader(split_dp, decode=True)
-        split_dp = hint_shuffling(split_dp)
-        split_dp = hint_sharding(split_dp)
-
-        dp = split_dp
-        for level, data_dp in enumerate((images_dp, anns_dp)):
-            dp = IterKeyZipper(
-                dp,
-                data_dp,
-                key_fn=getitem(*[0] * level, 1),
-                ref_key_fn=path_accessor("stem"),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            ("train", "2007", "detection"): 2_501,
-            ("train", "2007", "segmentation"): 209,
-            ("train", "2008", "detection"): 2_111,
-            ("train", "2008", "segmentation"): 511,
-            ("train", "2009", "detection"): 3_473,
-            ("train", "2009", "segmentation"): 749,
-            ("train", "2010", "detection"): 4_998,
-            ("train", "2010", "segmentation"): 964,
-            ("train", "2011", "detection"): 5_717,
-            ("train", "2011", "segmentation"): 1_112,
-            ("train", "2012", "detection"): 5_717,
-            ("train", "2012", "segmentation"): 1_464,
-            ("val", "2007", "detection"): 2_510,
-            ("val", "2007", "segmentation"): 213,
-            ("val", "2008", "detection"): 2_221,
-            ("val", "2008", "segmentation"): 512,
-            ("val", "2009", "detection"): 3_581,
-            ("val", "2009", "segmentation"): 750,
-            ("val", "2010", "detection"): 5_105,
-            ("val", "2010", "segmentation"): 964,
-            ("val", "2011", "detection"): 5_823,
-            ("val", "2011", "segmentation"): 1_111,
-            ("val", "2012", "detection"): 5_823,
-            ("val", "2012", "segmentation"): 1_449,
-            ("trainval", "2007", "detection"): 5_011,
-            ("trainval", "2007", "segmentation"): 422,
-            ("trainval", "2008", "detection"): 4_332,
-            ("trainval", "2008", "segmentation"): 1_023,
-            ("trainval", "2009", "detection"): 7_054,
-            ("trainval", "2009", "segmentation"): 1_499,
-            ("trainval", "2010", "detection"): 10_103,
-            ("trainval", "2010", "segmentation"): 1_928,
-            ("trainval", "2011", "detection"): 11_540,
-            ("trainval", "2011", "segmentation"): 2_223,
-            ("trainval", "2012", "detection"): 11_540,
-            ("trainval", "2012", "segmentation"): 2_913,
-            ("test", "2007", "detection"): 4_952,
-            ("test", "2007", "segmentation"): 210,
-        }[(self._split, self._year, self._task)]
-
-    def _filter_anns(self, data: Tuple[str, Any]) -> bool:
-        return self._classify_archive(data) == self._Demux.ANNS
-
-    def _generate_categories(self) -> List[str]:
-        self._task = "detection"
-        resources = self._resources()
-
-        archive_dp = resources[0].load(self._root)
-        dp = Filter(archive_dp, self._filter_anns)
-        dp = Mapper(dp, self._parse_detection_ann, input_col=1)
-
-        categories = sorted({instance["name"] for _, anns in dp for instance in anns["object"]})
-        # We add a background category to be used during segmentation
-        categories.insert(0, "__background__")
-
-        return categories
diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py
deleted file mode 100644
index 0bdded3e0..000000000
--- a/torchvision/prototype/datasets/_folder.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import functools
-import os
-import os.path
-import pathlib
-from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import EncodedData, EncodedImage
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.tv_tensors import Label
-
-
-__all__ = ["from_data_folder", "from_image_folder"]
-
-
-def _is_not_top_level_file(path: str, *, root: pathlib.Path) -> bool:
-    rel_path = pathlib.Path(path).relative_to(root)
-    return rel_path.is_dir() or rel_path.parent != pathlib.Path(".")
-
-
-def _prepare_sample(
-    data: Tuple[str, BinaryIO],
-    *,
-    root: pathlib.Path,
-    categories: List[str],
-) -> Dict[str, Any]:
-    path, buffer = data
-    category = pathlib.Path(path).relative_to(root).parts[0]
-    return dict(
-        path=path,
-        data=EncodedData.from_file(buffer),
-        label=Label.from_category(category, categories=categories),
-    )
-
-
-def from_data_folder(
-    root: Union[str, pathlib.Path],
-    *,
-    valid_extensions: Optional[Collection[str]] = None,
-    recursive: bool = True,
-) -> Tuple[IterDataPipe, List[str]]:
-    root = pathlib.Path(root).expanduser().resolve()
-    categories = sorted(entry.name for entry in os.scandir(root) if entry.is_dir())
-    masks: Union[List[str], str] = [f"*.{ext}" for ext in valid_extensions] if valid_extensions is not None else ""
-    dp = FileLister(str(root), recursive=recursive, masks=masks)
-    dp: IterDataPipe = Filter(dp, functools.partial(_is_not_top_level_file, root=root))
-    dp = hint_sharding(dp)
-    dp = hint_shuffling(dp)
-    dp = FileOpener(dp, mode="rb")
-    return Mapper(dp, functools.partial(_prepare_sample, root=root, categories=categories)), categories
-
-
-def _data_to_image_key(sample: Dict[str, Any]) -> Dict[str, Any]:
-    sample["image"] = EncodedImage(sample.pop("data").data)
-    return sample
-
-
-def from_image_folder(
-    root: Union[str, pathlib.Path],
-    *,
-    valid_extensions: Collection[str] = ("jpg", "jpeg", "png", "ppm", "bmp", "pgm", "tif", "tiff", "webp"),
-    **kwargs: Any,
-) -> Tuple[IterDataPipe, List[str]]:
-    valid_extensions = [valid_extension for ext in valid_extensions for valid_extension in (ext.lower(), ext.upper())]
-    dp, categories = from_data_folder(root, valid_extensions=valid_extensions, **kwargs)
-    return Mapper(dp, _data_to_image_key), categories
diff --git a/torchvision/prototype/datasets/_home.py b/torchvision/prototype/datasets/_home.py
deleted file mode 100644
index e5a89c4bd..000000000
--- a/torchvision/prototype/datasets/_home.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-from typing import Optional
-
-import torchvision._internally_replaced_utils as _iru
-
-
-def home(root: Optional[str] = None) -> str:
-    if root is not None:
-        _iru._HOME = root
-        return _iru._HOME
-
-    root = os.getenv("TORCHVISION_DATASETS_HOME")
-    if root is not None:
-        return root
-
-    return _iru._HOME
-
-
-def use_sharded_dataset(use: Optional[bool] = None) -> bool:
-    if use is not None:
-        _iru._USE_SHARDED_DATASETS = use
-        return _iru._USE_SHARDED_DATASETS
-
-    use = os.getenv("TORCHVISION_SHARDED_DATASETS")
-    if use is not None:
-        return use == "1"
-
-    return _iru._USE_SHARDED_DATASETS
diff --git a/torchvision/prototype/datasets/benchmark.py b/torchvision/prototype/datasets/benchmark.py
deleted file mode 100644
index 104ef95c9..000000000
--- a/torchvision/prototype/datasets/benchmark.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# type: ignore
-
-import argparse
-import collections.abc
-import contextlib
-import inspect
-import itertools
-import os
-import os.path
-import pathlib
-import shutil
-import sys
-import tempfile
-import time
-import unittest.mock
-import warnings
-
-import torch
-from torch.utils.data import DataLoader
-from torch.utils.data.dataloader_experimental import DataLoader2
-from torchvision import datasets as legacy_datasets
-from torchvision.datasets.utils import extract_archive
-from torchvision.prototype import datasets as new_datasets
-from torchvision.transforms import PILToTensor
-
-
-def main(
-    name,
-    *,
-    variant=None,
-    legacy=True,
-    new=True,
-    start=True,
-    iteration=True,
-    num_starts=3,
-    num_samples=10_000,
-    temp_root=None,
-    num_workers=0,
-):
-    benchmarks = [
-        benchmark
-        for benchmark in DATASET_BENCHMARKS
-        if benchmark.name == name and (variant is None or benchmark.variant == variant)
-    ]
-    if not benchmarks:
-        msg = f"No DatasetBenchmark available for dataset '{name}'"
-        if variant is not None:
-            msg += f" and variant '{variant}'"
-        raise ValueError(msg)
-
-    for benchmark in benchmarks:
-        print("#" * 80)
-        print(f"{benchmark.name}" + (f" ({benchmark.variant})" if benchmark.variant is not None else ""))
-
-        if legacy and start:
-            print(
-                "legacy",
-                "cold_start",
-                Measurement.time(benchmark.legacy_cold_start(temp_root, num_workers=num_workers), number=num_starts),
-            )
-            print(
-                "legacy",
-                "warm_start",
-                Measurement.time(benchmark.legacy_warm_start(temp_root, num_workers=num_workers), number=num_starts),
-            )
-
-        if legacy and iteration:
-            print(
-                "legacy",
-                "iteration",
-                Measurement.iterations_per_time(
-                    benchmark.legacy_iteration(temp_root, num_workers=num_workers, num_samples=num_samples)
-                ),
-            )
-
-        if new and start:
-            print(
-                "new",
-                "cold_start",
-                Measurement.time(benchmark.new_cold_start(num_workers=num_workers), number=num_starts),
-            )
-
-        if new and iteration:
-            print(
-                "new",
-                "iteration",
-                Measurement.iterations_per_time(
-                    benchmark.new_iteration(num_workers=num_workers, num_samples=num_samples)
-                ),
-            )
-
-
-class DatasetBenchmark:
-    def __init__(
-        self,
-        name: str,
-        *,
-        variant=None,
-        legacy_cls=None,
-        new_config=None,
-        legacy_config_map=None,
-        legacy_special_options_map=None,
-        prepare_legacy_root=None,
-    ):
-        self.name = name
-        self.variant = variant
-
-        self.new_raw_dataset = new_datasets._api.find(name)
-        self.legacy_cls = legacy_cls or self._find_legacy_cls()
-
-        if new_config is None:
-            new_config = self.new_raw_dataset.default_config
-        elif isinstance(new_config, dict):
-            new_config = self.new_raw_dataset.info.make_config(**new_config)
-        self.new_config = new_config
-
-        self.legacy_config_map = legacy_config_map
-        self.legacy_special_options_map = legacy_special_options_map or self._legacy_special_options_map
-        self.prepare_legacy_root = prepare_legacy_root
-
-    def new_dataset(self, *, num_workers=0):
-        return DataLoader2(new_datasets.load(self.name, **self.new_config), num_workers=num_workers)
-
-    def new_cold_start(self, *, num_workers):
-        def fn(timer):
-            with timer:
-                dataset = self.new_dataset(num_workers=num_workers)
-                next(iter(dataset))
-
-        return fn
-
-    def new_iteration(self, *, num_samples, num_workers):
-        def fn(timer):
-            dataset = self.new_dataset(num_workers=num_workers)
-            num_sample = 0
-            with timer:
-                for _ in dataset:
-                    num_sample += 1
-                    if num_sample == num_samples:
-                        break
-
-            return num_sample
-
-        return fn
-
-    def suppress_output(self):
-        @contextlib.contextmanager
-        def context_manager():
-            with open(os.devnull, "w") as devnull:
-                with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
-                    yield
-
-        return context_manager()
-
-    def legacy_dataset(self, root, *, num_workers=0, download=None):
-        legacy_config = self.legacy_config_map(self, root) if self.legacy_config_map else dict()
-
-        special_options = self.legacy_special_options_map(self)
-        if "download" in special_options and download is not None:
-            special_options["download"] = download
-
-        with self.suppress_output():
-            return DataLoader(
-                self.legacy_cls(legacy_config.pop("root", str(root)), **legacy_config, **special_options),
-                shuffle=True,
-                num_workers=num_workers,
-            )
-
-    @contextlib.contextmanager
-    def patch_download_and_integrity_checks(self):
-        patches = [
-            ("download_url", dict()),
-            ("download_file_from_google_drive", dict()),
-            ("check_integrity", dict(new=lambda path, md5=None: os.path.isfile(path))),
-        ]
-        dataset_module = sys.modules[self.legacy_cls.__module__]
-        utils_module = legacy_datasets.utils
-        with contextlib.ExitStack() as stack:
-            for name, patch_kwargs in patches:
-                patch_module = dataset_module if name in dir(dataset_module) else utils_module
-                stack.enter_context(unittest.mock.patch(f"{patch_module.__name__}.{name}", **patch_kwargs))
-
-            yield stack
-
-    def _find_resource_file_names(self):
-        info = self.new_raw_dataset.info
-        valid_options = info._valid_options
-
-        file_names = set()
-        for options in (
-            dict(zip(valid_options.keys(), values)) for values in itertools.product(*valid_options.values())
-        ):
-            resources = self.new_raw_dataset.resources(info.make_config(**options))
-            file_names.update([resource.file_name for resource in resources])
-
-        return file_names
-
-    @contextlib.contextmanager
-    def legacy_root(self, temp_root):
-        new_root = pathlib.Path(new_datasets.home()) / self.name
-        legacy_root = pathlib.Path(tempfile.mkdtemp(dir=temp_root))
-
-        if os.stat(new_root).st_dev != os.stat(legacy_root).st_dev:
-            warnings.warn(
-                "The temporary root directory for the legacy dataset was created on a different storage device than "
-                "the raw data that is used by the new dataset. If the devices have different I/O stats, this will "
-                "distort the benchmark. You can use the '--temp-root' flag to relocate the root directory of the "
-                "temporary directories.",
-                RuntimeWarning,
-            )
-
-        try:
-            for file_name in self._find_resource_file_names():
-                (legacy_root / file_name).symlink_to(new_root / file_name)
-
-            if self.prepare_legacy_root:
-                self.prepare_legacy_root(self, legacy_root)
-
-            with self.patch_download_and_integrity_checks():
-                yield legacy_root
-        finally:
-            shutil.rmtree(legacy_root)
-
-    def legacy_cold_start(self, temp_root, *, num_workers):
-        def fn(timer):
-            with self.legacy_root(temp_root) as root:
-                with timer:
-                    dataset = self.legacy_dataset(root, num_workers=num_workers)
-                    next(iter(dataset))
-
-        return fn
-
-    def legacy_warm_start(self, temp_root, *, num_workers):
-        def fn(timer):
-            with self.legacy_root(temp_root) as root:
-                self.legacy_dataset(root, num_workers=num_workers)
-                with timer:
-                    dataset = self.legacy_dataset(root, num_workers=num_workers, download=False)
-                    next(iter(dataset))
-
-        return fn
-
-    def legacy_iteration(self, temp_root, *, num_samples, num_workers):
-        def fn(timer):
-            with self.legacy_root(temp_root) as root:
-                dataset = self.legacy_dataset(root, num_workers=num_workers)
-                with timer:
-                    for num_sample, _ in enumerate(dataset, 1):
-                        if num_sample == num_samples:
-                            break
-
-            return num_sample
-
-        return fn
-
-    def _find_legacy_cls(self):
-        legacy_clss = {
-            name.lower(): dataset_class
-            for name, dataset_class in legacy_datasets.__dict__.items()
-            if isinstance(dataset_class, type) and issubclass(dataset_class, legacy_datasets.VisionDataset)
-        }
-        try:
-            return legacy_clss[self.name]
-        except KeyError as error:
-            raise RuntimeError(
-                f"Can't determine the legacy dataset class for '{self.name}' automatically. "
-                f"Please set the 'legacy_cls' keyword argument manually."
-            ) from error
-
-    _SPECIAL_KWARGS = {
-        "transform",
-        "target_transform",
-        "transforms",
-        "download",
-    }
-
-    @staticmethod
-    def _legacy_special_options_map(benchmark):
-        available_parameters = set()
-
-        for cls in benchmark.legacy_cls.__mro__:
-            if cls is legacy_datasets.VisionDataset:
-                break
-
-            available_parameters.update(inspect.signature(cls.__init__).parameters)
-
-        available_special_kwargs = benchmark._SPECIAL_KWARGS.intersection(available_parameters)
-
-        special_options = dict()
-
-        if "download" in available_special_kwargs:
-            special_options["download"] = True
-
-        if "transform" in available_special_kwargs:
-            special_options["transform"] = PILToTensor()
-            if "target_transform" in available_special_kwargs:
-                special_options["target_transform"] = torch.tensor
-        elif "transforms" in available_special_kwargs:
-            special_options["transforms"] = JointTransform(PILToTensor(), PILToTensor())
-
-        return special_options
-
-
-class Measurement:
-    @classmethod
-    def time(cls, fn, *, number):
-        results = Measurement._timeit(fn, number=number)
-        times = torch.tensor(tuple(zip(*results))[1])
-        return cls._format(times, unit="s")
-
-    @classmethod
-    def iterations_per_time(cls, fn):
-        num_samples, time = Measurement._timeit(fn, number=1)[0]
-        iterations_per_second = torch.tensor(num_samples) / torch.tensor(time)
-        return cls._format(iterations_per_second, unit="it/s")
-
-    class Timer:
-        def __init__(self):
-            self._start = None
-            self._stop = None
-
-        def __enter__(self):
-            self._start = time.perf_counter()
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            self._stop = time.perf_counter()
-
-        @property
-        def delta(self):
-            if self._start is None:
-                raise RuntimeError()
-            elif self._stop is None:
-                raise RuntimeError()
-            return self._stop - self._start
-
-    @classmethod
-    def _timeit(cls, fn, number):
-        results = []
-        for _ in range(number):
-            timer = cls.Timer()
-            output = fn(timer)
-            results.append((output, timer.delta))
-        return results
-
-    @classmethod
-    def _format(cls, measurements, *, unit):
-        measurements = torch.as_tensor(measurements).to(torch.float64).flatten()
-        if measurements.numel() == 1:
-            # TODO format that into engineering format
-            return f"{float(measurements):.3f} {unit}"
-
-        mean, std = Measurement._compute_mean_and_std(measurements)
-        # TODO format that into engineering format
-        return f"{mean:.3f} ± {std:.3f} {unit}"
-
-    @classmethod
-    def _compute_mean_and_std(cls, t):
-        mean = float(t.mean())
-        std = float(t.std(0, unbiased=t.numel() > 1))
-        return mean, std
-
-
-def no_split(benchmark, root):
-    legacy_config = dict(benchmark.new_config)
-    del legacy_config["split"]
-    return legacy_config
-
-
-def bool_split(name="train"):
-    def legacy_config_map(benchmark, root):
-        legacy_config = dict(benchmark.new_config)
-        legacy_config[name] = legacy_config.pop("split") == "train"
-        return legacy_config
-
-    return legacy_config_map
-
-
-def base_folder(rel_folder=None):
-    if rel_folder is None:
-
-        def rel_folder(benchmark):
-            return benchmark.name
-
-    elif not callable(rel_folder):
-        name = rel_folder
-
-        def rel_folder(_):
-            return name
-
-    def prepare_legacy_root(benchmark, root):
-        files = list(root.glob("*"))
-        folder = root / rel_folder(benchmark)
-        folder.mkdir(parents=True)
-        for file in files:
-            shutil.move(str(file), str(folder))
-
-        return folder
-
-    return prepare_legacy_root
-
-
-class JointTransform:
-    def __init__(self, *transforms):
-        self.transforms = transforms
-
-    def __call__(self, *inputs):
-        if len(inputs) == 1 and isinstance(inputs, collections.abc.Sequence):
-            inputs = inputs[0]
-
-        if len(inputs) != len(self.transforms):
-            raise RuntimeError(
-                f"The number of inputs and transforms mismatches: {len(inputs)} != {len(self.transforms)}."
-            )
-
-        return tuple(transform(input) for transform, input in zip(self.transforms, inputs))
-
-
-def caltech101_legacy_config_map(benchmark, root):
-    legacy_config = no_split(benchmark, root)
-    # The new dataset always returns the category and annotation
-    legacy_config["target_type"] = ("category", "annotation")
-    return legacy_config
-
-
-mnist_base_folder = base_folder(lambda benchmark: pathlib.Path(benchmark.legacy_cls.__name__) / "raw")
-
-
-def mnist_legacy_config_map(benchmark, root):
-    return dict(train=benchmark.new_config.split == "train")
-
-
-def emnist_prepare_legacy_root(benchmark, root):
-    folder = mnist_base_folder(benchmark, root)
-    shutil.move(str(folder / "emnist-gzip.zip"), str(folder / "gzip.zip"))
-    return folder
-
-
-def emnist_legacy_config_map(benchmark, root):
-    legacy_config = mnist_legacy_config_map(benchmark, root)
-    legacy_config["split"] = benchmark.new_config.image_set.replace("_", "").lower()
-    return legacy_config
-
-
-def qmnist_legacy_config_map(benchmark, root):
-    legacy_config = mnist_legacy_config_map(benchmark, root)
-    legacy_config["what"] = benchmark.new_config.split
-    # The new dataset always returns the full label
-    legacy_config["compat"] = False
-    return legacy_config
-
-
-def coco_legacy_config_map(benchmark, root):
-    images, _ = benchmark.new_raw_dataset.resources(benchmark.new_config)
-    return dict(
-        root=str(root / pathlib.Path(images.file_name).stem),
-        annFile=str(
-            root / "annotations" / f"{benchmark.variant}_{benchmark.new_config.split}{benchmark.new_config.year}.json"
-        ),
-    )
-
-
-def coco_prepare_legacy_root(benchmark, root):
-    images, annotations = benchmark.new_raw_dataset.resources(benchmark.new_config)
-    extract_archive(str(root / images.file_name))
-    extract_archive(str(root / annotations.file_name))
-
-
-DATASET_BENCHMARKS = [
-    DatasetBenchmark(
-        "caltech101",
-        legacy_config_map=caltech101_legacy_config_map,
-        prepare_legacy_root=base_folder(),
-        legacy_special_options_map=lambda config: dict(
-            download=True,
-            transform=PILToTensor(),
-            target_transform=JointTransform(torch.tensor, torch.tensor),
-        ),
-    ),
-    DatasetBenchmark(
-        "caltech256",
-        legacy_config_map=no_split,
-        prepare_legacy_root=base_folder(),
-    ),
-    DatasetBenchmark(
-        "celeba",
-        prepare_legacy_root=base_folder(),
-        legacy_config_map=lambda benchmark: dict(
-            split="valid" if benchmark.new_config.split == "val" else benchmark.new_config.split,
-            # The new dataset always returns all annotations
-            target_type=("attr", "identity", "bbox", "landmarks"),
-        ),
-    ),
-    DatasetBenchmark(
-        "cifar10",
-        legacy_config_map=bool_split(),
-    ),
-    DatasetBenchmark(
-        "cifar100",
-        legacy_config_map=bool_split(),
-    ),
-    DatasetBenchmark(
-        "emnist",
-        prepare_legacy_root=emnist_prepare_legacy_root,
-        legacy_config_map=emnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "fashionmnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "kmnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "mnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "qmnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "sbd",
-        legacy_cls=legacy_datasets.SBDataset,
-        legacy_config_map=lambda benchmark: dict(
-            image_set=benchmark.new_config.split,
-            mode="boundaries" if benchmark.new_config.boundaries else "segmentation",
-        ),
-        legacy_special_options_map=lambda benchmark: dict(
-            download=True,
-            transforms=JointTransform(
-                PILToTensor(), torch.tensor if benchmark.new_config.boundaries else PILToTensor()
-            ),
-        ),
-    ),
-    DatasetBenchmark("voc", legacy_cls=legacy_datasets.VOCDetection),
-    DatasetBenchmark("imagenet", legacy_cls=legacy_datasets.ImageNet),
-    DatasetBenchmark(
-        "coco",
-        variant="instances",
-        legacy_cls=legacy_datasets.CocoDetection,
-        new_config=dict(split="train", annotations="instances"),
-        legacy_config_map=coco_legacy_config_map,
-        prepare_legacy_root=coco_prepare_legacy_root,
-        legacy_special_options_map=lambda benchmark: dict(transform=PILToTensor(), target_transform=None),
-    ),
-    DatasetBenchmark(
-        "coco",
-        variant="captions",
-        legacy_cls=legacy_datasets.CocoCaptions,
-        new_config=dict(split="train", annotations="captions"),
-        legacy_config_map=coco_legacy_config_map,
-        prepare_legacy_root=coco_prepare_legacy_root,
-        legacy_special_options_map=lambda benchmark: dict(transform=PILToTensor(), target_transform=None),
-    ),
-]
-
-
-def parse_args(argv=None):
-    parser = argparse.ArgumentParser(
-        prog="torchvision.prototype.datasets.benchmark.py",
-        description="Utility to benchmark new datasets against their legacy variants.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    parser.add_argument("name", help="Name of the dataset to benchmark.")
-    parser.add_argument(
-        "--variant", help="Variant of the dataset. If omitted all available variants will be benchmarked."
-    )
-
-    parser.add_argument(
-        "-n",
-        "--num-starts",
-        type=int,
-        default=3,
-        help="Number of warm and cold starts of each benchmark. Default to 3.",
-    )
-    parser.add_argument(
-        "-N",
-        "--num-samples",
-        type=int,
-        default=10_000,
-        help="Maximum number of samples to draw during iteration benchmarks. Defaults to 10_000.",
-    )
-
-    parser.add_argument(
-        "--nl",
-        "--no-legacy",
-        dest="legacy",
-        action="store_false",
-        help="Skip legacy benchmarks.",
-    )
-    parser.add_argument(
-        "--nn",
-        "--no-new",
-        dest="new",
-        action="store_false",
-        help="Skip new benchmarks.",
-    )
-    parser.add_argument(
-        "--ns",
-        "--no-start",
-        dest="start",
-        action="store_false",
-        help="Skip start benchmarks.",
-    )
-    parser.add_argument(
-        "--ni",
-        "--no-iteration",
-        dest="iteration",
-        action="store_false",
-        help="Skip iteration benchmarks.",
-    )
-
-    parser.add_argument(
-        "-t",
-        "--temp-root",
-        type=pathlib.Path,
-        help=(
-            "Root of the temporary legacy root directories. Use this if your system default temporary directory is on "
-            "another storage device as the raw data to avoid distortions due to differing I/O stats."
-        ),
-    )
-    parser.add_argument(
-        "-j",
-        "--num-workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses used to load the data. Setting this to 0 (default) will load all data in the main "
-            "process and thus disable multi-processing."
-        ),
-    )
-
-    return parser.parse_args(argv or sys.argv[1:])
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    try:
-        main(
-            args.name,
-            variant=args.variant,
-            legacy=args.legacy,
-            new=args.new,
-            start=args.start,
-            iteration=args.iteration,
-            num_starts=args.num_starts,
-            num_samples=args.num_samples,
-            temp_root=args.temp_root,
-            num_workers=args.num_workers,
-        )
-    except Exception as error:
-        msg = str(error)
-        print(msg or f"Unspecified {type(error)} was raised during execution.", file=sys.stderr)
-        sys.exit(1)
diff --git a/torchvision/prototype/datasets/generate_category_files.py b/torchvision/prototype/datasets/generate_category_files.py
deleted file mode 100644
index 6d4e854fe..000000000
--- a/torchvision/prototype/datasets/generate_category_files.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# type: ignore
-
-import argparse
-import csv
-import sys
-
-from torchvision.prototype import datasets
-from torchvision.prototype.datasets.utils._internal import BUILTIN_DIR
-
-
-def main(*names, force=False):
-    for name in names:
-        path = BUILTIN_DIR / f"{name}.categories"
-        if path.exists() and not force:
-            continue
-
-        dataset = datasets.load(name)
-        try:
-            categories = dataset._generate_categories()
-        except NotImplementedError:
-            continue
-
-        with open(path, "w") as file:
-            writer = csv.writer(file, lineterminator="\n")
-            for category in categories:
-                writer.writerow((category,) if isinstance(category, str) else category)
-
-
-def parse_args(argv=None):
-    parser = argparse.ArgumentParser(prog="torchvision.prototype.datasets.generate_category_files.py")
-
-    parser.add_argument(
-        "names",
-        nargs="*",
-        type=str,
-        help="Names of datasets to generate category files for. If omitted, all datasets will be used.",
-    )
-    parser.add_argument(
-        "-f",
-        "--force",
-        action="store_true",
-        help="Force regeneration of category files.",
-    )
-
-    args = parser.parse_args(argv or sys.argv[1:])
-
-    if not args.names:
-        args.names = datasets.list_datasets()
-
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    try:
-        main(*args.names, force=args.force)
-    except Exception as error:
-        msg = str(error)
-        print(msg or f"Unspecified {type(error)} was raised during execution.", file=sys.stderr)
-        sys.exit(1)
diff --git a/torchvision/prototype/datasets/utils/__init__.py b/torchvision/prototype/datasets/utils/__init__.py
deleted file mode 100644
index 3fdb53eec..000000000
--- a/torchvision/prototype/datasets/utils/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from . import _internal  # usort: skip
-from ._dataset import Dataset
-from ._encoded import EncodedData, EncodedImage
-from ._resource import GDriveResource, HttpResource, KaggleDownloadResource, ManualDownloadResource, OnlineResource
diff --git a/torchvision/prototype/datasets/utils/_dataset.py b/torchvision/prototype/datasets/utils/_dataset.py
deleted file mode 100644
index 0d1cc2b15..000000000
--- a/torchvision/prototype/datasets/utils/_dataset.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import abc
-import importlib
-import pathlib
-from typing import Any, Collection, Dict, Iterator, List, Optional, Sequence, Union
-
-from torchdata.datapipes.iter import IterDataPipe
-from torchvision.datasets.utils import verify_str_arg
-
-from ._resource import OnlineResource
-
-
-class Dataset(IterDataPipe[Dict[str, Any]], abc.ABC):
-    @staticmethod
-    def _verify_str_arg(
-        value: str,
-        arg: Optional[str] = None,
-        valid_values: Optional[Collection[str]] = None,
-        *,
-        custom_msg: Optional[str] = None,
-    ) -> str:
-        return verify_str_arg(value, arg, valid_values, custom_msg=custom_msg)
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False, dependencies: Collection[str] = ()
-    ) -> None:
-        for dependency in dependencies:
-            try:
-                importlib.import_module(dependency)
-            except ModuleNotFoundError:
-                raise ModuleNotFoundError(
-                    f"{type(self).__name__}() depends on the third-party package '{dependency}'. "
-                    f"Please install it, for example with `pip install {dependency}`."
-                ) from None
-
-        self._root = pathlib.Path(root).expanduser().resolve()
-        resources = [
-            resource.load(self._root, skip_integrity_check=skip_integrity_check) for resource in self._resources()
-        ]
-        self._dp = self._datapipe(resources)
-
-    def __iter__(self) -> Iterator[Dict[str, Any]]:
-        yield from self._dp
-
-    @abc.abstractmethod
-    def _resources(self) -> List[OnlineResource]:
-        pass
-
-    @abc.abstractmethod
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        pass
-
-    @abc.abstractmethod
-    def __len__(self) -> int:
-        pass
-
-    def _generate_categories(self) -> Sequence[Union[str, Sequence[str]]]:
-        raise NotImplementedError
diff --git a/torchvision/prototype/datasets/utils/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py
deleted file mode 100644
index 7a0af4258..000000000
--- a/torchvision/prototype/datasets/utils/_encoded.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from __future__ import annotations
-
-import os
-import sys
-from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
-
-from torchvision.tv_tensors._tv_tensor import TVTensor
-
-D = TypeVar("D", bound="EncodedData")
-
-
-class EncodedData(TVTensor):
-    @classmethod
-    def _wrap(cls: Type[D], tensor: torch.Tensor) -> D:
-        return tensor.as_subclass(cls)
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> EncodedData:
-        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-        # TODO: warn / bail out if we encounter a tensor with shape other than (N,) or with dtype other than uint8?
-        return cls._wrap(tensor)
-
-    @classmethod
-    def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D:
-        return cls._wrap(tensor)
-
-    @classmethod
-    def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D:
-        encoded_data = cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs)
-        file.close()
-        return encoded_data
-
-    @classmethod
-    def from_path(cls: Type[D], path: Union[str, os.PathLike], **kwargs: Any) -> D:
-        with open(path, "rb") as file:
-            return cls.from_file(file, **kwargs)
-
-
-class EncodedImage(EncodedData):
-    # TODO: Use @functools.cached_property if we can depend on Python 3.8
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        if not hasattr(self, "_spatial_size"):
-            with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image:
-                self._spatial_size = image.height, image.width
-
-        return self._spatial_size
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
deleted file mode 100644
index f8a44b627..000000000
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import csv
-import functools
-import pathlib
-import pickle
-from typing import Any, BinaryIO, Callable, Dict, IO, Iterator, List, Sequence, Sized, Tuple, TypeVar, Union
-
-import torch
-import torch.distributed as dist
-import torch.utils.data
-from torchdata.datapipes.iter import IoPathFileLister, IoPathFileOpener, IterDataPipe, ShardingFilter, Shuffler
-from torchvision.prototype.utils._internal import fromfile
-
-
-__all__ = [
-    "INFINITE_BUFFER_SIZE",
-    "BUILTIN_DIR",
-    "read_mat",
-    "MappingIterator",
-    "getitem",
-    "path_accessor",
-    "path_comparator",
-    "read_flo",
-    "hint_sharding",
-    "hint_shuffling",
-]
-
-K = TypeVar("K")
-D = TypeVar("D")
-
-# pseudo-infinite until a true infinite buffer is supported by all datapipes
-INFINITE_BUFFER_SIZE = 1_000_000_000
-
-BUILTIN_DIR = pathlib.Path(__file__).parent.parent / "_builtin"
-
-
-def read_mat(buffer: BinaryIO, **kwargs: Any) -> Any:
-    try:
-        import scipy.io as sio
-    except ImportError as error:
-        raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error
-
-    data = sio.loadmat(buffer, **kwargs)
-    buffer.close()
-    return data
-
-
-class MappingIterator(IterDataPipe[Union[Tuple[K, D], D]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[K, D]], *, drop_key: bool = False) -> None:
-        self.datapipe = datapipe
-        self.drop_key = drop_key
-
-    def __iter__(self) -> Iterator[Union[Tuple[K, D], D]]:
-        for mapping in self.datapipe:
-            yield from iter(mapping.values() if self.drop_key else mapping.items())
-
-
-def _getitem_closure(obj: Any, *, items: Sequence[Any]) -> Any:
-    for item in items:
-        obj = obj[item]
-    return obj
-
-
-def getitem(*items: Any) -> Callable[[Any], Any]:
-    return functools.partial(_getitem_closure, items=items)
-
-
-def _getattr_closure(obj: Any, *, attrs: Sequence[str]) -> Any:
-    for attr in attrs:
-        obj = getattr(obj, attr)
-    return obj
-
-
-def _path_attribute_accessor(path: pathlib.Path, *, name: str) -> Any:
-    return _getattr_closure(path, attrs=name.split("."))
-
-
-def _path_accessor_closure(data: Tuple[str, Any], *, getter: Callable[[pathlib.Path], D]) -> D:
-    return getter(pathlib.Path(data[0]))
-
-
-def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[Tuple[str, Any]], D]:
-    if isinstance(getter, str):
-        getter = functools.partial(_path_attribute_accessor, name=getter)
-
-    return functools.partial(_path_accessor_closure, getter=getter)
-
-
-def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool:
-    return accessor(data) == value
-
-
-def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]:
-    return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value)
-
-
-class PicklerDataPipe(IterDataPipe):
-    def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO[bytes]]]) -> None:
-        self.source_datapipe = source_datapipe
-
-    def __iter__(self) -> Iterator[Any]:
-        for _, fobj in self.source_datapipe:
-            data = pickle.load(fobj)
-            for _, d in enumerate(data):
-                yield d
-
-
-class SharderDataPipe(ShardingFilter):
-    def __init__(self, source_datapipe: IterDataPipe) -> None:
-        super().__init__(source_datapipe)
-        self.rank = 0
-        self.world_size = 1
-        if dist.is_available() and dist.is_initialized():
-            self.rank = dist.get_rank()
-            self.world_size = dist.get_world_size()
-        self.apply_sharding(self.world_size, self.rank)
-
-    def __iter__(self) -> Iterator[Any]:
-        num_workers = self.world_size
-        worker_id = self.rank
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is not None:
-            worker_id = worker_id + worker_info.id * num_workers
-            num_workers *= worker_info.num_workers
-        self.apply_sharding(num_workers, worker_id)
-        yield from super().__iter__()
-
-
-class TakerDataPipe(IterDataPipe):
-    def __init__(self, source_datapipe: IterDataPipe, num_take: int) -> None:
-        super().__init__()
-        self.source_datapipe = source_datapipe
-        self.num_take = num_take
-        self.world_size = 1
-        if dist.is_available() and dist.is_initialized():
-            self.world_size = dist.get_world_size()
-
-    def __iter__(self) -> Iterator[Any]:
-        num_workers = self.world_size
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is not None:
-            num_workers *= worker_info.num_workers
-
-        # TODO: this is weird as it drops more elements than it should
-        num_take = self.num_take // num_workers
-
-        for i, data in enumerate(self.source_datapipe):
-            if i < num_take:
-                yield data
-            else:
-                break
-
-    def __len__(self) -> int:
-        num_take = self.num_take // self.world_size
-        if isinstance(self.source_datapipe, Sized):
-            if len(self.source_datapipe) < num_take:
-                num_take = len(self.source_datapipe)
-        # TODO: might be weird to not take `num_workers` into account
-        return num_take
-
-
-def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe[Dict[str, Any]]:
-    dp = IoPathFileLister(root=root)
-    dp = SharderDataPipe(dp)
-    dp = dp.shuffle(buffer_size=INFINITE_BUFFER_SIZE)
-    dp = IoPathFileOpener(dp, mode="rb")
-    dp = PicklerDataPipe(dp)
-    # dp = dp.cycle(2)
-    dp = TakerDataPipe(dp, dataset_size)
-    return dp
-
-
-def read_flo(file: BinaryIO) -> torch.Tensor:
-    if file.read(4) != b"PIEH":
-        raise ValueError("Magic number incorrect. Invalid .flo file")
-
-    width, height = fromfile(file, dtype=torch.int32, byte_order="little", count=2)
-    flow = fromfile(file, dtype=torch.float32, byte_order="little", count=height * width * 2)
-    return flow.reshape((height, width, 2)).permute((2, 0, 1))
-
-
-def hint_sharding(datapipe: IterDataPipe) -> ShardingFilter:
-    return ShardingFilter(datapipe)
-
-
-def hint_shuffling(datapipe: IterDataPipe[D]) -> Shuffler[D]:
-    return Shuffler(datapipe, buffer_size=INFINITE_BUFFER_SIZE).set_shuffle(False)
-
-
-def read_categories_file(name: str) -> List[Union[str, Sequence[str]]]:
-    path = BUILTIN_DIR / f"{name}.categories"
-    with open(path, newline="") as file:
-        rows = list(csv.reader(file))
-        rows = [row[0] if len(row) == 1 else row for row in rows]
-        return rows
diff --git a/torchvision/prototype/datasets/utils/_resource.py b/torchvision/prototype/datasets/utils/_resource.py
deleted file mode 100644
index dadec014b..000000000
--- a/torchvision/prototype/datasets/utils/_resource.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import abc
-import hashlib
-import itertools
-import pathlib
-from typing import Any, Callable, IO, Literal, NoReturn, Optional, Sequence, Set, Tuple, Union
-from urllib.parse import urlparse
-
-from torchdata.datapipes.iter import (
-    FileLister,
-    FileOpener,
-    IterableWrapper,
-    IterDataPipe,
-    RarArchiveLoader,
-    TarArchiveLoader,
-    ZipArchiveLoader,
-)
-from torchvision.datasets.utils import (
-    _decompress,
-    _detect_file_type,
-    _get_google_drive_file_id,
-    _get_redirect_url,
-    download_file_from_google_drive,
-    download_url,
-    extract_archive,
-)
-
-
-class OnlineResource(abc.ABC):
-    def __init__(
-        self,
-        *,
-        file_name: str,
-        sha256: Optional[str] = None,
-        preprocess: Optional[Union[Literal["decompress", "extract"], Callable[[pathlib.Path], None]]] = None,
-    ) -> None:
-        self.file_name = file_name
-        self.sha256 = sha256
-
-        if isinstance(preprocess, str):
-            if preprocess == "decompress":
-                preprocess = self._decompress
-            elif preprocess == "extract":
-                preprocess = self._extract
-            else:
-                raise ValueError(
-                    f"Only `'decompress'` or `'extract'` are valid if `preprocess` is passed as string,"
-                    f"but got {preprocess} instead."
-                )
-        self._preprocess = preprocess
-
-    @staticmethod
-    def _extract(file: pathlib.Path) -> None:
-        extract_archive(str(file), to_path=str(file).replace("".join(file.suffixes), ""), remove_finished=False)
-
-    @staticmethod
-    def _decompress(file: pathlib.Path) -> None:
-        _decompress(str(file), remove_finished=True)
-
-    def _loader(self, path: pathlib.Path) -> IterDataPipe[Tuple[str, IO]]:
-        if path.is_dir():
-            return FileOpener(FileLister(str(path), recursive=True), mode="rb")
-
-        dp = FileOpener(IterableWrapper((str(path),)), mode="rb")
-
-        archive_loader = self._guess_archive_loader(path)
-        if archive_loader:
-            dp = archive_loader(dp)
-
-        return dp
-
-    _ARCHIVE_LOADERS = {
-        ".tar": TarArchiveLoader,
-        ".zip": ZipArchiveLoader,
-        ".rar": RarArchiveLoader,
-    }
-
-    def _guess_archive_loader(
-        self, path: pathlib.Path
-    ) -> Optional[Callable[[IterDataPipe[Tuple[str, IO]]], IterDataPipe[Tuple[str, IO]]]]:
-        try:
-            _, archive_type, _ = _detect_file_type(path.name)
-        except RuntimeError:
-            return None
-        return self._ARCHIVE_LOADERS.get(archive_type)  # type: ignore[arg-type]
-
-    def load(
-        self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False
-    ) -> IterDataPipe[Tuple[str, IO]]:
-        root = pathlib.Path(root)
-        path = root / self.file_name
-
-        # Instead of the raw file, there might also be files with fewer suffixes after decompression or directories
-        # with no suffixes at all. `pathlib.Path().stem` will only give us the name with the last suffix removed, which
-        # is not sufficient for files with multiple suffixes, e.g. foo.tar.gz.
-        stem = path.name.replace("".join(path.suffixes), "")
-
-        def find_candidates() -> Set[pathlib.Path]:
-            # Although it looks like we could glob for f"{stem}*" to find the file candidates as well as the folder
-            # candidate simultaneously, that would also pick up other files that share the same prefix. For example, the
-            # test split of the stanford-cars dataset uses the files
-            # - cars_test.tgz
-            # - cars_test_annos_withlabels.mat
-            # Globbing for `"cars_test*"` picks up both.
-            candidates = {file for file in path.parent.glob(f"{stem}.*")}
-            folder_candidate = path.parent / stem
-            if folder_candidate.exists():
-                candidates.add(folder_candidate)
-
-            return candidates
-
-        candidates = find_candidates()
-
-        if not candidates:
-            self.download(root, skip_integrity_check=skip_integrity_check)
-            if self._preprocess is not None:
-                self._preprocess(path)
-            candidates = find_candidates()
-
-        # We use the path with the fewest suffixes. This gives us the
-        # extracted > decompressed > raw
-        # priority that we want for the best I/O performance.
-        return self._loader(min(candidates, key=lambda candidate: len(candidate.suffixes)))
-
-    @abc.abstractmethod
-    def _download(self, root: pathlib.Path) -> None:
-        pass
-
-    def download(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False) -> pathlib.Path:
-        root = pathlib.Path(root)
-        self._download(root)
-        path = root / self.file_name
-        if self.sha256 and not skip_integrity_check:
-            self._check_sha256(path)
-        return path
-
-    def _check_sha256(self, path: pathlib.Path, *, chunk_size: int = 1024 * 1024) -> None:
-        hash = hashlib.sha256()
-        with open(path, "rb") as file:
-            while chunk := file.read(chunk_size):
-                hash.update(chunk)
-        sha256 = hash.hexdigest()
-        if sha256 != self.sha256:
-            raise RuntimeError(
-                f"After the download, the SHA256 checksum of {path} didn't match the expected one: "
-                f"{sha256} != {self.sha256}"
-            )
-
-
-class HttpResource(OnlineResource):
-    def __init__(
-        self, url: str, *, file_name: Optional[str] = None, mirrors: Sequence[str] = (), **kwargs: Any
-    ) -> None:
-        super().__init__(file_name=file_name or pathlib.Path(urlparse(url).path).name, **kwargs)
-        self.url = url
-        self.mirrors = mirrors
-        self._resolved = False
-
-    def resolve(self) -> OnlineResource:
-        if self._resolved:
-            return self
-
-        redirect_url = _get_redirect_url(self.url)
-        if redirect_url == self.url:
-            self._resolved = True
-            return self
-
-        meta = {
-            attr.lstrip("_"): getattr(self, attr)
-            for attr in (
-                "file_name",
-                "sha256",
-                "_preprocess",
-            )
-        }
-
-        gdrive_id = _get_google_drive_file_id(redirect_url)
-        if gdrive_id:
-            return GDriveResource(gdrive_id, **meta)
-
-        http_resource = HttpResource(redirect_url, **meta)
-        http_resource._resolved = True
-        return http_resource
-
-    def _download(self, root: pathlib.Path) -> None:
-        if not self._resolved:
-            return self.resolve()._download(root)
-
-        for url in itertools.chain((self.url,), self.mirrors):
-
-            try:
-                download_url(url, str(root), filename=self.file_name, md5=None)
-            # TODO: make this more precise
-            except Exception:
-                continue
-
-            return
-        else:
-            # TODO: make this more informative
-            raise RuntimeError("Download failed!")
-
-
-class GDriveResource(OnlineResource):
-    def __init__(self, id: str, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-        self.id = id
-
-    def _download(self, root: pathlib.Path) -> None:
-        download_file_from_google_drive(self.id, root=str(root), filename=self.file_name, md5=None)
-
-
-class ManualDownloadResource(OnlineResource):
-    def __init__(self, instructions: str, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-        self.instructions = instructions
-
-    def _download(self, root: pathlib.Path) -> NoReturn:
-        raise RuntimeError(
-            f"The file {self.file_name} cannot be downloaded automatically. "
-            f"Please follow the instructions below and place it in {root}\n\n"
-            f"{self.instructions}"
-        )
-
-
-class KaggleDownloadResource(ManualDownloadResource):
-    def __init__(self, challenge_url: str, *, file_name: str, **kwargs: Any) -> None:
-        instructions = "\n".join(
-            (
-                "1. Register and login at https://www.kaggle.com",
-                f"2. Navigate to {challenge_url}",
-                "3. Click 'Join Competition' and follow the instructions there",
-                "4. Navigate to the 'Data' tab",
-                f"5. Select {file_name} in the 'Data Explorer' and click the download button",
-            )
-        )
-        super().__init__(instructions, file_name=file_name, **kwargs)
diff --git a/torchvision/prototype/models/__init__.py b/torchvision/prototype/models/__init__.py
deleted file mode 100644
index 8b8eda9e9..000000000
--- a/torchvision/prototype/models/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import depth
diff --git a/torchvision/prototype/models/depth/__init__.py b/torchvision/prototype/models/depth/__init__.py
deleted file mode 100644
index 0ff02953c..000000000
--- a/torchvision/prototype/models/depth/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import stereo
diff --git a/torchvision/prototype/models/depth/stereo/__init__.py b/torchvision/prototype/models/depth/stereo/__init__.py
deleted file mode 100644
index cd075ca2b..000000000
--- a/torchvision/prototype/models/depth/stereo/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .raft_stereo import *
-from .crestereo import *
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
deleted file mode 100644
index f1b9a7c87..000000000
--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ /dev/null
@@ -1,1463 +0,0 @@
-import math
-from functools import partial
-from typing import Callable, Dict, Iterable, List, Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.models.optical_flow.raft as raft
-from torch import Tensor
-from torchvision.models._api import register_model, Weights, WeightsEnum
-from torchvision.models._utils import handle_legacy_interface
-from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
-from torchvision.ops import Conv2dNormActivation
-from torchvision.prototype.transforms._presets import StereoMatching
-
-all = (
-    "CREStereo",
-    "CREStereo_Base_Weights",
-    "crestereo_base",
-)
-
-
-class ConvexMaskPredictor(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels: int,
-        hidden_size: int,
-        upsample_factor: int,
-        multiplier: float = 0.25,
-    ) -> None:
-
-        super().__init__()
-        self.mask_head = nn.Sequential(
-            Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3),
-            # https://arxiv.org/pdf/2003.12039.pdf (Annex section B) for the
-            # following convolution output size
-            nn.Conv2d(hidden_size, upsample_factor**2 * 9, 1, padding=0),
-        )
-
-        self.multiplier = multiplier
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.mask_head(x) * self.multiplier
-        return x
-
-
-def get_correlation(
-    left_feature: Tensor,
-    right_feature: Tensor,
-    window_size: Tuple[int, int] = (3, 3),
-    dilate: Tuple[int, int] = (1, 1),
-) -> Tensor:
-    """Function that computes a correlation product between the left and right features.
-
-    The correlation is computed in a sliding window fashion, namely the left features are fixed
-    and for each ``(i, j)`` location we compute the correlation with a sliding window anchored in
-    ``(i, j)`` from the right feature map. The sliding window selects pixels obtained in the range of the sliding
-    window; i.e ``(i - window_size // 2, i + window_size // 2)`` respectively ``(j - window_size // 2, j + window_size // 2)``.
-    """
-
-    B, C, H, W = left_feature.shape
-
-    di_y, di_x = dilate[0], dilate[1]
-    pad_y, pad_x = window_size[0] // 2 * di_y, window_size[1] // 2 * di_x
-
-    right_padded = F.pad(right_feature, (pad_x, pad_x, pad_y, pad_y), mode="replicate")
-    # in order to vectorize the correlation computation over all pixel candidates
-    # we create multiple shifted right images which we stack on an extra dimension
-    right_padded = F.unfold(right_padded, kernel_size=(H, W), dilation=dilate)
-    # torch unfold returns a tensor of shape [B, flattened_values, n_selections]
-    right_padded = right_padded.permute(0, 2, 1)
-    # we consider rehsape back into [B, n_views, C, H, W]
-    right_padded = right_padded.reshape(B, (window_size[0] * window_size[1]), C, H, W)
-    # we expand the left features for broadcasting
-    left_feature = left_feature.unsqueeze(1)
-    # this will compute an element product of between [B, 1, C, H, W] * [B, n_views, C, H, W]
-    # to obtain correlations over the pixel candidates we perform a mean on the C dimension
-    correlation = torch.mean(left_feature * right_padded, dim=2, keepdim=False)
-    # the final correlation tensor shape will be [B, n_views, H, W]
-    # where on the i-th position of the n_views dimension we will have
-    # the correlation value between the left pixel
-    # and the i-th candidate on the right feature map
-    return correlation
-
-
-def _check_window_specs(
-    search_window_1d: Tuple[int, int] = (1, 9),
-    search_dilate_1d: Tuple[int, int] = (1, 1),
-    search_window_2d: Tuple[int, int] = (3, 3),
-    search_dilate_2d: Tuple[int, int] = (1, 1),
-) -> None:
-
-    if not np.prod(search_window_1d) == np.prod(search_window_2d):
-        raise ValueError(
-            f"The 1D and 2D windows should contain the same number of elements. "
-            f"1D shape: {search_window_1d} 2D shape: {search_window_2d}"
-        )
-    if not np.prod(search_window_1d) % 2 == 1:
-        raise ValueError(
-            f"Search windows should contain an odd number of elements in them."
-            f"Window of shape {search_window_1d} has {np.prod(search_window_1d)} elements."
-        )
-    if not any(size == 1 for size in search_window_1d):
-        raise ValueError(f"The 1D search window should have at least one size equal to 1. 1D shape: {search_window_1d}")
-    if any(size == 1 for size in search_window_2d):
-        raise ValueError(
-            f"The 2D search window should have all dimensions greater than 1. 2D shape: {search_window_2d}"
-        )
-    if any(dilate < 1 for dilate in search_dilate_1d):
-        raise ValueError(
-            f"The 1D search dilation should have all elements equal or greater than 1. 1D shape: {search_dilate_1d}"
-        )
-    if any(dilate < 1 for dilate in search_dilate_2d):
-        raise ValueError(
-            f"The 2D search dilation should have all elements equal greater than 1. 2D shape: {search_dilate_2d}"
-        )
-
-
-class IterativeCorrelationLayer(nn.Module):
-    def __init__(
-        self,
-        groups: int = 4,
-        search_window_1d: Tuple[int, int] = (1, 9),
-        search_dilate_1d: Tuple[int, int] = (1, 1),
-        search_window_2d: Tuple[int, int] = (3, 3),
-        search_dilate_2d: Tuple[int, int] = (1, 1),
-    ) -> None:
-
-        super().__init__()
-        _check_window_specs(
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-        self.search_pixels = np.prod(search_window_1d)
-        self.groups = groups
-
-        # two selection tables for dealing with the small_patch argument in the forward function
-        self.patch_sizes = {
-            "2d": [search_window_2d for _ in range(self.groups)],
-            "1d": [search_window_1d for _ in range(self.groups)],
-        }
-
-        self.dilate_sizes = {
-            "2d": [search_dilate_2d for _ in range(self.groups)],
-            "1d": [search_dilate_1d for _ in range(self.groups)],
-        }
-
-    def forward(self, left_feature: Tensor, right_feature: Tensor, flow: Tensor, window_type: str = "1d") -> Tensor:
-        """Function that computes 1 pass of non-offsetted Group-Wise correlation"""
-        coords = make_coords_grid(
-            left_feature.shape[0], left_feature.shape[2], left_feature.shape[3], device=str(left_feature.device)
-        )
-
-        # we offset the coordinate grid in the flow direction
-        coords = coords + flow
-        coords = coords.permute(0, 2, 3, 1)
-        # resample right features according to off-setted grid
-        right_feature = grid_sample(right_feature, coords, mode="bilinear", align_corners=True)
-
-        # use_small_patch is a flag by which we decide on how many axes
-        # we perform candidate search. See section 3.1 ``Deformable search window`` & Figure 4 in the paper.
-        patch_size_list = self.patch_sizes[window_type]
-        dilate_size_list = self.dilate_sizes[window_type]
-
-        # chunking the left and right feature to perform group-wise correlation
-        # mechanism similar to GroupNorm. See section 3.1 ``Group-wise correlation``.
-        left_groups = torch.chunk(left_feature, self.groups, dim=1)
-        right_groups = torch.chunk(right_feature, self.groups, dim=1)
-
-        correlations = []
-        # this boils down to rather than performing the correlation product
-        # over the entire C dimensions, we use subsets of C to get multiple correlation sets
-        for i in range(len(patch_size_list)):
-            correlation = get_correlation(left_groups[i], right_groups[i], patch_size_list[i], dilate_size_list[i])
-            correlations.append(correlation)
-        final_correlations = torch.cat(correlations, dim=1)
-        return final_correlations
-
-
-class AttentionOffsetCorrelationLayer(nn.Module):
-    def __init__(
-        self,
-        groups: int = 4,
-        attention_module: Optional[nn.Module] = None,
-        search_window_1d: Tuple[int, int] = (1, 9),
-        search_dilate_1d: Tuple[int, int] = (1, 1),
-        search_window_2d: Tuple[int, int] = (3, 3),
-        search_dilate_2d: Tuple[int, int] = (1, 1),
-    ) -> None:
-        super().__init__()
-        _check_window_specs(
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-        # convert to python scalar
-        self.search_pixels = int(np.prod(search_window_1d))
-        self.groups = groups
-
-        # two selection tables for dealing with the small_patch argument in the forward function
-        self.patch_sizes = {
-            "2d": [search_window_2d for _ in range(self.groups)],
-            "1d": [search_window_1d for _ in range(self.groups)],
-        }
-
-        self.dilate_sizes = {
-            "2d": [search_dilate_2d for _ in range(self.groups)],
-            "1d": [search_dilate_1d for _ in range(self.groups)],
-        }
-
-        self.attention_module = attention_module
-
-    def forward(
-        self,
-        left_feature: Tensor,
-        right_feature: Tensor,
-        flow: Tensor,
-        extra_offset: Tensor,
-        window_type: str = "1d",
-    ) -> Tensor:
-        """Function that computes 1 pass of offsetted Group-Wise correlation
-
-        If the class was provided with an attention layer, the left and right feature maps
-        will be passed through a transformer first
-        """
-        B, C, H, W = left_feature.shape
-
-        if self.attention_module is not None:
-            # prepare for transformer required input shapes
-            left_feature = left_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
-            right_feature = right_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
-            # this can be either self attention or cross attention, hence the tuple return
-            left_feature, right_feature = self.attention_module(left_feature, right_feature)
-            left_feature = left_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
-            right_feature = right_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
-
-        left_groups = torch.chunk(left_feature, self.groups, dim=1)
-        right_groups = torch.chunk(right_feature, self.groups, dim=1)
-
-        num_search_candidates = self.search_pixels
-        # for each pixel (i, j) we have a number of search candidates
-        # thus, for each candidate we should have an X-axis and Y-axis offset value
-        extra_offset = extra_offset.reshape(B, num_search_candidates, 2, H, W).permute(0, 1, 3, 4, 2)
-
-        patch_size_list = self.patch_sizes[window_type]
-        dilate_size_list = self.dilate_sizes[window_type]
-
-        group_channels = C // self.groups
-        correlations = []
-
-        for i in range(len(patch_size_list)):
-            left_group, right_group = left_groups[i], right_groups[i]
-            patch_size, dilate = patch_size_list[i], dilate_size_list[i]
-
-            di_y, di_x = dilate
-            ps_y, ps_x = patch_size
-            # define the search based on the window patch shape
-            ry, rx = ps_y // 2 * di_y, ps_x // 2 * di_x
-
-            # base offsets for search (i.e. where to look on the search index)
-            x_grid, y_grid = torch.meshgrid(
-                torch.arange(-rx, rx + 1, di_x), torch.arange(-ry, ry + 1, di_y), indexing="xy"
-            )
-            x_grid, y_grid = x_grid.to(flow.device), y_grid.to(flow.device)
-            offsets = torch.stack((x_grid, y_grid))
-            offsets = offsets.reshape(2, -1).permute(1, 0)
-
-            for d in (0, 2, 3):
-                offsets = offsets.unsqueeze(d)
-            # extra offsets for search (i.e. deformed search indexes. Similar concept to deformable convolutions)
-            offsets = offsets + extra_offset
-
-            coords = (
-                make_coords_grid(
-                    left_feature.shape[0], left_feature.shape[2], left_feature.shape[3], device=str(left_feature.device)
-                )
-                + flow
-            )
-            coords = coords.permute(0, 2, 3, 1).unsqueeze(1)
-            coords = coords + offsets
-            coords = coords.reshape(B, -1, W, 2)
-
-            right_group = grid_sample(right_group, coords, mode="bilinear", align_corners=True)
-            # we do not need to perform any window shifting because the grid sample op
-            # will return a multi-view right based on the num_search_candidates dimension in the offsets
-            right_group = right_group.reshape(B, group_channels, -1, H, W)
-            left_group = left_group.reshape(B, group_channels, -1, H, W)
-            correlation = torch.mean(left_group * right_group, dim=1)
-            correlations.append(correlation)
-
-        final_correlation = torch.cat(correlations, dim=1)
-        return final_correlation
-
-
-class AdaptiveGroupCorrelationLayer(nn.Module):
-    """
-    Container for computing various correlation types between a left and right feature map.
-    This module does not contain any optimisable parameters, it's solely a collection of ops.
-    We wrap in a nn.Module for torch.jit.script compatibility
-
-    Adaptive Group Correlation operations from: https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf
-
-    Canonical reference implementation: https://github.com/megvii-research/CREStereo/blob/master/nets/corr.py
-    """
-
-    def __init__(
-        self,
-        iterative_correlation_layer: IterativeCorrelationLayer,
-        attention_offset_correlation_layer: AttentionOffsetCorrelationLayer,
-    ) -> None:
-        super().__init__()
-
-        self.iterative_correlation_layer = iterative_correlation_layer
-        self.attention_offset_correlation_layer = attention_offset_correlation_layer
-
-    def forward(
-        self,
-        left_features: Tensor,
-        right_features: Tensor,
-        flow: torch.Tensor,
-        extra_offset: Optional[Tensor],
-        window_type: str = "1d",
-        iter_mode: bool = False,
-    ) -> Tensor:
-        if iter_mode or extra_offset is None:
-            corr = self.iterative_correlation_layer(left_features, right_features, flow, window_type)
-        else:
-            corr = self.attention_offset_correlation_layer(
-                left_features, right_features, flow, extra_offset, window_type
-            )  # type: ignore
-        return corr
-
-
-def elu_feature_map(x: Tensor) -> Tensor:
-    """Elu feature map operation from: https://arxiv.org/pdf/2006.16236.pdf"""
-    return F.elu(x) + 1
-
-
-class LinearAttention(nn.Module):
-    """
-    Linear attention operation from: https://arxiv.org/pdf/2006.16236.pdf
-    Canonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
-    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
-    """
-
-    def __init__(self, eps: float = 1e-6, feature_map_fn: Callable[[Tensor], Tensor] = elu_feature_map) -> None:
-        super().__init__()
-        self.eps = eps
-        self.feature_map_fn = feature_map_fn
-
-    def forward(
-        self,
-        queries: Tensor,
-        keys: Tensor,
-        values: Tensor,
-        q_mask: Optional[Tensor] = None,
-        kv_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        """
-        Args:
-            queries (torch.Tensor): [N, S1, H, D]
-            keys (torch.Tensor): [N, S2, H, D]
-            values (torch.Tensor): [N, S2, H, D]
-            q_mask (torch.Tensor): [N, S1] (optional)
-            kv_mask (torch.Tensor): [N, S2] (optional)
-        Returns:
-            queried_values (torch.Tensor): [N, S1, H, D]
-        """
-        queries = self.feature_map_fn(queries)
-        keys = self.feature_map_fn(keys)
-
-        if q_mask is not None:
-            queries = queries * q_mask[:, :, None, None]
-        if kv_mask is not None:
-            keys = keys * kv_mask[:, :, None, None]
-            values = values * kv_mask[:, :, None, None]
-
-        # mitigates fp16 overflows
-        values_length = values.shape[1]
-        values = values / values_length
-        kv = torch.einsum("NSHD, NSHV -> NHDV", keys, values)
-        z = 1 / (torch.einsum("NLHD, NHD -> NLH", queries, keys.sum(dim=1)) + self.eps)
-        # rescale at the end to account for fp16 mitigation
-        queried_values = torch.einsum("NLHD, NHDV, NLH -> NLHV", queries, kv, z) * values_length
-        return queried_values
-
-
-class SoftmaxAttention(nn.Module):
-    """
-    A simple softmax attention  operation
-    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
-    """
-
-    def __init__(self, dropout: float = 0.0) -> None:
-        super().__init__()
-        self.dropout = nn.Dropout(dropout) if dropout else nn.Identity()
-
-    def forward(
-        self,
-        queries: Tensor,
-        keys: Tensor,
-        values: Tensor,
-        q_mask: Optional[Tensor] = None,
-        kv_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        """
-        Computes classical softmax full-attention between all queries and keys.
-
-        Args:
-            queries (torch.Tensor): [N, S1, H, D]
-            keys (torch.Tensor): [N, S2, H, D]
-            values (torch.Tensor): [N, S2, H, D]
-            q_mask (torch.Tensor): [N, S1] (optional)
-            kv_mask (torch.Tensor): [N, S2] (optional)
-        Returns:
-            queried_values: [N, S1, H, D]
-        """
-
-        scale_factor = 1.0 / queries.shape[3] ** 0.5  # irsqrt(D) scaling
-        queries = queries * scale_factor
-
-        qk = torch.einsum("NLHD, NSHD -> NLSH", queries, keys)
-        if kv_mask is not None and q_mask is not None:
-            qk.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float("-inf"))
-
-        attention = torch.softmax(qk, dim=2)
-        attention = self.dropout(attention)
-
-        queried_values = torch.einsum("NLSH, NSHD -> NLHD", attention, values)
-        return queried_values
-
-
-class PositionalEncodingSine(nn.Module):
-    """
-    Sinusoidal positional encodings
-
-    Using the scaling term from https://github.com/megvii-research/CREStereo/blob/master/nets/attention/position_encoding.py
-    Reference implementation from https://github.com/facebookresearch/detr/blob/8a144f83a287f4d3fece4acdf073f387c5af387d/models/position_encoding.py#L28-L48
-    """
-
-    def __init__(self, dim_model: int, max_size: int = 256) -> None:
-        super().__init__()
-        self.dim_model = dim_model
-        self.max_size = max_size
-        # pre-registered for memory efficiency during forward pass
-        pe = self._make_pe_of_size(self.max_size)
-        self.register_buffer("pe", pe)
-
-    def _make_pe_of_size(self, size: int) -> Tensor:
-        pe = torch.zeros((self.dim_model, *(size, size)), dtype=torch.float32)
-        y_positions = torch.ones((size, size)).cumsum(0).float().unsqueeze(0)
-        x_positions = torch.ones((size, size)).cumsum(1).float().unsqueeze(0)
-        div_term = torch.exp(torch.arange(0.0, self.dim_model // 2, 2) * (-math.log(10000.0) / self.dim_model // 2))
-        div_term = div_term[:, None, None]
-        pe[0::4, :, :] = torch.sin(x_positions * div_term)
-        pe[1::4, :, :] = torch.cos(x_positions * div_term)
-        pe[2::4, :, :] = torch.sin(y_positions * div_term)
-        pe[3::4, :, :] = torch.cos(y_positions * div_term)
-        pe = pe.unsqueeze(0)
-        return pe
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x: [B, C, H, W]
-
-        Returns:
-            x: [B, C, H, W]
-        """
-        torch._assert(
-            len(x.shape) == 4,
-            f"PositionalEncodingSine requires a 4-D dimensional input. Provided tensor is of shape {x.shape}",
-        )
-
-        B, C, H, W = x.shape
-        return x + self.pe[:, :, :H, :W]  # type: ignore
-
-
-class LocalFeatureEncoderLayer(nn.Module):
-    """
-    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
-    """
-
-    def __init__(
-        self,
-        *,
-        dim_model: int,
-        num_heads: int,
-        attention_module: Callable[..., nn.Module] = LinearAttention,
-    ) -> None:
-        super().__init__()
-
-        self.attention_op = attention_module()
-
-        if not isinstance(self.attention_op, (LinearAttention, SoftmaxAttention)):
-            raise ValueError(
-                f"attention_module must be an instance of LinearAttention or SoftmaxAttention. Got {type(self.attention_op)}"
-            )
-
-        self.dim_head = dim_model // num_heads
-        self.num_heads = num_heads
-
-        # multi-head attention
-        self.query_proj = nn.Linear(dim_model, dim_model, bias=False)
-        self.key_proj = nn.Linear(dim_model, dim_model, bias=False)
-        self.value_proj = nn.Linear(dim_model, dim_model, bias=False)
-        self.merge = nn.Linear(dim_model, dim_model, bias=False)
-
-        # feed forward network
-        self.ffn = nn.Sequential(
-            nn.Linear(dim_model * 2, dim_model * 2, bias=False),
-            nn.ReLU(),
-            nn.Linear(dim_model * 2, dim_model, bias=False),
-        )
-
-        # norm layers
-        self.attention_norm = nn.LayerNorm(dim_model)
-        self.ffn_norm = nn.LayerNorm(dim_model)
-
-    def forward(
-        self, x: Tensor, source: Tensor, x_mask: Optional[Tensor] = None, source_mask: Optional[Tensor] = None
-    ) -> Tensor:
-        """
-        Args:
-            x (torch.Tensor): [B, S1, D]
-            source (torch.Tensor): [B, S2, D]
-            x_mask (torch.Tensor): [B, S1] (optional)
-            source_mask (torch.Tensor): [B, S2] (optional)
-        """
-        B, S, D = x.shape
-        queries, keys, values = x, source, source
-
-        queries = self.query_proj(queries).reshape(B, S, self.num_heads, self.dim_head)
-        keys = self.key_proj(keys).reshape(B, S, self.num_heads, self.dim_head)
-        values = self.value_proj(values).reshape(B, S, self.num_heads, self.dim_head)
-
-        # attention operation
-        message = self.attention_op(queries, keys, values, x_mask, source_mask)
-        # concatenating attention heads together before passing through projection layer
-        message = self.merge(message.reshape(B, S, D))
-        message = self.attention_norm(message)
-
-        # ffn operation
-        message = self.ffn(torch.cat([x, message], dim=2))
-        message = self.ffn_norm(message)
-
-        return x + message
-
-
-class LocalFeatureTransformer(nn.Module):
-    """
-    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
-    """
-
-    def __init__(
-        self,
-        *,
-        dim_model: int,
-        num_heads: int,
-        attention_directions: List[str],
-        attention_module: Callable[..., nn.Module] = LinearAttention,
-    ) -> None:
-        super(LocalFeatureTransformer, self).__init__()
-
-        self.attention_module = attention_module
-        self.attention_directions = attention_directions
-        for direction in attention_directions:
-            if direction not in ["self", "cross"]:
-                raise ValueError(
-                    f"Attention direction {direction} unsupported. LocalFeatureTransformer accepts only ``attention_type`` in ``[self, cross]``."
-                )
-
-        self.layers = nn.ModuleList(
-            [
-                LocalFeatureEncoderLayer(dim_model=dim_model, num_heads=num_heads, attention_module=attention_module)
-                for _ in attention_directions
-            ]
-        )
-
-    def forward(
-        self,
-        left_features: Tensor,
-        right_features: Tensor,
-        left_mask: Optional[Tensor] = None,
-        right_mask: Optional[Tensor] = None,
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Args:
-            left_features (torch.Tensor): [N, S1, D]
-            right_features (torch.Tensor): [N, S2, D]
-            left_mask (torch.Tensor): [N, S1] (optional)
-            right_mask (torch.Tensor): [N, S2] (optional)
-        Returns:
-            left_features (torch.Tensor): [N, S1, D]
-            right_features (torch.Tensor): [N, S2, D]
-        """
-
-        torch._assert(
-            left_features.shape[2] == right_features.shape[2],
-            f"left_features and right_features should have the same embedding dimensions. left_features: {left_features.shape[2]} right_features: {right_features.shape[2]}",
-        )
-
-        for idx, layer in enumerate(self.layers):
-            attention_direction = self.attention_directions[idx]
-
-            if attention_direction == "self":
-                left_features = layer(left_features, left_features, left_mask, left_mask)
-                right_features = layer(right_features, right_features, right_mask, right_mask)
-
-            elif attention_direction == "cross":
-                left_features = layer(left_features, right_features, left_mask, right_mask)
-                right_features = layer(right_features, left_features, right_mask, left_mask)
-
-        return left_features, right_features
-
-
-class PyramidDownsample(nn.Module):
-    """
-    A simple wrapper that return and Avg Pool feature pyramid based on the provided scales.
-    Implicitly returns the input as well.
-    """
-
-    def __init__(self, factors: Iterable[int]) -> None:
-        super().__init__()
-        self.factors = factors
-
-    def forward(self, x: torch.Tensor) -> List[Tensor]:
-        results = [x]
-        for factor in self.factors:
-            results.append(F.avg_pool2d(x, kernel_size=factor, stride=factor))
-        return results
-
-
-class CREStereo(nn.Module):
-    """
-    Implements CREStereo from the `"Practical Stereo Matching via Cascaded Recurrent Network
-    With Adaptive Correlation" <https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf>`_ paper.
-    Args:
-        feature_encoder (raft.FeatureEncoder): Raft-like Feature Encoder module extract low-level features from inputs.
-        update_block (raft.UpdateBlock): Raft-like Update Block which recursively refines a flow-map.
-        flow_head (raft.FlowHead): Raft-like Flow Head which predics a flow-map from some inputs.
-        self_attn_block (LocalFeatureTransformer): A Local Feature Transformer that performs self attention on the two feature maps.
-        cross_attn_block (LocalFeatureTransformer): A Local Feature Transformer that performs cross attention between the two feature maps
-            used in the Adaptive Group Correlation module.
-        feature_downsample_rates (List[int]): The downsample rates used to build a feature pyramid from the outputs of the `feature_encoder`. Default: [2, 4]
-        correlation_groups (int): In how many groups should the features be split when computer per-pixel correlation. Defaults 4.
-        search_window_1d (Tuple[int, int]): The alternate search window size in the x and y directions for the 1D case. Defaults to (1, 9).
-        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
-        search_window_2d (Tuple[int, int]): The alternate search window size in the x and y directions for the 2D case. Defaults to (3, 3).
-        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
-    """
-
-    def __init__(
-        self,
-        *,
-        feature_encoder: raft.FeatureEncoder,
-        update_block: raft.UpdateBlock,
-        flow_head: raft.FlowHead,
-        self_attn_block: LocalFeatureTransformer,
-        cross_attn_block: LocalFeatureTransformer,
-        feature_downsample_rates: Tuple[int, ...] = (2, 4),
-        correlation_groups: int = 4,
-        search_window_1d: Tuple[int, int] = (1, 9),
-        search_dilate_1d: Tuple[int, int] = (1, 1),
-        search_window_2d: Tuple[int, int] = (3, 3),
-        search_dilate_2d: Tuple[int, int] = (1, 1),
-    ) -> None:
-        super().__init__()
-        self.output_channels = 2
-
-        self.feature_encoder = feature_encoder
-        self.update_block = update_block
-        self.flow_head = flow_head
-        self.self_attn_block = self_attn_block
-
-        # average pooling for the feature encoder outputs
-        self.downsampling_pyramid = PyramidDownsample(feature_downsample_rates)
-        self.downsampling_factors: List[int] = [feature_encoder.downsample_factor]
-        base_downsample_factor: int = self.downsampling_factors[0]
-        for rate in feature_downsample_rates:
-            self.downsampling_factors.append(base_downsample_factor * rate)
-
-        # output resolution tracking
-        self.resolutions: List[str] = [f"1 / {factor}" for factor in self.downsampling_factors]
-        self.search_pixels = int(np.prod(search_window_1d))
-
-        # flow convex upsampling mask predictor
-        self.mask_predictor = ConvexMaskPredictor(
-            in_channels=feature_encoder.output_dim // 2,
-            hidden_size=feature_encoder.output_dim,
-            upsample_factor=feature_encoder.downsample_factor,
-            multiplier=0.25,
-        )
-
-        # offsets modules for offsetted feature selection
-        self.offset_convs = nn.ModuleDict()
-        self.correlation_layers = nn.ModuleDict()
-
-        offset_conv_layer = partial(
-            Conv2dNormActivation,
-            in_channels=feature_encoder.output_dim,
-            out_channels=self.search_pixels * 2,
-            norm_layer=None,
-            activation_layer=None,
-        )
-
-        # populate the dicts in top to bottom order
-        # useful for iterating through torch.jit.script module given the network forward pass
-        #
-        # Ignore the largest resolution. We handle that separately due to torch.jit.script
-        # not being able to access to runtime generated keys in ModuleDicts.
-        # This way, we can keep a generic way of processing all pyramid levels but except
-        # the final one
-        iterative_correlation_layer = partial(
-            IterativeCorrelationLayer,
-            groups=correlation_groups,
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-
-        attention_offset_correlation_layer = partial(
-            AttentionOffsetCorrelationLayer,
-            groups=correlation_groups,
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-
-        for idx, resolution in enumerate(reversed(self.resolutions[1:])):
-            # the largest resolution does use offset convolutions for sampling grid coords
-            offset_conv = None if idx == len(self.resolutions) - 1 else offset_conv_layer()
-            if offset_conv:
-                self.offset_convs[resolution] = offset_conv
-                # only the lowest resolution uses the cross attention module when computing correlation scores
-                attention_module = cross_attn_block if idx == 0 else None
-                self.correlation_layers[resolution] = AdaptiveGroupCorrelationLayer(
-                    iterative_correlation_layer=iterative_correlation_layer(),
-                    attention_offset_correlation_layer=attention_offset_correlation_layer(
-                        attention_module=attention_module
-                    ),
-                )
-
-        # correlation layer for the largest resolution
-        self.max_res_correlation_layer = AdaptiveGroupCorrelationLayer(
-            iterative_correlation_layer=iterative_correlation_layer(),
-            attention_offset_correlation_layer=attention_offset_correlation_layer(),
-        )
-
-        # simple 2D Postional Encodings
-        self.positional_encodings = PositionalEncodingSine(feature_encoder.output_dim)
-
-    def _get_window_type(self, iteration: int) -> str:
-        return "1d" if iteration % 2 == 0 else "2d"
-
-    def forward(
-        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 10
-    ) -> List[Tensor]:
-        features = torch.cat([left_image, right_image], dim=0)
-        features = self.feature_encoder(features)
-        left_features, right_features = features.chunk(2, dim=0)
-
-        # update block network state and input context are derived from the left feature map
-        net, ctx = left_features.chunk(2, dim=1)
-        net = torch.tanh(net)
-        ctx = torch.relu(ctx)
-
-        # will output lists of tensor.
-        l_pyramid = self.downsampling_pyramid(left_features)
-        r_pyramid = self.downsampling_pyramid(right_features)
-        net_pyramid = self.downsampling_pyramid(net)
-        ctx_pyramid = self.downsampling_pyramid(ctx)
-
-        # we store in reversed order because we process the pyramid from top to bottom
-        l_pyramid = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        r_pyramid = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        net_pyramid = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        ctx_pyramid = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-
-        # offsets for sampling pixel candidates in the correlation ops
-        offsets: Dict[str, Tensor] = {}
-        for resolution, offset_conv in self.offset_convs.items():
-            feature_map = l_pyramid[resolution]
-            offset = offset_conv(feature_map)
-            offsets[resolution] = (torch.sigmoid(offset) - 0.5) * 2.0
-
-        # the smallest resolution is prepared for passing through self attention
-        min_res = self.resolutions[-1]
-        max_res = self.resolutions[0]
-
-        B, C, MIN_H, MIN_W = l_pyramid[min_res].shape
-        # add positional encodings
-        l_pyramid[min_res] = self.positional_encodings(l_pyramid[min_res])
-        r_pyramid[min_res] = self.positional_encodings(r_pyramid[min_res])
-        # reshaping for transformer
-        l_pyramid[min_res] = l_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
-        r_pyramid[min_res] = r_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
-        # perform self attention
-        l_pyramid[min_res], r_pyramid[min_res] = self.self_attn_block(l_pyramid[min_res], r_pyramid[min_res])
-        # now we need to reshape back into [B, C, H, W] format
-        l_pyramid[min_res] = l_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
-        r_pyramid[min_res] = r_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
-
-        predictions: List[Tensor] = []
-        flow_estimates: Dict[str, Tensor] = {}
-        # we added this because of torch.script.jit
-        # also, the predicition prior is always going to have the
-        # spatial size of the features outputted by the feature encoder
-        flow_pred_prior: Tensor = torch.empty(
-            size=(B, 2, left_features.shape[2], left_features.shape[3]),
-            dtype=l_pyramid[max_res].dtype,
-            device=l_pyramid[max_res].device,
-        )
-
-        if flow_init is not None:
-            scale = l_pyramid[max_res].shape[2] / flow_init.shape[2]
-            # in CREStereo implementation they multiply with -scale instead of scale
-            # this can be either a downsample or an upsample based on the cascaded inference
-            # configuration
-
-            # we use a -scale because the flow used inside the network is a negative flow
-            # from the right to the left, so we flip the flow direction
-            flow_estimates[max_res] = -scale * F.interpolate(
-                input=flow_init,
-                size=l_pyramid[max_res].shape[2:],
-                mode="bilinear",
-                align_corners=True,
-            )
-
-        # when not provided with a flow prior, we construct one using the lower resolution maps
-        else:
-            # initialize a zero flow with the smallest resolution
-            flow = torch.zeros(size=(B, 2, MIN_H, MIN_W), device=left_features.device, dtype=left_features.dtype)
-
-            # flows from coarse resolutions are refined similarly
-            # we always need to fetch the next pyramid feature map as well
-            # when updating coarse resolutions, therefore we create a reversed
-            # view which has its order synced with the ModuleDict keys iterator
-            coarse_resolutions: List[str] = self.resolutions[::-1]  # using slicing because of torch.jit.script
-            fine_grained_resolution = max_res
-
-            # set the coarsest flow to the zero flow
-            flow_estimates[coarse_resolutions[0]] = flow
-
-            # the correlation layer ModuleDict will contain layers ordered from coarse to fine resolution
-            # i.e ["1 / 16", "1 / 8", "1 / 4"]
-            # the correlation layer ModuleDict has layers for all the resolutions except the fine one
-            # i.e {"1 / 16": Module, "1 / 8": Module}
-            # for these resolution we perform only half of the number of refinement iterations
-            for idx, (resolution, correlation_layer) in enumerate(self.correlation_layers.items()):
-                # compute the scale difference between the first pyramid scale and the current pyramid scale
-                scale_to_base = l_pyramid[fine_grained_resolution].shape[2] // l_pyramid[resolution].shape[2]
-                for it in range(num_iters // 2):
-                    # set whether we want to search on (X, Y) axes for correlation or just on X axis
-                    window_type = self._get_window_type(it)
-                    # we consider this a prior, therefore we do not want to back-propagate through it
-                    flow_estimates[resolution] = flow_estimates[resolution].detach()
-
-                    correlations = correlation_layer(
-                        l_pyramid[resolution],  # left
-                        r_pyramid[resolution],  # right
-                        flow_estimates[resolution],
-                        offsets[resolution],
-                        window_type,
-                    )
-
-                    # update the recurrent network state and the flow deltas
-                    net_pyramid[resolution], delta_flow = self.update_block(
-                        net_pyramid[resolution], ctx_pyramid[resolution], correlations, flow_estimates[resolution]
-                    )
-
-                    # the convex upsampling weights are computed w.r.t.
-                    # the recurrent update state
-                    up_mask = self.mask_predictor(net_pyramid[resolution])
-                    flow_estimates[resolution] = flow_estimates[resolution] + delta_flow
-                    # convex upsampling with the initial feature encoder downsampling rate
-                    flow_pred_prior = upsample_flow(
-                        flow_estimates[resolution], up_mask, factor=self.downsampling_factors[0]
-                    )
-                    # we then bilinear upsample to the final resolution
-                    # we use a factor that's equivalent to the difference between
-                    # the current downsample resolution and the base downsample resolution
-                    #
-                    # i.e. if a 1 / 16 flow is upsampled by 4 (base downsampling) we get a 1 / 4 flow.
-                    # therefore we have to further upscale it by the difference between
-                    # the current level 1 / 16 and the base level 1 / 4.
-                    #
-                    # we use a -scale because the flow used inside the network is a negative flow
-                    # from the right to the left, so we flip the flow direction in order to get the
-                    # left to right flow
-                    flow_pred = -upsample_flow(flow_pred_prior, None, factor=scale_to_base)
-                    predictions.append(flow_pred)
-
-                # when constructing the next resolution prior, we resample w.r.t
-                # to the scale of the next level in the pyramid
-                next_resolution = coarse_resolutions[idx + 1]
-                scale_to_next = l_pyramid[next_resolution].shape[2] / flow_pred_prior.shape[2]
-                # we use the flow_up_prior because this is a more accurate estimation of the true flow
-                # due to the convex upsample, which resembles a learned super-resolution module.
-                # this is not necessarily an upsample, it can be a downsample, based on the provided configuration
-                flow_estimates[next_resolution] = -scale_to_next * F.interpolate(
-                    input=flow_pred_prior,
-                    size=l_pyramid[next_resolution].shape[2:],
-                    mode="bilinear",
-                    align_corners=True,
-                )
-
-        # finally we will be doing a full pass through the fine-grained resolution
-        # this coincides with the maximum resolution
-
-        # we keep a separate loop here in order to avoid python control flow
-        # to decide how many iterations should we do based on the current resolution
-        # furthermore, if provided with an initial flow, there is no need to generate
-        # a prior estimate when moving into the final refinement stage
-
-        for it in range(num_iters):
-            search_window_type = self._get_window_type(it)
-
-            flow_estimates[max_res] = flow_estimates[max_res].detach()
-            # we run the fine-grained resolution correlations in iterative mode
-            # this means that we are using the fixed window pixel selections
-            # instead of the deformed ones as with the previous steps
-            correlations = self.max_res_correlation_layer(
-                l_pyramid[max_res],
-                r_pyramid[max_res],
-                flow_estimates[max_res],
-                extra_offset=None,
-                window_type=search_window_type,
-                iter_mode=True,
-            )
-
-            net_pyramid[max_res], delta_flow = self.update_block(
-                net_pyramid[max_res], ctx_pyramid[max_res], correlations, flow_estimates[max_res]
-            )
-
-            up_mask = self.mask_predictor(net_pyramid[max_res])
-            flow_estimates[max_res] = flow_estimates[max_res] + delta_flow
-            # at the final resolution we simply do a convex upsample using the base downsample rate
-            flow_pred = -upsample_flow(flow_estimates[max_res], up_mask, factor=self.downsampling_factors[0])
-            predictions.append(flow_pred)
-
-        return predictions
-
-
-def _crestereo(
-    *,
-    weights: Optional[WeightsEnum],
-    progress: bool,
-    # Feature Encoder
-    feature_encoder_layers: Tuple[int, int, int, int, int],
-    feature_encoder_strides: Tuple[int, int, int, int],
-    feature_encoder_block: Callable[..., nn.Module],
-    feature_encoder_norm_layer: Callable[..., nn.Module],
-    # Average Pooling Pyramid
-    feature_downsample_rates: Tuple[int, ...],
-    # Adaptive Correlation Layer
-    corr_groups: int,
-    corr_search_window_2d: Tuple[int, int],
-    corr_search_dilate_2d: Tuple[int, int],
-    corr_search_window_1d: Tuple[int, int],
-    corr_search_dilate_1d: Tuple[int, int],
-    # Flow head
-    flow_head_hidden_size: int,
-    # Recurrent block
-    recurrent_block_hidden_state_size: int,
-    recurrent_block_kernel_size: Tuple[Tuple[int, int], Tuple[int, int]],
-    recurrent_block_padding: Tuple[Tuple[int, int], Tuple[int, int]],
-    # Motion Encoder
-    motion_encoder_corr_layers: Tuple[int, int],
-    motion_encoder_flow_layers: Tuple[int, int],
-    motion_encoder_out_channels: int,
-    # Transformer Blocks
-    num_attention_heads: int,
-    num_self_attention_layers: int,
-    num_cross_attention_layers: int,
-    self_attention_module: Callable[..., nn.Module],
-    cross_attention_module: Callable[..., nn.Module],
-    **kwargs,
-) -> CREStereo:
-
-    feature_encoder = kwargs.pop("feature_encoder", None) or raft.FeatureEncoder(
-        block=feature_encoder_block,
-        layers=feature_encoder_layers,
-        strides=feature_encoder_strides,
-        norm_layer=feature_encoder_norm_layer,
-    )
-
-    if feature_encoder.output_dim % corr_groups != 0:
-        raise ValueError(
-            f"Final ``feature_encoder_layers`` size should be divisible by ``corr_groups`` argument."
-            f"Feature encoder output size : {feature_encoder.output_dim}, Correlation groups: {corr_groups}."
-        )
-
-    motion_encoder = kwargs.pop("motion_encoder", None) or raft.MotionEncoder(
-        in_channels_corr=corr_groups * int(np.prod(corr_search_window_1d)),
-        corr_layers=motion_encoder_corr_layers,
-        flow_layers=motion_encoder_flow_layers,
-        out_channels=motion_encoder_out_channels,
-    )
-
-    out_channels_context = feature_encoder_layers[-1] - recurrent_block_hidden_state_size
-    recurrent_block = kwargs.pop("recurrent_block", None) or raft.RecurrentBlock(
-        input_size=motion_encoder.out_channels + out_channels_context,
-        hidden_size=recurrent_block_hidden_state_size,
-        kernel_size=recurrent_block_kernel_size,
-        padding=recurrent_block_padding,
-    )
-
-    flow_head = kwargs.pop("flow_head", None) or raft.FlowHead(
-        in_channels=out_channels_context, hidden_size=flow_head_hidden_size
-    )
-
-    update_block = raft.UpdateBlock(motion_encoder=motion_encoder, recurrent_block=recurrent_block, flow_head=flow_head)
-
-    self_attention_module = kwargs.pop("self_attention_module", None) or LinearAttention
-    self_attn_block = LocalFeatureTransformer(
-        dim_model=feature_encoder.output_dim,
-        num_heads=num_attention_heads,
-        attention_directions=["self"] * num_self_attention_layers,
-        attention_module=self_attention_module,
-    )
-
-    cross_attention_module = kwargs.pop("cross_attention_module", None) or LinearAttention
-    cross_attn_block = LocalFeatureTransformer(
-        dim_model=feature_encoder.output_dim,
-        num_heads=num_attention_heads,
-        attention_directions=["cross"] * num_cross_attention_layers,
-        attention_module=cross_attention_module,
-    )
-
-    model = CREStereo(
-        feature_encoder=feature_encoder,
-        update_block=update_block,
-        flow_head=flow_head,
-        self_attn_block=self_attn_block,
-        cross_attn_block=cross_attn_block,
-        feature_downsample_rates=feature_downsample_rates,
-        correlation_groups=corr_groups,
-        search_window_1d=corr_search_window_1d,
-        search_window_2d=corr_search_window_2d,
-        search_dilate_1d=corr_search_dilate_1d,
-        search_dilate_2d=corr_search_dilate_2d,
-    )
-
-    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
-
-    return model
-
-
-_COMMON_META = {
-    "resize_size": (384, 512),
-}
-
-
-class CREStereo_Base_Weights(WeightsEnum):
-    """The metrics reported here are as follows.
-
-    ``mae`` is the "mean-average-error" and indicates how far (in pixels) the
-    predicted disparity is from its true value (equivalent to ``epe``). This is averaged over all pixels
-    of all images. ``1px``, ``3px``, ``5px`` and indicate the percentage of pixels that have a lower
-    error than that of the ground truth. ``relepe`` is the "relative-end-point-error" and is the
-    average ``epe`` divided by the average ground truth disparity. ``fl-all`` corresponds to the average of pixels whose epe
-    is either <3px, or whom's ``relepe`` is lower than 0.05 (therefore higher is better).
-
-    """
-
-    MEGVII_V1 = Weights(
-        # Weights ported from https://github.com/megvii-research/CREStereo
-        url="https://download.pytorch.org/models/crestereo-756c8b0f.pth",
-        transforms=StereoMatching,
-        meta={
-            **_COMMON_META,
-            "num_params": 5432948,
-            "recipe": "https://github.com/megvii-research/CREStereo",
-            "_metrics": {
-                "Middlebury2014-train": {
-                    # metrics for 10 refinement iterations and 1 cascade
-                    "mae": 0.792,
-                    "rmse": 2.765,
-                    "1px": 0.905,
-                    "3px": 0.958,
-                    "5px": 0.97,
-                    "relepe": 0.114,
-                    "fl-all": 90.429,
-                    "_detailed": {
-                        # 1 is the number of cascades
-                        1: {
-                            # 2 is number of refininement iterations
-                            2: {
-                                "mae": 1.704,
-                                "rmse": 3.738,
-                                "1px": 0.738,
-                                "3px": 0.896,
-                                "5px": 0.933,
-                                "relepe": 0.157,
-                                "fl-all": 76.464,
-                            },
-                            5: {
-                                "mae": 0.956,
-                                "rmse": 2.963,
-                                "1px": 0.88,
-                                "3px": 0.948,
-                                "5px": 0.965,
-                                "relepe": 0.124,
-                                "fl-all": 88.186,
-                            },
-                            10: {
-                                "mae": 0.792,
-                                "rmse": 2.765,
-                                "1px": 0.905,
-                                "3px": 0.958,
-                                "5px": 0.97,
-                                "relepe": 0.114,
-                                "fl-all": 90.429,
-                            },
-                            20: {
-                                "mae": 0.749,
-                                "rmse": 2.706,
-                                "1px": 0.907,
-                                "3px": 0.961,
-                                "5px": 0.972,
-                                "relepe": 0.113,
-                                "fl-all": 90.807,
-                            },
-                        },
-                        2: {
-                            2: {
-                                "mae": 1.702,
-                                "rmse": 3.784,
-                                "1px": 0.784,
-                                "3px": 0.894,
-                                "5px": 0.924,
-                                "relepe": 0.172,
-                                "fl-all": 80.313,
-                            },
-                            5: {
-                                "mae": 0.932,
-                                "rmse": 2.907,
-                                "1px": 0.877,
-                                "3px": 0.944,
-                                "5px": 0.963,
-                                "relepe": 0.125,
-                                "fl-all": 87.979,
-                            },
-                            10: {
-                                "mae": 0.773,
-                                "rmse": 2.768,
-                                "1px": 0.901,
-                                "3px": 0.958,
-                                "5px": 0.972,
-                                "relepe": 0.117,
-                                "fl-all": 90.43,
-                            },
-                            20: {
-                                "mae": 0.854,
-                                "rmse": 2.971,
-                                "1px": 0.9,
-                                "3px": 0.957,
-                                "5px": 0.97,
-                                "relepe": 0.122,
-                                "fl-all": 90.269,
-                            },
-                        },
-                    },
-                }
-            },
-            "_docs": """These weights were ported from the original paper. They
-            are trained on a dataset mixture of the author's choice.""",
-        },
-    )
-
-    CRESTEREO_ETH_MBL_V1 = Weights(
-        # Weights ported from https://github.com/megvii-research/CREStereo
-        url="https://download.pytorch.org/models/crestereo-8f0e0e9a.pth",
-        transforms=StereoMatching,
-        meta={
-            **_COMMON_META,
-            "num_params": 5432948,
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/depth/stereo",
-            "_metrics": {
-                "Middlebury2014-train": {
-                    # metrics for 10 refinement iterations and 1 cascade
-                    "mae": 1.416,
-                    "rmse": 3.53,
-                    "1px": 0.777,
-                    "3px": 0.896,
-                    "5px": 0.933,
-                    "relepe": 0.148,
-                    "fl-all": 78.388,
-                    "_detailed": {
-                        # 1 is the number of cascades
-                        1: {
-                            # 2 is the number of refinement iterations
-                            2: {
-                                "mae": 2.363,
-                                "rmse": 4.352,
-                                "1px": 0.611,
-                                "3px": 0.828,
-                                "5px": 0.891,
-                                "relepe": 0.176,
-                                "fl-all": 64.511,
-                            },
-                            5: {
-                                "mae": 1.618,
-                                "rmse": 3.71,
-                                "1px": 0.761,
-                                "3px": 0.879,
-                                "5px": 0.918,
-                                "relepe": 0.154,
-                                "fl-all": 77.128,
-                            },
-                            10: {
-                                "mae": 1.416,
-                                "rmse": 3.53,
-                                "1px": 0.777,
-                                "3px": 0.896,
-                                "5px": 0.933,
-                                "relepe": 0.148,
-                                "fl-all": 78.388,
-                            },
-                            20: {
-                                "mae": 1.448,
-                                "rmse": 3.583,
-                                "1px": 0.771,
-                                "3px": 0.893,
-                                "5px": 0.931,
-                                "relepe": 0.145,
-                                "fl-all": 77.7,
-                            },
-                        },
-                        2: {
-                            2: {
-                                "mae": 1.972,
-                                "rmse": 4.125,
-                                "1px": 0.73,
-                                "3px": 0.865,
-                                "5px": 0.908,
-                                "relepe": 0.169,
-                                "fl-all": 74.396,
-                            },
-                            5: {
-                                "mae": 1.403,
-                                "rmse": 3.448,
-                                "1px": 0.793,
-                                "3px": 0.905,
-                                "5px": 0.937,
-                                "relepe": 0.151,
-                                "fl-all": 80.186,
-                            },
-                            10: {
-                                "mae": 1.312,
-                                "rmse": 3.368,
-                                "1px": 0.799,
-                                "3px": 0.912,
-                                "5px": 0.943,
-                                "relepe": 0.148,
-                                "fl-all": 80.379,
-                            },
-                            20: {
-                                "mae": 1.376,
-                                "rmse": 3.542,
-                                "1px": 0.796,
-                                "3px": 0.91,
-                                "5px": 0.942,
-                                "relepe": 0.149,
-                                "fl-all": 80.054,
-                            },
-                        },
-                    },
-                }
-            },
-            "_docs": """These weights were trained from scratch on
-            :class:`~torchvision.datasets._stereo_matching.CREStereo` +
-            :class:`~torchvision.datasets._stereo_matching.Middlebury2014Stereo` +
-            :class:`~torchvision.datasets._stereo_matching.ETH3DStereo`.""",
-        },
-    )
-
-    CRESTEREO_FINETUNE_MULTI_V1 = Weights(
-        # Weights ported from https://github.com/megvii-research/CREStereo
-        url="https://download.pytorch.org/models/crestereo-697c38f4.pth	",
-        transforms=StereoMatching,
-        meta={
-            **_COMMON_META,
-            "num_params": 5432948,
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/depth/stereo",
-            "_metrics": {
-                "Middlebury2014-train": {
-                    # metrics for 10 refinement iterations and 1 cascade
-                    "mae": 1.038,
-                    "rmse": 3.108,
-                    "1px": 0.852,
-                    "3px": 0.942,
-                    "5px": 0.963,
-                    "relepe": 0.129,
-                    "fl-all": 85.522,
-                    "_detailed": {
-                        # 1 is the number of cascades
-                        1: {
-                            # 2 is number of refininement iterations
-                            2: {
-                                "mae": 1.85,
-                                "rmse": 3.797,
-                                "1px": 0.673,
-                                "3px": 0.862,
-                                "5px": 0.917,
-                                "relepe": 0.171,
-                                "fl-all": 69.736,
-                            },
-                            5: {
-                                "mae": 1.111,
-                                "rmse": 3.166,
-                                "1px": 0.838,
-                                "3px": 0.93,
-                                "5px": 0.957,
-                                "relepe": 0.134,
-                                "fl-all": 84.596,
-                            },
-                            10: {
-                                "mae": 1.02,
-                                "rmse": 3.073,
-                                "1px": 0.854,
-                                "3px": 0.938,
-                                "5px": 0.96,
-                                "relepe": 0.129,
-                                "fl-all": 86.042,
-                            },
-                            20: {
-                                "mae": 0.993,
-                                "rmse": 3.059,
-                                "1px": 0.855,
-                                "3px": 0.942,
-                                "5px": 0.967,
-                                "relepe": 0.126,
-                                "fl-all": 85.784,
-                            },
-                        },
-                        2: {
-                            2: {
-                                "mae": 1.667,
-                                "rmse": 3.867,
-                                "1px": 0.78,
-                                "3px": 0.891,
-                                "5px": 0.922,
-                                "relepe": 0.165,
-                                "fl-all": 78.89,
-                            },
-                            5: {
-                                "mae": 1.158,
-                                "rmse": 3.278,
-                                "1px": 0.843,
-                                "3px": 0.926,
-                                "5px": 0.955,
-                                "relepe": 0.135,
-                                "fl-all": 84.556,
-                            },
-                            10: {
-                                "mae": 1.046,
-                                "rmse": 3.13,
-                                "1px": 0.85,
-                                "3px": 0.934,
-                                "5px": 0.96,
-                                "relepe": 0.13,
-                                "fl-all": 85.464,
-                            },
-                            20: {
-                                "mae": 1.021,
-                                "rmse": 3.102,
-                                "1px": 0.85,
-                                "3px": 0.935,
-                                "5px": 0.963,
-                                "relepe": 0.129,
-                                "fl-all": 85.417,
-                            },
-                        },
-                    },
-                },
-            },
-            "_docs": """These weights were finetuned on a mixture of
-            :class:`~torchvision.datasets._stereo_matching.CREStereo` +
-            :class:`~torchvision.datasets._stereo_matching.Middlebury2014Stereo` +
-            :class:`~torchvision.datasets._stereo_matching.ETH3DStereo` +
-            :class:`~torchvision.datasets._stereo_matching.InStereo2k` +
-            :class:`~torchvision.datasets._stereo_matching.CarlaStereo` +
-            :class:`~torchvision.datasets._stereo_matching.SintelStereo` +
-            :class:`~torchvision.datasets._stereo_matching.FallingThingsStereo` +
-            .""",
-        },
-    )
-
-    DEFAULT = MEGVII_V1
-
-
-@register_model()
-@handle_legacy_interface(weights=("pretrained", CREStereo_Base_Weights.MEGVII_V1))
-def crestereo_base(*, weights: Optional[CREStereo_Base_Weights] = None, progress=True, **kwargs) -> CREStereo:
-    """CREStereo model from
-    `Practical Stereo Matching via Cascaded Recurrent Network
-    With Adaptive Correlation <https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf>`_.
-
-    Please see the example below for a tutorial on how to use this model.
-
-    Args:
-        weights(:class:`~torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights`, optional): The
-            pretrained weights to use. See
-            :class:`~torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights`
-            below for more details, and possible values. By default, no
-            pre-trained weights are used.
-        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
-            base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/crestereo.py>`_
-            for more details about this class.
-
-    .. autoclass:: torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights
-        :members:
-    """
-
-    weights = CREStereo_Base_Weights.verify(weights)
-
-    return _crestereo(
-        weights=weights,
-        progress=progress,
-        # Feature encoder
-        feature_encoder_layers=(64, 64, 96, 128, 256),
-        feature_encoder_strides=(2, 1, 2, 1),
-        feature_encoder_block=partial(raft.ResidualBlock, always_project=True),
-        feature_encoder_norm_layer=nn.InstanceNorm2d,
-        # Average pooling pyramid
-        feature_downsample_rates=(2, 4),
-        # Motion encoder
-        motion_encoder_corr_layers=(256, 192),
-        motion_encoder_flow_layers=(128, 64),
-        motion_encoder_out_channels=128,
-        # Recurrent block
-        recurrent_block_hidden_state_size=128,
-        recurrent_block_kernel_size=((1, 5), (5, 1)),
-        recurrent_block_padding=((0, 2), (2, 0)),
-        # Flow head
-        flow_head_hidden_size=256,
-        # Transformer blocks
-        num_attention_heads=8,
-        num_self_attention_layers=1,
-        num_cross_attention_layers=1,
-        self_attention_module=LinearAttention,
-        cross_attention_module=LinearAttention,
-        # Adaptive Correlation layer
-        corr_groups=4,
-        corr_search_window_2d=(3, 3),
-        corr_search_dilate_2d=(1, 1),
-        corr_search_window_1d=(1, 9),
-        corr_search_dilate_1d=(1, 1),
-    )
diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
deleted file mode 100644
index 151935631..000000000
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ /dev/null
@@ -1,843 +0,0 @@
-from functools import partial
-from typing import Callable, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.models.optical_flow.raft as raft
-from torch import Tensor
-from torchvision.models._api import register_model, Weights, WeightsEnum
-from torchvision.models._utils import handle_legacy_interface
-from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
-from torchvision.models.optical_flow.raft import FlowHead, MotionEncoder, ResidualBlock
-from torchvision.ops import Conv2dNormActivation
-from torchvision.prototype.transforms._presets import StereoMatching
-from torchvision.utils import _log_api_usage_once
-
-
-__all__ = (
-    "RaftStereo",
-    "raft_stereo_base",
-    "raft_stereo_realtime",
-    "Raft_Stereo_Base_Weights",
-    "Raft_Stereo_Realtime_Weights",
-)
-
-
-class BaseEncoder(raft.FeatureEncoder):
-    """Base encoder for FeatureEncoder and ContextEncoder in which weight may be shared.
-
-    See the Raft-Stereo paper section 4.6 on backbone part.
-    """
-
-    def __init__(
-        self,
-        *,
-        block: Callable[..., nn.Module] = ResidualBlock,
-        layers: Tuple[int, int, int, int] = (64, 64, 96, 128),
-        strides: Tuple[int, int, int, int] = (2, 1, 2, 2),
-        norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d,
-    ):
-        # We use layers + (256,) because raft.FeatureEncoder require 5 layers
-        # but here we will set the last conv layer to identity
-        super().__init__(block=block, layers=layers + (256,), strides=strides, norm_layer=norm_layer)
-
-        # Base encoder don't have the last conv layer of feature encoder
-        self.conv = nn.Identity()
-
-        self.output_dim = layers[3]
-        num_downsampling = sum([x - 1 for x in strides])
-        self.downsampling_ratio = 2 ** (num_downsampling)
-
-
-class FeatureEncoder(nn.Module):
-    """Feature Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Context Encoder.
-
-    The FeatureEncoder takes concatenation of left and right image as input. It produces feature embedding that later
-    will be used to construct correlation volume.
-    """
-
-    def __init__(
-        self,
-        base_encoder: BaseEncoder,
-        output_dim: int = 256,
-        shared_base: bool = False,
-        block: Callable[..., nn.Module] = ResidualBlock,
-    ):
-        super().__init__()
-        self.base_encoder = base_encoder
-        self.base_downsampling_ratio = base_encoder.downsampling_ratio
-        base_dim = base_encoder.output_dim
-
-        if not shared_base:
-            self.residual_block: nn.Module = nn.Identity()
-            self.conv = nn.Conv2d(base_dim, output_dim, kernel_size=1)
-        else:
-            # If we share base encoder weight for Feature and Context Encoder
-            # we need to add residual block with InstanceNorm2d and change the kernel size for conv layer
-            # see: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/raft_stereo.py#L35-L37
-            self.residual_block = block(base_dim, base_dim, norm_layer=nn.InstanceNorm2d, stride=1)
-            self.conv = nn.Conv2d(base_dim, output_dim, kernel_size=3, padding=1)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.base_encoder(x)
-        x = self.residual_block(x)
-        x = self.conv(x)
-        return x
-
-
-class MultiLevelContextEncoder(nn.Module):
-    """Context Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Feature Encoder.
-
-    The ContextEncoder takes left image as input, and it outputs concatenated hidden_states and contexts.
-    In Raft-Stereo we have multi level GRUs and this context encoder will also multi outputs (list of Tensor)
-    that correspond to each GRUs.
-    Take note that the length of "out_with_blocks" parameter represent the number of GRU's level.
-    args:
-        base_encoder (nn.Module): The base encoder part that can have a shared weight with feature_encoder's
-            base_encoder because they have same architecture.
-        out_with_blocks (List[bool]): The length represent the number of GRU's level (length of output), and
-            if the element is True then the output layer on that position will have additional block
-        output_dim (int): The dimension of output on each level (default: 256)
-        block (Callable[..., nn.Module]): The type of basic block used for downsampling and output layer
-            (default: ResidualBlock)
-    """
-
-    def __init__(
-        self,
-        base_encoder: nn.Module,
-        out_with_blocks: List[bool],
-        output_dim: int = 256,
-        block: Callable[..., nn.Module] = ResidualBlock,
-    ):
-        super().__init__()
-        self.num_level = len(out_with_blocks)
-        self.base_encoder = base_encoder
-        self.base_downsampling_ratio = base_encoder.downsampling_ratio
-        base_dim = base_encoder.output_dim
-
-        self.downsample_and_out_layers = nn.ModuleList(
-            [
-                nn.ModuleDict(
-                    {
-                        "downsampler": self._make_downsampler(block, base_dim, base_dim) if i > 0 else nn.Identity(),
-                        "out_hidden_state": self._make_out_layer(
-                            base_dim, output_dim // 2, with_block=out_with_blocks[i], block=block
-                        ),
-                        "out_context": self._make_out_layer(
-                            base_dim, output_dim // 2, with_block=out_with_blocks[i], block=block
-                        ),
-                    }
-                )
-                for i in range(self.num_level)
-            ]
-        )
-
-    def _make_out_layer(self, in_channels, out_channels, with_block=True, block=ResidualBlock):
-        layers = []
-        if with_block:
-            layers.append(block(in_channels, in_channels, norm_layer=nn.BatchNorm2d, stride=1))
-        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
-        return nn.Sequential(*layers)
-
-    def _make_downsampler(self, block, in_channels, out_channels):
-        block1 = block(in_channels, out_channels, norm_layer=nn.BatchNorm2d, stride=2)
-        block2 = block(out_channels, out_channels, norm_layer=nn.BatchNorm2d, stride=1)
-        return nn.Sequential(block1, block2)
-
-    def forward(self, x: Tensor) -> List[Tensor]:
-        x = self.base_encoder(x)
-        outs = []
-        for layer_dict in self.downsample_and_out_layers:
-            x = layer_dict["downsampler"](x)
-            outs.append(torch.cat([layer_dict["out_hidden_state"](x), layer_dict["out_context"](x)], dim=1))
-        return outs
-
-
-class ConvGRU(raft.ConvGRU):
-    """Convolutional Gru unit."""
-
-    # Modified from raft.ConvGRU to accept pre-convolved contexts,
-    # see: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/update.py#L23
-    def forward(self, h: Tensor, x: Tensor, context: List[Tensor]) -> Tensor:  # type: ignore[override]
-        hx = torch.cat([h, x], dim=1)
-        z = torch.sigmoid(self.convz(hx) + context[0])
-        r = torch.sigmoid(self.convr(hx) + context[1])
-        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)) + context[2])
-        h = (1 - z) * h + z * q
-        return h
-
-
-class MultiLevelUpdateBlock(nn.Module):
-    """The update block which contains the motion encoder and grus
-
-    It must expose a ``hidden_dims`` attribute which is the hidden dimension size of its gru blocks
-    """
-
-    def __init__(self, *, motion_encoder: MotionEncoder, hidden_dims: List[int]):
-        super().__init__()
-        self.motion_encoder = motion_encoder
-
-        # The GRU input size is the size of previous level hidden_dim plus next level hidden_dim
-        # if this is the first gru, then we replace previous level with motion_encoder output channels
-        # for the last GRU, we don't add the next level hidden_dim
-        gru_input_dims = []
-        for i in range(len(hidden_dims)):
-            input_dim = hidden_dims[i - 1] if i > 0 else motion_encoder.out_channels
-            if i < len(hidden_dims) - 1:
-                input_dim += hidden_dims[i + 1]
-            gru_input_dims.append(input_dim)
-
-        self.grus = nn.ModuleList(
-            [
-                ConvGRU(input_size=gru_input_dims[i], hidden_size=hidden_dims[i], kernel_size=3, padding=1)
-                # Ideally we should reverse the direction during forward to use the gru with the smallest resolution
-                # first however currently there is no way to reverse a ModuleList that is jit script compatible
-                # hence we reverse the ordering of self.grus on the constructor instead
-                # see: https://github.com/pytorch/pytorch/issues/31772
-                for i in reversed(list(range(len(hidden_dims))))
-            ]
-        )
-
-        self.hidden_dims = hidden_dims
-
-    def forward(
-        self,
-        hidden_states: List[Tensor],
-        contexts: List[List[Tensor]],
-        corr_features: Tensor,
-        disparity: Tensor,
-        level_processed: List[bool],
-    ) -> List[Tensor]:
-        # We call it reverse_i because it has a reversed ordering compared to hidden_states
-        # see self.grus on the constructor for more detail
-        for reverse_i, gru in enumerate(self.grus):
-            i = len(self.grus) - 1 - reverse_i
-            if level_processed[i]:
-                # X is concatenation of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
-                # upsampled hidden_dim (or nothing if not exist).
-                if i == 0:
-                    features = self.motion_encoder(disparity, corr_features)
-                else:
-                    # 2x downsampled features from larger hidden states
-                    features = F.avg_pool2d(hidden_states[i - 1], kernel_size=3, stride=2, padding=1)
-
-                if i < len(self.grus) - 1:
-                    # Concat with 2x upsampled features from smaller hidden states
-                    _, _, h, w = hidden_states[i + 1].shape
-                    features = torch.cat(
-                        [
-                            features,
-                            F.interpolate(
-                                hidden_states[i + 1], size=(2 * h, 2 * w), mode="bilinear", align_corners=True
-                            ),
-                        ],
-                        dim=1,
-                    )
-
-                hidden_states[i] = gru(hidden_states[i], features, contexts[i])
-
-                # NOTE: For slow-fast gru, we don't always want to calculate delta disparity for every call on UpdateBlock
-                # Hence we move the delta disparity calculation to the RAFT-Stereo main forward
-
-        return hidden_states
-
-
-class MaskPredictor(raft.MaskPredictor):
-    """Mask predictor to be used when upsampling the predicted disparity."""
-
-    # We add out_channels compared to raft.MaskPredictor
-    def __init__(self, *, in_channels: int, hidden_size: int, out_channels: int, multiplier: float = 0.25):
-        super(raft.MaskPredictor, self).__init__()
-        self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        self.conv = nn.Conv2d(hidden_size, out_channels, kernel_size=1, padding=0)
-        self.multiplier = multiplier
-
-
-class CorrPyramid1d(nn.Module):
-    """Row-wise correlation pyramid.
-
-    Create a row-wise correlation pyramid with ``num_levels`` level from the outputs of the feature encoder,
-    this correlation pyramid will later be used as index to create correlation features using CorrBlock1d.
-    """
-
-    def __init__(self, num_levels: int = 4):
-        super().__init__()
-        self.num_levels = num_levels
-
-    def forward(self, fmap1: Tensor, fmap2: Tensor) -> List[Tensor]:
-        """Build the correlation pyramid from two feature maps.
-
-        The correlation volume is first computed as the dot product of each pair (pixel_in_fmap1, pixel_in_fmap2) on the same row.
-        The last 2 dimensions of the correlation volume are then pooled num_levels times at different resolutions
-        to build the correlation pyramid.
-        """
-
-        torch._assert(
-            fmap1.shape == fmap2.shape,
-            f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)",
-        )
-
-        batch_size, num_channels, h, w = fmap1.shape
-        fmap1 = fmap1.view(batch_size, num_channels, h, w)
-        fmap2 = fmap2.view(batch_size, num_channels, h, w)
-
-        corr = torch.einsum("aijk,aijh->ajkh", fmap1, fmap2)
-        corr = corr.view(batch_size, h, w, 1, w)
-        corr_volume = corr / torch.sqrt(torch.tensor(num_channels, device=corr.device))
-
-        corr_volume = corr_volume.reshape(batch_size * h * w, 1, 1, w)
-        corr_pyramid = [corr_volume]
-        for _ in range(self.num_levels - 1):
-            corr_volume = F.avg_pool2d(corr_volume, kernel_size=(1, 2), stride=(1, 2))
-            corr_pyramid.append(corr_volume)
-
-        return corr_pyramid
-
-
-class CorrBlock1d(nn.Module):
-    """The row-wise correlation block.
-
-    Use indexes from correlation pyramid to create correlation features.
-    The "indexing" of a given centroid pixel x' is done by concatenating its surrounding row neighbours
-    within radius
-    """
-
-    def __init__(self, *, num_levels: int = 4, radius: int = 4):
-        super().__init__()
-        self.radius = radius
-        self.out_channels = num_levels * (2 * radius + 1)
-
-    def forward(self, centroids_coords: Tensor, corr_pyramid: List[Tensor]) -> Tensor:
-        """Return correlation features by indexing from the pyramid."""
-        neighborhood_side_len = 2 * self.radius + 1  # see note in __init__ about out_channels
-        di = torch.linspace(-self.radius, self.radius, neighborhood_side_len, device=centroids_coords.device)
-        di = di.view(1, 1, neighborhood_side_len, 1).to(centroids_coords.device)
-
-        batch_size, _, h, w = centroids_coords.shape  # _ = 2 but we only use the first one
-        # We only consider 1d and take the first dim only
-        centroids_coords = centroids_coords[:, :1].permute(0, 2, 3, 1).reshape(batch_size * h * w, 1, 1, 1)
-
-        indexed_pyramid = []
-        for corr_volume in corr_pyramid:
-            x0 = centroids_coords + di  # end shape is (batch_size * h * w, 1, side_len, 1)
-            y0 = torch.zeros_like(x0)
-            sampling_coords = torch.cat([x0, y0], dim=-1)
-            indexed_corr_volume = grid_sample(corr_volume, sampling_coords, align_corners=True, mode="bilinear").view(
-                batch_size, h, w, -1
-            )
-            indexed_pyramid.append(indexed_corr_volume)
-            centroids_coords = centroids_coords / 2
-
-        corr_features = torch.cat(indexed_pyramid, dim=-1).permute(0, 3, 1, 2).contiguous()
-
-        expected_output_shape = (batch_size, self.out_channels, h, w)
-        torch._assert(
-            corr_features.shape == expected_output_shape,
-            f"Output shape of index pyramid is incorrect. Should be {expected_output_shape}, got {corr_features.shape}",
-        )
-        return corr_features
-
-
-class RaftStereo(nn.Module):
-    def __init__(
-        self,
-        *,
-        feature_encoder: FeatureEncoder,
-        context_encoder: MultiLevelContextEncoder,
-        corr_pyramid: CorrPyramid1d,
-        corr_block: CorrBlock1d,
-        update_block: MultiLevelUpdateBlock,
-        disparity_head: nn.Module,
-        mask_predictor: Optional[nn.Module] = None,
-        slow_fast: bool = False,
-    ):
-        """RAFT-Stereo model from
-        `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
-
-        args:
-            feature_encoder (FeatureEncoder): The feature encoder. Its input is the concatenation of ``left_image`` and ``right_image``.
-            context_encoder (MultiLevelContextEncoder): The context encoder. Its input is ``left_image``.
-                It has multi-level output and each level will have 2 parts:
-
-                - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the recurrent unit of
-                  the ``update_block``
-
-            corr_pyramid (CorrPyramid1d): Module to build the correlation pyramid from feature encoder output
-            corr_block (CorrBlock1d): The correlation block, which uses the correlation pyramid indexes
-                to create correlation features. It takes the coordinate of the centroid pixel and correlation pyramid
-                as input and returns the correlation features.
-                It must expose an ``out_channels`` attribute.
-
-            update_block (MultiLevelUpdateBlock): The update block, which contains the motion encoder, and the recurrent unit.
-                It takes as input the hidden state of its recurrent unit, the context, the correlation
-                features, and the current predicted disparity. It outputs an updated hidden state
-            disparity_head (nn.Module): The disparity head block will convert from the hidden state into changes in disparity.
-            mask_predictor (nn.Module, optional): Predicts the mask that will be used to upsample the predicted flow.
-                If ``None`` (default), the flow is upsampled using interpolation.
-            slow_fast (bool): A boolean that specify whether we should use slow-fast GRU or not. See RAFT-Stereo paper
-                on section 3.4 for more detail.
-        """
-        super().__init__()
-        _log_api_usage_once(self)
-
-        # This indicates that the disparity output will be only have 1 channel (represent horizontal axis).
-        # We need this because some stereo matching model like CREStereo might have 2 channel on the output
-        self.output_channels = 1
-
-        self.feature_encoder = feature_encoder
-        self.context_encoder = context_encoder
-
-        self.base_downsampling_ratio = feature_encoder.base_downsampling_ratio
-        self.num_level = self.context_encoder.num_level
-        self.corr_pyramid = corr_pyramid
-        self.corr_block = corr_block
-        self.update_block = update_block
-        self.disparity_head = disparity_head
-        self.mask_predictor = mask_predictor
-
-        hidden_dims = self.update_block.hidden_dims
-        # Follow the original implementation to do pre convolution on the context
-        # See: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/raft_stereo.py#L32
-        self.context_convs = nn.ModuleList(
-            [nn.Conv2d(hidden_dims[i], hidden_dims[i] * 3, kernel_size=3, padding=1) for i in range(self.num_level)]
-        )
-        self.slow_fast = slow_fast
-
-    def forward(
-        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 12
-    ) -> List[Tensor]:
-        """
-        Return disparity predictions on every iteration as a list of Tensor.
-        args:
-            left_image (Tensor): The input left image with layout B, C, H, W
-            right_image (Tensor): The input right image with layout B, C, H, W
-            flow_init (Optional[Tensor]): Initial estimate for the disparity. Default: None
-            num_iters (int): Number of update block iteration on the largest resolution. Default: 12
-        """
-        batch_size, _, h, w = left_image.shape
-        torch._assert(
-            (h, w) == right_image.shape[-2:],
-            f"input images should have the same shape, instead got ({h}, {w}) != {right_image.shape[-2:]}",
-        )
-
-        torch._assert(
-            (h % self.base_downsampling_ratio == 0 and w % self.base_downsampling_ratio == 0),
-            f"input image H and W should be divisible by {self.base_downsampling_ratio}, instead got H={h} and W={w}",
-        )
-
-        fmaps = self.feature_encoder(torch.cat([left_image, right_image], dim=0))
-        fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
-        torch._assert(
-            fmap1.shape[-2:] == (h // self.base_downsampling_ratio, w // self.base_downsampling_ratio),
-            f"The feature encoder should downsample H and W by {self.base_downsampling_ratio}",
-        )
-
-        corr_pyramid = self.corr_pyramid(fmap1, fmap2)
-
-        # Multi level contexts
-        context_outs = self.context_encoder(left_image)
-
-        hidden_dims = self.update_block.hidden_dims
-        context_out_channels = [context_outs[i].shape[1] - hidden_dims[i] for i in range(len(context_outs))]
-        hidden_states: List[Tensor] = []
-        contexts: List[List[Tensor]] = []
-        for i, context_conv in enumerate(self.context_convs):
-            # As in the original paper, the actual output of the context encoder is split in 2 parts:
-            # - one part is used to initialize the hidden state of the recurent units of the update block
-            # - the rest is the "actual" context.
-            hidden_state, context = torch.split(context_outs[i], [hidden_dims[i], context_out_channels[i]], dim=1)
-            hidden_states.append(torch.tanh(hidden_state))
-            contexts.append(
-                # mypy is technically correct here. The return type of `torch.split` was incorrectly annotated with
-                # `List[int]` although it should have been `Tuple[Tensor, ...]`. However, the latter is not supported by
-                # JIT and thus we have to keep the wrong annotation here and silence mypy.
-                torch.split(  # type: ignore[arg-type]
-                    context_conv(F.relu(context)), [hidden_dims[i], hidden_dims[i], hidden_dims[i]], dim=1
-                )
-            )
-
-        _, Cf, Hf, Wf = fmap1.shape
-        coords0 = make_coords_grid(batch_size, Hf, Wf).to(fmap1.device)
-        coords1 = make_coords_grid(batch_size, Hf, Wf).to(fmap1.device)
-
-        # We use flow_init for cascade inference
-        if flow_init is not None:
-            coords1 = coords1 + flow_init
-
-        disparity_predictions = []
-        for _ in range(num_iters):
-            coords1 = coords1.detach()  # Don't backpropagate gradients through this branch, see paper
-            corr_features = self.corr_block(centroids_coords=coords1, corr_pyramid=corr_pyramid)
-
-            disparity = coords1 - coords0
-            if self.slow_fast:
-                # Using slow_fast GRU (see paper section 3.4). The lower resolution are processed more often
-                for i in range(1, self.num_level):
-                    # We only processed the smallest i levels
-                    level_processed = [False] * (self.num_level - i) + [True] * i
-                    hidden_states = self.update_block(
-                        hidden_states, contexts, corr_features, disparity, level_processed=level_processed
-                    )
-            hidden_states = self.update_block(
-                hidden_states, contexts, corr_features, disparity, level_processed=[True] * self.num_level
-            )
-            # Take the largest hidden_state to get the disparity
-            hidden_state = hidden_states[0]
-            delta_disparity = self.disparity_head(hidden_state)
-            # in stereo mode, project disparity onto epipolar
-            delta_disparity[:, 1] = 0.0
-
-            coords1 = coords1 + delta_disparity
-            up_mask = None if self.mask_predictor is None else self.mask_predictor(hidden_state)
-            upsampled_disparity = upsample_flow(
-                (coords1 - coords0), up_mask=up_mask, factor=self.base_downsampling_ratio
-            )
-            disparity_predictions.append(upsampled_disparity[:, :1])
-
-        return disparity_predictions
-
-
-def _raft_stereo(
-    *,
-    weights: Optional[WeightsEnum],
-    progress: bool,
-    shared_encoder_weight: bool,
-    # Feature encoder
-    feature_encoder_layers: Tuple[int, int, int, int, int],
-    feature_encoder_strides: Tuple[int, int, int, int],
-    feature_encoder_block: Callable[..., nn.Module],
-    # Context encoder
-    context_encoder_layers: Tuple[int, int, int, int, int],
-    context_encoder_strides: Tuple[int, int, int, int],
-    # if the `out_with_blocks` param of the context_encoder is True, then
-    # the particular output on that level position will have additional `context_encoder_block` layer
-    context_encoder_out_with_blocks: List[bool],
-    context_encoder_block: Callable[..., nn.Module],
-    # Correlation block
-    corr_num_levels: int,
-    corr_radius: int,
-    # Motion encoder
-    motion_encoder_corr_layers: Tuple[int, int],
-    motion_encoder_flow_layers: Tuple[int, int],
-    motion_encoder_out_channels: int,
-    # Update block
-    update_block_hidden_dims: List[int],
-    # Flow Head
-    flow_head_hidden_size: int,
-    # Mask predictor
-    mask_predictor_hidden_size: int,
-    use_mask_predictor: bool,
-    slow_fast: bool,
-    **kwargs,
-):
-    if len(context_encoder_out_with_blocks) != len(update_block_hidden_dims):
-        raise ValueError(
-            "Length of context_encoder_out_with_blocks and update_block_hidden_dims must be the same"
-            + "because both of them represent the number of GRUs level"
-        )
-    if shared_encoder_weight:
-        if (
-            feature_encoder_layers[:-1] != context_encoder_layers[:-1]
-            or feature_encoder_strides != context_encoder_strides
-        ):
-            raise ValueError(
-                "If shared_encoder_weight is True, then the feature_encoder_layers[:-1]"
-                + " and feature_encoder_strides must be the same with context_encoder_layers[:-1] and context_encoder_strides!"
-            )
-
-        base_encoder = kwargs.pop("base_encoder", None) or BaseEncoder(
-            block=context_encoder_block,
-            layers=context_encoder_layers[:-1],
-            strides=context_encoder_strides,
-            norm_layer=nn.BatchNorm2d,
-        )
-        feature_base_encoder = base_encoder
-        context_base_encoder = base_encoder
-    else:
-        feature_base_encoder = BaseEncoder(
-            block=feature_encoder_block,
-            layers=feature_encoder_layers[:-1],
-            strides=feature_encoder_strides,
-            norm_layer=nn.InstanceNorm2d,
-        )
-        context_base_encoder = BaseEncoder(
-            block=context_encoder_block,
-            layers=context_encoder_layers[:-1],
-            strides=context_encoder_strides,
-            norm_layer=nn.BatchNorm2d,
-        )
-    feature_encoder = kwargs.pop("feature_encoder", None) or FeatureEncoder(
-        feature_base_encoder,
-        output_dim=feature_encoder_layers[-1],
-        shared_base=shared_encoder_weight,
-        block=feature_encoder_block,
-    )
-    context_encoder = kwargs.pop("context_encoder", None) or MultiLevelContextEncoder(
-        context_base_encoder,
-        out_with_blocks=context_encoder_out_with_blocks,
-        output_dim=context_encoder_layers[-1],
-        block=context_encoder_block,
-    )
-
-    feature_downsampling_ratio = feature_encoder.base_downsampling_ratio
-
-    corr_pyramid = kwargs.pop("corr_pyramid", None) or CorrPyramid1d(num_levels=corr_num_levels)
-    corr_block = kwargs.pop("corr_block", None) or CorrBlock1d(num_levels=corr_num_levels, radius=corr_radius)
-
-    motion_encoder = kwargs.pop("motion_encoder", None) or MotionEncoder(
-        in_channels_corr=corr_block.out_channels,
-        corr_layers=motion_encoder_corr_layers,
-        flow_layers=motion_encoder_flow_layers,
-        out_channels=motion_encoder_out_channels,
-    )
-    update_block = kwargs.pop("update_block", None) or MultiLevelUpdateBlock(
-        motion_encoder=motion_encoder, hidden_dims=update_block_hidden_dims
-    )
-
-    # We use the largest scale hidden_dims of update_block to get the predicted disparity
-    disparity_head = kwargs.pop("disparity_head", None) or FlowHead(
-        in_channels=update_block_hidden_dims[0],
-        hidden_size=flow_head_hidden_size,
-    )
-
-    mask_predictor = kwargs.pop("mask_predictor", None)
-    if use_mask_predictor:
-        mask_predictor = MaskPredictor(
-            in_channels=update_block.hidden_dims[0],
-            hidden_size=mask_predictor_hidden_size,
-            out_channels=9 * feature_downsampling_ratio * feature_downsampling_ratio,
-        )
-    else:
-        mask_predictor = None
-
-    model = RaftStereo(
-        feature_encoder=feature_encoder,
-        context_encoder=context_encoder,
-        corr_pyramid=corr_pyramid,
-        corr_block=corr_block,
-        update_block=update_block,
-        disparity_head=disparity_head,
-        mask_predictor=mask_predictor,
-        slow_fast=slow_fast,
-        **kwargs,  # not really needed, all params should be consumed by now
-    )
-
-    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
-
-    return model
-
-
-class Raft_Stereo_Realtime_Weights(WeightsEnum):
-    SCENEFLOW_V1 = Weights(
-        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
-        url="https://download.pytorch.org/models/raft_stereo_realtime-cf345ccb.pth",
-        transforms=partial(StereoMatching, resize_size=(224, 224)),
-        meta={
-            "num_params": 8077152,
-            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
-            "_metrics": {
-                # Following metrics from paper: https://arxiv.org/abs/2109.07547
-                "Kitty2015": {
-                    "3px": 0.9409,
-                }
-            },
-        },
-    )
-
-    DEFAULT = SCENEFLOW_V1
-
-
-class Raft_Stereo_Base_Weights(WeightsEnum):
-    SCENEFLOW_V1 = Weights(
-        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
-        url="https://download.pytorch.org/models/raft_stereo_base_sceneflow-eff3f2e6.pth",
-        transforms=partial(StereoMatching, resize_size=(224, 224)),
-        meta={
-            "num_params": 11116176,
-            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
-            "_metrics": {
-                # Following metrics from paper: https://arxiv.org/abs/2109.07547
-                # Using standard metrics for each dataset
-                "Kitty2015": {
-                    # Ratio of pixels with difference less than 3px from ground truth
-                    "3px": 0.9426,
-                },
-                # For middlebury, ratio of pixels with difference less than 2px from ground truth
-                # on full, half, and quarter image resolution
-                "Middlebury2014-val-full": {
-                    "2px": 0.8167,
-                },
-                "Middlebury2014-val-half": {
-                    "2px": 0.8741,
-                },
-                "Middlebury2014-val-quarter": {
-                    "2px": 0.9064,
-                },
-                "ETH3D-val": {
-                    # Ratio of pixels with difference less than 1px from ground truth
-                    "1px": 0.9672,
-                },
-            },
-        },
-    )
-
-    MIDDLEBURY_V1 = Weights(
-        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
-        url="https://download.pytorch.org/models/raft_stereo_base_middlebury-afa9d252.pth",
-        transforms=partial(StereoMatching, resize_size=(224, 224)),
-        meta={
-            "num_params": 11116176,
-            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
-            "_metrics": {
-                # Following metrics from paper: https://arxiv.org/abs/2109.07547
-                "Middlebury-test": {
-                    "mae": 1.27,
-                    "1px": 0.9063,
-                    "2px": 0.9526,
-                    "5px": 0.9725,
-                }
-            },
-        },
-    )
-
-    ETH3D_V1 = Weights(
-        # Weights ported from https://github.com/princeton-vl/RAFT-Stereo
-        url="https://download.pytorch.org/models/raft_stereo_base_eth3d-d4830f22.pth",
-        transforms=partial(StereoMatching, resize_size=(224, 224)),
-        meta={
-            "num_params": 11116176,
-            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
-            "_metrics": {
-                # Following metrics from paper: https://arxiv.org/abs/2109.07547
-                "ETH3D-test": {
-                    "mae": 0.18,
-                    "1px": 0.9756,
-                    "2px": 0.9956,
-                }
-            },
-        },
-    )
-
-    DEFAULT = MIDDLEBURY_V1
-
-
-@register_model()
-@handle_legacy_interface(weights=("pretrained", None))
-def raft_stereo_realtime(
-    *, weights: Optional[Raft_Stereo_Realtime_Weights] = None, progress=True, **kwargs
-) -> RaftStereo:
-    """RAFT-Stereo model from
-    `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
-    This is the realtime variant of the Raft-Stereo model that is described on the paper section 4.7.
-
-    Please see the example below for a tutorial on how to use this model.
-
-    Args:
-        weights(:class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Realtime_Weights`, optional): The
-            pretrained weights to use. See
-            :class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Realtime_Weights`
-            below for more details, and possible values. By default, no
-            pre-trained weights are used.
-        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
-            base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/raft.py>`_
-            for more details about this class.
-
-    .. autoclass:: torchvision.prototype.models.depth.stereo.Raft_Stereo_Realtime_Weights
-        :members:
-    """
-
-    weights = Raft_Stereo_Realtime_Weights.verify(weights)
-
-    return _raft_stereo(
-        weights=weights,
-        progress=progress,
-        shared_encoder_weight=True,
-        # Feature encoder
-        feature_encoder_layers=(64, 64, 96, 128, 256),
-        feature_encoder_strides=(2, 1, 2, 2),
-        feature_encoder_block=ResidualBlock,
-        # Context encoder
-        context_encoder_layers=(64, 64, 96, 128, 256),
-        context_encoder_strides=(2, 1, 2, 2),
-        context_encoder_out_with_blocks=[True, True],
-        context_encoder_block=ResidualBlock,
-        # Correlation block
-        corr_num_levels=4,
-        corr_radius=4,
-        # Motion encoder
-        motion_encoder_corr_layers=(64, 64),
-        motion_encoder_flow_layers=(64, 64),
-        motion_encoder_out_channels=128,
-        # Update block
-        update_block_hidden_dims=[128, 128],
-        # Flow head
-        flow_head_hidden_size=256,
-        # Mask predictor
-        mask_predictor_hidden_size=256,
-        use_mask_predictor=True,
-        slow_fast=True,
-        **kwargs,
-    )
-
-
-@register_model()
-@handle_legacy_interface(weights=("pretrained", None))
-def raft_stereo_base(*, weights: Optional[Raft_Stereo_Base_Weights] = None, progress=True, **kwargs) -> RaftStereo:
-    """RAFT-Stereo model from
-    `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
-
-    Please see the example below for a tutorial on how to use this model.
-
-    Args:
-        weights(:class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Base_Weights`, optional): The
-            pretrained weights to use. See
-            :class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Base_Weights`
-            below for more details, and possible values. By default, no
-            pre-trained weights are used.
-        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
-            base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/raft.py>`_
-            for more details about this class.
-
-    .. autoclass:: torchvision.prototype.models.depth.stereo.Raft_Stereo_Base_Weights
-        :members:
-    """
-
-    weights = Raft_Stereo_Base_Weights.verify(weights)
-
-    return _raft_stereo(
-        weights=weights,
-        progress=progress,
-        shared_encoder_weight=False,
-        # Feature encoder
-        feature_encoder_layers=(64, 64, 96, 128, 256),
-        feature_encoder_strides=(1, 1, 2, 2),
-        feature_encoder_block=ResidualBlock,
-        # Context encoder
-        context_encoder_layers=(64, 64, 96, 128, 256),
-        context_encoder_strides=(1, 1, 2, 2),
-        context_encoder_out_with_blocks=[True, True, False],
-        context_encoder_block=ResidualBlock,
-        # Correlation block
-        corr_num_levels=4,
-        corr_radius=4,
-        # Motion encoder
-        motion_encoder_corr_layers=(64, 64),
-        motion_encoder_flow_layers=(64, 64),
-        motion_encoder_out_channels=128,
-        # Update block
-        update_block_hidden_dims=[128, 128, 128],
-        # Flow head
-        flow_head_hidden_size=256,
-        # Mask predictor
-        mask_predictor_hidden_size=256,
-        use_mask_predictor=True,
-        slow_fast=False,
-        **kwargs,
-    )
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
deleted file mode 100644
index c264db5d3..000000000
--- a/torchvision/prototype/transforms/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from ._presets import StereoMatching  # usort: skip
-
-from ._augment import SimpleCopyPaste
-from ._geometry import FixedSizeCrop
-from ._misc import PermuteDimensions, TransposeDimensions
-from ._type_conversion import LabelToOneHot
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
deleted file mode 100644
index f7e5a6be2..000000000
--- a/torchvision/prototype/transforms/_augment.py
+++ /dev/null
@@ -1,205 +0,0 @@
-from typing import Any, cast, Dict, List, Optional, Tuple, Union
-
-import PIL.Image
-import torch
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import tv_tensors
-from torchvision.ops import masks_to_boxes
-from torchvision.prototype import tv_tensors as proto_tv_tensors
-from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
-from torchvision.transforms.v2._utils import is_pure_tensor
-
-from torchvision.transforms.v2.functional._geometry import _check_interpolation
-
-
-class SimpleCopyPaste(Transform):
-    def __init__(
-        self,
-        blending: bool = True,
-        resize_interpolation: Union[int, InterpolationMode] = F.InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__()
-        self.resize_interpolation = _check_interpolation(resize_interpolation)
-        self.blending = blending
-        self.antialias = antialias
-
-    def _copy_paste(
-        self,
-        image: Union[torch.Tensor, tv_tensors.Image],
-        target: Dict[str, Any],
-        paste_image: Union[torch.Tensor, tv_tensors.Image],
-        paste_target: Dict[str, Any],
-        random_selection: torch.Tensor,
-        blending: bool,
-        resize_interpolation: F.InterpolationMode,
-        antialias: Optional[bool],
-    ) -> Tuple[torch.Tensor, Dict[str, Any]]:
-
-        paste_masks = tv_tensors.wrap(paste_target["masks"][random_selection], like=paste_target["masks"])
-        paste_boxes = tv_tensors.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"])
-        paste_labels = tv_tensors.wrap(paste_target["labels"][random_selection], like=paste_target["labels"])
-
-        masks = target["masks"]
-
-        # We resize source and paste data if they have different sizes
-        # This is something different to TF implementation we introduced here as
-        # originally the algorithm works on equal-sized data
-        # (for example, coming from LSJ data augmentations)
-        size1 = cast(List[int], image.shape[-2:])
-        size2 = paste_image.shape[-2:]
-        if size1 != size2:
-            paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation, antialias=antialias)
-            paste_masks = F.resize(paste_masks, size=size1)
-            paste_boxes = F.resize(paste_boxes, size=size1)
-
-        paste_alpha_mask = paste_masks.sum(dim=0) > 0
-
-        if blending:
-            paste_alpha_mask = F.gaussian_blur(paste_alpha_mask.unsqueeze(0), kernel_size=[5, 5], sigma=[2.0])
-
-        inverse_paste_alpha_mask = paste_alpha_mask.logical_not()
-        # Copy-paste images:
-        image = image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask))
-
-        # Copy-paste masks:
-        masks = masks * inverse_paste_alpha_mask
-        non_all_zero_masks = masks.sum((-1, -2)) > 0
-        masks = masks[non_all_zero_masks]
-
-        # Do a shallow copy of the target dict
-        out_target = {k: v for k, v in target.items()}
-
-        out_target["masks"] = torch.cat([masks, paste_masks])
-
-        # Copy-paste boxes and labels
-        bbox_format = target["boxes"].format
-        xyxy_boxes = masks_to_boxes(masks)
-        # masks_to_boxes produces bboxes with x2y2 inclusive but x2y2 should be exclusive
-        # we need to add +1 to x2y2.
-        # There is a similar +1 in other reference implementations:
-        # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
-        xyxy_boxes[:, 2:] += 1
-        boxes = F.convert_bounding_box_format(
-            xyxy_boxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
-        )
-        out_target["boxes"] = torch.cat([boxes, paste_boxes])
-
-        labels = target["labels"][non_all_zero_masks]
-        out_target["labels"] = torch.cat([labels, paste_labels])
-
-        # Check for degenerated boxes and remove them
-        boxes = F.convert_bounding_box_format(
-            out_target["boxes"], old_format=bbox_format, new_format=tv_tensors.BoundingBoxFormat.XYXY
-        )
-        degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
-        if degenerate_boxes.any():
-            valid_targets = ~degenerate_boxes.any(dim=1)
-
-            out_target["boxes"] = boxes[valid_targets]
-            out_target["masks"] = out_target["masks"][valid_targets]
-            out_target["labels"] = out_target["labels"][valid_targets]
-
-        return image, out_target
-
-    def _extract_image_targets(
-        self, flat_sample: List[Any]
-    ) -> Tuple[List[Union[torch.Tensor, tv_tensors.Image]], List[Dict[str, Any]]]:
-        # fetch all images, bboxes, masks and labels from unstructured input
-        # with List[image], List[BoundingBoxes], List[Mask], List[Label]
-        images, bboxes, masks, labels = [], [], [], []
-        for obj in flat_sample:
-            if isinstance(obj, tv_tensors.Image) or is_pure_tensor(obj):
-                images.append(obj)
-            elif isinstance(obj, PIL.Image.Image):
-                images.append(F.to_image(obj))
-            elif isinstance(obj, tv_tensors.BoundingBoxes):
-                bboxes.append(obj)
-            elif isinstance(obj, tv_tensors.Mask):
-                masks.append(obj)
-            elif isinstance(obj, (proto_tv_tensors.Label, proto_tv_tensors.OneHotLabel)):
-                labels.append(obj)
-
-        if not (len(images) == len(bboxes) == len(masks) == len(labels)):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain equal sized list of Images, "
-                "BoundingBoxeses, Masks and Labels or OneHotLabels."
-            )
-
-        targets = []
-        for bbox, mask, label in zip(bboxes, masks, labels):
-            targets.append({"boxes": bbox, "masks": mask, "labels": label})
-
-        return images, targets
-
-    def _insert_outputs(
-        self,
-        flat_sample: List[Any],
-        output_images: List[torch.Tensor],
-        output_targets: List[Dict[str, Any]],
-    ) -> None:
-        c0, c1, c2, c3 = 0, 0, 0, 0
-        for i, obj in enumerate(flat_sample):
-            if isinstance(obj, tv_tensors.Image):
-                flat_sample[i] = tv_tensors.wrap(output_images[c0], like=obj)
-                c0 += 1
-            elif isinstance(obj, PIL.Image.Image):
-                flat_sample[i] = F.to_pil_image(output_images[c0])
-                c0 += 1
-            elif is_pure_tensor(obj):
-                flat_sample[i] = output_images[c0]
-                c0 += 1
-            elif isinstance(obj, tv_tensors.BoundingBoxes):
-                flat_sample[i] = tv_tensors.wrap(output_targets[c1]["boxes"], like=obj)
-                c1 += 1
-            elif isinstance(obj, tv_tensors.Mask):
-                flat_sample[i] = tv_tensors.wrap(output_targets[c2]["masks"], like=obj)
-                c2 += 1
-            elif isinstance(obj, (proto_tv_tensors.Label, proto_tv_tensors.OneHotLabel)):
-                flat_sample[i] = tv_tensors.wrap(output_targets[c3]["labels"], like=obj)
-                c3 += 1
-
-    def forward(self, *inputs: Any) -> Any:
-        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
-
-        images, targets = self._extract_image_targets(flat_inputs)
-
-        # images = [t1, t2, ..., tN]
-        # Let's define paste_images as shifted list of input images
-        # paste_images = [t2, t3, ..., tN, t1]
-        # FYI: in TF they mix data on the dataset level
-        images_rolled = images[-1:] + images[:-1]
-        targets_rolled = targets[-1:] + targets[:-1]
-
-        output_images, output_targets = [], []
-
-        for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled):
-
-            # Random paste targets selection:
-            num_masks = len(paste_target["masks"])
-
-            if num_masks < 1:
-                # Such degerante case with num_masks=0 can happen with LSJ
-                # Let's just return (image, target)
-                output_image, output_target = image, target
-            else:
-                random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device)
-                random_selection = torch.unique(random_selection)
-
-                output_image, output_target = self._copy_paste(
-                    image,
-                    target,
-                    paste_image,
-                    paste_target,
-                    random_selection=random_selection,
-                    blending=self.blending,
-                    resize_interpolation=self.resize_interpolation,
-                    antialias=self.antialias,
-                )
-            output_images.append(output_image)
-            output_targets.append(output_target)
-
-        # Insert updated images and targets into input flat_sample
-        self._insert_outputs(flat_inputs, output_images, output_targets)
-
-        return tree_unflatten(flat_inputs, spec)
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
deleted file mode 100644
index b04e1fe5a..000000000
--- a/torchvision/prototype/transforms/_geometry.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from typing import Any, Dict, List, Optional, Sequence, Type, Union
-
-import PIL.Image
-import torch
-
-from torchvision import tv_tensors
-from torchvision.prototype.tv_tensors import Label, OneHotLabel
-from torchvision.transforms.v2 import functional as F, Transform
-from torchvision.transforms.v2._utils import (
-    _FillType,
-    _get_fill,
-    _setup_fill_arg,
-    _setup_size,
-    get_bounding_boxes,
-    has_any,
-    is_pure_tensor,
-    query_size,
-)
-
-
-class FixedSizeCrop(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
-        padding_mode: str = "constant",
-    ) -> None:
-        super().__init__()
-        size = tuple(_setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
-        self.crop_height = size[0]
-        self.crop_width = size[1]
-
-        self.fill = fill
-        self._fill = _setup_fill_arg(fill)
-
-        self.padding_mode = padding_mode
-
-    def _check_inputs(self, flat_inputs: List[Any]) -> None:
-        if not has_any(
-            flat_inputs,
-            PIL.Image.Image,
-            tv_tensors.Image,
-            is_pure_tensor,
-            tv_tensors.Video,
-        ):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
-            )
-
-        if has_any(flat_inputs, tv_tensors.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel):
-            raise TypeError(
-                f"If a BoundingBoxes is contained in the input sample, "
-                f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
-            )
-
-    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_size(flat_inputs)
-        new_height = min(height, self.crop_height)
-        new_width = min(width, self.crop_width)
-
-        needs_crop = new_height != height or new_width != width
-
-        offset_height = max(height - self.crop_height, 0)
-        offset_width = max(width - self.crop_width, 0)
-
-        r = torch.rand(1)
-        top = int(offset_height * r)
-        left = int(offset_width * r)
-
-        bounding_boxes: Optional[torch.Tensor]
-        try:
-            bounding_boxes = get_bounding_boxes(flat_inputs)
-        except ValueError:
-            bounding_boxes = None
-
-        if needs_crop and bounding_boxes is not None:
-            format = bounding_boxes.format
-            bounding_boxes, canvas_size = F.crop_bounding_boxes(
-                bounding_boxes.as_subclass(torch.Tensor),
-                format=format,
-                top=top,
-                left=left,
-                height=new_height,
-                width=new_width,
-            )
-            bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size)
-            height_and_width = F.convert_bounding_box_format(
-                bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYWH
-            )[..., 2:]
-            is_valid = torch.all(height_and_width > 0, dim=-1)
-        else:
-            is_valid = None
-
-        pad_bottom = max(self.crop_height - new_height, 0)
-        pad_right = max(self.crop_width - new_width, 0)
-
-        needs_pad = pad_bottom != 0 or pad_right != 0
-
-        return dict(
-            needs_crop=needs_crop,
-            top=top,
-            left=left,
-            height=new_height,
-            width=new_width,
-            is_valid=is_valid,
-            padding=[0, 0, pad_right, pad_bottom],
-            needs_pad=needs_pad,
-        )
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if params["needs_crop"]:
-            inpt = self._call_kernel(
-                F.crop,
-                inpt,
-                top=params["top"],
-                left=params["left"],
-                height=params["height"],
-                width=params["width"],
-            )
-
-        if params["is_valid"] is not None:
-            if isinstance(inpt, (Label, OneHotLabel, tv_tensors.Mask)):
-                inpt = tv_tensors.wrap(inpt[params["is_valid"]], like=inpt)
-            elif isinstance(inpt, tv_tensors.BoundingBoxes):
-                inpt = tv_tensors.wrap(
-                    F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size),
-                    like=inpt,
-                )
-
-        if params["needs_pad"]:
-            fill = _get_fill(self._fill, type(inpt))
-            inpt = self._call_kernel(F.pad, inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
-
-        return inpt
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
deleted file mode 100644
index bab2c7081..000000000
--- a/torchvision/prototype/transforms/_misc.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import functools
-import warnings
-from collections import defaultdict
-from typing import Any, Dict, Optional, Sequence, Tuple, Type, TypeVar, Union
-
-import torch
-
-from torchvision import tv_tensors
-from torchvision.transforms.v2 import Transform
-
-from torchvision.transforms.v2._utils import is_pure_tensor
-
-
-T = TypeVar("T")
-
-
-def _default_arg(value: T) -> T:
-    return value
-
-
-def _get_defaultdict(default: T) -> Dict[Any, T]:
-    # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle.
-    # If it were possible, we could replace this with `defaultdict(lambda: default)`
-    return defaultdict(functools.partial(_default_arg, default))
-
-
-class PermuteDimensions(Transform):
-    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
-
-    def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None:
-        super().__init__()
-        if not isinstance(dims, dict):
-            dims = _get_defaultdict(dims)
-        if torch.Tensor in dims and any(cls in dims for cls in [tv_tensors.Image, tv_tensors.Video]):
-            warnings.warn(
-                "Got `dims` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
-                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
-                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
-            )
-        self.dims = dims
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
-        dims = self.dims[type(inpt)]
-        if dims is None:
-            return inpt.as_subclass(torch.Tensor)
-        return inpt.permute(*dims)
-
-
-class TransposeDimensions(Transform):
-    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
-
-    def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None:
-        super().__init__()
-        if not isinstance(dims, dict):
-            dims = _get_defaultdict(dims)
-        if torch.Tensor in dims and any(cls in dims for cls in [tv_tensors.Image, tv_tensors.Video]):
-            warnings.warn(
-                "Got `dims` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
-                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
-                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
-            )
-        self.dims = dims
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
-        dims = self.dims[type(inpt)]
-        if dims is None:
-            return inpt.as_subclass(torch.Tensor)
-        return inpt.transpose(*dims)
diff --git a/torchvision/prototype/transforms/_presets.py b/torchvision/prototype/transforms/_presets.py
deleted file mode 100644
index 25c39a903..000000000
--- a/torchvision/prototype/transforms/_presets.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""
-This file is part of the private API. Please do not use directly these classes as they will be modified on
-future versions without warning. The classes should be accessed only via the transforms argument of Weights.
-"""
-from typing import List, Optional, Tuple, Union
-
-import PIL.Image
-
-import torch
-from torch import Tensor
-
-from torchvision.transforms.v2 import functional as F, InterpolationMode
-
-from torchvision.transforms.v2.functional._geometry import _check_interpolation
-
-__all__ = ["StereoMatching"]
-
-
-class StereoMatching(torch.nn.Module):
-    def __init__(
-        self,
-        *,
-        use_gray_scale: bool = False,
-        resize_size: Optional[Tuple[int, ...]],
-        mean: Tuple[float, ...] = (0.5, 0.5, 0.5),
-        std: Tuple[float, ...] = (0.5, 0.5, 0.5),
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
-    ) -> None:
-        super().__init__()
-
-        # pacify mypy
-        self.resize_size: Union[None, List]
-
-        if resize_size is not None:
-            self.resize_size = list(resize_size)
-        else:
-            self.resize_size = None
-
-        self.mean = list(mean)
-        self.std = list(std)
-        self.interpolation = _check_interpolation(interpolation)
-        self.use_gray_scale = use_gray_scale
-
-    def forward(self, left_image: Tensor, right_image: Tensor) -> Tuple[Tensor, Tensor]:
-        def _process_image(img: PIL.Image.Image) -> Tensor:
-            if not isinstance(img, Tensor):
-                img = F.pil_to_tensor(img)
-            if self.resize_size is not None:
-                # We hard-code antialias=False to preserve results after we changed
-                # its default from None to True (see
-                # https://github.com/pytorch/vision/pull/7160)
-                # TODO: we could re-train the stereo models with antialias=True?
-                img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=False)
-            if self.use_gray_scale is True:
-                img = F.rgb_to_grayscale(img)
-            img = F.convert_image_dtype(img, torch.float)
-            img = F.normalize(img, mean=self.mean, std=self.std)
-            img = img.contiguous()
-            return img
-
-        left_image = _process_image(left_image)
-        right_image = _process_image(right_image)
-        return left_image, right_image
-
-    def __repr__(self) -> str:
-        format_string = self.__class__.__name__ + "("
-        format_string += f"\n    resize_size={self.resize_size}"
-        format_string += f"\n    mean={self.mean}"
-        format_string += f"\n    std={self.std}"
-        format_string += f"\n    interpolation={self.interpolation}"
-        format_string += "\n)"
-        return format_string
-
-    def describe(self) -> str:
-        return (
-            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
-            f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``. "
-            f"Finally the values are first rescaled to ``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and "
-            f"``std={self.std}``."
-        )
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
deleted file mode 100644
index 3532abb37..000000000
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from typing import Any, Dict
-
-import torch
-
-from torch.nn.functional import one_hot
-
-from torchvision.prototype import tv_tensors as proto_tv_tensors
-from torchvision.transforms.v2 import Transform
-
-
-class LabelToOneHot(Transform):
-    _transformed_types = (proto_tv_tensors.Label,)
-
-    def __init__(self, num_categories: int = -1):
-        super().__init__()
-        self.num_categories = num_categories
-
-    def _transform(self, inpt: proto_tv_tensors.Label, params: Dict[str, Any]) -> proto_tv_tensors.OneHotLabel:
-        num_categories = self.num_categories
-        if num_categories == -1 and inpt.categories is not None:
-            num_categories = len(inpt.categories)
-        output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories)
-        return proto_tv_tensors.OneHotLabel(output, categories=inpt.categories)
-
-    def extra_repr(self) -> str:
-        if self.num_categories == -1:
-            return ""
-
-        return f"num_categories={self.num_categories}"
diff --git a/torchvision/prototype/tv_tensors/__init__.py b/torchvision/prototype/tv_tensors/__init__.py
deleted file mode 100644
index 604628b25..000000000
--- a/torchvision/prototype/tv_tensors/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from ._label import Label, OneHotLabel
diff --git a/torchvision/prototype/tv_tensors/_label.py b/torchvision/prototype/tv_tensors/_label.py
deleted file mode 100644
index 506c4fb2b..000000000
--- a/torchvision/prototype/tv_tensors/_label.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, Optional, Sequence, Type, TypeVar, Union
-
-import torch
-from torch.utils._pytree import tree_map
-
-from torchvision.tv_tensors._tv_tensor import TVTensor
-
-
-L = TypeVar("L", bound="_LabelBase")
-
-
-class _LabelBase(TVTensor):
-    categories: Optional[Sequence[str]]
-
-    @classmethod
-    def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L:
-        label_base = tensor.as_subclass(cls)
-        label_base.categories = categories
-        return label_base
-
-    def __new__(
-        cls: Type[L],
-        data: Any,
-        *,
-        categories: Optional[Sequence[str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: Optional[bool] = None,
-    ) -> L:
-        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
-        return cls._wrap(tensor, categories=categories)
-
-    @classmethod
-    def from_category(
-        cls: Type[L],
-        category: str,
-        *,
-        categories: Sequence[str],
-        **kwargs: Any,
-    ) -> L:
-        return cls(categories.index(category), categories=categories, **kwargs)
-
-
-class Label(_LabelBase):
-    def to_categories(self) -> Any:
-        if self.categories is None:
-            raise RuntimeError("Label does not have categories")
-
-        return tree_map(lambda idx: self.categories[idx], self.tolist())
-
-
-class OneHotLabel(_LabelBase):
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        categories: Optional[Sequence[str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> OneHotLabel:
-        one_hot_label = super().__new__(
-            cls, data, categories=categories, dtype=dtype, device=device, requires_grad=requires_grad
-        )
-
-        if categories is not None and len(categories) != one_hot_label.shape[-1]:
-            raise ValueError()
-
-        return one_hot_label
diff --git a/torchvision/prototype/utils/__init__.py b/torchvision/prototype/utils/__init__.py
deleted file mode 100644
index e85a582b4..000000000
--- a/torchvision/prototype/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import _internal
diff --git a/torchvision/prototype/utils/_internal.py b/torchvision/prototype/utils/_internal.py
deleted file mode 100644
index 3dee4b59a..000000000
--- a/torchvision/prototype/utils/_internal.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import collections.abc
-import difflib
-import io
-import mmap
-import platform
-from typing import BinaryIO, Callable, Collection, Sequence, TypeVar, Union
-
-import numpy as np
-import torch
-from torchvision._utils import sequence_to_str
-
-
-__all__ = [
-    "add_suggestion",
-    "fromfile",
-    "ReadOnlyTensorBuffer",
-]
-
-
-def add_suggestion(
-    msg: str,
-    *,
-    word: str,
-    possibilities: Collection[str],
-    close_match_hint: Callable[[str], str] = lambda close_match: f"Did you mean '{close_match}'?",
-    alternative_hint: Callable[
-        [Sequence[str]], str
-    ] = lambda possibilities: f"Can be {sequence_to_str(possibilities, separate_last='or ')}.",
-) -> str:
-    if not isinstance(possibilities, collections.abc.Sequence):
-        possibilities = sorted(possibilities)
-    suggestions = difflib.get_close_matches(word, possibilities, 1)
-    hint = close_match_hint(suggestions[0]) if suggestions else alternative_hint(possibilities)
-    if not hint:
-        return msg
-
-    return f"{msg.strip()} {hint}"
-
-
-D = TypeVar("D")
-
-
-def _read_mutable_buffer_fallback(file: BinaryIO, count: int, item_size: int) -> bytearray:
-    # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable
-    return bytearray(file.read(-1 if count == -1 else count * item_size))
-
-
-def fromfile(
-    file: BinaryIO,
-    *,
-    dtype: torch.dtype,
-    byte_order: str,
-    count: int = -1,
-) -> torch.Tensor:
-    """Construct a tensor from a binary file.
-    .. note::
-        This function is similar to :func:`numpy.fromfile` with two notable differences:
-        1. This function only accepts an open binary file, but not a path to it.
-        2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that
-            concept.
-    .. note::
-        If the ``file`` was opened in update mode, i.e. "r+b" or "w+b", reading data is much faster. Be aware that as
-        long as the file is still open, inplace operations on the returned tensor will reflect back to the file.
-    Args:
-        file (IO): Open binary file.
-        dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor.
-        byte_order (str): Byte order of the data. Can be "little" or "big" endian.
-        count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
-    """
-    byte_order = "<" if byte_order == "little" else ">"
-    char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
-    item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-    np_dtype = byte_order + char + str(item_size)
-
-    buffer: Union[memoryview, bytearray]
-    if platform.system() != "Windows":
-        # PyTorch does not support tensors with underlying read-only memory. In case
-        # - the file has a .fileno(),
-        # - the file was opened for updating, i.e. 'r+b' or 'w+b',
-        # - the file is seekable
-        # we can avoid copying the data for performance. Otherwise we fall back to simply .read() the data and copy it
-        # to a mutable location afterwards.
-        try:
-            buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
-            # Reading from the memoryview does not advance the file cursor, so we have to do it manually.
-            file.seek(*(0, io.SEEK_END) if count == -1 else (count * item_size, io.SEEK_CUR))
-        except (AttributeError, PermissionError, io.UnsupportedOperation):
-            buffer = _read_mutable_buffer_fallback(file, count, item_size)
-    else:
-        # On Windows just trying to call mmap.mmap() on a file that does not support it, may corrupt the internal state
-        # so no data can be read afterwards. Thus, we simply ignore the possible speed-up.
-        buffer = _read_mutable_buffer_fallback(file, count, item_size)
-
-    # We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we
-    # read the data with np.frombuffer() with the correct byte order and convert it to the native one with the
-    # successive .astype() call.
-    return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False))
-
-
-class ReadOnlyTensorBuffer:
-    def __init__(self, tensor: torch.Tensor) -> None:
-        self._memory = memoryview(tensor.numpy())
-        self._cursor: int = 0
-
-    def tell(self) -> int:
-        return self._cursor
-
-    def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
-        if whence == io.SEEK_SET:
-            self._cursor = offset
-        elif whence == io.SEEK_CUR:
-            self._cursor += offset
-            pass
-        elif whence == io.SEEK_END:
-            self._cursor = len(self._memory) + offset
-        else:
-            raise ValueError(
-                f"'whence' should be ``{io.SEEK_SET}``, ``{io.SEEK_CUR}``, or ``{io.SEEK_END}``, "
-                f"but got {repr(whence)} instead"
-            )
-        return self.tell()
-
-    def read(self, size: int = -1) -> bytes:
-        cursor = self.tell()
-        offset, whence = (0, io.SEEK_END) if size == -1 else (size, io.SEEK_CUR)
-        return self._memory[slice(cursor, self.seek(offset, whence))].tobytes()
-- 
GitLab