From e89f639fa99a05607eaaed1832adc97b5095553b Mon Sep 17 00:00:00 2001 From: Robert Perrotta <104582251+robert-perrotta@users.noreply.github.com> Date: Tue, 4 Oct 2022 04:08:37 -0400 Subject: [PATCH 001/624] Fix error message typo (#6682) --- torchvision/models/detection/roi_heads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py index 18a6782a0..38dd7d4cf 100644 --- a/torchvision/models/detection/roi_heads.py +++ b/torchvision/models/detection/roi_heads.py @@ -787,7 +787,7 @@ class RoIHeads(nn.Module): mask_proposals = [p["boxes"] for p in result] if self.training: if matched_idxs is None: - raise ValueError("if in trainning, matched_idxs should not be None") + raise ValueError("if in training, matched_idxs should not be None") # during training, only focus on positive boxes num_images = len(proposals) -- GitLab From 344ccc05b2ebf643af319b6f648128e18807c892 Mon Sep 17 00:00:00 2001 From: Robert Perrotta <104582251+robert-perrotta@users.noreply.github.com> Date: Tue, 4 Oct 2022 04:12:07 -0400 Subject: [PATCH 002/624] Fix missing f-string prefix in error message (#6684) Co-authored-by: Nicolas Hug --- torchvision/models/detection/roi_heads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py index 38dd7d4cf..f6347a0d9 100644 --- a/torchvision/models/detection/roi_heads.py +++ b/torchvision/models/detection/roi_heads.py @@ -746,7 +746,7 @@ class RoIHeads(nn.Module): if not t["boxes"].dtype in floating_point_types: raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}") if not t["labels"].dtype == torch.int64: - raise TypeError("target labels must of int64 type, instead got {t['labels'].dtype}") + raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}") if self.has_keypoint(): if not t["keypoints"].dtype == torch.float32: raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}") -- GitLab From 45f87fa3a6bb2aec9885c3a0963513ea2bba7532 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Tue, 4 Oct 2022 03:16:57 -0700 Subject: [PATCH 003/624] [ONNX] Support exporting RoiAlign align=True to ONNX with opset 16 (#6685) * Support exporting RoiAlign align=True to ONNX with opset 16 * lint: ufmt Co-authored-by: Vasilis Vryniotis --- test/test_onnx.py | 27 +++++++----- torchvision/ops/_register_onnx_ops.py | 63 +++++++++++++++++++-------- 2 files changed, 61 insertions(+), 29 deletions(-) diff --git a/test/test_onnx.py b/test/test_onnx.py index d5dae64b4..b6f5481ed 100644 --- a/test/test_onnx.py +++ b/test/test_onnx.py @@ -1,6 +1,6 @@ import io from collections import OrderedDict -from typing import List, Tuple +from typing import List, Optional, Tuple import pytest import torch @@ -11,7 +11,7 @@ from torchvision.models.detection.image_list import ImageList from torchvision.models.detection.roi_heads import RoIHeads from torchvision.models.detection.rpn import AnchorGenerator, RegionProposalNetwork, RPNHead from torchvision.models.detection.transform import GeneralizedRCNNTransform -from torchvision.ops._register_onnx_ops import _onnx_opset_version +from torchvision.ops import _register_onnx_ops # In environments without onnxruntime we prefer to # invoke all tests in the repo and have this one skipped rather than fail. @@ -32,7 +32,11 @@ class TestONNXExporter: dynamic_axes=None, output_names=None, input_names=None, + opset_version: Optional[int] = None, ): + if opset_version is None: + opset_version = _register_onnx_ops.base_onnx_opset_version + model.eval() onnx_io = io.BytesIO() @@ -46,10 +50,11 @@ class TestONNXExporter: torch_onnx_input, onnx_io, do_constant_folding=do_constant_folding, - opset_version=_onnx_opset_version, + opset_version=opset_version, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names, + verbose=True, ) # validate the exported model with onnx runtime for test_inputs in inputs_list: @@ -140,39 +145,39 @@ class TestONNXExporter: model = ops.RoIAlign((5, 5), 1, -1) self.run_model(model, [(x, single_roi)]) - @pytest.mark.skip(reason="ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16.") def test_roi_align_aligned(self): + supported_onnx_version = _register_onnx_ops._onnx_opset_version_16 x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1, 2, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 0.5, 3, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1.8, 2, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((2, 2), 2.5, 0, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((2, 2), 2.5, -1, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) - @pytest.mark.skip(reason="Issue in exporting ROIAlign with aligned = True for malformed boxes") def test_roi_align_malformed_boxes(self): + supported_onnx_version = _register_onnx_ops._onnx_opset_version_16 x = torch.randn(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1, 1, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) def test_roi_pool(self): x = torch.rand(1, 1, 10, 10, dtype=torch.float32) diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py index 629c19c16..eaea0b900 100644 --- a/torchvision/ops/_register_onnx_ops.py +++ b/torchvision/ops/_register_onnx_ops.py @@ -3,7 +3,9 @@ import warnings import torch -_onnx_opset_version = 11 +_onnx_opset_version_11 = 11 +_onnx_opset_version_16 = 16 +base_onnx_opset_version = _onnx_opset_version_11 def _register_custom_op(): @@ -20,32 +22,56 @@ def _register_custom_op(): nms_out = g.op("NonMaxSuppression", boxes, scores, max_output_per_class, iou_threshold) return squeeze(g, select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1) - @parse_args("v", "v", "f", "i", "i", "i", "i") - def roi_align(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): - batch_indices = _cast_Long( + def _process_batch_indices_for_roi_align(g, rois): + return _cast_Long( g, squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1), False ) - rois = select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long))) - # TODO: Remove this warning after ONNX opset 16 is supported. - if aligned: - warnings.warn( - "ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16. " - "The workaround is that the user need apply the patch " - "https://github.com/microsoft/onnxruntime/pull/8564 " - "and build ONNXRuntime from source." - ) - # ONNX doesn't support negative sampling_ratio + def _process_rois_for_roi_align(g, rois): + return select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long))) + + def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int): if sampling_ratio < 0: warnings.warn( - "ONNX doesn't support negative sampling ratio, therefore is set to 0 in order to be exported." + "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. " + "The model will be exported with a sampling_ratio of 0." ) sampling_ratio = 0 + return sampling_ratio + + @parse_args("v", "v", "f", "i", "i", "i", "i") + def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): + batch_indices = _process_batch_indices_for_roi_align(g, rois) + rois = _process_rois_for_roi_align(g, rois) + if aligned: + warnings.warn( + "ROIAlign with aligned=True is not supported in ONNX, but is supported in opset 16. " + "Please export with opset 16 or higher to use aligned=False." + ) + sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio) + return g.op( + "RoiAlign", + input, + rois, + batch_indices, + spatial_scale_f=spatial_scale, + output_height_i=pooled_height, + output_width_i=pooled_width, + sampling_ratio_i=sampling_ratio, + ) + + @parse_args("v", "v", "f", "i", "i", "i", "i") + def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): + batch_indices = _process_batch_indices_for_roi_align(g, rois) + rois = _process_rois_for_roi_align(g, rois) + coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel" + sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio) return g.op( "RoiAlign", input, rois, batch_indices, + coordinate_transformation_mode_s=coordinate_transformation_mode, spatial_scale_f=spatial_scale, output_height_i=pooled_height, output_width_i=pooled_width, @@ -61,6 +87,7 @@ def _register_custom_op(): from torch.onnx import register_custom_op_symbolic - register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version) - register_custom_op_symbolic("torchvision::roi_align", roi_align, _onnx_opset_version) - register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version) + register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version_11) + register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _onnx_opset_version_11) + register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _onnx_opset_version_16) + register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version_11) -- GitLab From 969a7b532ec8946efc78513e12a81dcc78289444 Mon Sep 17 00:00:00 2001 From: YosuaMichael Date: Tue, 4 Oct 2022 12:22:27 +0100 Subject: [PATCH 004/624] Bump main version to 0.15 (#6691) * Bump main version to 0.15 * Update table on README.rst * Revert the readme update --- android/gradle.properties | 2 +- version.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/android/gradle.properties b/android/gradle.properties index 1b6b275f6..8204b73b0 100644 --- a/android/gradle.properties +++ b/android/gradle.properties @@ -1,6 +1,6 @@ ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64 -VERSION_NAME=0.14.0-SNAPSHOT +VERSION_NAME=0.15.0-SNAPSHOT GROUP=org.pytorch MAVEN_GROUP=org.pytorch SONATYPE_STAGING_PROFILE=orgpytorch diff --git a/version.txt b/version.txt index 56f78043a..b4f7ccce2 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.14.0a0 +0.15.0a0 -- GitLab From b482d896f448cc44fdadb030391ac12723a81546 Mon Sep 17 00:00:00 2001 From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com> Date: Tue, 4 Oct 2022 04:23:05 -0700 Subject: [PATCH 005/624] Use real image in test_detection_model (#6658) * update test_models.py * update tests * fix linting * fix linting * add comment * Trigger CI Co-authored-by: YosuaMichael --- ...er.test_fasterrcnn_resnet50_fpn_expect.pkl | Bin 3939 -> 4395 bytes ...test_fasterrcnn_resnet50_fpn_v2_expect.pkl | Bin 3939 -> 4410 bytes ...elTester.test_fcos_resnet50_fpn_expect.pkl | Bin 9571 -> 3405 bytes ....test_keypointrcnn_resnet50_fpn_expect.pkl | Bin 2199 -> 3367 bytes ...ster.test_maskrcnn_resnet50_fpn_expect.pkl | Bin 4507 -> 4965 bytes ...r.test_maskrcnn_resnet50_fpn_v2_expect.pkl | Bin 4507 -> 4986 bytes ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 9571 -> 9382 bytes ....test_retinanet_resnet50_fpn_v2_expect.pkl | Bin 9571 -> 9461 bytes test/test_models.py | 55 ++++++++++++++++-- 9 files changed, 49 insertions(+), 6 deletions(-) diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl index e95ba5f53985e3773c6a625bed929b406350aa90..862af2185c75bd90734b068981e298cf94d11cc8 100644 GIT binary patch literal 4395 zcmc&&2~d<*68^biM8I&2oZg6FxJCv+L(g;zQ6?-$f?^CZ90SZUF!Rp=$YJR!UWtZP zYb)ZBRYW8niOFsv5F;_W>Y9i_Ap(kuM?zL(K|B(VnEn4BNhw=o6DqZNRd0Iwf8F2L zulv3K&6@;;g(bt-+A{yurZfIboHk3Pc}Zn3s`P(uVlDjqQfxQ}1813@qhc-lT|_Y&ZMyNr{)ZygD#eVm z31+QREI65|GqE;tm1aiPHc{cEa8X#UVC^_zwt^L{%FNViG~5)_+5&?pUZ17XtFmUP zGmRp4#FQk~-pGzjVjWVfIT%fPhDu{#9do%DatC-{&=%zUMC&BbhOnb@Z4_BX_L)S5 zrNX6xbyiqaur8}}c|cbq>z2fd1wcclmdhnOx*ssNKj=`_z2Dp8v9~Ah?X^nq_BOIU zN$eO?dSHUW#a^w8o8t}ne&6GvSI<1$2bzN zH~uh|lS(?~ds7&{bUBrlv3GDorvowTB(gUhM^VyVF;x}3A&h%%Kf*~RB}7>X{=JeB zq|`o7X+^dqi|v4l2zyfA4<_y9Kw-RQO)$k&T*B6x5#;p#4w=7~BV`>Ap;@!~+j5$l?mk%t*D)C};bNc}` zb5L&AVn#j(ZRpU$!?<_eI(f^qPtkC;8}q!s5b(=Id|s8bG|E%RN7vO5A>Z3(KM>|; zk_QOm-rG+K_8ro0LI3>B2ZAoGwWkT5cafRYk5}b*3E$o8NO5gG^jXnP;f(t41`sHptC9j$`tQ z9yFWeO0P9+mbaygY1*+cTJQcN%JR3!U;NOVTKy)_tw-aj=M}BIx;2Wj#1V9qSRpo+S@-7!EH{bdD9!RLu#Bz>;5dMo1M=n;Z`y6FW>aCUh-SB5$AU7h@eDx}bB7lrYpKYWRi zWf3&z+eoTj=}T91arCug07aj;C5(S-{*1tPFz14xzorkNOP5dK?TU{vIa`bVeaBJP zegk7;XV8R5H*)xLCe3q`)7hJ6@aSzRUKJ-$rO#xVx%{t!|N3wFyerAFr&rLWz4tN7 z$DPb1VRRz;S^A_foC4R+qr1ybp{yhXn?ugx#)d~YexsjYUy-wvlS)djS}Ev?85?ok zU)*T#jXSui!c%Tv_Xsb{nMCdEQGBP?L$35|#yjUGQ{#d#N=+71`c)~VIn1Pp9n-0F z@o1`TiJ(z)<#fUSZ5f}xo%_4FeWawT5`G^ujK_RF?@HR&z6<}X|6YE0$J==N_yP3W z`W)F>dXoNO6cu+wQ{uu0f_-ebwP0^l*Q4&jHu>CpALC}{lPKF0#64#im+(2l_{b~% zv~Ye99Xm3W-l|EV$$epTaAPRBUks<_Hy2Q3@g@Op#g;}~-RMg3=L6{@Q4ncY22+Y* z0R;p*(-!Xz%!~A*h6Dw*1UyGxjU(yUt|O|Ua6-!(C zLTFD{B^Q&D#=YK-0SBg0+~3B~0aq!p`dPGRb12?%L@Y0Z;>l(3qgffS9iN6j{YE7mMLY zUlAN^EP~mair{o<5qxGSg4FmT@E=*P5WN+yvj3n4mu21d^#H zXsR*7#wsK1DK)~%Y$L?qGQhp-21r&L;QLeq?35TFF~k5%JPd&M_0Zj>2Um?AymIy6 z7ORIUEeqLwI*9&42PMmNU^h<(w#hm$SLooOzYb=)>R`5=4r=dcp;4)Y^RH;3B1sF@ zvkJh&zX0fDK3vGihsgMRkVfW1(a3zTTdV<_Vhx2hax!1z5I6?+4;Kb0$`!>{Lfye1M9*49%WgZWrI`x7kxw*ub=WU~q${ z+ALn^KS8HW&&;)+kfTY`rDrBj&}t^TDfn8NOJm<*XvZ9Kz&bvN_x*yphh~tmmoKS(%v{wKhF# za<290oMcUoCOITst+nPp^@-waw47}eXBXRs50jagq{+ zNj&CW;jM~prhkuqn;$FH#^b1&atQr~Ig{X5CB{?s25YMA-+|$xH+`uO7fATz9@9T= zP*9Y65Z$lyV175vAE@1e1L(BVom^@gluLrVDA#2cmbq7Bn*N^hr#a^_@6J&iGX5|o z?5xEF#YZv!TR)~1@;SfS7QDpIkBax5#>>kbsrP}OabS`Yg=~1M6#PjmhoXaiD{SaTm^Ja^%cjSn$eNg4s3?P>8)anS{^7Bb`%n!z=j9=;3G=J@)-Kbo?6-Oc#4-I>H#{kLU52uKdoTJ^-`-ST`w!NinCE89ui9V5@VwHixVvKk zRu6J!c*wmG=vhA-wbkw{p7-iejKA=l6~mHs!oE<^{%a4|di=UYf#tqyad)E)^JjWH zH1T(Lp!&X#l@(7q()xUNGL|SPJfs0#ruU|C(GRho%POWn)3JuPD!TBEoavp83-hF+ zRS89`Kk1E|c&j3n?ik~*9owiZIvY<`r_7k&CNv0V4j6+&9bK6|TJA-I%Lz*xVyRK_ z7fd}7&*E*Javk3;^e0W1o0!@wkOn;_%DUKx;PqI#dM<)~`SKNuC$BYU{9TKa77@3@6EN#+ z9mD$bqOXzlC)sJ6a_$~qnrgih56nJ@vIo0x)jfN1b{WX{e;t*Dz1luwzlHPj=F-$jmvUrA=x5|=kUR3B{MwY`);c&e_AFN((Qd=4j301i zmaxwxk|W3CDegfm9q|}VI%6M_ofu062m8^w9@|;Y*_tX0F+{%4S5aY)vfGETH1$67 z_giK|p4Dry!)iO_n2Py`&qrX2<4Js);7$wTtWk4+7)ck}($K-{u-(94abM*jjGr7# zmJCxpJM@}c!<(bReJ7u2IBiXQo`!Sikjlyc3DwoI#I<5LuU ziyogTs87QzmcM-XV&1CAYW-D)9}TylwRh_9RD~s7TfPK$jzC(WJAt(+otXc2NdSh+ zKc~*l!a0{n651xv!9`Cn)GC4Dq=*T;RneLq5{4UhInbSuRO~l@9K)~1eZl-&w%Sm` ztXViM!j572&oUPOmvcU>Kk9QjY_zSwy%7a?ZdrHQy>V%^*fK)B9G&&=232DrbJz^#86|F`gFFkHNwPR59Q7O)!5o3ibCc zAX&=`sAni2*-RJ=U;bAset?K^OtIMh-`f9IoYwM+@kMLOYpq@6wdNPwO|7W=m=$#& z9i*BUl$Z;daui`G>4F)ZqY^R>8a zMyQ85|Bc4i-#cHPEzehAD0-NDZMXRJW+1b{2nSXhVbW|PnDe!4iV@CCG=gUa-#*?5 zUw1J=qq`A8KQEz}VX$GhnZGa)e4KR18 z0S@#wKz3IHSiLQTuU{3y-lK)Ezh@!LeXWPn7xfUgSr1E!^x%`Ohx18#=pU~K%dUEO z+({2NY!NUSu_pQ3DaQ2<$S2f%!jIp`Oq~fA5Qw^ zLtFcNxKl9|ybn)-rMW(`jSqgM}qJi=} z4NMJ6hOgX{p`=a?*`G{++r{Hx^X^#CM#VzXu^3317zNJNkuc9868?TL9Ih-4hl|rj zLjP3=GYS#(SCmkcI2aCB1%a^Mr_^^JEtX0oLT!D$?Ipe5uXmm%CzpS@gkr!)7u;kE z^s*K*o?o9jR@iYujR=#n%_qL;nm;cb-krJ+#`J|}s&0Na*nN=kzZUVMYpxDZm};Bv z`<7Y<{`j=e_B7QtpTm~g>`zU!fu`E#vti$&&nnvw+IMpv5!R-S-#b!g^B(W%!gE$A mq7C4OE0iDk5JKT~!UI+!eV0LSM}%1jFHBxzArbgZ+QAtij$K8mcXdL$-Q9*GR_l3kU&Q!fla@54+9OiG{IrqH2b?^PYTld{} zUcFA$*w`|Ry*=}bk-!8qDMfi&-B7LGpe>0oh*Q>>Z0UDNZed|oiB?~zHN>d1#uOK3 zO^D9YPAt~u8Y1(u4cQUJ<8;Xo$YlQhD6<~LF+67#k|7f_$SInr)pORlhN6<({D>^0 zp+Kish)g-g0$pB~L0hOVDiOI9oJ~kbhMgFo7d^AbYB^g=h$2~6lx-Mdd8puI8CIf? znv-YPh;weSk+VygZ)M=@2Wwn3N{#Jw&Ow~4rgMrJxw%C;otS??QK4Qjs3cEYqRs19 zkZVwI9YWJMM+5g(8t0TD7h!bSIa-~bbIuoY$p4A=oub0AKhwHU+9=L7-%gWf;M@jl zY&FVhoV!LgjZ=P*F9CWOI8_?wNdfh_MPeZtrqNs*^$WhQ-v|-h~?}UO1=p88FOf+`jY& z=(i&S1AEz_q@RoI1D!_f1?9SBP~z4beK$Gam2X?1#Z`q)X|dQe%O2FFb>MF>LwCqj-+b6&OA_1|Op<)xDpLhk!@dK!2bHisUTr zx0BtcB{&pB@FKY7Qx{wcx4L4DMz7!>xIpX#D4@T3FOp0fP$aAhtFVC;1$NRU4Wp!x_Cc>%MyRW*g`T6-=svUoKKh3Y?`&v>v3}{; zI$4epZIz%r;DH6U{#bQ32k6`<(J1? zg@8N$_)Kx1>L(mdLps=D>Vh)(iN3E z-631P6#D9(``x+H zdozq1XQ|)7@@tSCl8H6D9#eZBEMN~MSHj9&M`*m{D~;{v=OjsZ&)>dI@vSeZsK1X! ziQSsR@amVAej6ESWFOt14Qcrs+39~u$E2fc*>?w3vU?wGXM42Xf&V7%h4sNzV8!l- z?z(t9>FfdTJgtI=YENjK>Wyjo9Z4y=kD*84akg(__->KY)Qt2pLeD9%JNdkyy%P@cF&{vh0bb&vMX-vxb2qpe6KM9HeYf> zKB$D+hZT5W@W{cqVTq+b$_kxv`$#7&Ti}JspY+G-1qCp3zbA(D-b4L89dE#~peA@6 z*9pDGC*T+5tHCwK4d4CYQ@B_E4ptai*xJB#kaMJ!?R;12X@;qKz6QD$d((P0tg^+e z>KF|9*pKF$e#;fWGY%`yXehqZxJ0(YmNJ-Be~|j$YS;u$7Xwgry@uK~yB|VG4;O44 zZI2D7GH`{bJI=hFfZBE56psz;4XMjVvU@rWqyE!7$S`?i9ZdexvcHZWIRq6Rhd^yO z3Q@*jJYw9;#ywgM-Uk@Slc!+x*_EKW{Q!>LTn$D=A9U^-4fl_ofd?ZpVMa!8oH*hF z@Xy0>M&3h;U$kHiT&q#wf=zqD`*0|RsTlMzbjPzd-Do~jw)*1aeo-jc2UC4LC~<^C zD-7pjamatvc(LsOG-S##;Lz6)q6KV;T?o2+9nkmiDvCe+>}+;m7jO8RViAl`KZA9b zw}5@)Imj5b1D>CZMb~T7;f32xxOeUYm{Nmyrs5RT&pQg;{TsliPZX|Sdx|p zCW<$!^L1Eq;wWq{_QM!Y9}HaD4E^og@IwAG*wN=*OpI6qCDrHQtHEob?w|1}9bm_Y zj5j8UN2?^ib%1%b`v8kd29IAqz@(1|Y5Xz&QHMYJ;FtSA+Mj{-r(67N7)Fxc+MjXl z-k;IQ5cBK%^Y?BL&z=Y*=!!tnng!DHTY>P60=dy3$|-?loD|5gT>=RdC2EI2y8TNa z555qeRSD$ra)BILCeBL*QockWs)YhsFh?Na9|$C!7l^|ofg~9O(pMB;QJf0};x$en znjC>l?I)162?BXPMj*FC1hU*)AS*ouvOy`3wx?#Y^0t}ut1}bpoo4dIW;02zG?V;g zW+F^86V((msS(WNa;cg4a%Pf{VPf%8)PPx8Z+q&qCLt?8p1?8rJ3|p zm`Td_Jh}9gC;SbbT)4uMVPEs)ave|h@8-$6YMzvB=E_lEIcXOizWxc%owLFl%x9N|LIvlBS>Uv@c(e5+uVb z)yU;Ne#Gl+6aF)@^e=-newyX9R~Y7`$R-Ujj(!R>xL9;?<-}() O>EbWO$Qa4r;`=|ZC7yZ! literal 3939 zcmeH~c~BHr9>;r_102EOMno>pAtI+n&8ltxo|!cc=!hs}SmhY*AlSpe8l!e3giOR* zQDaPu@kn{WMqN$Ra#xLI)HR7>FiJ`z2#Oagm!RN<`|H=lW!Osg&&?lsRiAqGo!|AY zp6+MR%M}dc;K2M>b7owbWV0zVt1vAzHQACJvha-@9SmTMHtlV@c|5a}mucgbm?0%^ zQCdE4XR_pBU`S$tB`YUiooG%=Dagu6O|+!t=I5Cc3nSFLT&s<@$AEnFOwLH-6;g;= zmy?%lnJxXO=9Td>^aY3)$G;><&)GU)l&G!$A;TKUWI9Ll2g)h@_9`rjv@2sx>0$#886B@ z)3V{bOQyX()xx{Z(kt}t@A7VX<-5H5`)G0q&>0E<)g zWxB}PI{-=Mn2PetBrNg$hB zR78y5?ly?9w`P*j6~0vKoSew?gjlklccMD|_7k|0F^fF2ccA)AkerOTG@a~ma-_Oy zn>Vr4mx0F1pX#kiF2uiU2=U2wrP}J@K`wU25yKuYs&(~_q%gAV@1l)uu&OwYoNa2SI;rM4EHius&J8l^AN<^pXm7qmrp=M! zeYbQ9bU*eh{Kdf>Q} zgS<*c^_-eW*tPpF(9VycdeqWRE`H}uSeN&Rvo_5m|JXDD(%&zJsz433uPr+a6RY+E zn{tlo%9Gu2)mKk``hcPHG0%AiirZ_TE3|~#$L#bWhVjwlYMr!xEwRqz4=)=?;D_$i ze}dVWxaUqKtLJ)<=2zwuZGP5ZV|-^DnLQgbycaA$f(gpId&g zgVd&h(C}wD^*ElOcuD^RuV=F}Jkbi+ox5l-BP*@(YM~s{_XE|2y?R9pnoR zS3=;#cfM3tW`~pVVN=OOWhAZlrBeWG&#}UhH38H<EcQ~%Ocja=*9$KZ4H9v#nut19?DeJpn%OL~60 zwOX(yYzE^-E%hI^U@tURoPej{2dTENaEGuW2DqS{*B#m(Dv8cJ#((R#(f_1H7a^?;k)l0+kUZ~ioJ^zb7!vkcUK^|=U`zjg~5m@<+_Jl%cfBz^4l(| zh4MjUvDd5QXsY!8EWUsWl&az>_-YjSwHJ`W_5~EK!;0cdTx`Ghzpump^BmwuEN#EF zb0|Jv`}S8_1LO=N@@vo6Fx%%VLI(kTzP4>Xy{WkMTwtq_${x;Oe*{TK;t>T>*)6aq zks`!`{0SL@Xc0fe6>&uFKNHwimB5phKBbqH)7QjUCr+-Miry~tSP(tUy5jkLE3>|3n@`=CW& zosbF50y`SvA`us){hq)cMMfYi?qZEQSOXb@T)8c<;(p)Mue?|$V;GTN z-fs;P`C`8d)6DsJbJ0QQ|J?9CM<7xyR;=2TF~<(izlcFE6T268y583dXVGBO`rh#R zM)!U<46un@g|q7&9U9g*w)eTWjdjNRrcG>k-`L)3XdAn*f1lWhzOlVmLt``MB&Ywx z_N=)haT^8Gge C%kz-{ diff --git a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl index 0657261d96cefe0d09b93efcd81c21b4cb56b3da..3d4e3e63f280c79044706fa5ac4e9c1c448fdefe 100644 GIT binary patch literal 3405 zcmeHKYiv|S6rOE&+q%3DYOktji9qx zXU=!d`OeIACg3l25>i%1{?}@Whb&Du>Pc14ntFzs=%Op*>1;HkXH&X4-xFQgl8WkU zTlBc8G{(#rZCRD{)y*Z50-9B{%pmIx$86z31S`^O^{i18H`AGTf<{}-=493le=Ay> zlZ{bRPi51YXj`q@D4soAl)@m3nXx9_aOO?iuf>up!})J+!zCP;;xS4@G5UB*t5Lf2 zU59Cu1^wgw6@KSt;|X+*&4znRJf2P_kwxdWvrfiV z=*g@xId9z;bbUJ2)F1SeT+p?~)I8v{KENk!z^bh|!0DzjBV;_)8lwS!X_-}f*S83X z+8P+PnjZ3Wfz?zzzPjp1*0NJO$%r7F@Y}2yzh1d3?3EMLj^JNxVTkbiTCXHxa>Oex zs9rIPc}1`EimC_^JGD|lL`U!@Cn2I){IqKE5_Xp#*2*;?22zp)ENN-Cp_5==58FM= z5|}5RMSn+HqB7EszFka7jp*Y01krXh{GyQn z5$z(t+XZlh4;<|R{u1cN%qtt92U*MOaY^dmMh!lK9+MV-2(5_xt!4y)MVovt1aI&l zQOoV{d{VWLR}b&z)f$*(UEmoJ@nxvjPIZIyWBfz#_@}xJ2x;w+<=(I*$HMLb z?3@vzp^n;=2$9EBA}MmJ2lpGXVgy*-f!hD568LXptC6xZrV5!zB)$ zw0r7hjy#aja?F1p@dv=!R*bE~+OxzO(I|F{oycf4YG%n^u^~+4MAY~cd`?HjbJk$2 z0k->962_PCd@)aa9`=fHYDR5hmiiX6cj)mq_V!le2(opg9a>lD4z2OkEqde*9r-Xm zpqNTVNhwK1S7OR?C8BVJL0(l}Q5GsMLte_s0%d_R2Qo`}MwzKpDU%_U%4lVjQVJ~c6zr(_@btImr70Pq1udZ?! zp5f|od(6ON28Ln4KL0Kqd?)ef^RI?1E?{4_>6t7pYreXL!)(lvkG=lU7uU18=nBJEJVVh3o{59eTX5kn5WQw7 z`oPK#Mn8+Ya)IczL(vCTrDhN_E$%|l`_@XpUs5u+4`xR3^ZkJK!^3X>_2=wY16uj0 dd=ESOsX?xc&i`!>2mCJkF#sDEvFGRK{tM;G{h9y( literal 9571 zcmeG?XH-Z5HFa`aWOVH8n91{e9CXc8s8##1PSl1WICM#gcOgYcK==N4MRv0DdWc*SiCT~Ann zrX7PxR)O~tk+4B5X}nMK1_Fvua>ej}G2a!tFJrzE-?M!}JSB~pA4X#{L74EYdn889 zXa7vIf5rwe+P@Qx4K!BbZhTwD#FufpbV}TJYiZ#t2&>SvpZ%#Yr!)IA3+1prm>kNs z)VOj-AU>Jhp5}uHTnX$5*q0q{YBhwg5`VtH`bvy`CJ2X-p6Uf|qviNG7S=Hc<#iu% zbW)=*w?>+iaPomb?4I68W415h=vjdn8{$M`rWKf&;f0@f{ek#uqLT?5MEBa>=rb)v zo^x2%2Ztm!h?w8JW<VdgFEf3Q@5C9ill0vrzaylrh^c+Z~7#TG$eg?c1TE*b77d zkuBSDo>L&YP8fvW?qz;cj!r)X;%7H*5})Pk#apL+ggU{Q`83D%lSI)qPZ@KJR>!^Y zy2A&1;@Pid?tkZ|i2XM8q{ z1b{c#YL)dMa>;xexBx;e-QC%x8OM(Gbh=nZ@jQ#EjiviROQF!k7+wD2659wM46N@w2;PZNnJt`IUW$7q2z6_rav}-gt4RJ8tU# zHqOZ99Joh#?UIFc5%;5nbA$TIm~E`NK>PFL!51=(zi{kaYu3?=(&RwQT3SUk*Uv11 z?>Xa7+gjqFZk!8)gd=WqJ`z5#;r^E}`6;iDB#&7yxw0FLSzm<}v)MlH8@A8*FJErW z5VaF~;6*R)7ZsKaG_W19>8nZ^vwx1c#K`+ujkAhrKM%SuDwlCTu|IpR&*f@l&%NcsBA&S$_9#df^)4g^>@KR^#e} zyv7D`z@=QqP5C5>WgEGEiQ=&x_p2{1wg(yCJ)uR1ltg%}AJa{GL7d=V(|AIk4Y@QhUi!3j8AuQzckq&lyFBozeg*gB{_$P zMq8}UX1*Okll`PIjpB>w6~vclE?!JZ>W#7K>{l(ztiJu5xHXMs4K(*MY{G{T(<471j*m^`O3J?T2Ts`eL7( ze+YHQ0Gg9<->-{A2Uk81*dLu)C%tf{mitYO*ROH@uK4|Qj*s`M3df!C!l4`Zyj5e2 z&l`k&v9gl)gD) z^Q-;r2}`)U*(zPzcXIZYlFHWkxK|Eg^?s!+mw?_Ub z<3G=n*!_Iwvp-(Hc(Ey?O0>0Se`c|HTMrD2;(DrZ>peBfY zD}2#eQ6;qYyq;$9Ac1Y^MNKHjrWZ5U`-`~3(;}sw+0Ph++rD2$H0KvD3Slt5xw;39nO33pfeWsyE7$`6+Wr zk8_C^X)A59XCkjZ%c}^~*n1bptH$-A98bOIF^uJZzYHN46C60YgZ`uuR*lGM4yH8J_>4UTahjA_dg9p>AU@mHh>{2wyn2A|OT7N<(;%jP#qsl4PoTz@3wZy~y>tubtU$M4 zD`ebM7hg;Y-bJ)8eyr1r`mXhw@`t=0lGe7UCvN%P8H4)q`;2qfi=hU0JT{Z}3-^OW zpu}bpuRs0XvEmpM=$Ogx1C}?5(h^6lXX+!OWTMq*Jot&2c#7|5@kFzaAaq;0kj6psT)Rtw zc zeld$akA1Pr!bUOt2KTFk8{gudvBM;f7Bc4X(Gis*rg1#k6(R?0q{K?|EZl^#Gt_SDBIx6fx zj`yny9pACTpU?4mq#;@&K4S%TZ^_bA;{CW5;?KH|=o+e$;r9XkJ@`Sbp=KW>_bVN7CN?*c*qt4aOs$ z@_B+CX+H-{s@J@={)B8VBrTP&5)!m_U<^32>I=l|9 z=-Z0>jq7bD9E2yk@}3AnA2ZiTiEkHmqj`=;t`~*MdXluZKle|f_&t;RDUoF8{=M54 z7x+z}F~=T8n0{BiQEORWNz}F+FHU-My_u&(`;WL+*|t)~R&n{FY!<)YxfY3H=e~N) z-csIgk~ZA)bKzciTMYSx`+@UK79S1if&FveBYlo3h(Is0?{j_i7Es{VhXBKtwcy&9c6C{%!GLUw%KazeF*qI@yv7*O&W` z-VX`PmuSp2kg(wZ_XpGT`;q>9Hlf7%ilH@81pn>NXErbY`OG_jtU&!8=(^z!to-y2 zjC!LU-g#69eXrI*v&D6g_g)=zRn>vxAJ?I`={h)mcpb*OT!)sw)q+hxEvTRt8a!&j zI`4Ogthoxu)?S5Q8?Hdei7Vi{?Fy9JUV)nrE<@F|%iw$NGBmz*8H!6U!SofEAYF9{ zPCmW}wrekfe_0LK=heWh=o+}>Q3DCiH8B4dGaT7xhAu755OS*;x))S~w4@pydRN1b zPStQgs)poyRdBer3OZh?f_~?#z$T>%j3=uAS69Iik1ELMR0$@RO0aUMgqD{o;O&wM z*ppuYNA(q8zpDZ+e^CL^t14jA!V0LHSph{kdDt9q9(JBAfxHh&VD0-Q;OJdKwu<5N z%3>IsT@0N!7sHZG#jt8sF{mSo;nR`DFze|#xcTrL^sYSz6S_J(F#vLjG*Q6r&?!zLO{B9ArsEXiJbs@wZDuhu4B|jE|_p(ChuPKD0E_8Fe zSO8lK3Lxj30w{z8cdrR-SDC;#(F9SxCiunP1fMqM!ym=@ zus$gtT;}D2`#2v_4O?omVOe%Ibp9b5e%p`@%hzVZ(nZ;DY(X}71ZG2Fc@~6Z zWkJK?EI9a87O1|=g6z#%@OxwyxP6ob#xYqibVL^H^Us2qmRT^sDhqt8&cLapGtfNY z3V8fK>+jN`CLta6%uk2VN$Jp9nGW|mro*Gh zY0&>n8hoCb20Mr!Fel2lmuaViw{PlbP} zQz5)-D(rMig|+QdVO+aZ7}z=$TpNuLaMuX!>x>XmYlQSFBgCW`A@{Tq^1nC2@~@3x z-f0AeTX-5sf=iSI-bCxC3|JpG8xAU&; z%jca>fKdF~e;3d>`ucfYf!7syU4hpXcwK?l6?k2N*A@7GtN`EFdL0hl6=qMrehL2b zea%_w^5VWWFES>UzGMjy?*DdSd*!2!w;k9+!B^A9SX0ytp}|&+UVPK@_m4{y%y{W} z^$VX@wg3Lx+UX_DjaJf&c=`JvFZHXsf4`@`tZQofimvyox___P%esa)U(xk>Rrl}J zaDFM~IbHtJwbVE?xJ8RT79qE0eP4+4=S=+kiM=RYehSG1KcvSWFiBS3_+P>Vp~21h O3nS8LF0p)*?EeDnO=}PU diff --git a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl index 2f1ff941abae5994144c73dcfd361e963ab28cb9..54dfb7cd206f1e420915bb5703f13971a4055cbe 100644 GIT binary patch literal 3367 zcmcJS3s4hB7{`xy2#P3(4+^411Vspe_&{=dw?}d6SsEVN_<$3VgD}Q~O%6+Ss3?W1 z^)+p^Y7rf^Me*&3qarv()am$uI{26lYFnm`P{$`K)Yf({K_-a<(`LqRX0yrVzVG+n zz5RB-Q?YLD1R;?SZ&(5mL407;^7;(kEbyji0e9R&{*wY@zCkc)3Ym@Rdf)x2grlA1> z8kmNB)t)$tK0Alko6#U0mY{o$ooO`W^-}#T)L1lF=fz3{G{izHL;kEw846gX69Wzv zP+%GwW(jE481Y1+;nsjUt7bVGVf7x_;~gY=zq8Wf9W0=bG&D->jpe+4AZ8R=j7CSO z@xvq*@!1A6#!g_8u`ox^_ey-%A~6nyVTg->#$rfhq&0ZBfX1bv2+NF_1ucr~(Peds zN8_z76Kq_f1T--XMHj05Yz1*W1B_t8ON;J#?wPHIA*~O1-bnN$DHunMQt9;F^6BR2A|#Kn4JwY zd@?qGDJtGew`5IYLN3dg*qK?(%@y&$yM|*D24>S$WgN3Z9z?Zwam=HnE?S(Vzt+?~ zFTpCr*54$Jbas+rlVM;VlO!gXXwLSc{zX@R&!cb*vAq@j^1k&)g~tQd^$6MKUrnCV z?jbYVw~#eE*OQ4wrKC1%9x3Wu@}g~ZDAp#n{`4g{JHK(sa8e)h+sCr&y?o`eLYwz~ z1|{lT{#C&PaU&O7e>#Uat1~_s`a0;0FO7MTsQbvmKM+!V@U8L{Z}Jb6%!3pc2p^Bn zf$|zTto!_1`rd|Mcs+0wOut`C?+a?6nz~bA5ekJF-QkdKd`Yu)>*>2G9kjpXA*~r* zO+U(+0aMNHFmo3Vr2%2^i}}-Ob>)3J_*OXFC{)q`n@p6}TLCZWBH**iG4SYxQ25kq z8x34*>55s8>4Nj$)4R*plNC+5;KzzKFsiExWF?GKZn~Wgz6mY{Ndta8O?7-)SMw?r zbl>_8R4kZS-+Cg741738+1)+|T>hsDe7%20edp;_l^ckGG=+QuE=?C7fcqQ(ngR*Ek* z`_FP}#Ia0j?V)-wZe;*<2M(v;m^0*MaGo5Q8b(cg>PLBZt6ubrZbFRI|qz(DxKBL+UG z4uL**w$h~+Tj|a&7r5O22wlBmDHR-6OOdn5NMR^>ZO(`3Ts3yrPP~|t1a(~iHg^i`ava=h>X}JmD=OC;zJss>>{)bZK z8wrN#{lVtyL~>A8t8%@un=C0*gZ3M3O8v<)Qom^pIeGElO@{F=1pc?G$ccXLt^x1X#Zd0q@u!~D)VMxafH}Ztc%$7)?~IvV1oFE zojG5jIdW`IktueJeOQlkoEVuS#dfc^r!?al%}GkEBgJ;dwx?{s1(K5#xg*7PzqO~x za53v7CC-szyQ|t$ZaPniccj?nh&@FtQk@)&!jWRT&ZpX~u}Ape`2Uoi>{GFxo>4vF yLfj_y1{Mz^(O&m);%fl5FU86k8}<`lB8YPX#7}DxqhdY8G9UX$2=RXFz5fC`w%lz1 literal 2199 zcmbuAe`s4(6vuDUA4#*>rt`PkYS(QpnOmBlU2$D$Pi9cxwDA=o4qciik0kb)*OPZs z>>q^+TdY=OtV3b44J;^v{viso{W0gp6#YksVoOm$H~;ew{}B8`D&G5ElD+$69d;Kk zH+k=T&iCAV&v_>)H8m6Bc9Z`MFWE#gd?^*w(0v!hXpg{Fy3#{57g+*oZd2L&|4{X$@pNZjfFFz zOiyaX64kML2DkZA>@vuUN~VN8LVyRg%DAdDLfmeI7{MJ(#6fW<6Y={^=MPcbC8N(c zF`eenUyI8`qPW{c9=0HRDDIUpP)T{M0i|mj>lHQ@!yt3eLosA*OzNy+SnA}L5HaRWgC3#kf3z?Dbv_yAL0mocnYWug<8!Sbc8EcQ63~6iAb5iEQD3><;g#t(_)+q? zmR?lU3nw6WOT8OAH-B9?{o0^=bYPI{ZusTBIeALiSul}}W9s04`(O+CV1Mfd2!AIX-0eJERR4*?M zzjO9c(EHznmA72bH}b^Le(%Td)sOSAZ~I9&-d2DUuM?0b`eCvE5G;h3hTG0vfFrX< z;h*cj!cWr+@b{M+VPNcK_~5`Lh^~epmVA8p!|FHi=amJBuDlD^9+`*hU+jXei_gLE z3nP&Kc^mNaf2`;H>>Rs$K5u^i9b|vw{->0(&OVt5*tgyUh0Aa3+Mz>vC_RkJ8P{&k^RHYhU~1hmPB@p^&&wxY-g=?AhOGL zHezS3IV`fgH3+&(=rmDYdrlsTp-O?QNLz;^U*f gUaNf-W!`w%U>tAyBvv!vuai=X@z%{KZg1ND3vqM*?f?J) diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl index 36b680816726017ffafc262ca38861df2737087a..f52b77a8dd8eb18ec2d4b0c85a52968bf6d7d92b 100644 GIT binary patch literal 4965 zcmeHLX;f547JkTT2U!es1QifLTcglGqk_nNuc|>p3j&G|QR#+m1#P=|y&wuC>VR9A z43fBI9Ai9269pxkyNTlx)F`74DiV#v4Wgss?qJlsszy#jW}FE>=FE@OId!VdMeP}nWty9x`nY2DQeZQOk3) zO(Oo4K`FemiSLoZyQDh^ZnUZ_jn>G!<_Iz5yx<+9*G+4ub)&Q)ynBwLLT%!ECM)a| z9;N)N3j0#tV{r~~=xO4`Dg0~Hp)p%8WRve@ahPKXI+XXapuOABy$QO{B8u*7;(bzh zUz#$qB1Yb?nC~Y^7gmy)sZr^8{|>>)_2w+Craip=G`uiAKp+s35tyuC1`V<}9$?}J zrtlJ)E~81!4{D3a0+I29Es)o(AVW<2&=fw%Y*qy&DIA@)|GhQvn2>eSlm2aO$(Wv9 zsI6dWEOO~oS!=g#g~Q86;5@e{=$SYk3mIT`d^&<-7$x?r4MRjON%z8Ww(C&siIG$v z7Zi^B8$yuFVFi`-jpfwdDZB!?^}EIOPah#XSK zS_zL5U%2ub)o=N69;yk6!bJ(q=zJlI{1ydMebJH;_+9%7mc)C1Yj-tg~f-g-q)448L8%X_aY6-(<-_}z7u@g_w%6`Mx703Hwchp4vg6k z_=5qjW4GMv6kln6gX@0rJUgdO$`X6`-V3=sA1-JAv3mC?BAecm!-}=tX|8G{>3E-4 z6!zUd30LpG&Sl?_)A^hiA4iQDV&u5rGGE`*fGZy#MbosF{k*I}PUmxaV+rRLu@N=} zEJCNA*R?Nw!7#RcaG|N+=;_%X%*+_L)Jlw!8V%O zllZp0?E>|O2cb*md2~LXw777AQ#mft<#>BOB>CmY?+?vrzL>%@-C(Mb-5^I zC32qYWwVjy$UjPquiRbAE_p1R5wr<>N511K?rugct4Q2RtlKHaXG*J3&;&%~hgV~$ z?apIf=u+r9c7;9>1>C;EJ(=o6^_wN0_;8x#OeOeU)3njn@}3J%qZiHEMAxgX+k7a9 z_!FnC^<)YCwP){h;^5(2eS-@5B}%Ek`B^f&rPoR1sUA&bRb@QhQ+osDCX;hqiOo9` za8+t=9Iz&lTe@rtmWa2Y`j2{``<~>SRN|6EUp)5Xr6?_XI&!`fgUeOtQFD1RTD#Y> zKev?)#>@%87dAYj^`Utn#rsyRYY!Es>`lp2d*oT>;!vLZ-+PzX2b3t>ie zAzV%^1er@Aq}I%Vn<;ak@txVQc=Bv$lFtUU|7=KWDuCgO^Wju@K3q9xhQEo-(9~*z zFaKhK6uA*j?J_`}vjJ8$@Gza@VXqGljAI_y)a#*7oE~z|>p;uu;E`x1v_8^8VWAe} zmuJAJGc&;c+;k|BOo!e5ro-)>xnK^m(<$N-i`;4q}km5=SvcBka~ o5@|2S&uCdQOp2T3xqzc8+gB|z`HPUGuqWS$1dJm?>@Bvx03H|20{{R3 literal 4507 zcmd^DX;f547Jls5BA_fTAhZGsD#)fVir)9Cq|wuWO}h*tO9S0P3ka{f*+gh0E=1HI zc+e=I(Q)5mG|CWlaL3F{28~-ZPNKn4jN>s;oY73kyjRWZL-!o?%$)f#=S-bb+;{7K z-@Ui$R=w(nOloCKQTF!KzcgoR2$inM%vKkvCMwf)=|KfKxf0|^C7HBJ%M_L>XD$2_ zZ8#4Z`6VhXYniFb=ZHaRg*tVvR+OeuWfZD&m1#QF5^cUFtteE)TKW4Y+HnLehfL2> zvDOBPNRpeMuA5^R6tOmm791v+wN13*WM)AjYZq5*p=0e6q|Q=Tsr4#$04FR~v7+kC z%>3M3Ri;jzzeFpVtx>8pD&-7yrcT5T43M)9I(CqpbxgG7+~lTbsB*QeQ#O}Fwh%Wq ze@T`xtTPE4!Vb>1lPY!WkOZl<)O986BDGn`y1t*yJ9N{r?s9f0aj4DA*Ko5QW^kBo zNIH}qZeWk-Wqa^!&uYRRsbjt5tT$P*ymW1jmKB$?qx=)ON>F*K^d;x1Vky@z6XgV&8u_gxD>PoJMss0d+=u;#m5V-W(JJ!*lu8 zaAPjBPwt4zBWevk8f|m`I?p+GBnMRvbf7CQ(=@Mpi7ZiP!67C$FSZwBn2+nZFVXMo_%Q`O|0o_u^$aKY zlgFnrNtumw{DZBiv5O)0*2gpO{R5VGap$k`&{IixM{%+-UgN_^NcC~Rb@FE@Zdnk) zuY6787S}McCWqrgQ2Vh)V?AH*J4+92*@&`t#*+2BaKN2#YJH{X=fWuRym~5)tGpvg zKD4izIdI#GJU`ocf~;Slrv=8V#}J=8T5m94?091I|5~YH0Tumj4vC+1bv`Ezt#N`2H1()aVo5#lBUP$R zOm)&=)aJIjH--XVs~SO%eX@b(?JvJqGR-{?jrsgg@f<|^BJfEJy&L-qQisFq&}(CafI-09L~^dE5L7WCEk>~b~HPo+`QLdR_+ctzWJiJ@F<~-(`omHr)-53O+x+)-I^JO+Q}Ugm z#xDcXh!1w{B7zr>w!vOuOHdJdkLam+UvrY*QxvtiiJu2|-bVB5Oi{^`t`dFsSKUP4 z+uVREo#XH)Pj8^qW&UJr&BX+QDR8(>@(Z@ALydXu%w$gwVpnXDAdPz%+88&0*tb-x zne!VW$n*2>EwE)!D%P})!nKu0$=FcE7*sO7i^=+NBwjb8nz?*jj4#`E5gVV=lwx0K zJbjqx;w{l6_L>#z(d2Ey88V!YS*mFyety*)f+a>u-E5WMtTGwy z_BPb_uiiCAd&QYssNOmSH|7k&PS^ny^DX+u8{KuXVZJRD8~Lx6rZC%=jw6^zb`X9wV=S?$MDEx(cPbuU8iPAN&LU$$ z%MpHm=y+Uxp`75CX6$DU&Z{K0d+o`1{D#@MP|DZX&?0Z}^DxwtA7Q)eP(tiH#MBI**FtE|M6$en#0qZ(3$ebhzk7 zFs}YVME5&TgH~*nGqTtwhK{;T^Z09b7c+6)DL5+oAJcig^Zt1HbdX`c{?U)$7YfYZ zS$Fc488mmb^ZjjimbK}f6(Zq0SQ)1Ke=9BjN9ND}806c3s~&>C{^n<42uBGxA+EVD z;QF%*B6|cHbCW;LYbL*ag|2UEm8+zzdQ;Uvmb^wPFsy_qUxd zU(@pyDna6R{w8#NqldrzO%I)1U%_?#Q$6^0=^^>gdidg+9s>TPhn7=%c+jMWD+)b) zM(d&Hc?DS3Rlv#Q3OEs84&9dJ5c{GG+8&g_ET1y?G;}F!D=UTHyex*=C&dsQT?}Yo zF*qJBggaF_*j}rJS?_6~CS40L30kT17&I!hJ9NAZEbn5_*D+vKCOnTg=#qd zVK&rH$b#o*7s0CeD%jnlfGwbaz>ri}K0FnUk4l7_Df5hd=m&G*!sxkxZ^~g?xeN*x z$)LYVQ@Vz4E!Gj zLtJDq)a;xHF&|EZQRRVfD=z?&m11zc>J80CzL++fXZ4AaH#Qs#Azd7 zfPOf*-*bZnF>bK;rVE^|b%7JthJg43C%7}-3C^u`gqk=<_;~#wIHnOn#VtE%sJ8{a z-bD*tm9^Z4CMEU-s?_KYD-@VrN zxr@})1p;kt!M~IV0!zV6nM4}uC5@6xBW>lp@DB@&S{4}`9_}A0jS82_$J+ZZiU{{# zF~(mS9U%>t+erfDfwmFLLfz0@;QQZ{b*c+E!5U7*Laf1C5G0G1Mscdaa#>_>h^_x} z`I68mASyA8e$cG zj6J6*R^w%G#Bxq+=5`f1r|luq6B&xsH*z|>P}#@{6M}YpN!rKti!JQ!sU;G}cr^GsN{X?`w5;>>u zAyO9^#&ZTDjd;#5F+?%p0690%lQU9I7!@qzy~!E3PZ-i}^cZeXySB+YZBvEz-~^@i z5IHy0llxfdGR2NWal_Vd!!5-8P71=Lf#KXIU5s;;Ee{HnzBkWIX`UlDf(P)$9KTHBHWgvpXS-Q%jXJ#CR6?$&cwoke*X-T8G>maEVegt0qD7D= zh^989Jc*Joyf$EQ?H}xd%6u&I5%TjYN&tOOd4wEjC?ykjw_^Fh>+DG3P`baw33idk zIKQ+S4%{@Q_u?j#{WI61zWPXZ*~7!Qa{X0ie`p+<#2+EG6^Ajm^a;DAt{Ed7oUrd6 zCt5LG$Vv=1qrSu+3bvg>&uuK56!QxAoe{yYy~cFp;ay}+(nPGxKg@cy#p95iQDD8u zhN?!?!eGvgW`s3C+3khOwX0qAr}aO3K}_ply7HkL>N`%M$#!GmiL400E)}*kR-x~L zcs63PI!&$G0G`ICwDI&=cA9D_wjE7`y6=C-RI{&G1K0iNY#s|fUtPqKa}4AcU&76~ z88G}xBbsH-0>6jjXr6He9NsdG&doG{lV^W`8x^(S;<*%co*RSL1`{lBu7(SJ_o7Lg zCm1!wA>8~c4Dijy%;0fs`7=YhC&m-RbF65eoNS0mcBZ9&H$>g{jSn#z)3_`L#T^0HXPZj?0Ug7q6+yq0PQif6Z5L(m}fJ{F{k04y?a zk;Op>{<08PEuBd>S4|VAF8(^9hNl?qqfs$ zQ{3vK9Dgr&A$5w(fPrU{G57jy2roK~_CdxmOY0y#l8*4#7 zIUEW7b$!sI=rqjB-iGR}M_JtmRXFQdG!)wksUFxsjj0%;52VBR)NS}s(-*XVcg5=I zxv)^Y0k{4o3$kK!Q8e`iEIR)PKeaccCfD5Qcq62fGUn1lg_|Mab^)&Mp9d8?Z=poE z2V(6?@%)chVdknASZHzxCY`pTf$CFmYMDC?|9Ud)id>2*7xqI@{}fCviigaV30QyS zG~{|^qH1*s)GTZPqf7Jfsc=3vZ8;7%C+tCiY7MN{FT zhQoF#_?t)<$B)pVkM4%S)9NOCD=&n31NzV`o2^j(`Wl8@D1t2-t!N(~2opxOqLs^8 z*mX{yW*Q_w$@E%G#&hspn-+cT9|UO=^{M~)vruerPWvY=q~=F{K}Eiq7QV$w8w&_3 zzlQe$cEaX4>h$7^1Hk-x14jf@!d^#B>b>(ZTN5`P#WNCsjfq6z%OX}059@Tb@OQo8 z@SBSTzn6*sn$+|Q`RuuFDmv#$~FHI1UbOrJxWCeNTlX8Y1R5At~1 z1p-|y6Pj^yAdL0e25zIG@z?K)p__71*`B?;CTpL7?Ens&hO$)Dn;J3qnW zVXN`m7bjsw;006}^#uCf*P+dqPQZ)8Ml95P0_)ceq^3t&;A))=;z>Lal7;mu)Jp=CfiwpJ&B%WQ30Fuo2PJ{?Az#0R0~6BeI_-GMfz zP5As0}46?%5Uv7i@>JWAWb0u~czp z1s#9b!171CqJHSiGVSG=<=}=!ozJY!_o9xA*Pl|6CHzgaUHmD#`9X2>RFs}%Pk!qd z-&6nptGy>~PwgFcbo6&fMSiZ#$=4mGCJ-po{!h!?eeqZhr*izW*0Ue^n@ia1u( z4{;58$#rtWejVJ7SUkD!eEcqXj*{>Dc5KXl=8qTX|L^=2(#R6 zJ&V0n`Bf{byGTVL-hWf*{6;uI(b1*!`U^vMOy@&M=P8KI#(&a`+f#rXEI F{{+26jNJdvML*q!ziM%EQfOJf*-pe_o9L#F@Q!v zMZgPBLX43>6hWg=;iUv>hk_gjCm$JaeP>0RAPiyDR%cp)nw-`36B-N6gqq7(J+4t(#_H#Ug~drE z5n=MExLBF~^!V_I_=s??s4%%cYhdTk8p_!L{;W};HfKg64vmn=*nyFp50S5VKaPuC z@P^j7ht`>WCsIckE@w^rgqlLLY}Qn$mCc&1h*TPUSI!RdX9xEfl!e8`bE`J*H5l3J z^jOxS7yaHV^!rNmkenX$P&sSq&wkJoGO1V=BV(=7S!>%sE)zUyggBNRrsAAeTtcWM z;*EPYJ?^=%!#M&j7$f|IJUhEy!z1PFD1X+rCtNalIBWOHr(TFFJGvJ#rVqqk&VK06 zIwT~B9ejm4y4B~qN1WoKu2kPYtqmidhe~ZYItkFVUuCG2$)9!#cD_#)b;0zm?igCM zUqq*zT?J7OFX0!>mL$gAimEO4pmP@o(?dHL8Y&jiem8_V8s;(q z_9yXP{RUEGtw&D=eg%z-Cm^ePiYZ^4jpwYbn2U~4czx_K=14*fGO6uk=$FB?Vu}#U z_lT%>ngXgS2=cH4*YZ!f{3L=S5CdlXqEoI+zg^`K*K zISzJS3mfTa++vHuwEw)+v>KCi&Fi!I1$FI~E2uz;Fc1=9OkoKr6Jd9)1_-Te{sre0zO?|z4RNlwER zx6AllPzvKA(xI90o}@HrAiel(JtD47E} zO>)rq`%z@Tfhv4Fhb3i;S}^x=GMSV#nAXKk#BA5`R6XB{q%{qsn+L5VNdrHi9#TN-D9~m*3e-zgl)DhR;@8gt|tE91E82u?^6|x?;VyWhf}HDolc`z4ETzZ$%}XU|FdO9Q-o z^d8ghcnE(u>qlO6w4t_qGLcN&iYqIViO;da=r~ji*S6or_BB6&-&YpYGGRQa%v_Hb z8dJ%+pc>SuA4yA!gXtxYv2^+f5v{ednY1sX1DPY`lcKyWXwEP0J$!B9mN=m+6ZdLr zV#O16nm0Ncvuejt_o?P2y14;|nu>|{m|wBwPCj$)=SIAdG=`w!1wLD0MLbTQ#m^3g zllj{pV@s|XIs9ln$}5A&nvV|PrX$sisQXI{?X)CS4L@V)i(%9!H;4|;^Povb=Ftrc zcMyv`W;>a|Vp>FZ6LL8YlniS?rGIQ^?)qOKc4dmPJCeDsgi62C{Mb57-rANuxW1C{N9WcHIo78v9Df&7Ce( zeD)mqH2x!cFwdNJC(oo~+}vs5)*!0+yDH$UOE*E ztTRba{imq^eLk7#vjevc+(kM^G~&tNYO?m}@7NH!q-VaNNWy%ZJD!yC_A^Snc~=<; zF(1#|Xg4DFH?^bM`1$1ABm)|6T}8US-G|2mPLLTHcknm&62go;kF7qtiMIN4+|aO> zEc(%kwsuWHsV$!-IhPT`*?hXr>MHSe64I{w$4Q;w5Ee8ylhW+RXgYB{30SfRO-Fx0 zj@X{UgOlrtb4W8v-OdnOT|K&ddmhn#+Jw()%gBb@hqygp4H@lSj}sDi5kb;8pbzA|STNGef ztN@446cDgj0i{U_SjH(vE8w7?0_wdK-~&=wCZ<8XV;Yo=N`qU^QsMUXRJeX96*P-eVQp?IH0Y(mrkWIR7@7k9 z>M3xqI~kr|N`_ha$)Mq$3|i?)U?EC^HbD}c?_2~sTNeTKTLdP?iSWWH0nXl(!i7F8dT2_!m8{k;5mN^1QrQEnePKnjY4Hzxxm7FGSE#0J-n_~CzQ~M~11i%0h29_C_iQv$K@Q~bAN2#8dAkN8TP}D?T_tS2dY>ruB*c9(jVJ5hpO0h{jsk7v3;{J zP$gF8T=zBatDO4^wYBYEO|sPZ@Xd7PzZyz;-R|Hvky2K_vdH0jhQ`0s`&~KgE7a;a NCv-TJa(?gJKLGbJ$A$m^ diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl index 7fb8d66b080dfdcebb4bed386cd752b99398b779..f188ee7b911cc7a024563f7572eb71062a0f97e7 100644 GIT binary patch literal 9382 zcmbVS30zI-`@ijJmr7|tI=7@nO)7i$`y>%bC1Hk0h^D%gM#dVGBxW?(LNZf?WKWiq z>_w#Pl`UI@7-5kAxhJnN_xJIi-*3+Mb3f00miPNS?{eOA?zzX$M@d;EQdbxK$KfXG zCK?(wF)Si5EH*AI#vzUu<6^?%!Xw8=hRMFg@}fugag(AW$A!&^4hxNQoH#yiyhHS~ z2rpQRM*nxmJjL&%qS;afn_v~bf(cPG!eXV0p>a_$p;H{jO^*wYh}GazO_&}YF>zd6 zSY&Ke%(!@G4XKiiO|Tl@AeN6DKRHaQ+}cFLDc?vXr4weBY(IQ}`ODu)N;mBPM-Lt0$y&lD3_qHq=fbPO3lLN5{wD zCuut$-Jhfe$x~#EhH+9Oe`$MxF=G6Lu!vZx@!uGSL`6>i%xEGoI!ilzVl@54XeMJc zUo0?M#7R5)OFK;;@8IX7rXHRZG%Jwrz3k)n)j94VYV&XA*s;48R9a`aV8n3ngTfyn zwAEIbwk`L=>SRc4-5Sv7TaRAr4`80#bC1yFhLrsNEROE|1xdZqC0RMDG}h@Us&^fg zSiMk^ED6ZRVTCKuJElPwUMfgre8;lBp8ZvV46D15@vU|pLS#8XUG}UU#`ODJ&8g-qnE4U@y8%6>z4+1he90aq(=3*ZuDm55Q*sJNl9nz zWMpmBBzgaiTeAheT-|p_HoSr9qb6eXVO{e2^$KqfA<=gxlGF=(B_V-}QLFh5Cy%#C zPS+<&Dk}D3a;O0vu53#tLAvnFA3!rxM6~SKT;x5`Mo)`oT(aWGu5SU9OEzPwqyx2d zbfL4gdoc2d7IkRS5c)6gkN71W`ROJoTct$R*Olnoq#U7t;dR+Kz9(C zWDX96*M!j+Ff$*uIuB7->O!zL68z)B%aL_D8v&0>aPLqpg37y)&Yk{r+BO55{=(z{rmSsYmok;$pWUYnPMchE^TUwpj<)$P}11YNKrU5yZ49M@gju zS;o{zJn~1=jL>Q!m(9WfQ0w4@=1e^@UYZT>O!Yub|AG(FPMd4c__j^M4k3t73i z)2bi0!sCh~!jIP?#GnFac9%nAlquZKwcy8JPa=0r0j>`&lGypTr@{6w@of)l{MK>? z{yFEdqr{Kaw|j}y^VZ}X-%VH##t!dLx6U1ZOfjP6(vt{KXhL(;T8u1qAr)_1YMNYu zn+xrsyZQ}&DLM^n(}Ds79ulIYRxi^Y6*GgHTx0{y+AI#gjyo{n8IVTr(mE zuPdnj`6?E`7e?-?bY$!m?C+8waOkgULoWSSeCEq34MbwEQ=j#X&*$Th%&l1S_MTvK zS>QwmpR}$$14CsTsq(;~UJBIL@dFA>%xJ`dqaSokiFS!;vDA;|xad=-K~)}!nnaP)4hyv|MZ2zRa>lk7ge4#`^%pmR(JZqHwa=O@>oW51V}HMaG-WxT%eE_w8{ss|PYtQ;_9Y1^s9(vRgO-PtSAI@Zky)R$G$U zjxCZ!4+5!Y`VHh|?m=l)D7JRjBRwB|GTHwer425Kt9K#46P+pOKpixk?qWn2b^7sv zIfVr;6!NXL{)js}SE9OClE>7o+Ym5o0m^NMQe~nFDc2AjXIoH9en#f> zr&#B!N|SBUaW?yz<}1VTy2@q)sp{fU9o<+G%mP zXRJfb38mOOs}U>in9wE#9hzqDPjzpLg!XN_7?Jw8c-)d`P(w;r>NKaXa6T=Xnu*c5 ztB^3_EH1;a``UrD zXmByI+QuQFMl%9IcE+0z z%^1Hf6UqukbYZy~Wj(D%A8!@fnX5*}_xn?m+D>#UY?A1@U&pJ;T+~<*ysZO}*0==@ z@usx&+7+lj%}1{)6?(63Or-w?>Kdw4*K9~{UiGAy=lz81(F@5kT%5511HV_Md1vBr zKz|rzX=_l^ipMyDWl-CvP3Zd?HE+u>)i70}zrcYuWn6||N_UU5g{jCNvj7=g>`1(C zfy8uB6$;X}A!hABns{!PWVAyTHf!xbRjv~yr>c?qvMZSJ(wE*B_NEEnX;N9k3S{1~ zqP6_>HnOG~aRGhlj8r7dciQjwVeorDa(k*S=q*jz(Dzay#fn-yO&CtkZ|oE5S8lTl z_b+O}dxxE%x8ItEbp_wTbyE&hztN<75e7ng1q(-EeTkYj;NJHKv8|6GUDJuh2qgzf zFT8}q1wUhbQye@86~lSp9qgZwjD;a{m!j8{JWnydIrg&%k;A1w22lPi>!Q z(iFA6^z7G8LVdEm!DDSQ5?&5Q$cv5??6?ip9 zjl(<~2_Fdc;XBY-^E{%&r(rWO5zDWAz||Wc;PAtFbgQdIc(#Zh>lu+-u@q1RKNu9!!|h_BJF;GU$x#+V{5=7QRqQt7cl zD2jBUj+tffy_6`lA2nT>I?Z+?)73h3B~u;u9*8MjFAw`p6rufrJoxl_jyO`GAxT{j z7r6)54=={aheo6^=OX-%bf9-0%9KBU1l8n^68O@ltb)O!E+~I8 z&GUv3S=kBvb=wcazP+~aJ+nxPl*KnBPMu33e}DRD%6+`d-hk94+LUCkLBB<+(UFjqonx64@Iu1t9@3Uo2$BLX!|Xw^&y%FOr`&oAiFgs>v$Ir`DM z!OHLrn~L3|^N}a+OXRo#T@PkUiu^7izG(#pKjhYiqY*n%Om2Ppn0m z)QBEfEW+%{7cjY909>)5kt3GCZ^?UXnR*Dp2Yb@TGgmR%N|{9ORA`f_JIy&{PFr1G z2>sv2dD7OPCGf}#_Xv7)2${Qo!o-6<`1so%RQ$FU^CH{R_3|Y`{&pQSY3fH4R3>*p zdfhIpes>-PdVciF#P=;BM) zN^B`QgkJ~zS~;z2Po4XBq===}xH&J71}6*@&dp9!$zy`UP6X=*khyCyx~|$GxjU#C zt{pNUo}nV-Youf>{?Q&DsUou2td5dcLy|`9#>K=?gxuJLRV|tD(^n^>+;9{vy^UAf za?m-UBTcJs!5y#jC|lZ|etvI2nlm%8B3OxD{d6AF9yrp=$IozARYYnUdSss9Kt}38 zm>X_DuX?Y-d?Q~P*5(BKiVSI&>vQZ>^d(=guQf+%g0 zrO=;wi+y2vK^536DT+?8Qzn|Oy8!JwG)a~`&&SNS8{r?#-#2Vp8G zq!N7)RsKN`nH=$0X_SFzizFlk9})5u1%#twNHwORO7e$xvn1^NPN=^94*fP7k~pI? zN;B=Lad!dCqBzoO&_=yqANq6fW+a_3CfC~^G4oGH3L9omvn)R%-T{O=Z7#q5-4(ldD-E<=XloI9!$Trfx*;O8$Pjam(7gO;LVz19EPu()f|8q?lug4G+7~T>D~N)Kj8SbC$v3nHg=K&;JHi zwnDEO6*^qI2q*9Mrhr;a@?NM!+kfMz@Lf;3a*M;Ff~80wHV+AB8t`KJI;=}nk?3eV zMBa|yaPAbpKcDH1Zwf0hZ*48?H?^fqScA%t}7tl866=rIE#1Dy>xlr+5U*_SG84e0Em1#mXBCzrxTD5oAlw7D|9kA24P`)*|8 zszn=Sy3y4^UHG;kWWT8coo#lZDidcx-?wT2`A682&y-P=Up0!neq4k%;VN{{vl!}^ zN08UMlaiGq=ioa3N1;dy=+ycOy&wosiU-3sAw^#qCy9^p}dIX!yX zh?cgGu_RfS6x{5{{iuOZ@9CCs>{(g!=>t&E%`h7u8)qA18v~oS*4wRTTaU61vhHi$-dfLE)mqW|o%ofwNqmdP0dcN4N4!D2 zR-7hI6ps=Ii3f`Ziao_nVh0{JVoR}!xShDISVyeFqnUg835{GM*T6mG?g+Ta$xy>x z<1P!R=Bl~MPpIHpQO?P5g2yqggvUW{zkq!l!)|Ujx8n=saa%qko7>1E^Apx^3@f>n z+^;;A^H?Te3Acn>EFg)KVcr**!~KOHIT>brLOd7G#qx+25XrS-8pkk&W0=IXB9vno z&$VJKCyz1Q7;eytr0)a*|Kz&&i_?*NR@;mvH0$ zLJv*`=g)BD{(?QnV9UwY_5H;^3=LKkiDaMuZ(XbWt99+{1qIP)ZP{kWAF_w>vg2#= zzm5A^d)bqB*}?p6TGh`wvop!!nUuG)KRo39@^SJyYsc(NGCSjA^|5|4yL=q8GaeSt z;+UOD=4X1lPvgMinUs%X@$ynWj@83>AEROl*rF5hw1Wl z$orYj;+Xz5$#|Kc>0fKl{POXvf6V^1`C$EIaq@i3&g9p4n4d|;(^hU?&Y;^1Q4a>xXhr(e$Hj-m z#PTmZc)|VaKT!VRHI8pIWEECsP97l}IoZe0M?n}w_D;!{Zz8yT+71&hja~oGI78 z4b;xg|8t5@>&MSWRn_hjmxYr3XI6HOF!_R?3fbXr*(&*V!ZP=?YSm?KKOYt0)KC$r Kh-C4tvHu5K@y`wb literal 9571 zcmb_i2{={T+dqe6I_5DnAVWl^kk||ReI-IklO!RP?p2(SCS6IQdAP}KE*fu1X)aB- zQY!b7R5z7dw-gP?5E;I6w$}aM&VBmd@A;l@J|=*YxK+xYKeeL#&D`c+AlnJnU^i1?Nva$*lrar34nh-9WF z#)*lxQ<4&A#wG}-%$XLIG&44OO5(KH32}3#%ykrsWGyX26vTuCF*0(-G?Ac(BJ_!k zi%cBTV-$+yLil2gy+}SpRxGE+Cy5jS3j~QG#nJw*{yP4G43UyQ51VW z(uh$AwBaX;G)McZ`RgnbY5A)!6KQA0Na#9=BHdt-9!HOjjG7jkAnMga|B`k<-0T^D zWv$O;?I<$%;?eMn$KDc;eX=-@eG^6ff<;D2k+wnp3W}EQpF&x2S0y#*n-ikO!>=a9 z&fW(+rKaPnx%uBH<*ij99u#qjz?^nvOwaN{stat5#J?j@F+-1)$O7IVET|9ut9^-O z6S%BbN4KKWtW1hOrq}!mQ5W06>)bX^e{W9=yKxy>S`Tpk_eL3^o9AIz`27=zTDYB! zeLWa=op=qUw}T-loMmT)cLP)N5Nz7+K-JHOpg?~J?r`_TQn!uZ+CtUy3Mi@VL$P~@$K9_0t2uqzPX z$C;sXjR%)ci0W?uetSV{_7V2=#IumyUIx$SkHD}3MOf{`gW9yGP+!>sxx=c&edqB8 zTjj&0f!jgBeH?hDrh?lP9(Jo%eMVr^i3un_w1~q8@vlSZhSTt>Y!fT| za|z7Tx&tXi{c-x0`&@jyW`U~~r9x0ZwwJ_zpAnJ3i#`aeYIm>;l(QkRN|qb@K~D#u z#_CRXV|O?@l|JUU>z3{JYTn=sEk?$8(YXzxWrVmt(3!)tKg{GXrJ27&o{1IOy-Vlv zv3QjRvhEG=Tit!Q=gGrA`zY55<)Lii0E+inx=aesu%P}(rJd(`dQGOG8uKIXyLYrYJkelQ$k2~Hgw;83s(1~_zqP1ywWaLB^% z2H+H5KU92eC5{<^vi0Uy$%m$shgri0Bb>bHC4`$D14pZs5Pkh6 z2qtS{TEPP@-)Co>v1WS?xNAIPFMXE7?9I6_=7-5xnjVZ+gBNiAn=16tt9=m64flcS z_x3&g)sjWM?A2`8UKSJ0Y9K%UCFd`9wH;PobAtYz8#&#~NDa3%WkBf>RWuwj4?^nw z&_YfP$JnHD?Jq(;Li`pxSk-lz(?dU-;QZz5{&D@B>D%g6;jY3qYGlKXj%JWv&sCLv z*Z?_V9}(EG&k`e@b2)!jh3XhGfr0GERM>aZ2-tbrC?Cs1=Ftm?GB?MZsv$Ia2`tW1L$gvLD(*MKj0z?6h;>CK(;wCR z#^Ft)Lmc1ug)QbB8-nRRrs6q`K*#z$TrP_iYs0DaXE_Y-xeD^i%h`yxo+!xdfYU!U zz%|i0h+eLPzJuGrro0Fit?P%ny;QmJe(=T~tJAVL?&fPMxZrLOcrQN$eU^yl;VNQU zV<#75s0+al7H47qT^=s5Pltt({cudK67s@&0q@RYNOP})G_L^|@F5Q}+owQ2RKlza z0`%0afGORIkfI`o_Et|oIKu?}FY92&XUV=8fwfzQbL&^~``3F*xLA_w+w|>C_Vn1} zu=?(`FSZC=J|GjcSVa!M=70}|!Fp(E^uYOsd@lc(ojq+5e)K3u5GF>#$Y{ehcFq+DjZcIpBiq1spbXlwWw3jF0n``CpqYUgK6`Kp?C+d_afd&`zN^LgT-uKE3N{o67qRy+y8W^dR>k6*(` z_uFvFR2@fLNrUMIw%A8U8D0ExLC118yVpeqg9;fuVAUHPG*$89HhVn3%8%>Y>$*O` zpEwmtW-6h}5>t%(q>2y65oFr~utU#?eKb7)VtntyIZIXC661oeM;oB;&@rg@Y$Rqx z$l?LzK`<}Z2{rcU;E_BzyitD-j`cOePIG&_Hc5QnDYxLpKI-5^^zT0y_tc$&(J7fW6H<65Z@lD`3dupWdaV-+|XD+B1^XM_ih&2g(=Iq>6#qNS1z z>OAR(lON{5^maFNeWi&zYxhIRjAO8O_CvV!uICz)#FMNakX`d7R~}D3Eg8oD)WJ?o zvc(7&0a{N7T=38XRr^1N6&gaE=O~X($^j^MTA$;{Gga_hv<9Bt8ISKrjS$-+aP~?b z&KX<+{$nRVv4SmrDAVC`l|)N?CW6nVWpIXXiyDbHIecuF9p2Z~#Os=>pgz?ezZ+fw ze6H6AZ^#wqU)gT;U&;F4s4e&CPZb;hW^%@RWnxQcU&K%wDK*e+{; zN`ezyPC@YzUR$3Q!>Ri@uZ;qG3C(9a`-Y@_cKDiCA?yBObCGGH|{RoWp z*T%GkY8dM}5Sy|Bal>Xs?238`$FG~=y`A=0N>ei!PdCDA59^@uk{gCU zwMUuGKG+VHa53K=Ki)2Zwwo$g``|nbf8v3x2_g~Cb-Ymqd%w~g`^`H60~Ne*LE|vw zsgLEx%Xp^}PTn&H^zK`rj7JNo6n4YRlo|+cH^iHVZLs8WDIE9s9+G6uam6VYl)fLO zUL1_GUL=D2`q3ze{tQ_nUA(YqFJQX^mOR$O;M5l|BKJN!u2LBzCf|j<^WTGe!%!@p zDBh>V^EH1<9SFxN;pPH0j4Bv`)>S*fvTU>0lHM<1?2^Zjn@|M@%ZB2JK>`l%o2`OF z3RN+5mpKNW2*5RUzd+?-dB{-t1WO_xKzo}K`X1N{ccRSk+&F!_I3y6Q$5wJ{VbwYo zKY7?;$%Gp4E)GD`gn?M~qXzm6bm7{)O$bK0zJXOwWifJ|G8$h*+)~j2`43%D?x{8x z|GD}cZvN~I*1*GBH8!-MFZS7y1k7DW^s@a3fp#vqtull2C+T0M>pKn~99$;0@pz|x zt>ttX_B2%QAHljWAAxVS`r@S#lR5uZE3|RU8cTR1KJU$zy$3C}zknd@56I7MhrES@ zaNqAvu=@Q*n00CsY>QDwTfK2&o}@%z|JZnF*q#O*Csx6gLq)J^j{=(1Y=-ES#V}O0 z1j1acadDUsRfnix%Lo(kIV%9?#W};wYMsfJi98YvB z?S}Y|6H&du1v8%6bNn}h?mD?{h+fj zsMZ)$#;T)>?`>$fWrKA;+G2|)i$QjheO|oZuKWnSyUsz0wh&DQyn>MneDSBlra1o3 z1CX5WiuSAZ(f;RS+WW3PF(*z5zXyehLOMzHg z;i*XioEK?_?q*(S@8ivlXTn()jsLjeDTyC#R15)QXF#~;M6gwBfam2wXtUuqlq=?r!^1GARt0}_ z^}za(Z(+;xV7xil0hf9hq5RdMm@?myi*FIo2E0E6cyEyoZfOCWuwwwar6a(D&D^b+-GG?0R!BBhv(gnsP(EAu6FvN)dm}Uc13b_;PJ@$u~_hS`1~m-U7ymc>a1y*;hIkfltpUZR9iX~N6J3(D(Js>g_w)`%|9cf2zNtwA^U`jD z;JO>e4(K_{+8iOc!!8^R3sH=Tz|bfjCiIfUpWi2dl9@g_eUQU5dy2fee1Cy7TnmNe zqcJsj6$G_6K!)mh$UE!Cjp>ZUIF5hHMje&%+Q91%Z}#xjUKsQI0@OJq!`lPhpr&00 z<=X>sdC)U1pB1xRuza}%*4j6~##zCzwp#-iEcyg{ij**jIRoj*{n2r!3GUw_hg}Dk zLD@=kB!{%|ypJpXa&ZfIP8x|7@d!A@<*PqLM{{uxhF?1|c>@Nh(7Jppod;uQt<#FrSI_T@` zi5qWeV7+s=SFao!4DH+n>PE|8db1jKhnS+FM>DE^Vk#C@%IfE^QMz&&>){xAXM6Ui%#sN9MBGR;IYevJ#p!lfdS3E95Eeh4&kK&QMF&{>7SY9{h%DRUFt?s}kYZZE7@lEt>%bH1wRk*9{cn%Y1YbWsbMO&+Il6BfIX!$6 z*%Y5iR<2AZq$8EwSdc;{3{D^kDk3s$$ZR72W-{4!X)-A?nM96XnMjVl3L*Rd3?y19 zfkZLDpA=siPP`usBQD+E5YkQJh3qt0{4Izeaok(7| z6H#k&Ag5LANu|J^{A6fN_}8sSoQ}9owIo-CmLzNc0OD3-P7HUO5l>=9thLR^5s@iL zAJL!8h%zBeMn5v_Mqe@|xG&LD?nCTb4T)lrA&EO?NPfvSBs-TGl4C0jh)Jpe`L$S| z{J`pyrm|jSP;xI)Tc$@c?DR;Ok{-Fyp-UWI=n$_Q9g?7-P14?K5#0$|yNfG^&$6picH#sT01bI?-%VBX~)Tq^(mUazCq*Pr0hZe5)#XxL%cL zE>k6|7panxBvq1PtV;A7R7gmT3OTe6BB)9l3pex!I?s$J5NZ=qJ?C!uaL}Uh2*-akYwg6k+nHW zL?NZMnw`5s7Us?E0W8XxUJ{S&AHyS6qj*Hso=145JaS2sM~=zxh+|zhWAv(Xm>HgYdV?rmE!tSC*!%HlR4?s$?yhtGK)1j znF@tY=1p4%qjtH2F)i$1))aIw=jU`Vk0*67@e?|j&|B;KkPH(IQBE+ zW%rrU)BVgW`lFp$y}O+ePH$(f3}|N*jN6%CTickTn{CYKD{V~Vt~Lgf+8B$`ZA`p> z8}mZ9jZts@#Q077#KaE&#Pl8ZiOH>QWm2xUGRM!gGW^4>%-zsd#y+H#`C!?~=#;cD zGaXwPQB^aO^01jPC~9Vw9BgLt_BJ!=6PuasgPWP9y3NeCr%lY@OHE95XcN<0Topo^ zm}?GA;!CfIS$p*(Q)Tv%Ij-}O*~{|1BkNVkAY{ue6=!^(`F5EA_bh%OAB<9@VM*-`XioT2eps#Tjj)BezQ zil;bjseVvA?VmKB>QqYmrFQC{>eR0^p5kab)v1)`Qyk5Y>J(2UwNpNgOWUcP>e7<( zDUa%uMW})Taj8FQr#fw?JZhJg6i4Tq;;CI4Pur#AQ5@xc3rBerM|B#X z+NnQjDeaf?rKL1aTBmqvyR=SmG{0}fr#y=Pw|-E(v_ES9H%WQ_hLg^N>NHQvr~as& z;%L9AqI+c`1bt=Dwr*<0uTljCqmySnu>W}iMr15ARilaJ}6erE2b}FUu|K?8` zPyI+sX+Od*`!U5yw^P4VO5-S=j*m2s+9{9ro8qXXc8Zhc$w=i%+bN&sA?=6yqwQ3u zyl=^G;idV~@u)7%r@D0B)E{l9lKPR(kK(DM{=TJCJC)QQ^+^jYz-nk!Jt>zqQ(bXyRY%`MkgKm9XvXrMZ7Ur+&@-O-G9BAkF>z zn0?K?Y#_yTl;-|@G}OP!Io|X?+%IDs&UxLZzL!yOL5ab|EQZ Z9yjNF^B!H&9ON&@tqVCZOM>sg{tpEr-C_U$ diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl index 9c74f2e9b9940de50adb68253c2b3d2bf9b41ba2..beaf6c8e84b1dee9a3748c0cc08dcaab2cf15c07 100644 GIT binary patch literal 9461 zcmb_i30O^Q+uqGdlS*ljjC++NMJhvk-47We$&{giip@#ciQ{wK@BIw#^S*2Ccdxy?JXBOgB27)v zza6e3Ml>`mC^%$f@XUzdaJvXj_=g8aObwkF8Z7@blM_Ac{U^@|^`GtFA3SG9a9~8& zpotL^?PmNE;*JiYasS0JU)fhCnk!RceAKuMCWXxjo+(ofj0g)4l-l{vikKQQQ;Q2V zY1Y(`ApeNq(3xT3Tre$}3d8tla0W9u=ENz%GSvnXE%%VHi4h|j8nt93aQdl-4B#>tv_^!8ASlWD~V28M-%a08ec7CKXFSa?uycyLg^ zseut%vgX#_GVKVNj<-zLM_nE=FpL|!Oi#+?AQj>coES3sTUdQQY&Tg8sm4%Sr3jh9 za1UJ%qeU`954}Y)qj;&@W6KDcvA4{G_ZTv9QgFyjS*w3|91<2f<(o%S-lK!etkI)+ zqelz5$JR@DkCqX#Hr_I;SrhHNJTx??rjL#q$&FrqdNn!6&LZ`noa3(c?oj4tIIDaO zeCE8D*j`qm$joiv3MA6}`xPr0or5TiQ8>Ec6oR}4Vr!8eT^{q5@Bg}T0Et8a{ORtT z!?t+5ovqG|!!h>^1n%g7+PvrRd{GJY3>SLfSRr|$lY!X7Q=sjlPlwj0NOt(BV#y0B zvajXCsozs%1P9{r#&%?<^BEm&<8i9PNw?B#Z*j-n7vufM@iD!<9w7YrpEzZ7jMvi- z7|`UiTTruUFKRaIkZ9^_)1<^vh-;sTJk+q-Am*~^lQ5W!eZ#pb(ok(q&e5?WFT$d+#)_nwi69r&t z$7&cA-$t}}A>Nf35?k1ua_1;PbhlNXOnQaCTw258O$x%tsI%d5%G4oYDB}DK;I(WB zdpzkFMu#5cbC=inYH1PgKXgeV5@d;N^6aNb3%rM?p7nTE;lL)QcBXUc6EJdxGRA$b zLiUVkjDS96DxcteWPj;G>lRqRg~E8FCCPc7GEO-CTPd(XQilvoq$jf3gIX zN4>c@6Nx+`7=-xgQK?!isyAFm)b&qT(#?$W*ErD^pIY|!;0uVkHXVz`eM05b-l$Am z0v(S6EHl-i2W>;atlut)?`cn-#(B_JGsVg*W14es5yDcs(C%fLD1BB3YnS;*PH06^ zb5%ZOYK15Dy=RRQB@;Rm>&M!BA(HCFLDYIxUzhMH2rsdrin#6AJS!GK)ug#+)a$d>8FuiT8U%3b;;gaOuNRKqtvPg**Io$wF#hVZ7q~P&4YnsBw8CkMsk^#k-yrMT>KY7inubUvvXBXSNS0+5SEI=eQ>9%KkN~&omF`QJ4_xh9Y&eMkQ z$PvR_+tPjA706ZVMPobOVC_xfu+wQNq$9N{r&k|rH8mz{q`|jSSL&s$1F26}oP9GF zg{v0u@rIow@ApatUB&otN^)>IWavwK8;oGrd=^ggfL?d!O5`-3I zT`6tsVpjJ+OLV)|2Coxjn5?ah$_XQJiJSjr!&6XsFps^Lp$T_~R3z_uiW?bPNF8B< zl<9Gh+Dduf&g1&y$c5IZFt`M>Z67h}y$WW&>`0?ki-0rbtk&uSsPW{k%^Q|vzEBy# zF0lxYj=_5OTx{?E6uZqgq6>HbU|yS$`lw4-&i16awJrJk)4-{N5cj!)opA9yEMw%? zoJi!kHJ9y`J{Y!RQgCRi4{{DWP>0Y$t}lSdz!_yGH`%g-*P%Hsnde8nJBYN;s}cKG zHLw3+{vILHws73rjhwX%F=l96D(Df7tk-?%WiA7INe#5luSAu93o>xo&yK&f0l$_5 z+r*bp_jUv8u+fuX7}~kKV`_e+&UCmtp7Kj^-XQz*Qe3Ow4UYsjJrGQ42S$p0*WwdDn69S##>1 zeS$qCn~X~9@#vtjna^Kd1Nm6jMMx-am`K)Ky5tdXkI*U*OR_uSGj3{Wj* zb?>S0*K_~MWY!^hF~ZchAklR#+i{nejy&pt(qsuGJui^(!{Nr`d#yb`zFRHR5VcYM zy%9iP7H)z|mJTV`-N5-2Yu=xne@bg5JHOWxR@cF*u`Pi1mbK>XZx%j5$;1U%W2#Sa z)@yJv;0z*--(&r&7PR`S3WloQf_cCQeta9AI>AP-UgF@f0p@-xcpUQ=g6wmkyUCC) zFJ?&XpG)k#!FP?iqWH}7}|?h;9~U`I3Jmdv`aOz;a&;%#84cOHnS) zEqa4FlUA|HHx1~FCyOtPEpM-VbQ^w2-hsEOcW`0`%a3pKj$~x0rLjlm_d>5*Hni0( z6=$D-l6K{?_SY=ooHYbjLn2ToIs)H;rlk7yHMaihNW~_xXm8eu?5Ymp-eU336(#x#PJJCwEXf>(5&G!@uUTv z^tHmK`5t6dw42Y(_<=5RM&z-zdL@|KaUr5D-?IzNF5+~JFGQwa-PU)#h7GhE4vafL zKKYyq^Z71H8K*qs+!iTiBjHjc{6l?FTW$quP#aQT{tCNoT0#3=u_QBLG3MsKVk_Eo zMbL#|(6wKU%dRPi$|z^;59?5Vm)~(!ouO`B;}JSzEu@*6^p}P<#fL`1w){Dc#phz$ zavgI1G6}DP?x8|*4~unJ@_LvL_aXY&xhUy&Ef611Y4SCpw9yl z`Mte`f894BeUo!+Liri=j;unjuZ=s^rfIs3zGUIqQn#u9W)*TqTMlHryboD<-sQS4JHrNrC%-?(7Io3 zsl<9Sd~{lo&FE5OIyh0)f@e5yt44`^b;-rBJ=ri1pt?Ybo+H87e8&#<^~*R7??@u^a zdKOkor%HxHOgjwAI9>pS)n}l;uLf2s{|kPyJg7ytrh#Fe_~fnTPGDt=@E-_rlpvckcZZe>XdP|1XtTQkX>sFe*MTs{f@8! zmfXJXI0oL0fr&H+V@ej_>CLmaeqD)P{^TvF8CU1 zQ&roZd`#`I0>q^C=I%WVPK{4OxnVLS8OQPA$wh2QHzw5=O5}Infj)28hc*xEpjLhf zvK4*!^}Kw+LpFQ9118*WxW9|;59EEy=gVYU8JlX5&qj2SVgGD5oJg9F0c~TVVO{G+ znHP&t(d#Pv%DfkJA5DO!K?3fd*^JBq)oA;B5o|4Vd4JjcHP}Rp=2#PI`ORPDx`!x^ zIE%=XWHvu~Cb})tpiPn_+)8Upx?A3WnK+t0=CHhdTy!40Kl%xdOB|86^Bj&j6|fmQ z+tZeeKzt~u#<7WMh%L3HcEi+Y@R zlq1`mq3g%BX=Fx!a%@wHAvbd{xyF^oUDx63GhKZc9g#NN&&u+yz-g%#S_Em+g|{7qNIWlMNg0|=j>y!8%pXQB&+cizRqNdoT-{iXRk~5Ie^iHqAD+O`Mw?vHbm-EJ z_H=%;4IR_BX)Wxa@0Azv{Tq`NS@Kev{bWuy^qZc9{P@?9tpWc{J-9jZfiaVd2W_n^KqhV1KWFYv1GHC zH)Pvu*-mD)7%;FEy?kvzhktdV%}1Ot`ho-P*|H2fYduK4pC5`2+M@pW5~KzlLG+gq zs9HA&r$sxE_aUg>dTi27fnGp`CyFK<`}(>OAc+ zDi38wnc*HVGxg$+9BRdXH{zxE8IMZwqwi2AE)(D3 zaFfF|9#_Oy#1|WJOf1JC4hI^ME6x?~YDAVeOT4`iTf}LNSTA17A&EmGk9e^hF+Apq zBO4JemLrVAFW(>)OT|Gv0>r*N#){<_{SDsYVU6$*H(;PxKtHhy2PYm5;{QcgaaXbJ zcVNUFd02|&Fymmt!%(cxLt89Ivu{urH$Yk30QtJkEXm&Dqbw51&;PWp@w4mN!5w{@ zSl9o>qko>#tl?%VYSAbKUHC&n(DfAhh4%leRQf;q5%_{^DxT2alvK16#;eFzVlu9?1XlKC)f+^f-XpbC-e(a=vUN*cEL{26?uX#$RF{9enr`oz0lsgaef6mq5Vfv z;0xmr_(Hp)6#9Q8743ecH>NKV{0e+Q=l-nY55;!j`VjIL`nj(({t)bic150|uF_~H z@CEy(;tM=QDcC8t3%bxRNTIzcsjDy^Asf&6TGALt@-`H6kt?BMX3+&dHQ=<#zK zK}RilhDB*pMsb@|k^J=XP?C#p@1%VHCWEWI;~&S*zoDWS=lfrQ&i{x}!A-)?#p!CV z80Gst`9GuRxBTfS-4vsIKjZ(5Qf2bfQ5+Pbd_QY0|EO3^+nU4H3!d8o(-7Uv+hLbx+@!JHeaB=WrAh zIR7X@js_<V`HMj|MXf< z;dFZx#*37LAf67LDUYCcdJ6KLy)O@RnDw`tHMYnwhG&PUv$%R5|#!1sO8b z9Y#wn$KTfG-w0l6VQ}ilLikPo5xnzPKqf#OpU3I&@SK48I|$!qaWH?wb68|ygIj-h zM45Fw9JneOdXpT`H{Be`>k`-gao6G8(`eZJ-V6=0OmX0l3h0zWHL@Ug~qqT?{{wLLr^u7N)kD`HqxG>mh*3Hx^Nu;zmm+RQD3>4!?7M`Vn= z52Nt}?(!qy~n*Da5NLAxXX-VAAi`Gblo-dHOU{{N+A$rtwL=z9dz(Z2IeR>A&PVSZ^Kbm)?BEzHdjhI+$cjmdb@X%r+rzUb=Y`vCUJ9hCGJt{5oEE%BI- z+mgXksS|=#z3{{fJ4yceM^!;ip&dNr4MwNC7hMJEDws(UVadh`_=w_QXM-`${b{(w zMvhe--T1@_3m!kHXd^hlkE#l1t&;3^*@et!!) z-#MV`0W;iIAcXt@%`houC3I&CF>!M??d))a{zbGAEWYf6Y5fck!Z;Y{;(>A@`654sBetX+4Ny9)o3SC$#6$cp*|Bg*)Y8Rn!>t zy!8Siujyj5+(@)d^OpE;@^K!d55EV4Lk8o1)&8iQtdCKCN5HRU49>e@fzkU!;1s_b z`rUX5o=lEM%-4V)64JqY>p7_2Wq_N;ZF0Sn)k5>?e}FG8C!pfduaX$I-s0gk+dvpH z$QjbdJ%u|9ilOnDvn0M%@9xuwtW{7qe;TZD+XwI0E`pan9@uz=hj#aJz%XGk7kC3MKa$(U(6t6lO8TQyaCFlIY|78U*iRCk=?Xq^k>lD z*b3LGyTECaEo$7hMbX0?IKMj!nwKi$*r*}sYxx>SfV zTkkQ2pSR`038PMEF1SOlc&>x4!#6H>rmv* z_l8-U*TS#DVVGLm0W*pM@j%dA5T5dMJ?C5wk-;mSle~S)VHXGRb+6aq$JLEY*bW(Hz_`a~Mvm z7Vpgso2SY2+DGk>(-RJDj)u5+xHdit9gh=7j>alAOK7b*2yQ7OvFvIU94{XL zo1fm5j3wsJYO;nD8!v3ovKBiN$ z7qpoD`sBqFtXen~g=6=F)1pLL>#jfEYrhB+(r?k*R+K}~$51$YL;>XknLV&u4%;p{ zA$Jv%AKfkxcuWyT8t#P74Fhmb;X>GAnh1B3%HgWzJ%~!V2=CO}z-F8#o{#(<4M;I8 zII|t>%ylrb;FV;2kH$lE=&K_zs>uy_avI zQ-hhOxCG%+4^{AV>VXHpwZq641Xns+V^wE4T|}3>)tRW$KkRvP%1 z*kF)#ed_}iLrI7eTFg01H& zx`xNYhGHAgyIcu30=L0KX0Ob3Hp4SdtZ@);I!5oGF7d75$^aN5>JK{&^B|y~7Eb@+ z0c3N1vE;NSlGZX9T^CNT*=T|pPaPppT^_wt^1y2&ppK~uKHriKdMTDT=)-SN=Sibp zz;RgQWP$4(Uc&e@uDCwf9A><+#N`8%na_{W_3Ka`zOhxT0tu9@O~zI${P*z^nKuagok`{4m$Xl;Oq!J3{YJ` z7yn!aQPv}1XNo*dJlhEgEvqE4n4qJ8WkY#*IcW{J);gi#?Q%$6sDMc^g-~?D6mwh} z!2e_kU6!*1dJ~4Ba_uSj;hYyV`0 z{s@lk0vo|IW?u&3?EaBpT*B;|8Rc;Mh#y+Xtabf)-*>1@W=Q5RePl@mw#!%@Z8#`Xd0-_84L5#XKlx?p@oSH-KjSNHn9{ zyrG4+5A}k4u_US$ddMV44&unhD=Y$UOX7AT^da3$=yppPp(F2mT;k&=8)30_Zg z7mbHjDjTwYjDyTgig=~;1S}8hk0*kh=mp)$P%gU%7VPDtT}UxqkaHBgf}>;1FgD!z*QReEw-L-l2wi zi)C@I7baI4yD#_=(!?3E;zj$RutR=d#8tBdWnaA30%DR{%1N<;R5Wq>I8771v*rB!to8g zz?oMHNBj+u4(20b?+9=q81kMqHIfdU@MI|f0_{RgL(!`zg6@V(V1 zxbDJ38y8*tO&AU2EfbIaCt;qK9qzm@hiSaaP}(pWi*Bo+$}KfCwQ+{;N=F+F$+7>&w?%!AsP=_Z3Qoh?v0pbYexeV3WW+!-PH z&-FHm;9z-S7c{-)z(#EWYE+$urWH+aA zo~cG#!sO}ZxJf1hrmPYoy;l+aZ;ZzUA{r|;>%#%d{jlfOdT1Z|^fX=#mzaHfH$)pt zHW}cSt3z?>A?Cixl!3yVP~2Y@e<|pJMFLG+!zqWcTKzC~wfM~BaBdFXM4zAL2%bno z&bFg)r=2+$1K+?=IRmuUUJI|CR4}^86?Hd{#Yl@ZSQU34mij+~0c*Gt{$KCQ2PfCj z>yAf5{^odSO*#V8wpN9>*cyx)heZ84Iu8%Hz+l1B!NRhf@i|G1N_#89NAzuPNi*LPu0zK3XFG z=(3kKNl%C6E4<WAP`_yX={t77mF0nXZCg-V_^fG1Q?0|w(S@dW#ye+6$ho1w!1 zd30;0@LujH;J#l2Q!I8vz^P(r%U8lGM?T&jXpUTtiDbTiy`LD|djpoTZ)tTVH|Gq_ zh1{a!z*~F{F6^m+yxWa%Uda^gSBEe;5`==G4Un4E3r9;0;pjj$-2ePC3^K;F+n8IQ_;m$$0d7TjVZ#0#SN=G~p;<&ukM+wJrvi7)z9w)yK?J zzIb7UAv(S~K&!P5!v3cZg9i5u)N|Ev>`yJADX_)~FLwYmd<2gv%D7BO;~7I0jNBrDt&xm3536y3n8a^Bzvi)MwUKv=MbFVtOT`!G{{~17Dfzi2CWWT z%-XJq5OY>?_gFGlJWmeiQ9vz3t-Am{Z#?i4bH{LtxkXP2F~iWGP6H{eh4b3KKqglP z_pJW_0h3OHyLUPLN$COXS$_sfT_4l)3N2h!7pddQGY!xhpbx>Pj3Bmh8#pKZ0#>(e zuzKNONp4lFoQ3YG6QRrQ5p1?O1Op4X`0(Hd`1#gENSm#P{ER&y{`%~={1vq#4Gu?K z|I^oJ{J(#FX7A3-=HqBv%Jhsi)$^-0b?oL~%4?Yw)m=W2+Hl2;`f$*UTCmrQI=j@A zYE&|%PK5NMbVl~0zUwuiPSluCmjxzNV6!phz0R0g;%H3m`)ovw8)ZZVR2oo+iVdip zJ_gjfdOd1tjUM$fUzhq}uP)W_y$D=YEypmv?DN+Iyq94;$G6AZhL7^ht8=}7fz~E*AJ^xoL%aae1njB zlq#h5%@I=bLxfZa3aN1sYK&bq>Y$z)7574wYIabeX0|I+Zpq5j_QlGS#zQ44H&2Ob za8;s=a|M)jrhrOI5m1ri1l0UmMauh`BBk!2NYxKgq$)ENn7JrWp z*l9kM5y+>;IP$4Wd_HB_#G_W#@~AmSc~oZxk8)YVqgp5MsC+dZWm?Cj0`GDu-Thq3 zD1l4$hHnR!PwxtYp{{x3w zc8o)f-^ZbR7IUZ?Hx9K>q09)cr#NYsEH5_q+hOqKhP6?IP=sb&<`%T_j~%7g4h9 zB1n5_ju90+0`df0?_APlErJ)*Hg{c|-Dtz9FZ!Rg+!Qs!9I1YI4xCnv^+JldCDOiS?S-r3=RpiU9Dsm{liX=x>5rv6WWP*1UaWktT z_a0XgyIYmSc3&m=;#El=Gj-5_O5&qiNscO1k~=ReNJMr8>By`goqH>YYkUQn_hSW_ z8D2r;EGkHmW(C=#P(dE{ydqOIUXgnKD>DAXOVXS9lBDi^NrK{Ek`=>VGULfB1e%vk zIl=sDBd-7Hywm#odFS8`!hif+AR|7|nWFd={x$x8^VI&OKYjVpSG)8)Sv?lnercI) zXOaEEjFtZ_GD{{YtbG>y;`i0h+L7kF2*{e8)7JIiP1#kRBdq}PMhWBXZsX=L@J+yAd} zU-4$`vh!h)ZD+BscDA2Ic3-mkEVBK5AsZLo*R=REFQnth_Djc!W8P$OhkpOK zut~ktF-#yH;Xy|DWG<6|;yS@wMht(x)AQGtON8Q%zZK~>e5BcbeYR%z$X5fKnJNC2 zJ!Ggf_pkTVzjN-?>HlQd|dV?q8$9=HpkN=k@UT?A!9;Z7@qhACXS4S`2VjT2ot Date: Tue, 4 Oct 2022 17:48:11 +0100 Subject: [PATCH 006/624] fix bug in output format for pyav (#6672) * fix bug in output format for pyav * add read from memory with constructor overload * Revert "add read from memory with constructor overload" This reverts commit 14cbbab239165be05096fd6cbb88cb0448502436. * run ufmt --- torchvision/io/video.py | 132 ++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 65 deletions(-) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index ceb20fe52..002fde998 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -273,72 +273,74 @@ def read_video( raise RuntimeError(f"File not found: {filename}") if get_video_backend() != "pyav": - return _video_opt._read_video(filename, start_pts, end_pts, pts_unit) - - _check_av_available() - - if end_pts is None: - end_pts = float("inf") - - if end_pts < start_pts: - raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}") - - info = {} - video_frames = [] - audio_frames = [] - audio_timebase = _video_opt.default_timebase - - try: - with av.open(filename, metadata_errors="ignore") as container: - if container.streams.audio: - audio_timebase = container.streams.audio[0].time_base - if container.streams.video: - video_frames = _read_from_stream( - container, - start_pts, - end_pts, - pts_unit, - container.streams.video[0], - {"video": 0}, - ) - video_fps = container.streams.video[0].average_rate - # guard against potentially corrupted files - if video_fps is not None: - info["video_fps"] = float(video_fps) - - if container.streams.audio: - audio_frames = _read_from_stream( - container, - start_pts, - end_pts, - pts_unit, - container.streams.audio[0], - {"audio": 0}, - ) - info["audio_fps"] = container.streams.audio[0].rate - - except av.AVError: - # TODO raise a warning? - pass - - vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames] - aframes_list = [frame.to_ndarray() for frame in audio_frames] - - if vframes_list: - vframes = torch.as_tensor(np.stack(vframes_list)) - else: - vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8) - - if aframes_list: - aframes = np.concatenate(aframes_list, 1) - aframes = torch.as_tensor(aframes) - if pts_unit == "sec": - start_pts = int(math.floor(start_pts * (1 / audio_timebase))) - if end_pts != float("inf"): - end_pts = int(math.ceil(end_pts * (1 / audio_timebase))) - aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts) + vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit) else: - aframes = torch.empty((1, 0), dtype=torch.float32) + _check_av_available() + + if end_pts is None: + end_pts = float("inf") + + if end_pts < start_pts: + raise ValueError( + f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}" + ) + + info = {} + video_frames = [] + audio_frames = [] + audio_timebase = _video_opt.default_timebase + + try: + with av.open(filename, metadata_errors="ignore") as container: + if container.streams.audio: + audio_timebase = container.streams.audio[0].time_base + if container.streams.video: + video_frames = _read_from_stream( + container, + start_pts, + end_pts, + pts_unit, + container.streams.video[0], + {"video": 0}, + ) + video_fps = container.streams.video[0].average_rate + # guard against potentially corrupted files + if video_fps is not None: + info["video_fps"] = float(video_fps) + + if container.streams.audio: + audio_frames = _read_from_stream( + container, + start_pts, + end_pts, + pts_unit, + container.streams.audio[0], + {"audio": 0}, + ) + info["audio_fps"] = container.streams.audio[0].rate + + except av.AVError: + # TODO raise a warning? + pass + + vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames] + aframes_list = [frame.to_ndarray() for frame in audio_frames] + + if vframes_list: + vframes = torch.as_tensor(np.stack(vframes_list)) + else: + vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8) + + if aframes_list: + aframes = np.concatenate(aframes_list, 1) + aframes = torch.as_tensor(aframes) + if pts_unit == "sec": + start_pts = int(math.floor(start_pts * (1 / audio_timebase))) + if end_pts != float("inf"): + end_pts = int(math.ceil(end_pts * (1 / audio_timebase))) + aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts) + else: + aframes = torch.empty((1, 0), dtype=torch.float32) if output_format == "TCHW": # [T,H,W,C] --> [T,C,H,W] -- GitLab From 3038cb28d9e9c8c2cebda310943721ec0dc9014e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 4 Oct 2022 20:05:51 +0200 Subject: [PATCH 007/624] fix example galleries in documentation (#6694) * exclude sphinx-gallery==0.11.0 * fix CSS * Update docs/requirements.txt --- docs/requirements.txt | 2 +- docs/source/_static/css/custom_torchvision.css | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 1ff0c8280..09a11359a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ matplotlib numpy sphinx-copybutton>=0.3.1 -sphinx-gallery>=0.9.0 +sphinx-gallery>=0.11.1 sphinx==5.0.0 tabulate -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme diff --git a/docs/source/_static/css/custom_torchvision.css b/docs/source/_static/css/custom_torchvision.css index bdc4071c1..07346d7b0 100644 --- a/docs/source/_static/css/custom_torchvision.css +++ b/docs/source/_static/css/custom_torchvision.css @@ -21,3 +21,15 @@ article.pytorch-article .reference.download.internal, article.pytorch-article .s .table-weights p { margin-bottom: 0.2rem !important; } + +/* Fix for Sphinx gallery 0.11 +See https://github.com/sphinx-gallery/sphinx-gallery/issues/990 +*/ +article.pytorch-article .sphx-glr-thumbnails .sphx-glr-thumbcontainer { + width: unset; + margin-right: 0; + margin-left: 0; +} +article.pytorch-article div.section div.wy-table-responsive tbody td { + width: 50%; +} -- GitLab From 71885b0f255f2e76fd0a07e348bddbec6430a8d3 Mon Sep 17 00:00:00 2001 From: Karan Desai Date: Tue, 4 Oct 2022 17:58:25 -0400 Subject: [PATCH 008/624] Make CUB200 labels 0-indexed. (#6702) CUB200 dataset in `torchvision.prototype.datasets` module formed labels using file paths. This resulted in labels being 1-indexed (1-200) instead of 0-indexed (0-199). Similar issue occurred with Flowers102 (`torchvision.datasets` module, #5766). --- torchvision/prototype/datasets/_builtin/cub200.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py index c07166a96..f1531615c 100644 --- a/torchvision/prototype/datasets/_builtin/cub200.py +++ b/torchvision/prototype/datasets/_builtin/cub200.py @@ -177,7 +177,10 @@ class CUB200(Dataset): return dict( prepare_ann_fn(anns_data, image.image_size), image=image, - label=Label(int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]), categories=self._categories), + label=Label( + int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1, + categories=self._categories, + ), ) def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: -- GitLab From 0e006a9fc9eae6d3cc13fceb3a205af29b06d703 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Tue, 4 Oct 2022 15:02:57 -0700 Subject: [PATCH 009/624] [ONNX] Rephrase ONNX RoiAlign warning for aligned=True (#6704) * Rephrase ONNX RoiAlign warning for aligned=True * add comma Co-authored-by: Vasilis Vryniotis --- torchvision/ops/_register_onnx_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py index eaea0b900..8f9598e1f 100644 --- a/torchvision/ops/_register_onnx_ops.py +++ b/torchvision/ops/_register_onnx_ops.py @@ -45,8 +45,8 @@ def _register_custom_op(): rois = _process_rois_for_roi_align(g, rois) if aligned: warnings.warn( - "ROIAlign with aligned=True is not supported in ONNX, but is supported in opset 16. " - "Please export with opset 16 or higher to use aligned=False." + "ROIAlign with aligned=True is only supported in opset >= 16. " + "Please export with opset 16 or higher, or use aligned=False." ) sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio) return g.op( -- GitLab From 4a99bae8ad28247520b3e3b179ed604317d5fdce Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 5 Oct 2022 14:07:55 +0200 Subject: [PATCH 010/624] add dispatch tests for prototype transform dispatchers (#6631) --- test/prototype_transforms_dispatcher_infos.py | 122 ++++++++++++++++-- test/prototype_transforms_kernel_infos.py | 2 +- test/test_prototype_transforms_functional.py | 75 +++++++++++ 3 files changed, 189 insertions(+), 10 deletions(-) diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py index 99a9066be..a14d5eaf0 100644 --- a/test/prototype_transforms_dispatcher_infos.py +++ b/test/prototype_transforms_dispatcher_infos.py @@ -1,15 +1,27 @@ +import collections.abc import dataclasses + from collections import defaultdict -from typing import Callable, Dict, List, Sequence, Type +from typing import Callable, Dict, List, Optional, Sequence, Type import pytest import torchvision.prototype.transforms.functional as F -from prototype_transforms_kernel_infos import KERNEL_INFOS, Skip +from prototype_common_utils import BoundingBoxLoader +from prototype_transforms_kernel_infos import KERNEL_INFOS, KernelInfo, Skip from torchvision.prototype import features __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"] -KERNEL_SAMPLE_INPUTS_FN_MAP = {info.kernel: info.sample_inputs_fn for info in KERNEL_INFOS} +KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS} + + +@dataclasses.dataclass +class PILKernelInfo: + kernel: Callable + kernel_name: str = dataclasses.field(default=None) + + def __post_init__(self): + self.kernel_name = self.kernel_name or self.kernel.__name__ def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"): @@ -28,21 +40,35 @@ def skip_integer_size_jit(name="size"): class DispatcherInfo: dispatcher: Callable kernels: Dict[Type, Callable] + kernel_infos: Dict[Type, KernelInfo] = dataclasses.field(default=None) + pil_kernel_info: Optional[PILKernelInfo] = None + method_name: str = dataclasses.field(default=None) skips: Sequence[Skip] = dataclasses.field(default_factory=list) _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False) def __post_init__(self): + self.kernel_infos = {feature_type: KERNEL_INFO_MAP[kernel] for feature_type, kernel in self.kernels.items()} + self.method_name = self.method_name or self.dispatcher.__name__ skips_map = defaultdict(list) for skip in self.skips: skips_map[skip.test_name].append(skip) self._skips_map = dict(skips_map) - def sample_inputs(self, *types): - for type in types or self.kernels.keys(): - if type not in self.kernels: - raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}") + def sample_inputs(self, *feature_types, filter_metadata=True): + for feature_type in feature_types or self.kernels.keys(): + if feature_type not in self.kernels: + raise pytest.UsageError(f"There is no kernel registered for type {feature_type.__name__}") + + sample_inputs = self.kernel_infos[feature_type].sample_inputs_fn() + if not filter_metadata: + yield from sample_inputs + else: + for args_kwargs in sample_inputs: + for attribute in feature_type.__annotations__.keys(): + if attribute in args_kwargs.kwargs: + del args_kwargs.kwargs[attribute] - yield from KERNEL_SAMPLE_INPUTS_FN_MAP[self.kernels[type]]() + yield args_kwargs def maybe_skip(self, *, test_name, args_kwargs, device): skips = self._skips_map.get(test_name) @@ -54,6 +80,31 @@ class DispatcherInfo: pytest.skip(skip.reason) +def fill_sequence_needs_broadcast(args_kwargs, device): + (image_loader, *_), kwargs = args_kwargs + try: + fill = kwargs["fill"] + except KeyError: + return False + + if not isinstance(fill, collections.abc.Sequence) or len(fill) > 1: + return False + + return image_loader.num_channels > 1 + + +skip_dispatch_pil_if_fill_sequence_needs_broadcast = Skip( + "test_dispatch_pil", + condition=fill_sequence_needs_broadcast, + reason="PIL kernel doesn't support sequences of length 1 if the number of channels is larger.", +) + +skip_dispatch_feature = Skip( + "test_dispatch_feature", + reason="Dispatcher doesn't support arbitrary feature dispatch.", +) + + DISPATCHER_INFOS = [ DispatcherInfo( F.horizontal_flip, @@ -62,6 +113,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.horizontal_flip_bounding_box, features.Mask: F.horizontal_flip_mask, }, + pil_kernel_info=PILKernelInfo(F.horizontal_flip_image_pil, kernel_name="horizontal_flip_image_pil"), ), DispatcherInfo( F.resize, @@ -70,6 +122,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.resize_bounding_box, features.Mask: F.resize_mask, }, + pil_kernel_info=PILKernelInfo(F.resize_image_pil), skips=[ skip_integer_size_jit(), ], @@ -81,7 +134,11 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.affine_bounding_box, features.Mask: F.affine_mask, }, - skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")], + pil_kernel_info=PILKernelInfo(F.affine_image_pil), + skips=[ + skip_dispatch_pil_if_fill_sequence_needs_broadcast, + skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"), + ], ), DispatcherInfo( F.vertical_flip, @@ -90,6 +147,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.vertical_flip_bounding_box, features.Mask: F.vertical_flip_mask, }, + pil_kernel_info=PILKernelInfo(F.vertical_flip_image_pil, kernel_name="vertical_flip_image_pil"), ), DispatcherInfo( F.rotate, @@ -98,6 +156,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.rotate_bounding_box, features.Mask: F.rotate_mask, }, + pil_kernel_info=PILKernelInfo(F.rotate_image_pil), ), DispatcherInfo( F.crop, @@ -106,6 +165,17 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.crop_bounding_box, features.Mask: F.crop_mask, }, + pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"), + skips=[ + Skip( + "test_dispatch_feature", + condition=lambda args_kwargs, device: isinstance(args_kwargs.args[0], BoundingBoxLoader), + reason=( + "F.crop expects 4 coordinates as input, but bounding box sample inputs only generate two " + "since that is sufficient for the kernel." + ), + ) + ], ), DispatcherInfo( F.resized_crop, @@ -114,6 +184,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.resized_crop_bounding_box, features.Mask: F.resized_crop_mask, }, + pil_kernel_info=PILKernelInfo(F.resized_crop_image_pil), ), DispatcherInfo( F.pad, @@ -122,6 +193,10 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.pad_bounding_box, features.Mask: F.pad_mask, }, + skips=[ + skip_dispatch_pil_if_fill_sequence_needs_broadcast, + ], + pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"), ), DispatcherInfo( F.perspective, @@ -130,6 +205,10 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.perspective_bounding_box, features.Mask: F.perspective_mask, }, + skips=[ + skip_dispatch_pil_if_fill_sequence_needs_broadcast, + ], + pil_kernel_info=PILKernelInfo(F.perspective_image_pil), ), DispatcherInfo( F.elastic, @@ -138,6 +217,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.elastic_bounding_box, features.Mask: F.elastic_mask, }, + pil_kernel_info=PILKernelInfo(F.elastic_image_pil), ), DispatcherInfo( F.center_crop, @@ -146,6 +226,7 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.center_crop_bounding_box, features.Mask: F.center_crop_mask, }, + pil_kernel_info=PILKernelInfo(F.center_crop_image_pil), skips=[ skip_integer_size_jit("output_size"), ], @@ -155,6 +236,7 @@ DISPATCHER_INFOS = [ kernels={ features.Image: F.gaussian_blur_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil), skips=[ skip_python_scalar_arg_jit("kernel_size"), skip_python_scalar_arg_jit("sigma"), @@ -165,80 +247,97 @@ DISPATCHER_INFOS = [ kernels={ features.Image: F.equalize_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.equalize_image_pil, kernel_name="equalize_image_pil"), ), DispatcherInfo( F.invert, kernels={ features.Image: F.invert_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.invert_image_pil, kernel_name="invert_image_pil"), ), DispatcherInfo( F.posterize, kernels={ features.Image: F.posterize_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.posterize_image_pil, kernel_name="posterize_image_pil"), ), DispatcherInfo( F.solarize, kernels={ features.Image: F.solarize_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.solarize_image_pil, kernel_name="solarize_image_pil"), ), DispatcherInfo( F.autocontrast, kernels={ features.Image: F.autocontrast_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.autocontrast_image_pil, kernel_name="autocontrast_image_pil"), ), DispatcherInfo( F.adjust_sharpness, kernels={ features.Image: F.adjust_sharpness_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"), ), DispatcherInfo( F.erase, kernels={ features.Image: F.erase_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.erase_image_pil), + skips=[ + skip_dispatch_feature, + ], ), DispatcherInfo( F.adjust_brightness, kernels={ features.Image: F.adjust_brightness_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.adjust_brightness_image_pil, kernel_name="adjust_brightness_image_pil"), ), DispatcherInfo( F.adjust_contrast, kernels={ features.Image: F.adjust_contrast_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"), ), DispatcherInfo( F.adjust_gamma, kernels={ features.Image: F.adjust_gamma_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"), ), DispatcherInfo( F.adjust_hue, kernels={ features.Image: F.adjust_hue_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"), ), DispatcherInfo( F.adjust_saturation, kernels={ features.Image: F.adjust_saturation_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"), ), DispatcherInfo( F.five_crop, kernels={ features.Image: F.five_crop_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.five_crop_image_pil), skips=[ skip_integer_size_jit(), + skip_dispatch_feature, ], ), DispatcherInfo( @@ -246,8 +345,10 @@ DISPATCHER_INFOS = [ kernels={ features.Image: F.ten_crop_image_tensor, }, + pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil), skips=[ skip_integer_size_jit(), + skip_dispatch_feature, ], ), DispatcherInfo( @@ -255,5 +356,8 @@ DISPATCHER_INFOS = [ kernels={ features.Image: F.normalize_image_tensor, }, + skips=[ + skip_dispatch_feature, + ], ), ] diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 3f050ad8f..a047a2d57 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -33,7 +33,7 @@ class KernelInfo: sample_inputs_fn: Callable[[], Iterable[ArgsKwargs]] # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then - kernel_name: Optional[str] = None + kernel_name: str = dataclasses.field(default=None) # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also take # tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should happen # inside the function. It should return a tensor or to be more precise an object that can be compared to a diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index b2c830d5d..143a5cd22 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -174,6 +174,18 @@ class TestKernels: assert_close(actual, expected, check_dtype=False, **info.closeness_kwargs) +@pytest.fixture +def spy_on(mocker): + def make_spy(fn, *, module=None, name=None): + # TODO: we can probably get rid of the non-default modules and names if we eliminate aliasing + module = module or fn.__module__ + name = name or fn.__name__ + spy = mocker.patch(f"{module}.{name}", wraps=fn) + return spy + + return make_spy + + class TestDispatchers: @pytest.mark.parametrize( ("info", "args_kwargs"), @@ -211,6 +223,69 @@ class TestDispatchers: def test_scriptable(self, dispatcher): script(dispatcher) + @pytest.mark.parametrize( + ("info", "args_kwargs"), + [ + pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") + for info in DISPATCHER_INFOS + for idx, args_kwargs in enumerate(info.sample_inputs(features.Image)) + if features.Image in info.kernels + ], + ) + def test_dispatch_simple_tensor(self, info, args_kwargs, spy_on): + (image_feature, *other_args), kwargs = args_kwargs.load() + image_simple_tensor = torch.Tensor(image_feature) + + kernel_info = info.kernel_infos[features.Image] + spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.kernel_name) + + info.dispatcher(image_simple_tensor, *other_args, **kwargs) + + spy.assert_called_once() + + @pytest.mark.parametrize( + ("info", "args_kwargs"), + [ + pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") + for info in DISPATCHER_INFOS + for idx, args_kwargs in enumerate(info.sample_inputs(features.Image)) + if features.Image in info.kernels and info.pil_kernel_info is not None + ], + ) + def test_dispatch_pil(self, info, args_kwargs, spy_on): + (image_feature, *other_args), kwargs = args_kwargs.load() + + if image_feature.ndim > 3: + pytest.skip("Input is batched") + + image_pil = F.to_image_pil(image_feature) + + pil_kernel_info = info.pil_kernel_info + spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.kernel_name) + + info.dispatcher(image_pil, *other_args, **kwargs) + + spy.assert_called_once() + + @pytest.mark.parametrize( + ("info", "args_kwargs"), + [ + pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") + for info in DISPATCHER_INFOS + for idx, args_kwargs in enumerate(info.sample_inputs()) + ], + ) + def test_dispatch_feature(self, info, args_kwargs, spy_on): + (feature, *other_args), kwargs = args_kwargs.load() + + method = getattr(feature, info.method_name) + feature_type = type(feature) + spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{info.method_name}") + + info.dispatcher(feature, *other_args, **kwargs) + + spy.assert_called_once() + @pytest.mark.parametrize( ("alias", "target"), -- GitLab From a46c4f0ccdb67d94c2ffc8b68b52693533a7683c Mon Sep 17 00:00:00 2001 From: YosuaMichael Date: Wed, 5 Oct 2022 13:54:05 +0100 Subject: [PATCH 011/624] [bugfix] Fix the output format for VideoClips.subset (#6700) Co-authored-by: Vasilis Vryniotis --- torchvision/datasets/video_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index c4890ff44..b607def24 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -198,6 +198,7 @@ class VideoClips: _video_max_dimension=self._video_max_dimension, _audio_samples=self._audio_samples, _audio_channels=self._audio_channels, + output_format=self.output_format, ) @staticmethod -- GitLab From 46eae182b9a2ad3cb906294e51be2838c98b5073 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 5 Oct 2022 16:46:30 +0200 Subject: [PATCH 012/624] use pytest markers instead of custom solution for prototype transforms functional tests (#6653) * use pytest markers instead of custom solution for prototype transforms functional tests * cleanup * cleanup * trigger CI --- test/prototype_transforms_dispatcher_infos.py | 139 +++++++++--------- test/prototype_transforms_kernel_infos.py | 132 ++++++++--------- test/test_prototype_transforms_functional.py | 131 +++++++++-------- 3 files changed, 201 insertions(+), 201 deletions(-) diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py index a14d5eaf0..11a4c35ae 100644 --- a/test/prototype_transforms_dispatcher_infos.py +++ b/test/prototype_transforms_dispatcher_infos.py @@ -2,12 +2,12 @@ import collections.abc import dataclasses from collections import defaultdict + from typing import Callable, Dict, List, Optional, Sequence, Type import pytest import torchvision.prototype.transforms.functional as F -from prototype_common_utils import BoundingBoxLoader -from prototype_transforms_kernel_infos import KERNEL_INFOS, KernelInfo, Skip +from prototype_transforms_kernel_infos import KERNEL_INFOS, TestMark from torchvision.prototype import features __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"] @@ -24,35 +24,27 @@ class PILKernelInfo: self.kernel_name = self.kernel_name or self.kernel.__name__ -def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"): - return Skip( - "test_scripted_smoke", - condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs[name], (int, float)), - reason=reason, - ) - - -def skip_integer_size_jit(name="size"): - return skip_python_scalar_arg_jit(name, reason="Integer size is not supported when scripting.") - - @dataclasses.dataclass class DispatcherInfo: dispatcher: Callable kernels: Dict[Type, Callable] - kernel_infos: Dict[Type, KernelInfo] = dataclasses.field(default=None) pil_kernel_info: Optional[PILKernelInfo] = None method_name: str = dataclasses.field(default=None) - skips: Sequence[Skip] = dataclasses.field(default_factory=list) - _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False) + test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list) + _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False) def __post_init__(self): self.kernel_infos = {feature_type: KERNEL_INFO_MAP[kernel] for feature_type, kernel in self.kernels.items()} self.method_name = self.method_name or self.dispatcher.__name__ - skips_map = defaultdict(list) - for skip in self.skips: - skips_map[skip.test_name].append(skip) - self._skips_map = dict(skips_map) + test_marks_map = defaultdict(list) + for test_mark in self.test_marks: + test_marks_map[test_mark.test_id].append(test_mark) + self._test_marks_map = dict(test_marks_map) + + def get_marks(self, test_id, args_kwargs): + return [ + test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) + ] def sample_inputs(self, *feature_types, filter_metadata=True): for feature_type in feature_types or self.kernels.keys(): @@ -70,17 +62,27 @@ class DispatcherInfo: yield args_kwargs - def maybe_skip(self, *, test_name, args_kwargs, device): - skips = self._skips_map.get(test_name) - if not skips: - return - for skip in skips: - if skip.condition(args_kwargs, device): - pytest.skip(skip.reason) +def xfail_python_scalar_arg_jit(name, *, reason=None): + reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting" + return TestMark( + ("TestDispatchers", "test_scripted_smoke"), + pytest.mark.xfail(reason=reason), + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)), + ) + +def xfail_integer_size_jit(name="size"): + return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.") -def fill_sequence_needs_broadcast(args_kwargs, device): + +skip_dispatch_feature = TestMark( + ("TestDispatchers", "test_dispatch_feature"), + pytest.mark.skip(reason="Dispatcher doesn't support arbitrary feature dispatch."), +) + + +def fill_sequence_needs_broadcast(args_kwargs): (image_loader, *_), kwargs = args_kwargs try: fill = kwargs["fill"] @@ -93,15 +95,12 @@ def fill_sequence_needs_broadcast(args_kwargs, device): return image_loader.num_channels > 1 -skip_dispatch_pil_if_fill_sequence_needs_broadcast = Skip( - "test_dispatch_pil", +xfail_dispatch_pil_if_fill_sequence_needs_broadcast = TestMark( + ("TestDispatchers", "test_dispatch_pil"), + pytest.mark.xfail( + reason="PIL kernel doesn't support sequences of length 1 for `fill` if the number of color channels is larger." + ), condition=fill_sequence_needs_broadcast, - reason="PIL kernel doesn't support sequences of length 1 if the number of channels is larger.", -) - -skip_dispatch_feature = Skip( - "test_dispatch_feature", - reason="Dispatcher doesn't support arbitrary feature dispatch.", ) @@ -123,8 +122,8 @@ DISPATCHER_INFOS = [ features.Mask: F.resize_mask, }, pil_kernel_info=PILKernelInfo(F.resize_image_pil), - skips=[ - skip_integer_size_jit(), + test_marks=[ + xfail_integer_size_jit(), ], ), DispatcherInfo( @@ -135,9 +134,9 @@ DISPATCHER_INFOS = [ features.Mask: F.affine_mask, }, pil_kernel_info=PILKernelInfo(F.affine_image_pil), - skips=[ - skip_dispatch_pil_if_fill_sequence_needs_broadcast, - skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"), + test_marks=[ + xfail_dispatch_pil_if_fill_sequence_needs_broadcast, + xfail_python_scalar_arg_jit("shear"), ], ), DispatcherInfo( @@ -166,16 +165,6 @@ DISPATCHER_INFOS = [ features.Mask: F.crop_mask, }, pil_kernel_info=PILKernelInfo(F.crop_image_pil, kernel_name="crop_image_pil"), - skips=[ - Skip( - "test_dispatch_feature", - condition=lambda args_kwargs, device: isinstance(args_kwargs.args[0], BoundingBoxLoader), - reason=( - "F.crop expects 4 coordinates as input, but bounding box sample inputs only generate two " - "since that is sufficient for the kernel." - ), - ) - ], ), DispatcherInfo( F.resized_crop, @@ -193,10 +182,20 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.pad_bounding_box, features.Mask: F.pad_mask, }, - skips=[ - skip_dispatch_pil_if_fill_sequence_needs_broadcast, - ], pil_kernel_info=PILKernelInfo(F.pad_image_pil, kernel_name="pad_image_pil"), + test_marks=[ + TestMark( + ("TestDispatchers", "test_dispatch_pil"), + pytest.mark.xfail( + reason=( + "PIL kernel doesn't support sequences of length 1 for argument `fill` and " + "`padding_mode='constant'`, if the number of color channels is larger." + ) + ), + condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs) + and args_kwargs.kwargs.get("padding_mode", "constant") == "constant", + ) + ], ), DispatcherInfo( F.perspective, @@ -205,10 +204,10 @@ DISPATCHER_INFOS = [ features.BoundingBox: F.perspective_bounding_box, features.Mask: F.perspective_mask, }, - skips=[ - skip_dispatch_pil_if_fill_sequence_needs_broadcast, - ], pil_kernel_info=PILKernelInfo(F.perspective_image_pil), + test_marks=[ + xfail_dispatch_pil_if_fill_sequence_needs_broadcast, + ], ), DispatcherInfo( F.elastic, @@ -227,8 +226,8 @@ DISPATCHER_INFOS = [ features.Mask: F.center_crop_mask, }, pil_kernel_info=PILKernelInfo(F.center_crop_image_pil), - skips=[ - skip_integer_size_jit("output_size"), + test_marks=[ + xfail_integer_size_jit("output_size"), ], ), DispatcherInfo( @@ -237,9 +236,9 @@ DISPATCHER_INFOS = [ features.Image: F.gaussian_blur_image_tensor, }, pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil), - skips=[ - skip_python_scalar_arg_jit("kernel_size"), - skip_python_scalar_arg_jit("sigma"), + test_marks=[ + xfail_python_scalar_arg_jit("kernel_size"), + xfail_python_scalar_arg_jit("sigma"), ], ), DispatcherInfo( @@ -290,7 +289,7 @@ DISPATCHER_INFOS = [ features.Image: F.erase_image_tensor, }, pil_kernel_info=PILKernelInfo(F.erase_image_pil), - skips=[ + test_marks=[ skip_dispatch_feature, ], ), @@ -335,8 +334,8 @@ DISPATCHER_INFOS = [ features.Image: F.five_crop_image_tensor, }, pil_kernel_info=PILKernelInfo(F.five_crop_image_pil), - skips=[ - skip_integer_size_jit(), + test_marks=[ + xfail_integer_size_jit(), skip_dispatch_feature, ], ), @@ -345,18 +344,18 @@ DISPATCHER_INFOS = [ kernels={ features.Image: F.ten_crop_image_tensor, }, - pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil), - skips=[ - skip_integer_size_jit(), + test_marks=[ + xfail_integer_size_jit(), skip_dispatch_feature, ], + pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil), ), DispatcherInfo( F.normalize, kernels={ features.Image: F.normalize_image_tensor, }, - skips=[ + test_marks=[ skip_dispatch_feature, ], ), diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index a047a2d57..2e02989b4 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -3,13 +3,15 @@ import functools import itertools import math from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple import numpy as np import pytest import torch.testing import torchvision.ops import torchvision.prototype.transforms.functional as F + +from _pytest.mark.structures import MarkDecorator from datasets_utils import combinations_grid from prototype_common_utils import ArgsKwargs, make_bounding_box_loaders, make_image_loaders, make_mask_loaders from torchvision.prototype import features @@ -18,11 +20,14 @@ from torchvision.transforms.functional_tensor import _max_value as get_max_value __all__ = ["KernelInfo", "KERNEL_INFOS"] +TestID = Tuple[Optional[str], str] + + @dataclasses.dataclass -class Skip: - test_name: str - reason: str - condition: Callable[[ArgsKwargs, str], bool] = lambda args_kwargs, device: True +class TestMark: + test_id: TestID + mark: MarkDecorator + condition: Callable[[ArgsKwargs], bool] = lambda args_kwargs: True @dataclasses.dataclass @@ -44,26 +49,22 @@ class KernelInfo: reference_inputs_fn: Optional[Callable[[], Iterable[ArgsKwargs]]] = None # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. closeness_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) - skips: Sequence[Skip] = dataclasses.field(default_factory=list) - _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False) + test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list) + _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False) def __post_init__(self): self.kernel_name = self.kernel_name or self.kernel.__name__ self.reference_inputs_fn = self.reference_inputs_fn or self.sample_inputs_fn - skips_map = defaultdict(list) - for skip in self.skips: - skips_map[skip.test_name].append(skip) - self._skips_map = dict(skips_map) + test_marks_map = defaultdict(list) + for test_mark in self.test_marks: + test_marks_map[test_mark.test_id].append(test_mark) + self._test_marks_map = dict(test_marks_map) - def maybe_skip(self, *, test_name, args_kwargs, device): - skips = self._skips_map.get(test_name) - if not skips: - return - - for skip in skips: - if skip.condition(args_kwargs, device): - pytest.skip(skip.reason) + def get_marks(self, test_id, args_kwargs): + return [ + test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) + ] DEFAULT_IMAGE_CLOSENESS_KWARGS = dict( @@ -87,16 +88,27 @@ def pil_reference_wrapper(pil_kernel): return wrapper -def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"): - return Skip( - "test_scripted_vs_eager", - condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs[name], (int, float)), - reason=reason, +def mark_framework_limitation(test_id, reason): + # The purpose of this function is to have a single entry point for skip marks that are only there, because the test + # framework cannot handle the kernel in general or a specific parameter combination. + # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is + # still justified. + # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus, + # we are wasting CI resources for no reason for most of the time. + return TestMark(test_id, pytest.mark.skip(reason=reason)) + + +def xfail_python_scalar_arg_jit(name, *, reason=None): + reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting" + return TestMark( + ("TestKernels", "test_scripted_vs_eager"), + pytest.mark.xfail(reason=reason), + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)), ) -def skip_integer_size_jit(name="size"): - return skip_python_scalar_arg_jit(name, reason="Integer size is not supported when scripting.") +def xfail_integer_size_jit(name="size"): + return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.") KERNEL_INFOS = [] @@ -151,8 +163,7 @@ KERNEL_INFOS.extend( def _get_resize_sizes(image_size): height, width = image_size length = max(image_size) - # FIXME: enable me when the kernels are fixed - # yield length + yield length yield [length] yield (length,) new_height = int(height * 0.75) @@ -236,15 +247,15 @@ KERNEL_INFOS.extend( reference_fn=reference_resize_image_tensor, reference_inputs_fn=reference_inputs_resize_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[ - skip_integer_size_jit(), + test_marks=[ + xfail_integer_size_jit(), ], ), KernelInfo( F.resize_bounding_box, sample_inputs_fn=sample_inputs_resize_bounding_box, - skips=[ - skip_integer_size_jit(), + test_marks=[ + xfail_integer_size_jit(), ], ), KernelInfo( @@ -253,8 +264,8 @@ KERNEL_INFOS.extend( reference_fn=reference_resize_mask, reference_inputs_fn=reference_inputs_resize_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[ - skip_integer_size_jit(), + test_marks=[ + xfail_integer_size_jit(), ], ), ] @@ -436,16 +447,6 @@ def reference_inputs_resize_mask(): yield ArgsKwargs(mask_loader, **affine_kwargs) -# FIXME: @datumbox, remove this as soon as you have fixed the behavior in https://github.com/pytorch/vision/pull/6636 -def skip_scalar_shears(*test_names): - for test_name in test_names: - yield Skip( - test_name, - condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs["shear"], (int, float)), - reason="The kernel is broken for a scalar `shear`", - ) - - KERNEL_INFOS.extend( [ KernelInfo( @@ -454,7 +455,7 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.affine_image_pil), reference_inputs_fn=reference_inputs_affine_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")], + test_marks=[xfail_python_scalar_arg_jit("shear")], ), KernelInfo( F.affine_bounding_box, @@ -462,13 +463,8 @@ KERNEL_INFOS.extend( reference_fn=reference_affine_bounding_box, reference_inputs_fn=reference_inputs_affine_bounding_box, closeness_kwargs=dict(atol=1, rtol=0), - skips=[ - skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"), - *skip_scalar_shears( - "test_batched_vs_single", - "test_no_inplace", - "test_dtype_and_device_consistency", - ), + test_marks=[ + xfail_python_scalar_arg_jit("shear"), ], ), KernelInfo( @@ -477,7 +473,7 @@ KERNEL_INFOS.extend( reference_fn=reference_affine_mask, reference_inputs_fn=reference_inputs_resize_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")], + test_marks=[xfail_python_scalar_arg_jit("shear")], ), ] ) @@ -1093,15 +1089,15 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.center_crop_image_pil), reference_inputs_fn=reference_inputs_center_crop_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[ - skip_integer_size_jit("output_size"), + test_marks=[ + xfail_integer_size_jit("output_size"), ], ), KernelInfo( F.center_crop_bounding_box, sample_inputs_fn=sample_inputs_center_crop_bounding_box, - skips=[ - skip_integer_size_jit("output_size"), + test_marks=[ + xfail_integer_size_jit("output_size"), ], ), KernelInfo( @@ -1110,8 +1106,8 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.center_crop_image_pil), reference_inputs_fn=reference_inputs_center_crop_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[ - skip_integer_size_jit("output_size"), + test_marks=[ + xfail_integer_size_jit("output_size"), ], ), ] @@ -1138,9 +1134,9 @@ KERNEL_INFOS.append( F.gaussian_blur_image_tensor, sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - skips=[ - skip_python_scalar_arg_jit("kernel_size"), - skip_python_scalar_arg_jit("sigma"), + test_marks=[ + xfail_python_scalar_arg_jit("kernel_size"), + xfail_python_scalar_arg_jit("sigma"), ], ) ) @@ -1551,9 +1547,9 @@ KERNEL_INFOS.extend( sample_inputs_fn=sample_inputs_five_crop_image_tensor, reference_fn=pil_reference_wrapper(F.five_crop_image_pil), reference_inputs_fn=reference_inputs_five_crop_image_tensor, - skips=[ - skip_integer_size_jit(), - Skip("test_batched_vs_single", reason="Custom batching needed for five_crop_image_tensor."), + test_marks=[ + xfail_integer_size_jit(), + mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."), ], closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, ), @@ -1562,9 +1558,9 @@ KERNEL_INFOS.extend( sample_inputs_fn=sample_inputs_ten_crop_image_tensor, reference_fn=pil_reference_wrapper(F.ten_crop_image_pil), reference_inputs_fn=reference_inputs_ten_crop_image_tensor, - skips=[ - skip_integer_size_jit(), - Skip("test_batched_vs_single", reason="Custom batching needed for ten_crop_image_tensor."), + test_marks=[ + xfail_integer_size_jit(), + mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."), ], closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, ), diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 143a5cd22..a6523045c 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -1,3 +1,4 @@ +import functools import math import os @@ -26,33 +27,60 @@ def script(fn): raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error -@pytest.fixture(autouse=True) -def maybe_skip(request): - # In case the test uses no parametrization or fixtures, the `callspec` attribute does not exist - try: - callspec = request.node.callspec - except AttributeError: - return +def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, name_fn=lambda info: str(info)): + if condition is None: - try: - info = callspec.params["info"] - args_kwargs = callspec.params["args_kwargs"] - except KeyError: - return + def condition(info): + return True - info.maybe_skip( - test_name=request.node.originalname, args_kwargs=args_kwargs, device=callspec.params.get("device", "cpu") - ) + def decorator(test_fn): + parts = test_fn.__qualname__.split(".") + if len(parts) == 1: + test_class_name = None + test_function_name = parts[0] + elif len(parts) == 2: + test_class_name, test_function_name = parts + else: + raise pytest.UsageError("Unable to parse the test class and test name from test function") + test_id = (test_class_name, test_function_name) + + argnames = ("info", "args_kwargs") + argvalues = [] + for info in infos: + if not condition(info): + continue + + args_kwargs = list(args_kwargs_fn(info)) + name = name_fn(info) + idx_field_len = len(str(len(args_kwargs))) + + for idx, args_kwargs_ in enumerate(args_kwargs): + argvalues.append( + pytest.param( + info, + args_kwargs_, + marks=info.get_marks(test_id, args_kwargs_), + id=f"{name}-{idx:0{idx_field_len}}", + ) + ) + + return pytest.mark.parametrize(argnames, argvalues)(test_fn) + + return decorator class TestKernels: - sample_inputs = pytest.mark.parametrize( - ("info", "args_kwargs"), - [ - pytest.param(info, args_kwargs, id=f"{info.kernel_name}-{idx}") - for info in KERNEL_INFOS - for idx, args_kwargs in enumerate(info.sample_inputs_fn()) - ], + make_kernel_args_kwargs_parametrization = functools.partial( + make_args_kwargs_parametrization, name_fn=lambda info: info.kernel_name + ) + sample_inputs = kernel_sample_inputs = make_kernel_args_kwargs_parametrization( + KERNEL_INFOS, + args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(), + ) + reference_inputs = make_kernel_args_kwargs_parametrization( + KERNEL_INFOS, + args_kwargs_fn=lambda info: info.reference_inputs_fn(), + condition=lambda info: info.reference_fn is not None, ) @sample_inputs @@ -156,15 +184,7 @@ class TestKernels: assert output.dtype == input.dtype assert output.device == input.device - @pytest.mark.parametrize( - ("info", "args_kwargs"), - [ - pytest.param(info, args_kwargs, id=f"{info.kernel_name}-{idx}") - for info in KERNEL_INFOS - for idx, args_kwargs in enumerate(info.reference_inputs_fn()) - if info.reference_fn is not None - ], - ) + @reference_inputs def test_against_reference(self, info, args_kwargs): args, kwargs = args_kwargs.load("cpu") @@ -187,15 +207,16 @@ def spy_on(mocker): class TestDispatchers: - @pytest.mark.parametrize( - ("info", "args_kwargs"), - [ - pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") - for info in DISPATCHER_INFOS - for idx, args_kwargs in enumerate(info.sample_inputs(features.Image)) - if features.Image in info.kernels - ], + make_dispatcher_args_kwargs_parametrization = functools.partial( + make_args_kwargs_parametrization, name_fn=lambda info: info.dispatcher.__name__ ) + image_sample_inputs = kernel_sample_inputs = make_dispatcher_args_kwargs_parametrization( + DISPATCHER_INFOS, + args_kwargs_fn=lambda info: info.sample_inputs(features.Image), + condition=lambda info: features.Image in info.kernels, + ) + + @image_sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_scripted_smoke(self, info, args_kwargs, device): dispatcher = script(info.dispatcher) @@ -223,15 +244,7 @@ class TestDispatchers: def test_scriptable(self, dispatcher): script(dispatcher) - @pytest.mark.parametrize( - ("info", "args_kwargs"), - [ - pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") - for info in DISPATCHER_INFOS - for idx, args_kwargs in enumerate(info.sample_inputs(features.Image)) - if features.Image in info.kernels - ], - ) + @image_sample_inputs def test_dispatch_simple_tensor(self, info, args_kwargs, spy_on): (image_feature, *other_args), kwargs = args_kwargs.load() image_simple_tensor = torch.Tensor(image_feature) @@ -243,14 +256,10 @@ class TestDispatchers: spy.assert_called_once() - @pytest.mark.parametrize( - ("info", "args_kwargs"), - [ - pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") - for info in DISPATCHER_INFOS - for idx, args_kwargs in enumerate(info.sample_inputs(features.Image)) - if features.Image in info.kernels and info.pil_kernel_info is not None - ], + @make_dispatcher_args_kwargs_parametrization( + DISPATCHER_INFOS, + args_kwargs_fn=lambda info: info.sample_inputs(features.Image), + condition=lambda info: info.pil_kernel_info is not None, ) def test_dispatch_pil(self, info, args_kwargs, spy_on): (image_feature, *other_args), kwargs = args_kwargs.load() @@ -267,13 +276,9 @@ class TestDispatchers: spy.assert_called_once() - @pytest.mark.parametrize( - ("info", "args_kwargs"), - [ - pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}") - for info in DISPATCHER_INFOS - for idx, args_kwargs in enumerate(info.sample_inputs()) - ], + @make_dispatcher_args_kwargs_parametrization( + DISPATCHER_INFOS, + args_kwargs_fn=lambda info: info.sample_inputs(), ) def test_dispatch_feature(self, info, args_kwargs, spy_on): (feature, *other_args), kwargs = args_kwargs.load() -- GitLab From 96d1fecf282fa23883fe1953f44edd20c8a8658a Mon Sep 17 00:00:00 2001 From: Aditya Gandhamal <61016383+adityagandhamal@users.noreply.github.com> Date: Thu, 6 Oct 2022 00:42:37 +0530 Subject: [PATCH 013/624] Handle invalid reduction values (#6675) * Add ValueError * Add tests for ValueError * Add tests for ValueError * Add ValueError * Change to if/else * Ammend iou_fn tests * Move code excerpt * Format tests Co-authored-by: Philip Meier Co-authored-by: Vasilis Vryniotis --- test/test_ops.py | 22 ++++++++++++++++++++++ torchvision/ops/ciou_loss.py | 11 +++++++++-- torchvision/ops/diou_loss.py | 9 ++++++++- torchvision/ops/focal_loss.py | 11 +++++++++-- torchvision/ops/giou_loss.py | 10 ++++++++-- 5 files changed, 56 insertions(+), 7 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index b34fbe7f2..d76e57fae 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1394,6 +1394,11 @@ class TestGeneralizedBoxIouLoss: assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, device=device, reduction="sum") assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, device=device, reduction="mean") + # Test reduction value + # reduction value other than ["none", "mean", "sum"] should raise a ValueError + with pytest.raises(ValueError, match="Invalid"): + ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz") + @pytest.mark.parametrize("device", cpu_and_gpu()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_inputs(self, dtype, device): @@ -1413,6 +1418,9 @@ class TestCompleteBoxIouLoss: assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean") assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum") + with pytest.raises(ValueError, match="Invalid"): + ops.complete_box_iou_loss(box1s, box2s, reduction="xyz") + @pytest.mark.parametrize("device", cpu_and_gpu()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_inputs(self, dtype, device): @@ -1432,6 +1440,9 @@ class TestDistanceBoxIouLoss: assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean") assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum") + with pytest.raises(ValueError, match="Invalid"): + ops.distance_box_iou_loss(box1s, box2s, reduction="xyz") + @pytest.mark.parametrize("device", cpu_and_gpu()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_distance_iou_inputs(self, dtype, device): @@ -1554,6 +1565,17 @@ class TestFocalLoss: tol = 1e-3 if dtype is torch.half else 1e-5 torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol) + # Raise ValueError for anonymous reduction mode + @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) + def test_reduction_mode(self, device, dtype, reduction="xyz"): + if device == "cpu" and dtype is torch.half: + pytest.skip("Currently torch.half is not fully supported on cpu") + torch.random.manual_seed(0) + inputs, targets = self._generate_diverse_input_target_pair(device=device, dtype=dtype) + with pytest.raises(ValueError, match="Invalid"): + ops.sigmoid_focal_loss(inputs, targets, 0.25, 2, reduction) + class TestMasksToBoxes: def test_masks_box(self): diff --git a/torchvision/ops/ciou_loss.py b/torchvision/ops/ciou_loss.py index a9f20a5f4..75a1c4cb1 100644 --- a/torchvision/ops/ciou_loss.py +++ b/torchvision/ops/ciou_loss.py @@ -63,9 +63,16 @@ def complete_box_iou_loss( alpha = v / (1 - iou + v + eps) loss = diou_loss + alpha * v - if reduction == "mean": + + # Check reduction option and return loss accordingly + if reduction == "none": + pass + elif reduction == "mean": loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() elif reduction == "sum": loss = loss.sum() - + else: + raise ValueError( + f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'" + ) return loss diff --git a/torchvision/ops/diou_loss.py b/torchvision/ops/diou_loss.py index 2187aea4c..caf62bd2c 100644 --- a/torchvision/ops/diou_loss.py +++ b/torchvision/ops/diou_loss.py @@ -50,10 +50,17 @@ def distance_box_iou_loss( loss, _ = _diou_iou_loss(boxes1, boxes2, eps) - if reduction == "mean": + # Check reduction option and return loss accordingly + if reduction == "none": + pass + elif reduction == "mean": loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() elif reduction == "sum": loss = loss.sum() + else: + raise ValueError( + f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'" + ) return loss diff --git a/torchvision/ops/focal_loss.py b/torchvision/ops/focal_loss.py index c8cc9a8ac..08c282555 100644 --- a/torchvision/ops/focal_loss.py +++ b/torchvision/ops/focal_loss.py @@ -32,6 +32,7 @@ def sigmoid_focal_loss( Loss tensor with the reduction option applied. """ # Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(sigmoid_focal_loss) p = torch.sigmoid(inputs) @@ -43,9 +44,15 @@ def sigmoid_focal_loss( alpha_t = alpha * targets + (1 - alpha) * (1 - targets) loss = alpha_t * loss - if reduction == "mean": + # Check reduction option and return loss accordingly + if reduction == "none": + pass + elif reduction == "mean": loss = loss.mean() elif reduction == "sum": loss = loss.sum() - + else: + raise ValueError( + f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'" + ) return loss diff --git a/torchvision/ops/giou_loss.py b/torchvision/ops/giou_loss.py index 0c555ec4f..03ef8e622 100644 --- a/torchvision/ops/giou_loss.py +++ b/torchvision/ops/giou_loss.py @@ -62,9 +62,15 @@ def generalized_box_iou_loss( loss = 1 - miouk - if reduction == "mean": + # Check reduction option and return loss accordingly + if reduction == "none": + pass + elif reduction == "mean": loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() elif reduction == "sum": loss = loss.sum() - + else: + raise ValueError( + f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'" + ) return loss -- GitLab From d020820edcc7c417fe9ca581da23b298ea6dfb46 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 6 Oct 2022 09:15:47 +0200 Subject: [PATCH 014/624] make pytest summary more concise (#6708) * make pytest summary more concise * fix comment --- pytest.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytest.ini b/pytest.ini index 1dde465d3..a2f59ecec 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] addopts = - # show summary of all tests that did not pass - -ra + # show tests that (f)ailed, (E)rror, or (X)passed in the summary + -rfEX # Make tracebacks shorter --tb=native # enable all warnings -- GitLab From e3941afca3f380914397cb0e5665e5d616d440ae Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 6 Oct 2022 12:40:15 +0100 Subject: [PATCH 015/624] Remove unnecessary `ignore` directives to fix mypy (#6713) --- torchvision/prototype/features/_feature.py | 8 ++++---- torchvision/prototype/features/_image.py | 2 +- torchvision/prototype/transforms/functional/_misc.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py index 9c0cece15..2da10be90 100644 --- a/torchvision/prototype/features/_feature.py +++ b/torchvision/prototype/features/_feature.py @@ -32,10 +32,10 @@ class _Feature(torch.Tensor): return ( torch.as_tensor( # type: ignore[return-value] data, - dtype=dtype, # type: ignore[arg-type] - device=device, # type: ignore[arg-type] + dtype=dtype, + device=device, ) - .as_subclass(cls) # type: ignore[arg-type] + .as_subclass(cls) .requires_grad_(requires_grad) ) @@ -115,7 +115,7 @@ class _Feature(torch.Tensor): # Inplace `func`'s, canonically identified with a trailing underscore in their name like `.add_(...)`, # will retain the input type. Thus, we need to unwrap here. if isinstance(output, cls): - return output.as_subclass(torch.Tensor) # type: ignore[arg-type] + return output.as_subclass(torch.Tensor) return output diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index 21126c7f2..c953ae78c 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -71,7 +71,7 @@ class Image(_Feature): device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, ) -> Image: - data = torch.as_tensor(data, dtype=dtype, device=device) # type: ignore[arg-type] + data = torch.as_tensor(data, dtype=dtype, device=device) if data.ndim < 2: raise ValueError elif data.ndim == 2: diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 03ddf05ac..6f35781d4 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -16,7 +16,7 @@ def normalize( correct_type = isinstance(inpt, torch.Tensor) else: correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, features.Image) - inpt = inpt.as_subclass(torch.Tensor) # type: ignore[arg-type] + inpt = inpt.as_subclass(torch.Tensor) if not correct_type: raise TypeError(f"img should be Tensor Image. Got {type(inpt)}") -- GitLab From 026991b152ffc3cbad8f49fe3f448ee66fe58803 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 6 Oct 2022 15:19:44 +0200 Subject: [PATCH 016/624] Reduce sample inputs for prototype transform kernels (#6714) * pad_image_tensor * pad_mask and pad_bounding_box * resize_{image_tensor, mask, bounding_box} * center_crop_{image_tensor, mask} * {five, ten}_crop_image_tensor * crop_{image_tensor, mask} * convert_color_space_image_tensor * affine_{image_tensor, mask, bounding_box} * rotate_{image_tensor, mask} * gaussian_blur_image_tensor * cleanup --- test/prototype_common_utils.py | 6 +- test/prototype_transforms_dispatcher_infos.py | 60 ++- test/prototype_transforms_kernel_infos.py | 389 ++++++++++++------ torchvision/transforms/functional_tensor.py | 7 +- 4 files changed, 309 insertions(+), 153 deletions(-) diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index e9192f44f..333e11fb2 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -28,6 +28,7 @@ __all__ = [ "assert_close", "assert_equal", "ArgsKwargs", + "VALID_EXTRA_DIMS", "make_image_loaders", "make_image", "make_images", @@ -201,7 +202,10 @@ def _parse_image_size(size, *, name="size"): ) -DEFAULT_EXTRA_DIMS = ((), (0,), (4,), (2, 3), (5, 0), (0, 5)) +VALID_EXTRA_DIMS = ((), (4,), (2, 3)) +DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5)) + +DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS) def from_loader(loader_fn): diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py index 11a4c35ae..9678249aa 100644 --- a/test/prototype_transforms_dispatcher_infos.py +++ b/test/prototype_transforms_dispatcher_infos.py @@ -63,17 +63,40 @@ class DispatcherInfo: yield args_kwargs -def xfail_python_scalar_arg_jit(name, *, reason=None): +def xfail_jit_python_scalar_arg(name, *, reason=None): reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting" return TestMark( ("TestDispatchers", "test_scripted_smoke"), pytest.mark.xfail(reason=reason), - condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)), + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)), ) -def xfail_integer_size_jit(name="size"): - return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.") +def xfail_jit_integer_size(name="size"): + return xfail_jit_python_scalar_arg(name, reason=f"Integer `{name}` is not supported when scripting.") + + +def xfail_jit_tuple_instead_of_list(name, *, reason=None): + reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting" + return TestMark( + ("TestDispatchers", "test_scripted_smoke"), + pytest.mark.xfail(reason=reason), + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple), + ) + + +def is_list_of_ints(args_kwargs): + fill = args_kwargs.kwargs.get("fill") + return isinstance(fill, list) and any(isinstance(scalar_fill, int) for scalar_fill in fill) + + +def xfail_jit_list_of_ints(name, *, reason=None): + reason = reason or f"Passing a list of integers for `{name}` is not supported when scripting" + return TestMark( + ("TestDispatchers", "test_scripted_smoke"), + pytest.mark.xfail(reason=reason), + condition=is_list_of_ints, + ) skip_dispatch_feature = TestMark( @@ -123,7 +146,7 @@ DISPATCHER_INFOS = [ }, pil_kernel_info=PILKernelInfo(F.resize_image_pil), test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), ], ), DispatcherInfo( @@ -136,7 +159,10 @@ DISPATCHER_INFOS = [ pil_kernel_info=PILKernelInfo(F.affine_image_pil), test_marks=[ xfail_dispatch_pil_if_fill_sequence_needs_broadcast, - xfail_python_scalar_arg_jit("shear"), + xfail_jit_python_scalar_arg("shear"), + xfail_jit_tuple_instead_of_list("fill"), + # TODO: check if this is a regression since it seems that should be supported if `int` is ok + xfail_jit_list_of_ints("fill"), ], ), DispatcherInfo( @@ -156,6 +182,11 @@ DISPATCHER_INFOS = [ features.Mask: F.rotate_mask, }, pil_kernel_info=PILKernelInfo(F.rotate_image_pil), + test_marks=[ + xfail_jit_tuple_instead_of_list("fill"), + # TODO: check if this is a regression since it seems that should be supported if `int` is ok + xfail_jit_list_of_ints("fill"), + ], ), DispatcherInfo( F.crop, @@ -194,7 +225,12 @@ DISPATCHER_INFOS = [ ), condition=lambda args_kwargs: fill_sequence_needs_broadcast(args_kwargs) and args_kwargs.kwargs.get("padding_mode", "constant") == "constant", - ) + ), + xfail_jit_python_scalar_arg("padding"), + xfail_jit_tuple_instead_of_list("padding"), + xfail_jit_tuple_instead_of_list("fill"), + # TODO: check if this is a regression since it seems that should be supported if `int` is ok + xfail_jit_list_of_ints("fill"), ], ), DispatcherInfo( @@ -227,7 +263,7 @@ DISPATCHER_INFOS = [ }, pil_kernel_info=PILKernelInfo(F.center_crop_image_pil), test_marks=[ - xfail_integer_size_jit("output_size"), + xfail_jit_integer_size("output_size"), ], ), DispatcherInfo( @@ -237,8 +273,8 @@ DISPATCHER_INFOS = [ }, pil_kernel_info=PILKernelInfo(F.gaussian_blur_image_pil), test_marks=[ - xfail_python_scalar_arg_jit("kernel_size"), - xfail_python_scalar_arg_jit("sigma"), + xfail_jit_python_scalar_arg("kernel_size"), + xfail_jit_python_scalar_arg("sigma"), ], ), DispatcherInfo( @@ -335,7 +371,7 @@ DISPATCHER_INFOS = [ }, pil_kernel_info=PILKernelInfo(F.five_crop_image_pil), test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), skip_dispatch_feature, ], ), @@ -345,7 +381,7 @@ DISPATCHER_INFOS = [ features.Image: F.ten_crop_image_tensor, }, test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), skip_dispatch_feature, ], pil_kernel_info=PILKernelInfo(F.ten_crop_image_pil), diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 2e02989b4..c0e7bf5bf 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -12,8 +12,16 @@ import torchvision.ops import torchvision.prototype.transforms.functional as F from _pytest.mark.structures import MarkDecorator +from common_utils import cycle_over from datasets_utils import combinations_grid -from prototype_common_utils import ArgsKwargs, make_bounding_box_loaders, make_image_loaders, make_mask_loaders +from prototype_common_utils import ( + ArgsKwargs, + make_bounding_box_loaders, + make_image_loader, + make_image_loaders, + make_mask_loaders, + VALID_EXTRA_DIMS, +) from torchvision.prototype import features from torchvision.transforms.functional_tensor import _max_value as get_max_value @@ -98,17 +106,40 @@ def mark_framework_limitation(test_id, reason): return TestMark(test_id, pytest.mark.skip(reason=reason)) -def xfail_python_scalar_arg_jit(name, *, reason=None): +def xfail_jit_python_scalar_arg(name, *, reason=None): reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting" return TestMark( ("TestKernels", "test_scripted_vs_eager"), pytest.mark.xfail(reason=reason), - condition=lambda args_kwargs: isinstance(args_kwargs.kwargs[name], (int, float)), + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)), + ) + + +def xfail_jit_integer_size(name="size"): + return xfail_jit_python_scalar_arg(name, reason=f"Integer `{name}` is not supported when scripting.") + + +def xfail_jit_tuple_instead_of_list(name, *, reason=None): + reason = reason or f"Passing a tuple instead of a list for `{name}` is not supported when scripting" + return TestMark( + ("TestKernels", "test_scripted_vs_eager"), + pytest.mark.xfail(reason=reason), + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), tuple), ) -def xfail_integer_size_jit(name="size"): - return xfail_python_scalar_arg_jit(name, reason=f"Integer `{name}` is not supported when scripting.") +def is_list_of_ints(args_kwargs): + fill = args_kwargs.kwargs.get("fill") + return isinstance(fill, list) and any(isinstance(scalar_fill, int) for scalar_fill in fill) + + +def xfail_jit_list_of_ints(name, *, reason=None): + reason = reason or f"Passing a list of integers for `{name}` is not supported when scripting" + return TestMark( + ("TestKernels", "test_scripted_vs_eager"), + pytest.mark.xfail(reason=reason), + condition=is_list_of_ints, + ) KERNEL_INFOS = [] @@ -173,15 +204,33 @@ def _get_resize_sizes(image_size): def sample_inputs_resize_image_tensor(): + for image_loader in make_image_loaders( + sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + ): + for size in _get_resize_sizes(image_loader.image_size): + yield ArgsKwargs(image_loader, size=size) + for image_loader, interpolation in itertools.product( - make_image_loaders(dtypes=[torch.float32]), + make_image_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB]), [ F.InterpolationMode.NEAREST, + F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC, ], ): - for size in _get_resize_sizes(image_loader.image_size): - yield ArgsKwargs(image_loader, size=size, interpolation=interpolation) + yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation) + + # We have a speed hack in place for nearest interpolation and single channel images (grayscale) + for image_loader in make_image_loaders( + sizes=["random"], + color_spaces=[features.ColorSpace.GRAY], + extra_dims=VALID_EXTRA_DIMS, + ): + yield ArgsKwargs( + image_loader, size=[min(image_loader.image_size) + 1], interpolation=F.InterpolationMode.NEAREST + ) + + yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25) @pil_reference_wrapper @@ -217,15 +266,14 @@ def reference_inputs_resize_image_tensor(): def sample_inputs_resize_bounding_box(): - for bounding_box_loader in make_bounding_box_loaders(formats=[features.BoundingBoxFormat.XYXY]): + for bounding_box_loader in make_bounding_box_loaders(): for size in _get_resize_sizes(bounding_box_loader.image_size): yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size) def sample_inputs_resize_mask(): - for mask_loader in make_mask_loaders(dtypes=[torch.uint8]): - for size in _get_resize_sizes(mask_loader.shape[-2:]): - yield ArgsKwargs(mask_loader, size=size) + for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]): + yield ArgsKwargs(mask_loader, size=[min(mask_loader.shape[-2:]) + 1]) @pil_reference_wrapper @@ -248,14 +296,14 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_resize_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), ], ), KernelInfo( F.resize_bounding_box, sample_inputs_fn=sample_inputs_resize_bounding_box, test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), ], ), KernelInfo( @@ -265,7 +313,7 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_resize_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), ], ), ] @@ -290,28 +338,51 @@ def _diversify_affine_kwargs_types(affine_kwargs): yield dict(affine_kwargs, shear=diverse_shear) +def _full_affine_params(**partial_params): + partial_params.setdefault("angle", 0.0) + partial_params.setdefault("translate", [0.0, 0.0]) + partial_params.setdefault("scale", 1.0) + partial_params.setdefault("shear", [0.0, 0.0]) + partial_params.setdefault("center", None) + return partial_params + + +_DIVERSE_AFFINE_PARAMS = [ + _full_affine_params(**{name: arg}) + for name, args in [ + ("angle", [1.0, 2]), + ("translate", [[1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]), + ("scale", [0.5]), + ("shear", [1.0, 2, [1.0], [2], (1.0,), (2,), [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]), + ("center", [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]), + ] + for arg in args +] + + def sample_inputs_affine_image_tensor(): - for image_loader, interpolation_mode, center in itertools.product( - make_image_loaders(sizes=["random"], dtypes=[torch.float32]), + make_affine_image_loaders = functools.partial( + make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + ) + + for image_loader, affine_params in itertools.product(make_affine_image_loaders(), _DIVERSE_AFFINE_PARAMS): + yield ArgsKwargs(image_loader, **affine_params) + + for image_loader in make_affine_image_loaders(): + fills = [None, 0.5] + if image_loader.num_channels > 1: + fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]]) + for fill in fills: + yield ArgsKwargs(image_loader, **_full_affine_params(), fill=fill) + + for image_loader, interpolation in itertools.product( + make_affine_image_loaders(), [ F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR, ], - [None, (0, 0)], - ): - for fill in [None, 128.0, 128, [12.0], [0.5] * image_loader.num_channels]: - yield ArgsKwargs( - image_loader, - interpolation=interpolation_mode, - center=center, - fill=fill, - **_AFFINE_KWARGS[0], - ) - - for image_loader, affine_kwargs in itertools.product( - make_image_loaders(sizes=["random"], dtypes=[torch.float32]), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0]) ): - yield ArgsKwargs(image_loader, **affine_kwargs) + yield ArgsKwargs(image_loader, **_full_affine_params(), fill=0) def reference_inputs_affine_image_tensor(): @@ -324,22 +395,14 @@ def reference_inputs_affine_image_tensor(): def sample_inputs_affine_bounding_box(): - for bounding_box_loader in make_bounding_box_loaders(): - yield ArgsKwargs( - bounding_box_loader, - format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, - **_AFFINE_KWARGS[0], - ) - - for bounding_box_loader, affine_kwargs in itertools.product( - make_bounding_box_loaders(), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0]) + for bounding_box_loader, affine_params in itertools.product( + make_bounding_box_loaders(formats=[features.BoundingBoxFormat.XYXY]), _DIVERSE_AFFINE_PARAMS ): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size, - **affine_kwargs, + **affine_params, ) @@ -423,16 +486,8 @@ def reference_inputs_affine_bounding_box(): def sample_inputs_affine_image_mask(): - for mask_loader, center in itertools.product( - make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]), - [None, (0, 0)], - ): - yield ArgsKwargs(mask_loader, center=center, **_AFFINE_KWARGS[0]) - - for mask_loader, affine_kwargs in itertools.product( - make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0]) - ): - yield ArgsKwargs(mask_loader, **affine_kwargs) + for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]): + yield ArgsKwargs(mask_loader, **_full_affine_params()) @pil_reference_wrapper @@ -455,7 +510,12 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.affine_image_pil), reference_inputs_fn=reference_inputs_affine_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=[xfail_python_scalar_arg_jit("shear")], + test_marks=[ + xfail_jit_python_scalar_arg("shear"), + xfail_jit_tuple_instead_of_list("fill"), + # TODO: check if this is a regression since it seems that should be supported if `int` is ok + xfail_jit_list_of_ints("fill"), + ], ), KernelInfo( F.affine_bounding_box, @@ -464,7 +524,7 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_affine_bounding_box, closeness_kwargs=dict(atol=1, rtol=0), test_marks=[ - xfail_python_scalar_arg_jit("shear"), + xfail_jit_python_scalar_arg("shear"), ], ), KernelInfo( @@ -473,7 +533,9 @@ KERNEL_INFOS.extend( reference_fn=reference_affine_mask, reference_inputs_fn=reference_inputs_resize_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=[xfail_python_scalar_arg_jit("shear")], + test_marks=[ + xfail_jit_python_scalar_arg("shear"), + ], ), ] ) @@ -514,15 +576,21 @@ KERNEL_INFOS.append( def sample_inputs_convert_color_space_image_tensor(): - color_spaces = set(features.ColorSpace) - {features.ColorSpace.OTHER} - for image_loader in make_image_loaders(sizes=["random"], color_spaces=color_spaces, constant_alpha=True): - old_color_space = image_loader.color_space - for params in combinations_grid(new_color_space=color_spaces - {old_color_space}, copy=(True, False)): - yield ArgsKwargs(image_loader, old_color_space=old_color_space, **params) + color_spaces = list(set(features.ColorSpace) - {features.ColorSpace.OTHER}) + + for old_color_space, new_color_space in cycle_over(color_spaces): + for image_loader in make_image_loaders(sizes=["random"], color_spaces=[old_color_space], constant_alpha=True): + yield ArgsKwargs(image_loader, old_color_space=old_color_space, new_color_space=new_color_space) + + for color_space in color_spaces: + for image_loader in make_image_loaders( + sizes=["random"], color_spaces=[color_space], dtypes=[torch.float32], constant_alpha=True + ): + yield ArgsKwargs(image_loader, old_color_space=color_space, new_color_space=color_space, copy=False) @pil_reference_wrapper -def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space, copy): +def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space, copy=True): color_space_pil = features.ColorSpace.from_pil_mode(image_pil.mode) if color_space_pil != old_color_space: raise pytest.UsageError( @@ -600,25 +668,30 @@ _ROTATE_ANGLES = [-87, 15, 90] def sample_inputs_rotate_image_tensor(): - for image_loader, params in itertools.product( - make_image_loaders(sizes=["random"], dtypes=[torch.float32]), - combinations_grid( - interpolation=[F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR], - expand=[True, False], - center=[None, (0, 0)], - ), + make_rotate_image_loaders = functools.partial( + make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + ) + + for image_loader in make_rotate_image_loaders(): + yield ArgsKwargs(image_loader, angle=15.0, expand=True) + + for image_loader, center in itertools.product( + make_rotate_image_loaders(), [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)] ): - if params["center"] is not None and params["expand"]: - # Otherwise this will emit a warning and ignore center anyway - continue + yield ArgsKwargs(image_loader, angle=15.0, center=center) - for fill in [None, 0.5, [0.5] * image_loader.num_channels]: - yield ArgsKwargs( - image_loader, - angle=_ROTATE_ANGLES[0], - fill=fill, - **params, - ) + for image_loader in make_rotate_image_loaders(): + fills = [None, 0.5] + if image_loader.num_channels > 1: + fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]]) + for fill in fills: + yield ArgsKwargs(image_loader, angle=15.0, fill=fill) + + for image_loader, interpolation in itertools.product( + make_rotate_image_loaders(), + [F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR], + ): + yield ArgsKwargs(image_loader, angle=15.0, fill=0) def reference_inputs_rotate_image_tensor(): @@ -637,22 +710,8 @@ def sample_inputs_rotate_bounding_box(): def sample_inputs_rotate_mask(): - for image_loader, params in itertools.product( - make_image_loaders(sizes=["random"], dtypes=[torch.uint8]), - combinations_grid( - expand=[True, False], - center=[None, (0, 0)], - ), - ): - if params["center"] is not None and params["expand"]: - # Otherwise this will emit a warning and ignore center anyway - continue - - yield ArgsKwargs( - image_loader, - angle=_ROTATE_ANGLES[0], - **params, - ) + for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]): + yield ArgsKwargs(mask_loader, angle=15.0) @pil_reference_wrapper @@ -673,6 +732,11 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.rotate_image_pil), reference_inputs_fn=reference_inputs_rotate_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=[ + xfail_jit_tuple_instead_of_list("fill"), + # TODO: check if this is a regression since it seems that should be supported if `int` is ok + xfail_jit_list_of_ints("fill"), + ], ), KernelInfo( F.rotate_bounding_box, @@ -692,7 +756,16 @@ _CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20 def sample_inputs_crop_image_tensor(): - for image_loader, params in itertools.product(make_image_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]): + for image_loader, params in itertools.product( + make_image_loaders(sizes=[(16, 17)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]), + [ + dict(top=4, left=3, height=7, width=8), + dict(top=-1, left=3, height=7, width=8), + dict(top=4, left=-1, height=7, width=8), + dict(top=4, left=3, height=17, width=8), + dict(top=4, left=3, height=7, width=18), + ], + ): yield ArgsKwargs(image_loader, **params) @@ -709,8 +782,8 @@ def sample_inputs_crop_bounding_box(): def sample_inputs_crop_mask(): - for mask_loader, params in itertools.product(make_mask_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]): - yield ArgsKwargs(mask_loader, **params) + for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=["random"], num_objects=["random"]): + yield ArgsKwargs(mask_loader, top=4, left=3, height=7, width=8) def reference_inputs_crop_mask(): @@ -829,12 +902,34 @@ _PAD_PARAMS = combinations_grid( def sample_inputs_pad_image_tensor(): - for image_loader, params in itertools.product(make_image_loaders(sizes=["random"]), _PAD_PARAMS): - fills = [None, 128.0, 128, [12.0]] - if params["padding_mode"] == "constant": - fills.append([12.0 + c for c in range(image_loader.num_channels)]) + make_pad_image_loaders = functools.partial( + make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + ) + + for image_loader, padding in itertools.product( + make_pad_image_loaders(), + [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]], + ): + yield ArgsKwargs(image_loader, padding=padding) + + for image_loader in make_pad_image_loaders(): + fills = [None, 0.5] + if image_loader.num_channels > 1: + fills.extend(vector_fill * image_loader.num_channels for vector_fill in [(0.5,), (1,), [0.5], [1]]) for fill in fills: - yield ArgsKwargs(image_loader, fill=fill, **params) + yield ArgsKwargs(image_loader, padding=[1], fill=fill) + + for image_loader, padding_mode in itertools.product( + # We branch for non-constant padding and integer inputs + make_pad_image_loaders(dtypes=[torch.uint8]), + ["constant", "symmetric", "edge", "reflect"], + ): + yield ArgsKwargs(image_loader, padding=[1], padding_mode=padding_mode) + + # `torch.nn.functional.pad` does not support symmetric padding, and thus we have a custom implementation. Besides + # negative padding, this is already handled by the inputs above. + for image_loader in make_pad_image_loaders(): + yield ArgsKwargs(image_loader, padding=[-1], padding_mode="symmetric") def reference_inputs_pad_image_tensor(): @@ -848,18 +943,21 @@ def reference_inputs_pad_image_tensor(): def sample_inputs_pad_bounding_box(): - for bounding_box_loader, params in itertools.product(make_bounding_box_loaders(), _PAD_PARAMS): - if params["padding_mode"] != "constant": - continue - + for bounding_box_loader, padding in itertools.product( + make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] + ): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size, **params + bounding_box_loader, + format=bounding_box_loader.format, + image_size=bounding_box_loader.image_size, + padding=padding, + padding_mode="constant", ) def sample_inputs_pad_mask(): - for image_loader, fill, params in itertools.product(make_mask_loaders(sizes=["random"]), [None, 127], _PAD_PARAMS): - yield ArgsKwargs(image_loader, fill=fill, **params) + for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]): + yield ArgsKwargs(mask_loader, padding=[1]) def reference_inputs_pad_mask(): @@ -875,10 +973,21 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.pad_image_pil), reference_inputs_fn=reference_inputs_pad_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=[ + xfail_jit_python_scalar_arg("padding"), + xfail_jit_tuple_instead_of_list("padding"), + xfail_jit_tuple_instead_of_list("fill"), + # TODO: check if this is a regression since it seems that should be supported if `int` is ok + xfail_jit_list_of_ints("fill"), + ], ), KernelInfo( F.pad_bounding_box, sample_inputs_fn=sample_inputs_pad_bounding_box, + test_marks=[ + xfail_jit_python_scalar_arg("padding"), + xfail_jit_tuple_instead_of_list("padding"), + ], ), KernelInfo( F.pad_mask, @@ -1045,7 +1154,13 @@ _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)] def sample_inputs_center_crop_image_tensor(): for image_loader, output_size in itertools.product( - make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES), _CENTER_CROP_OUTPUT_SIZES + make_image_loaders(sizes=[(16, 17)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]), + [ + # valid `output_size` types for which cropping is applied to both dimensions + *[5, (4,), (2, 3), [6], [3, 2]], + # `output_size`'s for which at least one dimension needs to be padded + *[[4, 18], [17, 5], [17, 18]], + ], ): yield ArgsKwargs(image_loader, output_size=output_size) @@ -1068,10 +1183,9 @@ def sample_inputs_center_crop_bounding_box(): def sample_inputs_center_crop_mask(): - for mask_loader, output_size in itertools.product( - make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES), _CENTER_CROP_OUTPUT_SIZES - ): - yield ArgsKwargs(mask_loader, output_size=output_size) + for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]): + height, width = mask_loader.shape[-2:] + yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2)) def reference_inputs_center_crop_mask(): @@ -1090,14 +1204,14 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_center_crop_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, test_marks=[ - xfail_integer_size_jit("output_size"), + xfail_jit_integer_size("output_size"), ], ), KernelInfo( F.center_crop_bounding_box, sample_inputs_fn=sample_inputs_center_crop_bounding_box, test_marks=[ - xfail_integer_size_jit("output_size"), + xfail_jit_integer_size("output_size"), ], ), KernelInfo( @@ -1107,7 +1221,7 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_center_crop_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, test_marks=[ - xfail_integer_size_jit("output_size"), + xfail_jit_integer_size("output_size"), ], ), ] @@ -1115,18 +1229,21 @@ KERNEL_INFOS.extend( def sample_inputs_gaussian_blur_image_tensor(): - for image_loader, params in itertools.product( - make_image_loaders( - sizes=["random"], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], - ), - combinations_grid( - kernel_size=[(3, 3), [3, 3], 5], - sigma=[None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)], - ), + make_gaussian_blur_image_loaders = functools.partial( + make_image_loaders, + sizes=["random"], + color_spaces=[features.ColorSpace.RGB], + # FIXME: kernel should support arbitrary batch sizes + extra_dims=[(), (4,)], + ) + + for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]): + yield ArgsKwargs(image_loader, kernel_size=kernel_size) + + for image_loader, sigma in itertools.product( + make_gaussian_blur_image_loaders(), [None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)] ): - yield ArgsKwargs(image_loader, **params) + yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma) KERNEL_INFOS.append( @@ -1135,8 +1252,8 @@ KERNEL_INFOS.append( sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, test_marks=[ - xfail_python_scalar_arg_jit("kernel_size"), - xfail_python_scalar_arg_jit("sigma"), + xfail_jit_python_scalar_arg("kernel_size"), + xfail_jit_python_scalar_arg("sigma"), ], ) ) @@ -1518,7 +1635,9 @@ def _get_five_ten_crop_image_size(size): def sample_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: - for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)]): + for image_loader in make_image_loaders( + sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + ): yield ArgsKwargs(image_loader, size=size) @@ -1530,7 +1649,9 @@ def reference_inputs_five_crop_image_tensor(): def sample_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): - for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)]): + for image_loader in make_image_loaders( + sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + ): yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) @@ -1548,7 +1669,7 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.five_crop_image_pil), reference_inputs_fn=reference_inputs_five_crop_image_tensor, test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."), ], closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, @@ -1559,7 +1680,7 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.ten_crop_image_pil), reference_inputs_fn=reference_inputs_ten_crop_image_tensor, test_marks=[ - xfail_integer_size_jit(), + xfail_jit_integer_size(), mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."), ], closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index 2be2964b9..20b76fbf0 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -755,12 +755,7 @@ def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Te kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device) kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1]) - img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in( - img, - [ - kernel.dtype, - ], - ) + img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype]) # padding = (left, right, top, bottom) padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2] -- GitLab From 61034d534c1dff58a66bf7e2a9be8c173648a483 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 6 Oct 2022 15:19:22 +0100 Subject: [PATCH 017/624] Avoid recommuting the affine matrix in bbox rotate (#6712) * Avoid recommuting the affine matrix in bbox rotate * Fix linter * inverted=True for estimating image size * Update the image size estimation to match the one from the image kernel * Nits * Address comments. * Center=0,0 when expand=true --- .../transforms/functional/_geometry.py | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 6a035b257..7a291967b 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -279,9 +279,9 @@ def affine_image_tensor( center_f = [0.0, 0.0] if center is not None: # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center. - center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])] + center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])] - translate_f = [1.0 * t for t in translate] + translate_f = [float(t) for t in translate] matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear) output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill) @@ -321,7 +321,7 @@ def _affine_bounding_box_xyxy( shear: List[float], center: Optional[List[float]] = None, expand: bool = False, -) -> torch.Tensor: +) -> Tuple[torch.Tensor, Tuple[int, int]]: angle, translate, shear, center = _affine_parse_args( angle, translate, scale, shear, InterpolationMode.NEAREST, center ) @@ -333,11 +333,16 @@ def _affine_bounding_box_xyxy( dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32 device = bounding_box.device - affine_matrix = torch.tensor( - _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False), - dtype=dtype, - device=device, - ).view(2, 3) + affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False) + transposed_affine_matrix = ( + torch.tensor( + affine_vector, + dtype=dtype, + device=device, + ) + .view(2, 3) + .T + ) # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners). # Tensor of points has shape (N * 4, 3), where N is the number of bboxes # Single point structure is similar to @@ -345,7 +350,7 @@ def _affine_bounding_box_xyxy( points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2) points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1) # 2) Now let's transform the points using affine matrix - transformed_points = torch.matmul(points, affine_matrix.T) + transformed_points = torch.matmul(points, transposed_affine_matrix) # 3) Reshape transformed points to [N boxes, 4 points, x/y coords] # and compute bounding box from 4 transformed points: transformed_points = transformed_points.view(-1, 4, 2) @@ -360,20 +365,24 @@ def _affine_bounding_box_xyxy( points = torch.tensor( [ [0.0, 0.0, 1.0], - [0.0, 1.0 * height, 1.0], - [1.0 * width, 1.0 * height, 1.0], - [1.0 * width, 0.0, 1.0], + [0.0, float(height), 1.0], + [float(width), float(height), 1.0], + [float(width), 0.0, 1.0], ], dtype=dtype, device=device, ) - new_points = torch.matmul(points, affine_matrix.T) + new_points = torch.matmul(points, transposed_affine_matrix) tr, _ = torch.min(new_points, dim=0, keepdim=True) # Translate bounding boxes out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0] out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1] + # Estimate meta-data for image with inverted=True and with center=[0,0] + affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear) + new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height) + image_size = (new_height, new_width) - return out_bboxes.to(bounding_box.dtype) + return out_bboxes.to(bounding_box.dtype), image_size def affine_bounding_box( @@ -391,7 +400,7 @@ def affine_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - out_bboxes = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center) + out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center) # out_bboxes should be of shape [N boxes, 4] @@ -502,7 +511,7 @@ def rotate_image_tensor( warnings.warn("The provided center argument has no effect on the result if expand is True") else: # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center. - center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])] + center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])] # due to current incoherence of rotation angle direction between affine and rotate implementations # we need to set -angle. @@ -558,7 +567,7 @@ def rotate_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - out_bboxes = _affine_bounding_box_xyxy( + out_bboxes, image_size = _affine_bounding_box_xyxy( bounding_box, image_size, angle=-angle, @@ -569,14 +578,6 @@ def rotate_bounding_box( expand=expand, ) - if expand: - # TODO: Move this computation inside of `_affine_bounding_box_xyxy` to avoid computing the rotation and points - # matrix twice - height, width = image_size - rotation_matrix = _get_inverse_affine_matrix([0.0, 0.0], angle, [0.0, 0.0], 1.0, [0.0, 0.0]) - new_width, new_height = _FT._compute_affine_output_size(rotation_matrix, width, height) - image_size = (new_height, new_width) - return ( convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False -- GitLab From 7d2de404372b0a77c5dec825c62f739e75a351ee Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 6 Oct 2022 11:49:29 -0400 Subject: [PATCH 018/624] Fix windows python 3.8 required dlls not found (#6715) * Fix windows python 3.8 * Update torchvision/extension.py Co-authored-by: Vasilis Vryniotis * Update torchvision/extension.py Co-authored-by: Vasilis Vryniotis --- torchvision/extension.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/torchvision/extension.py b/torchvision/extension.py index 3bad8351b..702e7e33b 100644 --- a/torchvision/extension.py +++ b/torchvision/extension.py @@ -16,6 +16,18 @@ def _has_ops(): try: + # On Windows Python-3.8.x has `os.add_dll_directory` call, + # which is called to configure dll search path. + # To find cuda related dlls we need to make sure the + # conda environment/bin path is configured Please take a look: + # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python + if os.name == "nt" and sys.version_info >= (3, 8) and sys.version_info < (3, 9): + env_path = os.environ["PATH"] + path_arr = env_path.split(";") + for path in path_arr: + if os.path.exists(path): + os.add_dll_directory(path) # type: ignore[attr-defined] + lib_path = _get_extension_path("_C") torch.ops.load_library(lib_path) _HAS_OPS = True -- GitLab From 7eb5d7fcab73afec976907a855d9e63fa31f5579 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 6 Oct 2022 22:42:42 +0200 Subject: [PATCH 019/624] close streams in prototype datasets (#6647) * close streams in prototype datasets * refactor prototype SBD to avoid closing demux streams at construction time * mypy --- test/builtin_dataset_mocks.py | 28 ++++---- test/test_prototype_datasets_builtin.py | 70 +++++++++++++++---- .../prototype/datasets/_builtin/celeba.py | 10 +-- .../prototype/datasets/_builtin/cifar.py | 4 +- .../prototype/datasets/_builtin/clevr.py | 2 + .../prototype/datasets/_builtin/mnist.py | 2 + .../prototype/datasets/_builtin/pcam.py | 2 + .../prototype/datasets/_builtin/sbd.py | 68 ++++++++++-------- .../prototype/datasets/_builtin/voc.py | 4 +- .../prototype/datasets/utils/_internal.py | 8 +-- torchvision/prototype/features/_encoded.py | 4 +- 11 files changed, 135 insertions(+), 67 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 8c5484a28..001e7e831 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -661,15 +661,15 @@ class SBDMockData: _NUM_CATEGORIES = 20 @classmethod - def _make_split_files(cls, root_map): - ids_map = { - split: [f"2008_{idx:06d}" for idx in idcs] - for split, idcs in ( - ("train", [0, 1, 2]), - ("train_noval", [0, 2]), - ("val", [3]), - ) - } + def _make_split_files(cls, root_map, *, split): + splits_and_idcs = [ + ("train", [0, 1, 2]), + ("val", [3]), + ] + if split == "train_noval": + splits_and_idcs.append(("train_noval", [0, 2])) + + ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs} for split, ids in ids_map.items(): with open(root_map[split] / f"{split}.txt", "w") as fh: @@ -710,12 +710,14 @@ class SBDMockData: return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy() @classmethod - def generate(cls, root): + def generate(cls, root, *, split): archive_folder = root / "benchmark_RELEASE" dataset_folder = archive_folder / "dataset" dataset_folder.mkdir(parents=True, exist_ok=True) - ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root})) + ids, num_samples_map = cls._make_split_files( + defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split + ) sizes = cls._make_anns_folder(dataset_folder, "cls", ids) create_image_folder( dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx] @@ -723,12 +725,12 @@ class SBDMockData: make_tar(root, "benchmark.tgz", archive_folder, compression="gz") - return num_samples_map + return num_samples_map[split] @register_mock(configs=combinations_grid(split=("train", "val", "train_noval"))) def sbd(root, config): - return SBDMockData.generate(root)[config["split"]] + return SBDMockData.generate(root, split=config["split"]) @register_mock(configs=[dict()]) diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py index 283a30a3d..7bea05fce 100644 --- a/test/test_prototype_datasets_builtin.py +++ b/test/test_prototype_datasets_builtin.py @@ -1,6 +1,7 @@ import functools import io import pickle +from collections import deque from pathlib import Path import pytest @@ -11,10 +12,11 @@ from torch.utils.data import DataLoader from torch.utils.data.graph import traverse_dps from torch.utils.data.graph_settings import get_all_graph_pipes from torchdata.datapipes.iter import ShardingFilter, Shuffler +from torchdata.datapipes.utils import StreamWrapper from torchvision._utils import sequence_to_str -from torchvision.prototype import datasets, transforms +from torchvision.prototype import datasets, features, transforms from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE -from torchvision.prototype.features import Image, Label + assert_samples_equal = functools.partial( assert_equal, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True @@ -25,6 +27,17 @@ def extract_datapipes(dp): return get_all_graph_pipes(traverse_dps(dp)) +def consume(iterator): + # Copied from the official itertools recipes: https://docs.python.org/3/library/itertools.html#itertools-recipes + deque(iterator, maxlen=0) + + +def next_consume(iterator): + item = next(iterator) + consume(iterator) + return item + + @pytest.fixture(autouse=True) def test_home(mocker, tmp_path): mocker.patch("torchvision.prototype.datasets._api.home", return_value=str(tmp_path)) @@ -66,7 +79,7 @@ class TestCommon: dataset, _ = dataset_mock.load(config) try: - sample = next(iter(dataset)) + sample = next_consume(iter(dataset)) except StopIteration: raise AssertionError("Unable to draw any sample.") from None except Exception as error: @@ -84,22 +97,53 @@ class TestCommon: assert len(list(dataset)) == mock_info["num_samples"] + @pytest.fixture + def log_session_streams(self): + debug_unclosed_streams = StreamWrapper.debug_unclosed_streams + try: + StreamWrapper.debug_unclosed_streams = True + yield + finally: + StreamWrapper.debug_unclosed_streams = debug_unclosed_streams + @parametrize_dataset_mocks(DATASET_MOCKS) - def test_no_vanilla_tensors(self, dataset_mock, config): + def test_stream_closing(self, log_session_streams, dataset_mock, config): + def make_msg_and_close(head): + unclosed_streams = [] + for stream in StreamWrapper.session_streams.keys(): + unclosed_streams.append(repr(stream.file_obj)) + stream.close() + unclosed_streams = "\n".join(unclosed_streams) + return f"{head}\n\n{unclosed_streams}" + + if StreamWrapper.session_streams: + raise pytest.UsageError(make_msg_and_close("A previous test did not close the following streams:")) + dataset, _ = dataset_mock.load(config) - vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor} - if vanilla_tensors: + consume(iter(dataset)) + + if StreamWrapper.session_streams: + raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:")) + + @parametrize_dataset_mocks(DATASET_MOCKS) + def test_no_simple_tensors(self, dataset_mock, config): + dataset, _ = dataset_mock.load(config) + + simple_tensors = {key for key, value in next_consume(iter(dataset)).items() if features.is_simple_tensor(value)} + if simple_tensors: raise AssertionError( f"The values of key(s) " - f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors." + f"{sequence_to_str(sorted(simple_tensors), separate_last='and ')} contained simple tensors." ) @parametrize_dataset_mocks(DATASET_MOCKS) def test_transformable(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - next(iter(dataset.map(transforms.Identity()))) + dataset = dataset.map(transforms.Identity()) + + consume(iter(dataset)) @parametrize_dataset_mocks(DATASET_MOCKS) def test_traversable(self, dataset_mock, config): @@ -131,7 +175,7 @@ class TestCommon: collate_fn=self._collate_fn, ) - next(iter(dl)) + consume(dl) # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also # that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680 @@ -148,7 +192,7 @@ class TestCommon: def test_save_load(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - sample = next(iter(dataset)) + sample = next_consume(iter(dataset)) with io.BytesIO() as buffer: torch.save(sample, buffer) @@ -177,7 +221,7 @@ class TestQMNIST: def test_extra_label(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - sample = next(iter(dataset)) + sample = next_consume(iter(dataset)) for key, type in ( ("nist_hsf_series", int), ("nist_writer_id", int), @@ -214,7 +258,7 @@ class TestUSPS: assert "image" in sample assert "label" in sample - assert isinstance(sample["image"], Image) - assert isinstance(sample["label"], Label) + assert isinstance(sample["image"], features.Image) + assert isinstance(sample["label"], features.Label) assert sample["image"].shape == (1, 16, 16) diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py index e42657e82..a0a021845 100644 --- a/torchvision/prototype/datasets/_builtin/celeba.py +++ b/torchvision/prototype/datasets/_builtin/celeba.py @@ -30,24 +30,26 @@ class CelebACSVParser(IterDataPipe[Tuple[str, Dict[str, str]]]): def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]: for _, file in self.datapipe: - file = (line.decode() for line in file) + lines = (line.decode() for line in file) if self.fieldnames: fieldnames = self.fieldnames else: # The first row is skipped, because it only contains the number of samples - next(file) + next(lines) # Empty field names are filtered out, because some files have an extra white space after the header # line, which is recognized as extra column - fieldnames = [name for name in next(csv.reader([next(file)], dialect="celeba")) if name] + fieldnames = [name for name in next(csv.reader([next(lines)], dialect="celeba")) if name] # Some files do not include a label for the image ID column if fieldnames[0] != "image_id": fieldnames.insert(0, "image_id") - for line in csv.DictReader(file, fieldnames=fieldnames, dialect="celeba"): + for line in csv.DictReader(lines, fieldnames=fieldnames, dialect="celeba"): yield line.pop("image_id"), line + file.close() + NAME = "celeba" diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py index 26196ded6..0fff2e6a1 100644 --- a/torchvision/prototype/datasets/_builtin/cifar.py +++ b/torchvision/prototype/datasets/_builtin/cifar.py @@ -62,7 +62,9 @@ class _CifarBase(Dataset): def _unpickle(self, data: Tuple[str, io.BytesIO]) -> Dict[str, Any]: _, file = data - return cast(Dict[str, Any], pickle.load(file, encoding="latin1")) + content = cast(Dict[str, Any], pickle.load(file, encoding="latin1")) + file.close() + return content def _prepare_sample(self, data: Tuple[np.ndarray, int]) -> Dict[str, Any]: image_array, category_idx = data diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py index 4ddacdfb9..cb701fbe6 100644 --- a/torchvision/prototype/datasets/_builtin/clevr.py +++ b/torchvision/prototype/datasets/_builtin/clevr.py @@ -97,6 +97,8 @@ class CLEVR(Dataset): buffer_size=INFINITE_BUFFER_SIZE, ) else: + for _, file in scenes_dp: + file.close() dp = Mapper(images_dp, self._add_empty_anns) return Mapper(dp, self._prepare_sample) diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index 7a459b2d0..c13836a8c 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -57,6 +57,8 @@ class MNISTFileReader(IterDataPipe[torch.Tensor]): for _ in range(stop - start): yield read(dtype=dtype, count=count).reshape(shape) + file.close() + class _MNISTBase(Dataset): _URL_BASE: Union[str, Sequence[str]] diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py index 162f22f1a..3a9fe6e90 100644 --- a/torchvision/prototype/datasets/_builtin/pcam.py +++ b/torchvision/prototype/datasets/_builtin/pcam.py @@ -33,6 +33,8 @@ class PCAMH5Reader(IterDataPipe[Tuple[str, io.IOBase]]): data = data[self.key] yield from data + handle.close() + _Resource = namedtuple("_Resource", ("file_name", "gdrive_id", "sha256")) diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py index c7a79c418..7aea1e0f7 100644 --- a/torchvision/prototype/datasets/_builtin/sbd.py +++ b/torchvision/prototype/datasets/_builtin/sbd.py @@ -49,31 +49,35 @@ class SBD(Dataset): super().__init__(root, dependencies=("scipy",), skip_integrity_check=skip_integrity_check) def _resources(self) -> List[OnlineResource]: - archive = HttpResource( - "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz", - sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53", - ) - extra_split = HttpResource( - "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt", - sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432", - ) - return [archive, extra_split] + resources = [ + HttpResource( + "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz", + sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53", + ) + ] + if self._split == "train_noval": + resources.append( + HttpResource( + "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt", + sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432", + ) + ) + return resources # type: ignore[return-value] def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]: path = pathlib.Path(data[0]) parent, grandparent, *_ = path.parents - if parent.name == "dataset": - return 0 - elif grandparent.name == "dataset": + if grandparent.name == "dataset": if parent.name == "img": - return 1 + return 0 elif parent.name == "cls": - return 2 - else: - return None - else: - return None + return 1 + + if parent.name == "dataset" and self._split != "train_noval": + return 2 + + return None def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]) -> Dict[str, Any]: split_and_image_data, ann_data = data @@ -93,18 +97,24 @@ class SBD(Dataset): ) def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: - archive_dp, extra_split_dp = resource_dps - - archive_dp = resource_dps[0] - split_dp, images_dp, anns_dp = Demultiplexer( - archive_dp, - 3, - self._classify_archive, - buffer_size=INFINITE_BUFFER_SIZE, - drop_none=True, - ) if self._split == "train_noval": - split_dp = extra_split_dp + archive_dp, split_dp = resource_dps + images_dp, anns_dp = Demultiplexer( + archive_dp, + 2, + self._classify_archive, + buffer_size=INFINITE_BUFFER_SIZE, + drop_none=True, + ) + else: + archive_dp = resource_dps[0] + images_dp, anns_dp, split_dp = Demultiplexer( + archive_dp, + 3, + self._classify_archive, + buffer_size=INFINITE_BUFFER_SIZE, + drop_none=True, + ) split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt")) split_dp = LineReader(split_dp, decode=True) diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py index 2f13ce10d..84a9b3a7f 100644 --- a/torchvision/prototype/datasets/_builtin/voc.py +++ b/torchvision/prototype/datasets/_builtin/voc.py @@ -94,7 +94,9 @@ class VOC(Dataset): return None def _parse_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]: - return cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"]) + ann = cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"]) + buffer.close() + return ann def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]: anns = self._parse_detection_ann(buffer) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 0385d98c2..55f1b8a3f 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -8,7 +8,6 @@ import torch import torch.distributed as dist import torch.utils.data from torchdata.datapipes.iter import IoPathFileLister, IoPathFileOpener, IterDataPipe, ShardingFilter, Shuffler -from torchdata.datapipes.utils import StreamWrapper from torchvision.prototype.utils._internal import fromfile @@ -40,10 +39,9 @@ def read_mat(buffer: BinaryIO, **kwargs: Any) -> Any: except ImportError as error: raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error - if isinstance(buffer, StreamWrapper): - buffer = buffer.file_obj - - return sio.loadmat(buffer, **kwargs) + data = sio.loadmat(buffer, **kwargs) + buffer.close() + return data class MappingIterator(IterDataPipe[Union[Tuple[K, D], D]]): diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py index b8b983960..0ec14ab20 100644 --- a/torchvision/prototype/features/_encoded.py +++ b/torchvision/prototype/features/_encoded.py @@ -27,7 +27,9 @@ class EncodedData(_Feature): @classmethod def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D: - return cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs) + encoded_data = cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs) + file.close() + return encoded_data @classmethod def from_path(cls: Type[D], path: Union[str, os.PathLike], **kwargs: Any) -> D: -- GitLab From 3118fb520d5c8f0d413241104b27848c46c2460e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 7 Oct 2022 15:59:42 +0200 Subject: [PATCH 020/624] add Video feature and kernels (#6667) * add video feature * add video kernels * add video testing utils * add one kernel info * fix kernel names in Video feature * use only uint8 for video testing * require at least 4 dims for Video feature * add TODO for image_size -> spatial_size * image -> video in feature constructor * introduce new combined images and video type * add video to transform utils * fix transforms test * fix auto augment * cleanup * address review comments * add remaining video kernel infos * add batch dimension squashing to some kernels * fix tests and kernel infos * add xfails for arbitrary batch sizes on some kernels * fix test setup * fix equalize_image_tensor for multi batch dims * fix adjust_sharpness_image_tensor for multi batch dims * address review comments --- test/prototype_common_utils.py | 81 ++- test/prototype_transforms_dispatcher_infos.py | 20 + test/prototype_transforms_kernel_infos.py | 601 +++++++++++++----- test/test_prototype_transforms.py | 9 +- test/test_prototype_transforms_functional.py | 1 + torchvision/prototype/features/__init__.py | 1 + torchvision/prototype/features/_video.py | 240 +++++++ torchvision/prototype/transforms/_augment.py | 4 +- .../prototype/transforms/_auto_augment.py | 86 +-- torchvision/prototype/transforms/_color.py | 12 +- torchvision/prototype/transforms/_geometry.py | 6 +- torchvision/prototype/transforms/_meta.py | 4 +- torchvision/prototype/transforms/_misc.py | 8 +- torchvision/prototype/transforms/_utils.py | 4 +- .../transforms/functional/__init__.py | 35 +- .../transforms/functional/_augment.py | 14 +- .../prototype/transforms/functional/_color.py | 97 ++- .../transforms/functional/_geometry.py | 137 ++++ .../prototype/transforms/functional/_meta.py | 34 +- .../prototype/transforms/functional/_misc.py | 34 +- 20 files changed, 1171 insertions(+), 257 deletions(-) create mode 100644 torchvision/prototype/features/_video.py diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index 333e11fb2..c10cec94c 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -45,6 +45,8 @@ __all__ = [ "make_segmentation_masks", "make_mask_loaders", "make_masks", + "make_video", + "make_videos", ] @@ -210,17 +212,19 @@ DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS) def from_loader(loader_fn): def wrapper(*args, **kwargs): + device = kwargs.pop("device", "cpu") loader = loader_fn(*args, **kwargs) - return loader.load(kwargs.get("device", "cpu")) + return loader.load(device) return wrapper def from_loaders(loaders_fn): def wrapper(*args, **kwargs): + device = kwargs.pop("device", "cpu") loaders = loaders_fn(*args, **kwargs) for loader in loaders: - yield loader.load(kwargs.get("device", "cpu")) + yield loader.load(device) return wrapper @@ -246,6 +250,21 @@ class ImageLoader(TensorLoader): self.num_channels = self.shape[-3] +NUM_CHANNELS_MAP = { + features.ColorSpace.GRAY: 1, + features.ColorSpace.GRAY_ALPHA: 2, + features.ColorSpace.RGB: 3, + features.ColorSpace.RGB_ALPHA: 4, +} + + +def get_num_channels(color_space): + num_channels = NUM_CHANNELS_MAP.get(color_space) + if not num_channels: + raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}") + return num_channels + + def make_image_loader( size="random", *, @@ -255,16 +274,7 @@ def make_image_loader( constant_alpha=True, ): size = _parse_image_size(size) - - try: - num_channels = { - features.ColorSpace.GRAY: 1, - features.ColorSpace.GRAY_ALPHA: 2, - features.ColorSpace.RGB: 3, - features.ColorSpace.RGB_ALPHA: 4, - }[color_space] - except KeyError as error: - raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}") from error + num_channels = get_num_channels(color_space) def fn(shape, dtype, device): max_value = get_max_value(dtype) @@ -531,3 +541,50 @@ def make_mask_loaders( make_masks = from_loaders(make_mask_loaders) + + +class VideoLoader(ImageLoader): + pass + + +def make_video_loader( + size="random", + *, + color_space=features.ColorSpace.RGB, + num_frames="random", + extra_dims=(), + dtype=torch.uint8, +): + size = _parse_image_size(size) + num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames + + def fn(shape, dtype, device): + video = make_image(size=shape[-2:], color_space=color_space, extra_dims=shape[:-3], dtype=dtype, device=device) + return features.Video(video, color_space=color_space) + + return VideoLoader( + fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype, color_space=color_space + ) + + +make_video = from_loader(make_video_loader) + + +def make_video_loaders( + *, + sizes=DEFAULT_IMAGE_SIZES, + color_spaces=( + features.ColorSpace.GRAY, + features.ColorSpace.RGB, + ), + num_frames=(1, 0, "random"), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.uint8,), +): + for params in combinations_grid( + size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes + ): + yield make_video_loader(**params) + + +make_videos = from_loaders(make_video_loaders) diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py index 9678249aa..be8bd3002 100644 --- a/test/prototype_transforms_dispatcher_infos.py +++ b/test/prototype_transforms_dispatcher_infos.py @@ -127,6 +127,23 @@ xfail_dispatch_pil_if_fill_sequence_needs_broadcast = TestMark( ) +def xfail_all_tests(*, reason, condition): + return [ + TestMark(("TestDispatchers", test_name), pytest.mark.xfail(reason=reason), condition=condition) + for test_name in [ + "test_scripted_smoke", + "test_dispatch_simple_tensor", + "test_dispatch_feature", + ] + ] + + +xfails_degenerate_or_multi_batch_dims = xfail_all_tests( + reason="See https://github.com/pytorch/vision/issues/6670 for details.", + condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]), +) + + DISPATCHER_INFOS = [ DispatcherInfo( F.horizontal_flip, @@ -243,6 +260,7 @@ DISPATCHER_INFOS = [ pil_kernel_info=PILKernelInfo(F.perspective_image_pil), test_marks=[ xfail_dispatch_pil_if_fill_sequence_needs_broadcast, + *xfails_degenerate_or_multi_batch_dims, ], ), DispatcherInfo( @@ -253,6 +271,7 @@ DISPATCHER_INFOS = [ features.Mask: F.elastic_mask, }, pil_kernel_info=PILKernelInfo(F.elastic_image_pil), + test_marks=xfails_degenerate_or_multi_batch_dims, ), DispatcherInfo( F.center_crop, @@ -275,6 +294,7 @@ DISPATCHER_INFOS = [ test_marks=[ xfail_jit_python_scalar_arg("kernel_size"), xfail_jit_python_scalar_arg("sigma"), + *xfails_degenerate_or_multi_batch_dims, ], ), DispatcherInfo( diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index c0e7bf5bf..d90d3bf68 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -20,6 +20,7 @@ from prototype_common_utils import ( make_image_loader, make_image_loaders, make_mask_loaders, + make_video_loaders, VALID_EXTRA_DIMS, ) from torchvision.prototype import features @@ -142,6 +143,25 @@ def xfail_jit_list_of_ints(name, *, reason=None): ) +def xfail_all_tests(*, reason, condition): + return [ + TestMark(("TestKernels", test_name), pytest.mark.xfail(reason=reason), condition=condition) + for test_name in [ + "test_scripted_vs_eager", + "test_batched_vs_single", + "test_no_inplace", + "test_cuda_vs_cpu", + "test_dtype_and_device_consistency", + ] + ] + + +xfails_image_degenerate_or_multi_batch_dims = xfail_all_tests( + reason="See https://github.com/pytorch/vision/issues/6670 for details.", + condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]), +) + + KERNEL_INFOS = [] @@ -169,6 +189,11 @@ def sample_inputs_horizontal_flip_mask(): yield ArgsKwargs(image_loader) +def sample_inputs_horizontal_flip_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -187,6 +212,10 @@ KERNEL_INFOS.extend( F.horizontal_flip_mask, sample_inputs_fn=sample_inputs_horizontal_flip_mask, ), + KernelInfo( + F.horizontal_flip_video, + sample_inputs_fn=sample_inputs_horizontal_flip_video, + ), ] ) @@ -287,6 +316,11 @@ def reference_inputs_resize_mask(): yield ArgsKwargs(mask_loader, size=size) +def sample_inputs_resize_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, size=[min(video_loader.shape[-2:]) + 1]) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -316,6 +350,10 @@ KERNEL_INFOS.extend( xfail_jit_integer_size(), ], ), + KernelInfo( + F.resize_video, + sample_inputs_fn=sample_inputs_resize_video, + ), ] ) @@ -485,7 +523,7 @@ def reference_inputs_affine_bounding_box(): ) -def sample_inputs_affine_image_mask(): +def sample_inputs_affine_mask(): for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]): yield ArgsKwargs(mask_loader, **_full_affine_params()) @@ -502,6 +540,11 @@ def reference_inputs_resize_mask(): yield ArgsKwargs(mask_loader, **affine_kwargs) +def sample_inputs_affine_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, **_full_affine_params()) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -529,7 +572,7 @@ KERNEL_INFOS.extend( ), KernelInfo( F.affine_mask, - sample_inputs_fn=sample_inputs_affine_image_mask, + sample_inputs_fn=sample_inputs_affine_mask, reference_fn=reference_affine_mask, reference_inputs_fn=reference_inputs_resize_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, @@ -537,6 +580,10 @@ KERNEL_INFOS.extend( xfail_jit_python_scalar_arg("shear"), ], ), + KernelInfo( + F.affine_video, + sample_inputs_fn=sample_inputs_affine_video, + ), ] ) @@ -608,14 +655,28 @@ def reference_inputs_convert_color_space_image_tensor(): yield args_kwargs -KERNEL_INFOS.append( - KernelInfo( - F.convert_color_space_image_tensor, - sample_inputs_fn=sample_inputs_convert_color_space_image_tensor, - reference_fn=reference_convert_color_space_image_tensor, - reference_inputs_fn=reference_inputs_convert_color_space_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ), +def sample_inputs_convert_color_space_video(): + color_spaces = [features.ColorSpace.GRAY, features.ColorSpace.RGB] + + for old_color_space, new_color_space in cycle_over(color_spaces): + for video_loader in make_video_loaders(sizes=["random"], color_spaces=[old_color_space], num_frames=["random"]): + yield ArgsKwargs(video_loader, old_color_space=old_color_space, new_color_space=new_color_space) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.convert_color_space_image_tensor, + sample_inputs_fn=sample_inputs_convert_color_space_image_tensor, + reference_fn=reference_convert_color_space_image_tensor, + reference_inputs_fn=reference_inputs_convert_color_space_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.convert_color_space_video, + sample_inputs_fn=sample_inputs_convert_color_space_video, + ), + ] ) @@ -643,6 +704,11 @@ def sample_inputs_vertical_flip_mask(): yield ArgsKwargs(image_loader) +def sample_inputs_vertical_flip_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -661,6 +727,10 @@ KERNEL_INFOS.extend( F.vertical_flip_mask, sample_inputs_fn=sample_inputs_vertical_flip_mask, ), + KernelInfo( + F.vertical_flip_video, + sample_inputs_fn=sample_inputs_vertical_flip_video, + ), ] ) @@ -724,6 +794,11 @@ def reference_inputs_rotate_mask(): yield ArgsKwargs(mask_loader, angle=angle) +def sample_inputs_rotate_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, angle=15.0) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -749,6 +824,10 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_rotate_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, ), + KernelInfo( + F.rotate_video, + sample_inputs_fn=sample_inputs_rotate_video, + ), ] ) @@ -791,6 +870,11 @@ def reference_inputs_crop_mask(): yield ArgsKwargs(mask_loader, **params) +def sample_inputs_crop_video(): + for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=["random"]): + yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -812,6 +896,10 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_crop_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, ), + KernelInfo( + F.crop_video, + sample_inputs_fn=sample_inputs_crop_video, + ), ] ) @@ -872,6 +960,11 @@ def reference_inputs_resized_crop_mask(): yield ArgsKwargs(mask_loader, **params) +def sample_inputs_resized_crop_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0]) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -892,6 +985,10 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_resized_crop_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, ), + KernelInfo( + F.resized_crop_video, + sample_inputs_fn=sample_inputs_resized_crop_video, + ), ] ) @@ -965,6 +1062,11 @@ def reference_inputs_pad_mask(): yield ArgsKwargs(image_loader, fill=fill, **params) +def sample_inputs_pad_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, padding=[1]) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -996,6 +1098,10 @@ KERNEL_INFOS.extend( reference_inputs_fn=reference_inputs_pad_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, ), + KernelInfo( + F.pad_video, + sample_inputs_fn=sample_inputs_pad_video, + ), ] ) @@ -1006,11 +1112,7 @@ _PERSPECTIVE_COEFFS = [ def sample_inputs_perspective_image_tensor(): - for image_loader in make_image_loaders( - sizes=["random"], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], - ): + for image_loader in make_image_loaders(sizes=["random"]): for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]: yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=_PERSPECTIVE_COEFFS[0]) @@ -1030,11 +1132,7 @@ def sample_inputs_perspective_bounding_box(): def sample_inputs_perspective_mask(): - for mask_loader in make_mask_loaders( - sizes=["random"], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], - ): + for mask_loader in make_mask_loaders(sizes=["random"]): yield ArgsKwargs(mask_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0]) @@ -1045,6 +1143,11 @@ def reference_inputs_perspective_mask(): yield ArgsKwargs(mask_loader, perspective_coeffs=perspective_coeffs) +def sample_inputs_perspective_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0]) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -1053,6 +1156,7 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.perspective_image_pil), reference_inputs_fn=reference_inputs_perspective_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=xfails_image_degenerate_or_multi_batch_dims, ), KernelInfo( F.perspective_bounding_box, @@ -1064,6 +1168,11 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.perspective_image_pil), reference_inputs_fn=reference_inputs_perspective_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=xfails_image_degenerate_or_multi_batch_dims, + ), + KernelInfo( + F.perspective_video, + sample_inputs_fn=sample_inputs_perspective_video, ), ] ) @@ -1074,11 +1183,7 @@ def _get_elastic_displacement(image_size): def sample_inputs_elastic_image_tensor(): - for image_loader in make_image_loaders( - sizes=["random"], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], - ): + for image_loader in make_image_loaders(sizes=["random"]): displacement = _get_elastic_displacement(image_loader.image_size) for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]: yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) @@ -1109,11 +1214,7 @@ def sample_inputs_elastic_bounding_box(): def sample_inputs_elastic_mask(): - for mask_loader in make_mask_loaders( - sizes=["random"], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], - ): + for mask_loader in make_mask_loaders(sizes=["random"]): displacement = _get_elastic_displacement(mask_loader.shape[-2:]) yield ArgsKwargs(mask_loader, displacement=displacement) @@ -1124,6 +1225,12 @@ def reference_inputs_elastic_mask(): yield ArgsKwargs(mask_loader, displacement=displacement) +def sample_inputs_elastic_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + displacement = _get_elastic_displacement(video_loader.shape[-2:]) + yield ArgsKwargs(video_loader, displacement=displacement) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -1132,6 +1239,7 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.elastic_image_pil), reference_inputs_fn=reference_inputs_elastic_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=xfails_image_degenerate_or_multi_batch_dims, ), KernelInfo( F.elastic_bounding_box, @@ -1143,6 +1251,11 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.elastic_image_pil), reference_inputs_fn=reference_inputs_elastic_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=xfails_image_degenerate_or_multi_batch_dims, + ), + KernelInfo( + F.elastic_video, + sample_inputs_fn=sample_inputs_elastic_video, ), ] ) @@ -1195,6 +1308,12 @@ def reference_inputs_center_crop_mask(): yield ArgsKwargs(mask_loader, output_size=output_size) +def sample_inputs_center_crop_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + height, width = video_loader.shape[-2:] + yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2)) + + KERNEL_INFOS.extend( [ KernelInfo( @@ -1224,17 +1343,17 @@ KERNEL_INFOS.extend( xfail_jit_integer_size("output_size"), ], ), + KernelInfo( + F.center_crop_video, + sample_inputs_fn=sample_inputs_center_crop_video, + ), ] ) def sample_inputs_gaussian_blur_image_tensor(): make_gaussian_blur_image_loaders = functools.partial( - make_image_loaders, - sizes=["random"], - color_spaces=[features.ColorSpace.RGB], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], + make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB] ) for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]): @@ -1246,26 +1365,34 @@ def sample_inputs_gaussian_blur_image_tensor(): yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma) -KERNEL_INFOS.append( - KernelInfo( - F.gaussian_blur_image_tensor, - sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=[ - xfail_jit_python_scalar_arg("kernel_size"), - xfail_jit_python_scalar_arg("sigma"), - ], - ) +def sample_inputs_gaussian_blur_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, kernel_size=[3, 3]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.gaussian_blur_image_tensor, + sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + test_marks=[ + xfail_jit_python_scalar_arg("kernel_size"), + xfail_jit_python_scalar_arg("sigma"), + *xfails_image_degenerate_or_multi_batch_dims, + ], + ), + KernelInfo( + F.gaussian_blur_video, + sample_inputs_fn=sample_inputs_gaussian_blur_video, + ), + ] ) def sample_inputs_equalize_image_tensor(): for image_loader in make_image_loaders( - sizes=["random"], - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], - color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), - dtypes=[torch.uint8], + sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8] ): yield ArgsKwargs(image_loader) @@ -1277,15 +1404,26 @@ def reference_inputs_equalize_image_tensor(): yield ArgsKwargs(image_loader) -KERNEL_INFOS.append( - KernelInfo( - F.equalize_image_tensor, - kernel_name="equalize_image_tensor", - sample_inputs_fn=sample_inputs_equalize_image_tensor, - reference_fn=pil_reference_wrapper(F.equalize_image_pil), - reference_inputs_fn=reference_inputs_equalize_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_equalize_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.equalize_image_tensor, + kernel_name="equalize_image_tensor", + sample_inputs_fn=sample_inputs_equalize_image_tensor, + reference_fn=pil_reference_wrapper(F.equalize_image_pil), + reference_inputs_fn=reference_inputs_equalize_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.equalize_video, + sample_inputs_fn=sample_inputs_equalize_video, + ), + ] ) @@ -1303,15 +1441,26 @@ def reference_inputs_invert_image_tensor(): yield ArgsKwargs(image_loader) -KERNEL_INFOS.append( - KernelInfo( - F.invert_image_tensor, - kernel_name="invert_image_tensor", - sample_inputs_fn=sample_inputs_invert_image_tensor, - reference_fn=pil_reference_wrapper(F.invert_image_pil), - reference_inputs_fn=reference_inputs_invert_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_invert_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.invert_image_tensor, + kernel_name="invert_image_tensor", + sample_inputs_fn=sample_inputs_invert_image_tensor, + reference_fn=pil_reference_wrapper(F.invert_image_pil), + reference_inputs_fn=reference_inputs_invert_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.invert_video, + sample_inputs_fn=sample_inputs_invert_video, + ), + ] ) @@ -1335,15 +1484,26 @@ def reference_inputs_posterize_image_tensor(): yield ArgsKwargs(image_loader, bits=bits) -KERNEL_INFOS.append( - KernelInfo( - F.posterize_image_tensor, - kernel_name="posterize_image_tensor", - sample_inputs_fn=sample_inputs_posterize_image_tensor, - reference_fn=pil_reference_wrapper(F.posterize_image_pil), - reference_inputs_fn=reference_inputs_posterize_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_posterize_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.posterize_image_tensor, + kernel_name="posterize_image_tensor", + sample_inputs_fn=sample_inputs_posterize_image_tensor, + reference_fn=pil_reference_wrapper(F.posterize_image_pil), + reference_inputs_fn=reference_inputs_posterize_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.posterize_video, + sample_inputs_fn=sample_inputs_posterize_video, + ), + ] ) @@ -1368,15 +1528,26 @@ def reference_inputs_solarize_image_tensor(): yield ArgsKwargs(image_loader, threshold=threshold) -KERNEL_INFOS.append( - KernelInfo( - F.solarize_image_tensor, - kernel_name="solarize_image_tensor", - sample_inputs_fn=sample_inputs_solarize_image_tensor, - reference_fn=pil_reference_wrapper(F.solarize_image_pil), - reference_inputs_fn=reference_inputs_solarize_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_solarize_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype))) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.solarize_image_tensor, + kernel_name="solarize_image_tensor", + sample_inputs_fn=sample_inputs_solarize_image_tensor, + reference_fn=pil_reference_wrapper(F.solarize_image_pil), + reference_inputs_fn=reference_inputs_solarize_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.solarize_video, + sample_inputs_fn=sample_inputs_solarize_video, + ), + ] ) @@ -1394,15 +1565,26 @@ def reference_inputs_autocontrast_image_tensor(): yield ArgsKwargs(image_loader) -KERNEL_INFOS.append( - KernelInfo( - F.autocontrast_image_tensor, - kernel_name="autocontrast_image_tensor", - sample_inputs_fn=sample_inputs_autocontrast_image_tensor, - reference_fn=pil_reference_wrapper(F.autocontrast_image_pil), - reference_inputs_fn=reference_inputs_autocontrast_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_autocontrast_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.autocontrast_image_tensor, + kernel_name="autocontrast_image_tensor", + sample_inputs_fn=sample_inputs_autocontrast_image_tensor, + reference_fn=pil_reference_wrapper(F.autocontrast_image_pil), + reference_inputs_fn=reference_inputs_autocontrast_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.autocontrast_video, + sample_inputs_fn=sample_inputs_autocontrast_video, + ), + ] ) _ADJUST_SHARPNESS_FACTORS = [0.1, 0.5] @@ -1412,8 +1594,6 @@ def sample_inputs_adjust_sharpness_image_tensor(): for image_loader in make_image_loaders( sizes=["random", (2, 2)], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), - # FIXME: kernel should support arbitrary batch sizes - extra_dims=[(), (4,)], ): yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0]) @@ -1426,15 +1606,26 @@ def reference_inputs_adjust_sharpness_image_tensor(): yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor) -KERNEL_INFOS.append( - KernelInfo( - F.adjust_sharpness_image_tensor, - kernel_name="adjust_sharpness_image_tensor", - sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor, - reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil), - reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_adjust_sharpness_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_sharpness_image_tensor, + kernel_name="adjust_sharpness_image_tensor", + sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor, + reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil), + reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.adjust_sharpness_video, + sample_inputs_fn=sample_inputs_adjust_sharpness_video, + ), + ] ) @@ -1446,12 +1637,26 @@ def sample_inputs_erase_image_tensor(): yield ArgsKwargs(image_loader, i=1, j=2, h=h, w=w, v=v) -KERNEL_INFOS.append( - KernelInfo( - F.erase_image_tensor, - kernel_name="erase_image_tensor", - sample_inputs_fn=sample_inputs_erase_image_tensor, - ) +def sample_inputs_erase_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + # FIXME: make the parameters more diverse + h, w = 6, 7 + v = torch.rand(video_loader.num_channels, h, w) + yield ArgsKwargs(video_loader, i=1, j=2, h=h, w=w, v=v) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.erase_image_tensor, + kernel_name="erase_image_tensor", + sample_inputs_fn=sample_inputs_erase_image_tensor, + ), + KernelInfo( + F.erase_video, + sample_inputs_fn=sample_inputs_erase_video, + ), + ] ) _ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5] @@ -1472,15 +1677,26 @@ def reference_inputs_adjust_brightness_image_tensor(): yield ArgsKwargs(image_loader, brightness_factor=brightness_factor) -KERNEL_INFOS.append( - KernelInfo( - F.adjust_brightness_image_tensor, - kernel_name="adjust_brightness_image_tensor", - sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor, - reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil), - reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_adjust_brightness_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_brightness_image_tensor, + kernel_name="adjust_brightness_image_tensor", + sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor, + reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil), + reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.adjust_brightness_video, + sample_inputs_fn=sample_inputs_adjust_brightness_video, + ), + ] ) @@ -1502,15 +1718,26 @@ def reference_inputs_adjust_contrast_image_tensor(): yield ArgsKwargs(image_loader, contrast_factor=contrast_factor) -KERNEL_INFOS.append( - KernelInfo( - F.adjust_contrast_image_tensor, - kernel_name="adjust_contrast_image_tensor", - sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor, - reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil), - reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_adjust_contrast_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_contrast_image_tensor, + kernel_name="adjust_contrast_image_tensor", + sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor, + reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil), + reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.adjust_contrast_video, + sample_inputs_fn=sample_inputs_adjust_contrast_video, + ), + ] ) _ADJUST_GAMMA_GAMMAS_GAINS = [ @@ -1535,15 +1762,27 @@ def reference_inputs_adjust_gamma_image_tensor(): yield ArgsKwargs(image_loader, gamma=gamma, gain=gain) -KERNEL_INFOS.append( - KernelInfo( - F.adjust_gamma_image_tensor, - kernel_name="adjust_gamma_image_tensor", - sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor, - reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil), - reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_adjust_gamma_video(): + gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0] + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, gamma=gamma, gain=gain) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_gamma_image_tensor, + kernel_name="adjust_gamma_image_tensor", + sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor, + reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil), + reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.adjust_gamma_video, + sample_inputs_fn=sample_inputs_adjust_gamma_video, + ), + ] ) @@ -1565,15 +1804,26 @@ def reference_inputs_adjust_hue_image_tensor(): yield ArgsKwargs(image_loader, hue_factor=hue_factor) -KERNEL_INFOS.append( - KernelInfo( - F.adjust_hue_image_tensor, - kernel_name="adjust_hue_image_tensor", - sample_inputs_fn=sample_inputs_adjust_hue_image_tensor, - reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil), - reference_inputs_fn=reference_inputs_adjust_hue_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_adjust_hue_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_hue_image_tensor, + kernel_name="adjust_hue_image_tensor", + sample_inputs_fn=sample_inputs_adjust_hue_image_tensor, + reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil), + reference_inputs_fn=reference_inputs_adjust_hue_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.adjust_hue_video, + sample_inputs_fn=sample_inputs_adjust_hue_video, + ), + ] ) _ADJUST_SATURATION_FACTORS = [0.1, 0.5] @@ -1594,15 +1844,26 @@ def reference_inputs_adjust_saturation_image_tensor(): yield ArgsKwargs(image_loader, saturation_factor=saturation_factor) -KERNEL_INFOS.append( - KernelInfo( - F.adjust_saturation_image_tensor, - kernel_name="adjust_saturation_image_tensor", - sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor, - reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil), - reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - ) +def sample_inputs_adjust_saturation_video(): + for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_saturation_image_tensor, + kernel_name="adjust_saturation_image_tensor", + sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor, + reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil), + reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + ), + KernelInfo( + F.adjust_saturation_video, + sample_inputs_fn=sample_inputs_adjust_saturation_video, + ), + ] ) @@ -1702,10 +1963,24 @@ def sample_inputs_normalize_image_tensor(): yield ArgsKwargs(image_loader, mean=mean, std=std) -KERNEL_INFOS.append( - KernelInfo( - F.normalize_image_tensor, - kernel_name="normalize_image_tensor", - sample_inputs_fn=sample_inputs_normalize_image_tensor, - ) +def sample_inputs_normalize_video(): + mean, std = _NORMALIZE_MEANS_STDS[0] + for video_loader in make_video_loaders( + sizes=["random"], color_spaces=[features.ColorSpace.RGB], num_frames=["random"], dtypes=[torch.float32] + ): + yield ArgsKwargs(video_loader, mean=mean, std=std) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.normalize_image_tensor, + kernel_name="normalize_image_tensor", + sample_inputs_fn=sample_inputs_normalize_image_tensor, + ), + KernelInfo( + F.normalize_video, + sample_inputs_fn=sample_inputs_normalize_video, + ), + ] ) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 9734a5dc3..916861f4e 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -17,6 +17,7 @@ from prototype_common_utils import ( make_masks, make_one_hot_labels, make_segmentation_mask, + make_videos, ) from torchvision.ops.boxes import box_iou from torchvision.prototype import features, transforms @@ -65,6 +66,7 @@ def parametrize_from_transforms(*transforms): make_vanilla_tensor_images, make_pil_images, make_masks, + make_videos, ]: inputs = list(creation_fn()) try: @@ -155,12 +157,14 @@ class TestSmoke: features.ColorSpace.RGB, ], dtypes=[torch.uint8], - extra_dims=[(4,)], + extra_dims=[(), (4,)], + **(dict(num_frames=["random"]) if fn is make_videos else dict()), ) for fn in [ make_images, make_vanilla_tensor_images, make_pil_images, + make_videos, ] ), ) @@ -184,6 +188,7 @@ class TestSmoke: for fn in [ make_images, make_vanilla_tensor_images, + make_videos, ] ), ), @@ -200,6 +205,7 @@ class TestSmoke: make_images(extra_dims=[(4,)]), make_vanilla_tensor_images(), make_pil_images(), + make_videos(extra_dims=[()]), ), ) ] @@ -218,6 +224,7 @@ class TestSmoke: make_images, make_vanilla_tensor_images, make_pil_images, + make_videos, ) ] ), diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index a6523045c..5adea4d26 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -129,6 +129,7 @@ class TestKernels: # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as # common ground. features.Mask: 2, + features.Video: 4, }.get(feature_type) if data_dims is None: raise pytest.UsageError( diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py index df77e8b77..6fc2fb6ea 100644 --- a/torchvision/prototype/features/__init__.py +++ b/torchvision/prototype/features/__init__.py @@ -13,3 +13,4 @@ from ._image import ( ) from ._label import Label, OneHotLabel from ._mask import Mask +from ._video import ImageOrVideoType, ImageOrVideoTypeJIT, TensorImageOrVideoType, TensorImageOrVideoTypeJIT, Video diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py new file mode 100644 index 000000000..e19b6f7ed --- /dev/null +++ b/torchvision/prototype/features/_video.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import warnings +from typing import Any, cast, List, Optional, Tuple, Union + +import torch +from torchvision.transforms.functional import InterpolationMode + +from ._feature import _Feature, FillTypeJIT +from ._image import ColorSpace, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT + + +class Video(_Feature): + color_space: ColorSpace + + def __new__( + cls, + data: Any, + *, + color_space: Optional[Union[ColorSpace, str]] = None, + dtype: Optional[torch.dtype] = None, + device: Optional[Union[torch.device, str, int]] = None, + requires_grad: bool = False, + ) -> Video: + data = torch.as_tensor(data, dtype=dtype, device=device) + if data.ndim < 4: + raise ValueError + video = super().__new__(cls, data, requires_grad=requires_grad) + + if color_space is None: + color_space = ColorSpace.from_tensor_shape(video.shape) # type: ignore[arg-type] + if color_space == ColorSpace.OTHER: + warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.") + elif isinstance(color_space, str): + color_space = ColorSpace.from_str(color_space.upper()) + elif not isinstance(color_space, ColorSpace): + raise ValueError + video.color_space = color_space + + return video + + def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] + return self._make_repr(color_space=self.color_space) + + @classmethod + def new_like( + cls, other: Video, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any + ) -> Video: + return super().new_like( + other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs + ) + + # TODO: rename this (and all instances of this term to spatial size) + @property + def image_size(self) -> Tuple[int, int]: + return cast(Tuple[int, int], tuple(self.shape[-2:])) + + @property + def num_channels(self) -> int: + return self.shape[-3] + + @property + def num_frames(self) -> int: + return self.shape[-4] + + def to_color_space(self, color_space: Union[str, ColorSpace], copy: bool = True) -> Video: + if isinstance(color_space, str): + color_space = ColorSpace.from_str(color_space.upper()) + + return Video.new_like( + self, + self._F.convert_color_space_video( + self, old_color_space=self.color_space, new_color_space=color_space, copy=copy + ), + color_space=color_space, + ) + + def horizontal_flip(self) -> Video: + output = self._F.horizontal_flip_video(self) + return Video.new_like(self, output) + + def vertical_flip(self) -> Video: + output = self._F.vertical_flip_video(self) + return Video.new_like(self, output) + + def resize( # type: ignore[override] + self, + size: List[int], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + max_size: Optional[int] = None, + antialias: bool = False, + ) -> Video: + output = self._F.resize_video(self, size, interpolation=interpolation, max_size=max_size, antialias=antialias) + return Video.new_like(self, output) + + def crop(self, top: int, left: int, height: int, width: int) -> Video: + output = self._F.crop_video(self, top, left, height, width) + return Video.new_like(self, output) + + def center_crop(self, output_size: List[int]) -> Video: + output = self._F.center_crop_video(self, output_size=output_size) + return Video.new_like(self, output) + + def resized_crop( + self, + top: int, + left: int, + height: int, + width: int, + size: List[int], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + antialias: bool = False, + ) -> Video: + output = self._F.resized_crop_video( + self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias + ) + return Video.new_like(self, output) + + def pad( + self, + padding: Union[int, List[int]], + fill: FillTypeJIT = None, + padding_mode: str = "constant", + ) -> Video: + output = self._F.pad_video(self, padding, fill=fill, padding_mode=padding_mode) + return Video.new_like(self, output) + + def rotate( + self, + angle: float, + interpolation: InterpolationMode = InterpolationMode.NEAREST, + expand: bool = False, + fill: FillTypeJIT = None, + center: Optional[List[float]] = None, + ) -> Video: + output = self._F._geometry.rotate_video( + self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center + ) + return Video.new_like(self, output) + + def affine( + self, + angle: Union[int, float], + translate: List[float], + scale: float, + shear: List[float], + interpolation: InterpolationMode = InterpolationMode.NEAREST, + fill: FillTypeJIT = None, + center: Optional[List[float]] = None, + ) -> Video: + output = self._F._geometry.affine_video( + self, + angle, + translate=translate, + scale=scale, + shear=shear, + interpolation=interpolation, + fill=fill, + center=center, + ) + return Video.new_like(self, output) + + def perspective( + self, + perspective_coeffs: List[float], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: FillTypeJIT = None, + ) -> Video: + output = self._F._geometry.perspective_video(self, perspective_coeffs, interpolation=interpolation, fill=fill) + return Video.new_like(self, output) + + def elastic( + self, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: FillTypeJIT = None, + ) -> Video: + output = self._F._geometry.elastic_video(self, displacement, interpolation=interpolation, fill=fill) + return Video.new_like(self, output) + + def adjust_brightness(self, brightness_factor: float) -> Video: + output = self._F.adjust_brightness_video(self, brightness_factor=brightness_factor) + return Video.new_like(self, output) + + def adjust_saturation(self, saturation_factor: float) -> Video: + output = self._F.adjust_saturation_video(self, saturation_factor=saturation_factor) + return Video.new_like(self, output) + + def adjust_contrast(self, contrast_factor: float) -> Video: + output = self._F.adjust_contrast_video(self, contrast_factor=contrast_factor) + return Video.new_like(self, output) + + def adjust_sharpness(self, sharpness_factor: float) -> Video: + output = self._F.adjust_sharpness_video(self, sharpness_factor=sharpness_factor) + return Video.new_like(self, output) + + def adjust_hue(self, hue_factor: float) -> Video: + output = self._F.adjust_hue_video(self, hue_factor=hue_factor) + return Video.new_like(self, output) + + def adjust_gamma(self, gamma: float, gain: float = 1) -> Video: + output = self._F.adjust_gamma_video(self, gamma=gamma, gain=gain) + return Video.new_like(self, output) + + def posterize(self, bits: int) -> Video: + output = self._F.posterize_video(self, bits=bits) + return Video.new_like(self, output) + + def solarize(self, threshold: float) -> Video: + output = self._F.solarize_video(self, threshold=threshold) + return Video.new_like(self, output) + + def autocontrast(self) -> Video: + output = self._F.autocontrast_video(self) + return Video.new_like(self, output) + + def equalize(self) -> Video: + output = self._F.equalize_video(self) + return Video.new_like(self, output) + + def invert(self) -> Video: + output = self._F.invert_video(self) + return Video.new_like(self, output) + + def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Video: + output = self._F.gaussian_blur_video(self, kernel_size=kernel_size, sigma=sigma) + return Video.new_like(self, output) + + +VideoType = Union[torch.Tensor, Video] +VideoTypeJIT = torch.Tensor +LegacyVideoType = torch.Tensor +LegacyVideoTypeJIT = torch.Tensor +TensorVideoType = Union[torch.Tensor, Video] +TensorVideoTypeJIT = torch.Tensor + +ImageOrVideoType = Union[ImageType, VideoType] +ImageOrVideoTypeJIT = Union[ImageTypeJIT, VideoTypeJIT] +TensorImageOrVideoType = Union[TensorImageType, TensorVideoType] +TensorImageOrVideoTypeJIT = Union[TensorImageTypeJIT, TensorVideoTypeJIT] diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 3cd925fd9..311ad6d5a 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -15,7 +15,7 @@ from ._utils import has_any, query_chw class RandomErasing(_RandomApplyTransform): - _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image) + _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image, features.Video) def __init__( self, @@ -92,7 +92,7 @@ class RandomErasing(_RandomApplyTransform): return dict(i=i, j=j, h=h, w=w, v=v) - def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: + def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: if params["v"] is not None: inpt = F.erase(inpt, **params, inplace=self.inplace) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index c98e5c36e..4732f88d4 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -31,40 +31,41 @@ class _AutoAugmentBase(Transform): key = keys[int(torch.randint(len(keys), ()))] return key, dct[key] - def _extract_image( + def _extract_image_or_video( self, sample: Any, unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask), - ) -> Tuple[int, features.ImageType]: + ) -> Tuple[int, features.ImageOrVideoType]: sample_flat, _ = tree_flatten(sample) - images = [] + image_or_videos = [] for id, inpt in enumerate(sample_flat): - if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor)): - images.append((id, inpt)) + if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)): + image_or_videos.append((id, inpt)) elif isinstance(inpt, unsupported_types): raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()") - if not images: + if not image_or_videos: raise TypeError("Found no image in the sample.") - if len(images) > 1: + if len(image_or_videos) > 1: raise TypeError( - f"Auto augment transformations are only properly defined for a single image, but found {len(images)}." + f"Auto augment transformations are only properly defined for a single image or video, " + f"but found {len(image_or_videos)}." ) - return images[0] + return image_or_videos[0] def _put_into_sample(self, sample: Any, id: int, item: Any) -> Any: sample_flat, spec = tree_flatten(sample) sample_flat[id] = item return tree_unflatten(sample_flat, spec) - def _apply_image_transform( + def _apply_image_or_video_transform( self, - image: features.ImageType, + image: features.ImageOrVideoType, transform_id: str, magnitude: float, interpolation: InterpolationMode, fill: Dict[Type, features.FillType], - ) -> features.ImageType: + ) -> features.ImageOrVideoType: fill_ = fill[type(image)] fill_ = F._geometry._convert_fill_arg(fill_) @@ -276,8 +277,8 @@ class AutoAugment(_AutoAugmentBase): def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] - id, image = self._extract_image(sample) - _, height, width = get_chw(image) + id, image_or_video = self._extract_image_or_video(sample) + _, height, width = get_chw(image_or_video) policy = self._policies[int(torch.randint(len(self._policies), ()))] @@ -295,11 +296,11 @@ class AutoAugment(_AutoAugmentBase): else: magnitude = 0.0 - image = self._apply_image_transform( - image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill + image_or_video = self._apply_image_or_video_transform( + image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - return self._put_into_sample(sample, id, image) + return self._put_into_sample(sample, id, image_or_video) class RandAugment(_AutoAugmentBase): @@ -347,8 +348,8 @@ class RandAugment(_AutoAugmentBase): def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] - id, image = self._extract_image(sample) - _, height, width = get_chw(image) + id, image_or_video = self._extract_image_or_video(sample) + _, height, width = get_chw(image_or_video) for _ in range(self.num_ops): transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -359,11 +360,11 @@ class RandAugment(_AutoAugmentBase): magnitude *= -1 else: magnitude = 0.0 - image = self._apply_image_transform( - image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill + image_or_video = self._apply_image_or_video_transform( + image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - return self._put_into_sample(sample, id, image) + return self._put_into_sample(sample, id, image_or_video) class TrivialAugmentWide(_AutoAugmentBase): @@ -401,8 +402,8 @@ class TrivialAugmentWide(_AutoAugmentBase): def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] - id, image = self._extract_image(sample) - _, height, width = get_chw(image) + id, image_or_video = self._extract_image_or_video(sample) + _, height, width = get_chw(image_or_video) transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -414,10 +415,10 @@ class TrivialAugmentWide(_AutoAugmentBase): else: magnitude = 0.0 - image = self._apply_image_transform( - image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill + image_or_video = self._apply_image_or_video_transform( + image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - return self._put_into_sample(sample, id, image) + return self._put_into_sample(sample, id, image_or_video) class AugMix(_AutoAugmentBase): @@ -471,27 +472,28 @@ class AugMix(_AutoAugmentBase): def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] - id, orig_image = self._extract_image(sample) - _, height, width = get_chw(orig_image) + id, orig_image_or_video = self._extract_image_or_video(sample) + _, height, width = get_chw(orig_image_or_video) - if isinstance(orig_image, torch.Tensor): - image = orig_image + if isinstance(orig_image_or_video, torch.Tensor): + image_or_video = orig_image_or_video else: # isinstance(inpt, PIL.Image.Image): - image = F.pil_to_tensor(orig_image) + image_or_video = F.pil_to_tensor(orig_image_or_video) augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE - orig_dims = list(image.shape) - batch = image.view([1] * max(4 - image.ndim, 0) + orig_dims) + orig_dims = list(image_or_video.shape) + batch = image_or_video.view([1] * max(4 - image_or_video.ndim, 0) + orig_dims) batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1) - # Sample the beta weights for combining the original and augmented image. To get Beta, we use a Dirichlet - # with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of augmented image. + # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a + # Dirichlet with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of + # augmented image or video. m = self._sample_dirichlet( torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1) ) - # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images. + # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos. combined_weights = self._sample_dirichlet( torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1) ) * m[:, 1].view([batch_dims[0], -1]) @@ -511,15 +513,15 @@ class AugMix(_AutoAugmentBase): else: magnitude = 0.0 - aug = self._apply_image_transform( + aug = self._apply_image_or_video_transform( aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) mix.add_(combined_weights[:, i].view(batch_dims) * aug) - mix = mix.view(orig_dims).to(dtype=image.dtype) + mix = mix.view(orig_dims).to(dtype=image_or_video.dtype) - if isinstance(orig_image, features.Image): - mix = features.Image.new_like(orig_image, mix) - elif isinstance(orig_image, PIL.Image.Image): + if isinstance(orig_image_or_video, (features.Image, features.Video)): + mix = type(orig_image_or_video).new_like(orig_image_or_video, mix) # type: ignore[arg-type] + elif isinstance(orig_image_or_video, PIL.Image.Image): mix = F.to_image_pil(mix) return self._put_into_sample(sample, id, mix) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index e0ee8d1b9..451b57b66 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -82,7 +82,7 @@ class ColorJitter(Transform): class RandomPhotometricDistort(Transform): - _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor) + _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video) def __init__( self, @@ -110,20 +110,22 @@ class RandomPhotometricDistort(Transform): channel_permutation=torch.randperm(num_channels) if torch.rand(()) < self.p else None, ) - def _permute_channels(self, inpt: features.ImageType, permutation: torch.Tensor) -> features.ImageType: + def _permute_channels( + self, inpt: features.ImageOrVideoType, permutation: torch.Tensor + ) -> features.ImageOrVideoType: if isinstance(inpt, PIL.Image.Image): inpt = F.pil_to_tensor(inpt) output = inpt[..., permutation, :, :] - if isinstance(inpt, features.Image): - output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.OTHER) + if isinstance(inpt, (features.Image, features.Video)): + output = type(inpt).new_like(inpt, output, color_space=features.ColorSpace.OTHER) # type: ignore[arg-type] elif isinstance(inpt, PIL.Image.Image): output = F.to_image_pil(output) return output - def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: + def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: if params["brightness"]: inpt = F.adjust_brightness( inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1]) diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 008d4d195..1f132ec92 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -855,8 +855,10 @@ class FixedSizeCrop(Transform): return inpt def forward(self, *inputs: Any) -> Any: - if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor): - raise TypeError(f"{type(self).__name__}() requires input sample to contain an tensor or PIL image.") + if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video): + raise TypeError( + f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." + ) if has_any(inputs, features.BoundingBox) and not has_any(inputs, features.Label, features.OneHotLabel): raise TypeError( diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index 2ea3014aa..cb090492a 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -34,7 +34,7 @@ class ConvertImageDtype(Transform): class ConvertColorSpace(Transform): - _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image) + _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image, features.Video) def __init__( self, @@ -54,7 +54,7 @@ class ConvertColorSpace(Transform): self.copy = copy - def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: + def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: return F.convert_color_space( inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy ) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index 976e9f8b5..2531bf8f6 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -38,7 +38,7 @@ class Lambda(Transform): class LinearTransformation(Transform): - _transformed_types = (features.is_simple_tensor, features.Image) + _transformed_types = (features.is_simple_tensor, features.Image, features.Video) def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor): super().__init__() @@ -68,7 +68,7 @@ class LinearTransformation(Transform): return super().forward(*inputs) - def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> torch.Tensor: + def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor: # Image instance after linear transformation is not Image anymore due to unknown data range # Thus we will return Tensor for input Image @@ -93,7 +93,7 @@ class LinearTransformation(Transform): class Normalize(Transform): - _transformed_types = (features.Image, features.is_simple_tensor) + _transformed_types = (features.Image, features.is_simple_tensor, features.Video) def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False): super().__init__() @@ -101,7 +101,7 @@ class Normalize(Transform): self.std = list(std) self.inplace = inplace - def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> torch.Tensor: + def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor: return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace) def forward(self, *inpts: Any) -> Any: diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index 219e6e505..a76891a34 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -82,10 +82,10 @@ def query_chw(sample: Any) -> Tuple[int, int, int]: chws = { get_chw(item) for item in flat_sample - if isinstance(item, (features.Image, PIL.Image.Image)) or features.is_simple_tensor(item) + if isinstance(item, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(item) } if not chws: - raise TypeError("No image was found in the sample") + raise TypeError("No image or video was found in the sample") elif len(chws) > 1: raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}") return chws.pop() diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index f081d101d..cb801df73 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -6,6 +6,7 @@ from ._meta import ( convert_format_bounding_box, convert_color_space_image_tensor, convert_color_space_image_pil, + convert_color_space_video, convert_color_space, get_dimensions, get_image_num_channels, @@ -13,41 +14,52 @@ from ._meta import ( get_spatial_size, ) # usort: skip -from ._augment import erase, erase_image_pil, erase_image_tensor +from ._augment import erase, erase_image_pil, erase_image_tensor, erase_video from ._color import ( adjust_brightness, adjust_brightness_image_pil, adjust_brightness_image_tensor, + adjust_brightness_video, adjust_contrast, adjust_contrast_image_pil, adjust_contrast_image_tensor, + adjust_contrast_video, adjust_gamma, adjust_gamma_image_pil, adjust_gamma_image_tensor, + adjust_gamma_video, adjust_hue, adjust_hue_image_pil, adjust_hue_image_tensor, + adjust_hue_video, adjust_saturation, adjust_saturation_image_pil, adjust_saturation_image_tensor, + adjust_saturation_video, adjust_sharpness, adjust_sharpness_image_pil, adjust_sharpness_image_tensor, + adjust_sharpness_video, autocontrast, autocontrast_image_pil, autocontrast_image_tensor, + autocontrast_video, equalize, equalize_image_pil, equalize_image_tensor, + equalize_video, invert, invert_image_pil, invert_image_tensor, + invert_video, posterize, posterize_image_pil, posterize_image_tensor, + posterize_video, solarize, solarize_image_pil, solarize_image_tensor, + solarize_video, ) from ._geometry import ( affine, @@ -55,22 +67,26 @@ from ._geometry import ( affine_image_pil, affine_image_tensor, affine_mask, + affine_video, center_crop, center_crop_bounding_box, center_crop_image_pil, center_crop_image_tensor, center_crop_mask, + center_crop_video, crop, crop_bounding_box, crop_image_pil, crop_image_tensor, crop_mask, + crop_video, elastic, elastic_bounding_box, elastic_image_pil, elastic_image_tensor, elastic_mask, elastic_transform, + elastic_video, five_crop, five_crop_image_pil, five_crop_image_tensor, @@ -80,31 +96,37 @@ from ._geometry import ( horizontal_flip_image_pil, horizontal_flip_image_tensor, horizontal_flip_mask, + horizontal_flip_video, pad, pad_bounding_box, pad_image_pil, pad_image_tensor, pad_mask, + pad_video, perspective, perspective_bounding_box, perspective_image_pil, perspective_image_tensor, perspective_mask, + perspective_video, resize, resize_bounding_box, resize_image_pil, resize_image_tensor, resize_mask, + resize_video, resized_crop, resized_crop_bounding_box, resized_crop_image_pil, resized_crop_image_tensor, resized_crop_mask, + resized_crop_video, rotate, rotate_bounding_box, rotate_image_pil, rotate_image_tensor, rotate_mask, + rotate_video, ten_crop, ten_crop_image_pil, ten_crop_image_tensor, @@ -113,9 +135,18 @@ from ._geometry import ( vertical_flip_image_pil, vertical_flip_image_tensor, vertical_flip_mask, + vertical_flip_video, vflip, ) -from ._misc import gaussian_blur, gaussian_blur_image_pil, gaussian_blur_image_tensor, normalize, normalize_image_tensor +from ._misc import ( + gaussian_blur, + gaussian_blur_image_pil, + gaussian_blur_image_tensor, + gaussian_blur_video, + normalize, + normalize_image_tensor, + normalize_video, +) from ._type_conversion import ( convert_image_dtype, decode_image_with_pil, diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py index fb48c3588..976feb99e 100644 --- a/torchvision/prototype/transforms/functional/_augment.py +++ b/torchvision/prototype/transforms/functional/_augment.py @@ -17,19 +17,25 @@ def erase_image_pil( return to_pil_image(output, mode=image.mode) +def erase_video( + video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False +) -> torch.Tensor: + return erase_image_tensor(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace) + + def erase( - inpt: features.ImageTypeJIT, + inpt: features.ImageOrVideoTypeJIT, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False, -) -> features.ImageTypeJIT: +) -> features.ImageOrVideoTypeJIT: if isinstance(inpt, torch.Tensor): output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) - if not torch.jit.is_scripting() and isinstance(inpt, features.Image): - output = features.Image.new_like(inpt, output) + if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): + output = type(inpt).new_like(inpt, output) # type: ignore[arg-type] return output else: # isinstance(inpt, PIL.Image.Image): return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py index f375cb048..d11dd3c3b 100644 --- a/torchvision/prototype/transforms/functional/_color.py +++ b/torchvision/prototype/transforms/functional/_color.py @@ -2,10 +2,16 @@ import torch from torchvision.prototype import features from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT +from ._meta import get_dimensions_image_tensor + adjust_brightness_image_tensor = _FT.adjust_brightness adjust_brightness_image_pil = _FP.adjust_brightness +def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor: + return adjust_brightness_image_tensor(video, brightness_factor=brightness_factor) + + def adjust_brightness(inpt: features.InputTypeJIT, brightness_factor: float) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor) @@ -19,6 +25,10 @@ adjust_saturation_image_tensor = _FT.adjust_saturation adjust_saturation_image_pil = _FP.adjust_saturation +def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor: + return adjust_saturation_image_tensor(video, saturation_factor=saturation_factor) + + def adjust_saturation(inpt: features.InputTypeJIT, saturation_factor: float) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor) @@ -32,6 +42,10 @@ adjust_contrast_image_tensor = _FT.adjust_contrast adjust_contrast_image_pil = _FP.adjust_contrast +def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor: + return adjust_contrast_image_tensor(video, contrast_factor=contrast_factor) + + def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor) @@ -41,10 +55,40 @@ def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> feat return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor) -adjust_sharpness_image_tensor = _FT.adjust_sharpness +def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor: + num_channels, height, width = get_dimensions_image_tensor(image) + if num_channels not in (1, 3): + raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}") + + if sharpness_factor < 0: + raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.") + + if image.numel() == 0 or height <= 2 or width <= 2: + return image + + shape = image.shape + + if image.ndim > 4: + image = image.view(-1, num_channels, height, width) + needs_unsquash = True + else: + needs_unsquash = False + + output = _FT._blend(image, _FT._blurred_degenerate_image(image), sharpness_factor) + + if needs_unsquash: + output = output.view(shape) + + return output + + adjust_sharpness_image_pil = _FP.adjust_sharpness +def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor: + return adjust_sharpness_image_tensor(video, sharpness_factor=sharpness_factor) + + def adjust_sharpness(inpt: features.InputTypeJIT, sharpness_factor: float) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor) @@ -58,6 +102,10 @@ adjust_hue_image_tensor = _FT.adjust_hue adjust_hue_image_pil = _FP.adjust_hue +def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor: + return adjust_hue_image_tensor(video, hue_factor=hue_factor) + + def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return adjust_hue_image_tensor(inpt, hue_factor=hue_factor) @@ -71,6 +119,10 @@ adjust_gamma_image_tensor = _FT.adjust_gamma adjust_gamma_image_pil = _FP.adjust_gamma +def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor: + return adjust_gamma_image_tensor(video, gamma=gamma, gain=gain) + + def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain) @@ -84,6 +136,10 @@ posterize_image_tensor = _FT.posterize posterize_image_pil = _FP.posterize +def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor: + return posterize_image_tensor(video, bits=bits) + + def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return posterize_image_tensor(inpt, bits=bits) @@ -97,6 +153,10 @@ solarize_image_tensor = _FT.solarize solarize_image_pil = _FP.solarize +def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor: + return solarize_image_tensor(video, threshold=threshold) + + def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return solarize_image_tensor(inpt, threshold=threshold) @@ -110,6 +170,10 @@ autocontrast_image_tensor = _FT.autocontrast autocontrast_image_pil = _FP.autocontrast +def autocontrast_video(video: torch.Tensor) -> torch.Tensor: + return autocontrast_image_tensor(video) + + def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return autocontrast_image_tensor(inpt) @@ -119,10 +183,35 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT: return autocontrast_image_pil(inpt) -equalize_image_tensor = _FT.equalize +def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: + if image.dtype != torch.uint8: + raise TypeError(f"Only torch.uint8 image tensors are supported, but found {image.dtype}") + + num_channels, height, width = get_dimensions_image_tensor(image) + if num_channels not in (1, 3): + raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}") + + if image.numel() == 0: + return image + elif image.ndim == 2: + return _FT._scale_channel(image) + else: + return torch.stack( + [ + # TODO: when merging transforms v1 and v2, we can inline this function call + _FT._equalize_single_image(single_image) + for single_image in image.view(-1, num_channels, height, width) + ] + ).view(image.shape) + + equalize_image_pil = _FP.equalize +def equalize_video(video: torch.Tensor) -> torch.Tensor: + return equalize_image_tensor(video) + + def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return equalize_image_tensor(inpt) @@ -136,6 +225,10 @@ invert_image_tensor = _FT.invert invert_image_pil = _FP.invert +def invert_video(video: torch.Tensor) -> torch.Tensor: + return invert_image_tensor(video) + + def invert(inpt: features.InputTypeJIT) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return invert_image_tensor(inpt) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 7a291967b..f205b5aea 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -47,6 +47,10 @@ def horizontal_flip_bounding_box( ).view(shape) +def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor: + return horizontal_flip_image_tensor(video) + + def horizontal_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return horizontal_flip_image_tensor(inpt) @@ -80,6 +84,10 @@ def vertical_flip_bounding_box( ).view(shape) +def vertical_flip_video(video: torch.Tensor) -> torch.Tensor: + return vertical_flip_image_tensor(video) + + def vertical_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return vertical_flip_image_tensor(inpt) @@ -185,6 +193,16 @@ def resize_bounding_box( ) +def resize_video( + video: torch.Tensor, + size: List[int], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + max_size: Optional[int] = None, + antialias: bool = False, +) -> torch.Tensor: + return resize_image_tensor(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias) + + def resize( inpt: features.InputTypeJIT, size: List[int], @@ -441,6 +459,28 @@ def affine_mask( return output +def affine_video( + video: torch.Tensor, + angle: Union[int, float], + translate: List[float], + scale: float, + shear: List[float], + interpolation: InterpolationMode = InterpolationMode.NEAREST, + fill: features.FillTypeJIT = None, + center: Optional[List[float]] = None, +) -> torch.Tensor: + return affine_image_tensor( + video, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + interpolation=interpolation, + fill=fill, + center=center, + ) + + def _convert_fill_arg(fill: features.FillType) -> features.FillTypeJIT: # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517 # So, we can't reassign fill to 0 @@ -614,6 +654,17 @@ def rotate_mask( return output +def rotate_video( + video: torch.Tensor, + angle: float, + interpolation: InterpolationMode = InterpolationMode.NEAREST, + expand: bool = False, + fill: features.FillTypeJIT = None, + center: Optional[List[float]] = None, +) -> torch.Tensor: + return rotate_image_tensor(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center) + + def rotate( inpt: features.InputTypeJIT, angle: float, @@ -751,6 +802,15 @@ def pad_bounding_box( return bounding_box, (height, width) +def pad_video( + video: torch.Tensor, + padding: Union[int, List[int]], + fill: features.FillTypeJIT = None, + padding_mode: str = "constant", +) -> torch.Tensor: + return pad_image_tensor(video, padding, fill=fill, padding_mode=padding_mode) + + def pad( inpt: features.InputTypeJIT, padding: Union[int, List[int]], @@ -798,6 +858,10 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) return crop_image_tensor(mask, top, left, height, width) +def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: + return crop_image_tensor(video, top, left, height, width) + + def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: int) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return crop_image_tensor(inpt, top, left, height, width) @@ -932,6 +996,33 @@ def perspective_mask( return output +def perspective_video( + video: torch.Tensor, + perspective_coeffs: List[float], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: features.FillTypeJIT = None, +) -> torch.Tensor: + # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when + # https://github.com/pytorch/vision/issues/6670 is resolved. + if video.numel() == 0: + return video + + shape = video.shape + + if video.ndim > 4: + video = video.view((-1,) + shape[-3:]) + needs_unsquash = True + else: + needs_unsquash = False + + output = perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill) + + if needs_unsquash: + output = output.view(shape) + + return output + + def perspective( inpt: features.InputTypeJIT, perspective_coeffs: List[float], @@ -1026,6 +1117,33 @@ def elastic_mask( return output +def elastic_video( + video: torch.Tensor, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: features.FillTypeJIT = None, +) -> torch.Tensor: + # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when + # https://github.com/pytorch/vision/issues/6670 is resolved. + if video.numel() == 0: + return video + + shape = video.shape + + if video.ndim > 4: + video = video.view((-1,) + shape[-3:]) + needs_unsquash = True + else: + needs_unsquash = False + + output = elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill) + + if needs_unsquash: + output = output.view(shape) + + return output + + def elastic( inpt: features.InputTypeJIT, displacement: torch.Tensor, @@ -1128,6 +1246,10 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor return output +def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor: + return center_crop_image_tensor(video, output_size) + + def center_crop(inpt: features.InputTypeJIT, output_size: List[int]) -> features.InputTypeJIT: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return center_crop_image_tensor(inpt, output_size) @@ -1190,6 +1312,21 @@ def resized_crop_mask( return resize_mask(mask, size) +def resized_crop_video( + video: torch.Tensor, + top: int, + left: int, + height: int, + width: int, + size: List[int], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + antialias: bool = False, +) -> torch.Tensor: + return resized_crop_image_tensor( + video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation + ) + + def resized_crop( inpt: features.InputTypeJIT, top: int, diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index 90cfffcf2..1e53edf39 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -11,10 +11,12 @@ get_dimensions_image_pil = _FP.get_dimensions # TODO: Should this be prefixed with `_` similar to other methods that don't get exposed by init? -def get_chw(image: features.ImageTypeJIT) -> Tuple[int, int, int]: - if isinstance(image, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(image, features.Image)): +def get_chw(image: features.ImageOrVideoTypeJIT) -> Tuple[int, int, int]: + if isinstance(image, torch.Tensor) and ( + torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) + ): channels, height, width = get_dimensions_image_tensor(image) - elif isinstance(image, features.Image): + elif isinstance(image, (features.Image, features.Video)): channels = image.num_channels height, width = image.image_size else: # isinstance(image, PIL.Image.Image) @@ -29,11 +31,11 @@ def get_chw(image: features.ImageTypeJIT) -> Tuple[int, int, int]: # detailed above. -def get_dimensions(image: features.ImageTypeJIT) -> List[int]: +def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: return list(get_chw(image)) -def get_num_channels(image: features.ImageTypeJIT) -> int: +def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int: num_channels, *_ = get_chw(image) return num_channels @@ -43,7 +45,7 @@ def get_num_channels(image: features.ImageTypeJIT) -> int: get_image_num_channels = get_num_channels -def get_spatial_size(image: features.ImageTypeJIT) -> List[int]: +def get_spatial_size(image: features.ImageOrVideoTypeJIT) -> List[int]: _, *size = get_chw(image) return size @@ -207,13 +209,23 @@ def convert_color_space_image_pil( return image.convert(new_mode) +def convert_color_space_video( + video: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True +) -> torch.Tensor: + return convert_color_space_image_tensor( + video, old_color_space=old_color_space, new_color_space=new_color_space, copy=copy + ) + + def convert_color_space( - inpt: features.ImageTypeJIT, + inpt: features.ImageOrVideoTypeJIT, color_space: ColorSpace, old_color_space: Optional[ColorSpace] = None, copy: bool = True, -) -> features.ImageTypeJIT: - if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Image)): +) -> features.ImageOrVideoTypeJIT: + if isinstance(inpt, torch.Tensor) and ( + torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video)) + ): if old_color_space is None: raise RuntimeError( "In order to convert the color space of simple tensor images, " @@ -222,7 +234,7 @@ def convert_color_space( return convert_color_space_image_tensor( inpt, old_color_space=old_color_space, new_color_space=color_space, copy=copy ) - elif isinstance(inpt, features.Image): + elif isinstance(inpt, (features.Image, features.Video)): return inpt.to_color_space(color_space, copy=copy) else: - return cast(features.ImageTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy)) + return cast(features.ImageOrVideoTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy)) diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 6f35781d4..7b3773e63 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -9,18 +9,22 @@ from torchvision.transforms.functional import pil_to_tensor, to_pil_image normalize_image_tensor = _FT.normalize +def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor: + return normalize_image_tensor(video, mean, std, inplace=inplace) + + def normalize( - inpt: features.TensorImageTypeJIT, mean: List[float], std: List[float], inplace: bool = False + inpt: features.TensorImageOrVideoTypeJIT, mean: List[float], std: List[float], inplace: bool = False ) -> torch.Tensor: if torch.jit.is_scripting(): correct_type = isinstance(inpt, torch.Tensor) else: - correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, features.Image) + correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, (features.Image, features.Video)) inpt = inpt.as_subclass(torch.Tensor) if not correct_type: raise TypeError(f"img should be Tensor Image. Got {type(inpt)}") - # Image instance after normalization is not Image anymore due to unknown data range + # Image or Video type should not be retained after normalization due to unknown data range # Thus we return Tensor for input Image return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace) @@ -64,6 +68,30 @@ def gaussian_blur_image_pil( return to_pil_image(output, mode=image.mode) +def gaussian_blur_video( + video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None +) -> torch.Tensor: + # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when + # https://github.com/pytorch/vision/issues/6670 is resolved. + if video.numel() == 0: + return video + + shape = video.shape + + if video.ndim > 4: + video = video.view((-1,) + shape[-3:]) + needs_unsquash = True + else: + needs_unsquash = False + + output = gaussian_blur_image_tensor(video, kernel_size, sigma) + + if needs_unsquash: + output = output.view(shape) + + return output + + def gaussian_blur( inpt: features.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None ) -> features.InputTypeJIT: -- GitLab From 4c049ca3b74c2f93bb2acd952548626aada08fe0 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 7 Oct 2022 17:35:23 +0200 Subject: [PATCH 021/624] replace new_like with wrap_like (#6718) * replace new_like with wrap_like * fix videos * revert casting in favor of ignoring mypy --- test/test_prototype_features.py | 4 +- test/test_prototype_transforms.py | 17 ++-- .../prototype/features/_bounding_box.py | 57 +++++++------ torchvision/prototype/features/_encoded.py | 11 ++- torchvision/prototype/features/_feature.py | 55 +++++------- torchvision/prototype/features/_image.py | 85 ++++++++++--------- torchvision/prototype/features/_label.py | 20 +++-- torchvision/prototype/features/_mask.py | 47 +++++++--- torchvision/prototype/features/_video.py | 74 ++++++++-------- torchvision/prototype/transforms/_augment.py | 22 ++--- .../prototype/transforms/_auto_augment.py | 2 +- torchvision/prototype/transforms/_color.py | 3 +- .../prototype/transforms/_deprecated.py | 4 +- torchvision/prototype/transforms/_geometry.py | 16 ++-- torchvision/prototype/transforms/_meta.py | 10 ++- torchvision/prototype/transforms/_misc.py | 2 +- .../transforms/functional/_augment.py | 2 +- .../transforms/functional/_geometry.py | 4 +- 18 files changed, 239 insertions(+), 196 deletions(-) diff --git a/test/test_prototype_features.py b/test/test_prototype_features.py index 2701dd66b..d2b0d2e63 100644 --- a/test/test_prototype_features.py +++ b/test/test_prototype_features.py @@ -99,14 +99,14 @@ def test_inplace_op_no_wrapping(): assert type(label) is features.Label -def test_new_like(): +def test_wrap_like(): tensor = torch.tensor([0, 1, 0], dtype=torch.int64) label = features.Label(tensor, categories=["foo", "bar"]) # any operation besides .to() and .clone() will do here output = label * 2 - label_new = features.Label.new_like(label, output) + label_new = features.Label.wrap_like(label, output) assert type(label_new) is features.Label assert label_new.data_ptr() == output.data_ptr() diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 916861f4e..4037a7467 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -8,6 +8,7 @@ import pytest import torch from common_utils import assert_equal, cpu_and_gpu from prototype_common_utils import ( + DEFAULT_EXTRA_DIMS, make_bounding_box, make_bounding_boxes, make_detection_mask, @@ -23,6 +24,8 @@ from torchvision.ops.boxes import box_iou from torchvision.prototype import features, transforms from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image +BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims] + def make_vanilla_tensor_images(*args, **kwargs): for image in make_images(*args, **kwargs): @@ -109,13 +112,11 @@ class TestSmoke: ( transform, [ - dict( - image=features.Image.new_like(image, image.unsqueeze(0), dtype=torch.float), - one_hot_label=features.OneHotLabel.new_like( - one_hot_label, one_hot_label.unsqueeze(0), dtype=torch.float - ), + dict(image=image, one_hot_label=one_hot_label) + for image, one_hot_label in itertools.product( + make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]), + make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]), ) - for image, one_hot_label in itertools.product(make_images(), make_one_hot_labels()) ], ) for transform in [ @@ -300,7 +301,7 @@ class TestRandomHorizontalFlip: actual = transform(input) expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input - expected = features.BoundingBox.new_like(input, data=expected_image_tensor) + expected = features.BoundingBox.wrap_like(input, expected_image_tensor) assert_equal(expected, actual) assert actual.format == expected.format assert actual.image_size == expected.image_size @@ -353,7 +354,7 @@ class TestRandomVerticalFlip: actual = transform(input) expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input - expected = features.BoundingBox.new_like(input, data=expected_image_tensor) + expected = features.BoundingBox.wrap_like(input, expected_image_tensor) assert_equal(expected, actual) assert actual.format == expected.format assert actual.image_size == expected.image_size diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py index 9ccd4fa62..7b69af5f9 100644 --- a/torchvision/prototype/features/_bounding_box.py +++ b/torchvision/prototype/features/_bounding_box.py @@ -19,6 +19,13 @@ class BoundingBox(_Feature): format: BoundingBoxFormat image_size: Tuple[int, int] + @classmethod + def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, image_size: Tuple[int, int]) -> BoundingBox: + bounding_box = tensor.as_subclass(cls) + bounding_box.format = format + bounding_box.image_size = image_size + return bounding_box + def __new__( cls, data: Any, @@ -29,52 +36,46 @@ class BoundingBox(_Feature): device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, ) -> BoundingBox: - bounding_box = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad) + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) if isinstance(format, str): format = BoundingBoxFormat.from_str(format.upper()) - bounding_box.format = format - - bounding_box.image_size = image_size - return bounding_box - - def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] - return self._make_repr(format=self.format, image_size=self.image_size) + return cls._wrap(tensor, format=format, image_size=image_size) @classmethod - def new_like( + def wrap_like( cls, other: BoundingBox, - data: Any, + tensor: torch.Tensor, *, - format: Optional[Union[BoundingBoxFormat, str]] = None, + format: Optional[BoundingBoxFormat] = None, image_size: Optional[Tuple[int, int]] = None, - **kwargs: Any, ) -> BoundingBox: - return super().new_like( - other, - data, + return cls._wrap( + tensor, format=format if format is not None else other.format, image_size=image_size if image_size is not None else other.image_size, - **kwargs, ) + def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] + return self._make_repr(format=self.format, image_size=self.image_size) + def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox: if isinstance(format, str): format = BoundingBoxFormat.from_str(format.upper()) - return BoundingBox.new_like( + return BoundingBox.wrap_like( self, self._F.convert_format_bounding_box(self, old_format=self.format, new_format=format), format=format ) def horizontal_flip(self) -> BoundingBox: output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size) - return BoundingBox.new_like(self, output) + return BoundingBox.wrap_like(self, output) def vertical_flip(self) -> BoundingBox: output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size) - return BoundingBox.new_like(self, output) + return BoundingBox.wrap_like(self, output) def resize( # type: ignore[override] self, @@ -84,19 +85,19 @@ class BoundingBox(_Feature): antialias: bool = False, ) -> BoundingBox: output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size) - return BoundingBox.new_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, image_size=image_size) def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox: output, image_size = self._F.crop_bounding_box( self, self.format, top=top, left=left, height=height, width=width ) - return BoundingBox.new_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, image_size=image_size) def center_crop(self, output_size: List[int]) -> BoundingBox: output, image_size = self._F.center_crop_bounding_box( self, format=self.format, image_size=self.image_size, output_size=output_size ) - return BoundingBox.new_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, image_size=image_size) def resized_crop( self, @@ -109,7 +110,7 @@ class BoundingBox(_Feature): antialias: bool = False, ) -> BoundingBox: output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size) - return BoundingBox.new_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, image_size=image_size) def pad( self, @@ -120,7 +121,7 @@ class BoundingBox(_Feature): output, image_size = self._F.pad_bounding_box( self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode ) - return BoundingBox.new_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, image_size=image_size) def rotate( self, @@ -133,7 +134,7 @@ class BoundingBox(_Feature): output, image_size = self._F.rotate_bounding_box( self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center ) - return BoundingBox.new_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, image_size=image_size) def affine( self, @@ -155,7 +156,7 @@ class BoundingBox(_Feature): shear=shear, center=center, ) - return BoundingBox.new_like(self, output, dtype=output.dtype) + return BoundingBox.wrap_like(self, output) def perspective( self, @@ -164,7 +165,7 @@ class BoundingBox(_Feature): fill: FillTypeJIT = None, ) -> BoundingBox: output = self._F.perspective_bounding_box(self, self.format, perspective_coeffs) - return BoundingBox.new_like(self, output, dtype=output.dtype) + return BoundingBox.wrap_like(self, output) def elastic( self, @@ -173,4 +174,4 @@ class BoundingBox(_Feature): fill: FillTypeJIT = None, ) -> BoundingBox: output = self._F.elastic_bounding_box(self, self.format, displacement) - return BoundingBox.new_like(self, output, dtype=output.dtype) + return BoundingBox.wrap_like(self, output) diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py index 0ec14ab20..4b963986b 100644 --- a/torchvision/prototype/features/_encoded.py +++ b/torchvision/prototype/features/_encoded.py @@ -14,6 +14,10 @@ D = TypeVar("D", bound="EncodedData") class EncodedData(_Feature): + @classmethod + def _wrap(cls: Type[D], tensor: torch.Tensor) -> D: + return tensor.as_subclass(cls) + def __new__( cls, data: Any, @@ -22,8 +26,13 @@ class EncodedData(_Feature): device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, ) -> EncodedData: + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) # TODO: warn / bail out if we encounter a tensor with shape other than (N,) or with dtype other than uint8? - return super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad) + return cls._wrap(tensor) + + @classmethod + def wrap_like(cls: Type[D], other: D, tensor: torch.Tensor) -> D: + return cls._wrap(tensor) @classmethod def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D: diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py index 2da10be90..a56441f29 100644 --- a/torchvision/prototype/features/_feature.py +++ b/torchvision/prototype/features/_feature.py @@ -21,48 +21,39 @@ def is_simple_tensor(inpt: Any) -> bool: class _Feature(torch.Tensor): __F: Optional[ModuleType] = None - def __new__( - cls: Type[F], + @staticmethod + def _to_tensor( data: Any, - *, dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, - ) -> F: - return ( - torch.as_tensor( # type: ignore[return-value] - data, - dtype=dtype, - device=device, - ) - .as_subclass(cls) - .requires_grad_(requires_grad) - ) + ) -> torch.Tensor: + return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad) - @classmethod - def new_like( - cls: Type[F], - other: F, + # FIXME: this is just here for BC with the prototype datasets. Some datasets use the _Feature directly to have a + # a no-op input for the prototype transforms. For this use case, we can't use plain tensors, since they will be + # interpreted as images. We should decide if we want a public no-op feature like `GenericFeature` or make this one + # public again. + def __new__( + cls, data: Any, - *, dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str, int]] = None, - requires_grad: Optional[bool] = None, - **kwargs: Any, - ) -> F: - return cls( - data, - dtype=dtype if dtype is not None else other.dtype, - device=device if device is not None else other.device, - requires_grad=requires_grad if requires_grad is not None else other.requires_grad, - **kwargs, - ) + requires_grad: bool = False, + ) -> _Feature: + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) + return tensor.as_subclass(_Feature) + + @classmethod + def wrap_like(cls: Type[F], other: F, tensor: torch.Tensor) -> F: + # FIXME: this is just here for BC with the prototype datasets. See __new__ for details. If that is resolved, + # this method should be made abstract + # raise NotImplementedError + return tensor.as_subclass(cls) _NO_WRAPPING_EXCEPTIONS = { - torch.Tensor.clone: lambda cls, input, output: cls.new_like(input, output), - torch.Tensor.to: lambda cls, input, output: cls.new_like( - input, output, dtype=output.dtype, device=output.device - ), + torch.Tensor.clone: lambda cls, input, output: cls.wrap_like(input, output), + torch.Tensor.to: lambda cls, input, output: cls.wrap_like(input, output), # We don't need to wrap the output of `Tensor.requires_grad_`, since it is an inplace operation and thus # retains the type automatically torch.Tensor.requires_grad_: lambda cls, input, output: output, diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index c953ae78c..23f81678d 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -62,6 +62,12 @@ def _from_tensor_shape(shape: List[int]) -> ColorSpace: class Image(_Feature): color_space: ColorSpace + @classmethod + def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Image: + image = tensor.as_subclass(cls) + image.color_space = color_space + return image + def __new__( cls, data: Any, @@ -71,36 +77,33 @@ class Image(_Feature): device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, ) -> Image: - data = torch.as_tensor(data, dtype=dtype, device=device) - if data.ndim < 2: + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) + if tensor.ndim < 2: raise ValueError - elif data.ndim == 2: - data = data.unsqueeze(0) - image = super().__new__(cls, data, requires_grad=requires_grad) + elif tensor.ndim == 2: + tensor = tensor.unsqueeze(0) if color_space is None: - color_space = ColorSpace.from_tensor_shape(image.shape) # type: ignore[arg-type] + color_space = ColorSpace.from_tensor_shape(tensor.shape) # type: ignore[arg-type] if color_space == ColorSpace.OTHER: warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.") elif isinstance(color_space, str): color_space = ColorSpace.from_str(color_space.upper()) elif not isinstance(color_space, ColorSpace): raise ValueError - image.color_space = color_space - return image - - def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] - return self._make_repr(color_space=self.color_space) + return cls._wrap(tensor, color_space=color_space) @classmethod - def new_like( - cls, other: Image, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any - ) -> Image: - return super().new_like( - other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs + def wrap_like(cls, other: Image, tensor: torch.Tensor, *, color_space: Optional[ColorSpace] = None) -> Image: + return cls._wrap( + tensor, + color_space=color_space if color_space is not None else other.color_space, ) + def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] + return self._make_repr(color_space=self.color_space) + @property def image_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) @@ -113,7 +116,7 @@ class Image(_Feature): if isinstance(color_space, str): color_space = ColorSpace.from_str(color_space.upper()) - return Image.new_like( + return Image.wrap_like( self, self._F.convert_color_space_image_tensor( self, old_color_space=self.color_space, new_color_space=color_space, copy=copy @@ -129,15 +132,15 @@ class Image(_Feature): def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image: # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we # promote this out of the prototype state - return Image.new_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs)) + return Image.wrap_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs)) def horizontal_flip(self) -> Image: output = self._F.horizontal_flip_image_tensor(self) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def vertical_flip(self) -> Image: output = self._F.vertical_flip_image_tensor(self) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def resize( # type: ignore[override] self, @@ -149,15 +152,15 @@ class Image(_Feature): output = self._F.resize_image_tensor( self, size, interpolation=interpolation, max_size=max_size, antialias=antialias ) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def crop(self, top: int, left: int, height: int, width: int) -> Image: output = self._F.crop_image_tensor(self, top, left, height, width) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def center_crop(self, output_size: List[int]) -> Image: output = self._F.center_crop_image_tensor(self, output_size=output_size) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def resized_crop( self, @@ -172,7 +175,7 @@ class Image(_Feature): output = self._F.resized_crop_image_tensor( self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias ) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def pad( self, @@ -181,7 +184,7 @@ class Image(_Feature): padding_mode: str = "constant", ) -> Image: output = self._F.pad_image_tensor(self, padding, fill=fill, padding_mode=padding_mode) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def rotate( self, @@ -194,7 +197,7 @@ class Image(_Feature): output = self._F._geometry.rotate_image_tensor( self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center ) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def affine( self, @@ -216,7 +219,7 @@ class Image(_Feature): fill=fill, center=center, ) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def perspective( self, @@ -227,7 +230,7 @@ class Image(_Feature): output = self._F._geometry.perspective_image_tensor( self, perspective_coeffs, interpolation=interpolation, fill=fill ) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def elastic( self, @@ -236,55 +239,55 @@ class Image(_Feature): fill: FillTypeJIT = None, ) -> Image: output = self._F._geometry.elastic_image_tensor(self, displacement, interpolation=interpolation, fill=fill) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def adjust_brightness(self, brightness_factor: float) -> Image: output = self._F.adjust_brightness_image_tensor(self, brightness_factor=brightness_factor) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def adjust_saturation(self, saturation_factor: float) -> Image: output = self._F.adjust_saturation_image_tensor(self, saturation_factor=saturation_factor) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def adjust_contrast(self, contrast_factor: float) -> Image: output = self._F.adjust_contrast_image_tensor(self, contrast_factor=contrast_factor) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def adjust_sharpness(self, sharpness_factor: float) -> Image: output = self._F.adjust_sharpness_image_tensor(self, sharpness_factor=sharpness_factor) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def adjust_hue(self, hue_factor: float) -> Image: output = self._F.adjust_hue_image_tensor(self, hue_factor=hue_factor) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def adjust_gamma(self, gamma: float, gain: float = 1) -> Image: output = self._F.adjust_gamma_image_tensor(self, gamma=gamma, gain=gain) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def posterize(self, bits: int) -> Image: output = self._F.posterize_image_tensor(self, bits=bits) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def solarize(self, threshold: float) -> Image: output = self._F.solarize_image_tensor(self, threshold=threshold) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def autocontrast(self) -> Image: output = self._F.autocontrast_image_tensor(self) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def equalize(self) -> Image: output = self._F.equalize_image_tensor(self) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def invert(self) -> Image: output = self._F.invert_image_tensor(self) - return Image.new_like(self, output) + return Image.wrap_like(self, output) def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image: output = self._F.gaussian_blur_image_tensor(self, kernel_size=kernel_size, sigma=sigma) - return Image.new_like(self, output) + return Image.wrap_like(self, output) ImageType = Union[torch.Tensor, PIL.Image.Image, Image] diff --git a/torchvision/prototype/features/_label.py b/torchvision/prototype/features/_label.py index ebaa84d66..9c2bcfc0f 100644 --- a/torchvision/prototype/features/_label.py +++ b/torchvision/prototype/features/_label.py @@ -14,6 +14,12 @@ L = TypeVar("L", bound="_LabelBase") class _LabelBase(_Feature): categories: Optional[Sequence[str]] + @classmethod + def _wrap(cls: Type[L], tensor: torch.Tensor, *, categories: Optional[Sequence[str]]) -> L: + label_base = tensor.as_subclass(cls) + label_base.categories = categories + return label_base + def __new__( cls: Type[L], data: Any, @@ -23,16 +29,14 @@ class _LabelBase(_Feature): device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, ) -> L: - label_base = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad) - - label_base.categories = categories - - return label_base + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) + return cls._wrap(tensor, categories=categories) @classmethod - def new_like(cls: Type[L], other: L, data: Any, *, categories: Optional[Sequence[str]] = None, **kwargs: Any) -> L: - return super().new_like( - other, data, categories=categories if categories is not None else other.categories, **kwargs + def wrap_like(cls: Type[L], other: L, tensor: torch.Tensor, *, categories: Optional[Sequence[str]] = None) -> L: + return cls._wrap( + tensor, + categories=categories if categories is not None else other.categories, ) @classmethod diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py index 9dd614752..65793dc45 100644 --- a/torchvision/prototype/features/_mask.py +++ b/torchvision/prototype/features/_mask.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List, Optional, Union +from typing import Any, List, Optional, Union import torch from torchvision.transforms import InterpolationMode @@ -9,13 +9,36 @@ from ._feature import _Feature, FillTypeJIT class Mask(_Feature): + @classmethod + def _wrap(cls, tensor: torch.Tensor) -> Mask: + return tensor.as_subclass(cls) + + def __new__( + cls, + data: Any, + *, + dtype: Optional[torch.dtype] = None, + device: Optional[Union[torch.device, str, int]] = None, + requires_grad: bool = False, + ) -> Mask: + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) + return cls._wrap(tensor) + + @classmethod + def wrap_like( + cls, + other: Mask, + tensor: torch.Tensor, + ) -> Mask: + return cls._wrap(tensor) + def horizontal_flip(self) -> Mask: output = self._F.horizontal_flip_mask(self) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def vertical_flip(self) -> Mask: output = self._F.vertical_flip_mask(self) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def resize( # type: ignore[override] self, @@ -25,15 +48,15 @@ class Mask(_Feature): antialias: bool = False, ) -> Mask: output = self._F.resize_mask(self, size, max_size=max_size) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def crop(self, top: int, left: int, height: int, width: int) -> Mask: output = self._F.crop_mask(self, top, left, height, width) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def center_crop(self, output_size: List[int]) -> Mask: output = self._F.center_crop_mask(self, output_size=output_size) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def resized_crop( self, @@ -46,7 +69,7 @@ class Mask(_Feature): antialias: bool = False, ) -> Mask: output = self._F.resized_crop_mask(self, top, left, height, width, size=size) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def pad( self, @@ -55,7 +78,7 @@ class Mask(_Feature): padding_mode: str = "constant", ) -> Mask: output = self._F.pad_mask(self, padding, padding_mode=padding_mode, fill=fill) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def rotate( self, @@ -66,7 +89,7 @@ class Mask(_Feature): center: Optional[List[float]] = None, ) -> Mask: output = self._F.rotate_mask(self, angle, expand=expand, center=center, fill=fill) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def affine( self, @@ -87,7 +110,7 @@ class Mask(_Feature): fill=fill, center=center, ) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def perspective( self, @@ -96,7 +119,7 @@ class Mask(_Feature): fill: FillTypeJIT = None, ) -> Mask: output = self._F.perspective_mask(self, perspective_coeffs, fill=fill) - return Mask.new_like(self, output) + return Mask.wrap_like(self, output) def elastic( self, @@ -105,4 +128,4 @@ class Mask(_Feature): fill: FillTypeJIT = None, ) -> Mask: output = self._F.elastic_mask(self, displacement, fill=fill) - return Mask.new_like(self, output, dtype=output.dtype) + return Mask.wrap_like(self, output) diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py index e19b6f7ed..a58027243 100644 --- a/torchvision/prototype/features/_video.py +++ b/torchvision/prototype/features/_video.py @@ -13,6 +13,12 @@ from ._image import ColorSpace, ImageType, ImageTypeJIT, TensorImageType, Tensor class Video(_Feature): color_space: ColorSpace + @classmethod + def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Video: + image = tensor.as_subclass(cls) + image.color_space = color_space + return image + def __new__( cls, data: Any, @@ -22,7 +28,7 @@ class Video(_Feature): device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, ) -> Video: - data = torch.as_tensor(data, dtype=dtype, device=device) + tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) if data.ndim < 4: raise ValueError video = super().__new__(cls, data, requires_grad=requires_grad) @@ -35,21 +41,19 @@ class Video(_Feature): color_space = ColorSpace.from_str(color_space.upper()) elif not isinstance(color_space, ColorSpace): raise ValueError - video.color_space = color_space - - return video - def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] - return self._make_repr(color_space=self.color_space) + return cls._wrap(tensor, color_space=color_space) @classmethod - def new_like( - cls, other: Video, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any - ) -> Video: - return super().new_like( - other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs + def wrap_like(cls, other: Video, tensor: torch.Tensor, *, color_space: Optional[ColorSpace] = None) -> Video: + return cls._wrap( + tensor, + color_space=color_space if color_space is not None else other.color_space, ) + def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] + return self._make_repr(color_space=self.color_space) + # TODO: rename this (and all instances of this term to spatial size) @property def image_size(self) -> Tuple[int, int]: @@ -67,7 +71,7 @@ class Video(_Feature): if isinstance(color_space, str): color_space = ColorSpace.from_str(color_space.upper()) - return Video.new_like( + return Video.wrap_like( self, self._F.convert_color_space_video( self, old_color_space=self.color_space, new_color_space=color_space, copy=copy @@ -77,11 +81,11 @@ class Video(_Feature): def horizontal_flip(self) -> Video: output = self._F.horizontal_flip_video(self) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def vertical_flip(self) -> Video: output = self._F.vertical_flip_video(self) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def resize( # type: ignore[override] self, @@ -91,15 +95,15 @@ class Video(_Feature): antialias: bool = False, ) -> Video: output = self._F.resize_video(self, size, interpolation=interpolation, max_size=max_size, antialias=antialias) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def crop(self, top: int, left: int, height: int, width: int) -> Video: output = self._F.crop_video(self, top, left, height, width) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def center_crop(self, output_size: List[int]) -> Video: output = self._F.center_crop_video(self, output_size=output_size) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def resized_crop( self, @@ -114,7 +118,7 @@ class Video(_Feature): output = self._F.resized_crop_video( self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias ) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def pad( self, @@ -123,7 +127,7 @@ class Video(_Feature): padding_mode: str = "constant", ) -> Video: output = self._F.pad_video(self, padding, fill=fill, padding_mode=padding_mode) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def rotate( self, @@ -136,7 +140,7 @@ class Video(_Feature): output = self._F._geometry.rotate_video( self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center ) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def affine( self, @@ -158,7 +162,7 @@ class Video(_Feature): fill=fill, center=center, ) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def perspective( self, @@ -167,7 +171,7 @@ class Video(_Feature): fill: FillTypeJIT = None, ) -> Video: output = self._F._geometry.perspective_video(self, perspective_coeffs, interpolation=interpolation, fill=fill) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def elastic( self, @@ -176,55 +180,55 @@ class Video(_Feature): fill: FillTypeJIT = None, ) -> Video: output = self._F._geometry.elastic_video(self, displacement, interpolation=interpolation, fill=fill) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def adjust_brightness(self, brightness_factor: float) -> Video: output = self._F.adjust_brightness_video(self, brightness_factor=brightness_factor) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def adjust_saturation(self, saturation_factor: float) -> Video: output = self._F.adjust_saturation_video(self, saturation_factor=saturation_factor) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def adjust_contrast(self, contrast_factor: float) -> Video: output = self._F.adjust_contrast_video(self, contrast_factor=contrast_factor) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def adjust_sharpness(self, sharpness_factor: float) -> Video: output = self._F.adjust_sharpness_video(self, sharpness_factor=sharpness_factor) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def adjust_hue(self, hue_factor: float) -> Video: output = self._F.adjust_hue_video(self, hue_factor=hue_factor) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def adjust_gamma(self, gamma: float, gain: float = 1) -> Video: output = self._F.adjust_gamma_video(self, gamma=gamma, gain=gain) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def posterize(self, bits: int) -> Video: output = self._F.posterize_video(self, bits=bits) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def solarize(self, threshold: float) -> Video: output = self._F.solarize_video(self, threshold=threshold) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def autocontrast(self) -> Video: output = self._F.autocontrast_video(self) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def equalize(self) -> Video: output = self._F.equalize_video(self) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def invert(self) -> Video: output = self._F.invert_video(self) - return Video.new_like(self, output) + return Video.wrap_like(self, output) def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Video: output = self._F.gaussian_blur_video(self, kernel_size=kernel_size, sigma=sigma) - return Video.new_like(self, output) + return Video.wrap_like(self, output) VideoType = Union[torch.Tensor, Video] diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 311ad6d5a..bcab0a3f4 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -119,7 +119,7 @@ class _BaseMixupCutmix(_RandomApplyTransform): raise ValueError("Need a batch of one hot labels") output = inpt.clone() output = output.roll(1, -2).mul_(1 - lam).add_(output.mul_(lam)) - return features.OneHotLabel.new_like(inpt, output) + return features.OneHotLabel.wrap_like(inpt, output) class RandomMixup(_BaseMixupCutmix): @@ -135,7 +135,7 @@ class RandomMixup(_BaseMixupCutmix): output = output.roll(1, -4).mul_(1 - lam).add_(output.mul_(lam)) if isinstance(inpt, features.Image): - output = features.Image.new_like(inpt, output) + output = features.Image.wrap_like(inpt, output) return output elif isinstance(inpt, features.OneHotLabel): @@ -178,7 +178,7 @@ class RandomCutmix(_BaseMixupCutmix): output[..., y1:y2, x1:x2] = image_rolled[..., y1:y2, x1:x2] if isinstance(inpt, features.Image): - output = features.Image.new_like(inpt, output) + output = features.Image.wrap_like(inpt, output) return output elif isinstance(inpt, features.OneHotLabel): @@ -213,9 +213,11 @@ class SimpleCopyPaste(_RandomApplyTransform): antialias: Optional[bool], ) -> Tuple[features.TensorImageType, Dict[str, Any]]: - paste_masks = paste_target["masks"].new_like(paste_target["masks"], paste_target["masks"][random_selection]) - paste_boxes = paste_target["boxes"].new_like(paste_target["boxes"], paste_target["boxes"][random_selection]) - paste_labels = paste_target["labels"].new_like(paste_target["labels"], paste_target["labels"][random_selection]) + paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection]) + paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection]) + paste_labels = paste_target["labels"].wrap_like( + paste_target["labels"], paste_target["labels"][random_selection] + ) masks = target["masks"] @@ -317,7 +319,7 @@ class SimpleCopyPaste(_RandomApplyTransform): c0, c1, c2, c3 = 0, 0, 0, 0 for i, obj in enumerate(flat_sample): if isinstance(obj, features.Image): - flat_sample[i] = features.Image.new_like(obj, output_images[c0]) + flat_sample[i] = features.Image.wrap_like(obj, output_images[c0]) c0 += 1 elif isinstance(obj, PIL.Image.Image): flat_sample[i] = F.to_image_pil(output_images[c0]) @@ -326,13 +328,13 @@ class SimpleCopyPaste(_RandomApplyTransform): flat_sample[i] = output_images[c0] c0 += 1 elif isinstance(obj, features.BoundingBox): - flat_sample[i] = features.BoundingBox.new_like(obj, output_targets[c1]["boxes"]) + flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"]) c1 += 1 elif isinstance(obj, features.Mask): - flat_sample[i] = features.Mask.new_like(obj, output_targets[c2]["masks"]) + flat_sample[i] = features.Mask.wrap_like(obj, output_targets[c2]["masks"]) c2 += 1 elif isinstance(obj, (features.Label, features.OneHotLabel)): - flat_sample[i] = obj.new_like(obj, output_targets[c3]["labels"]) # type: ignore[arg-type] + flat_sample[i] = obj.wrap_like(obj, output_targets[c3]["labels"]) # type: ignore[arg-type] c3 += 1 def forward(self, *inputs: Any) -> Any: diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index 4732f88d4..7e28d9d6c 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -520,7 +520,7 @@ class AugMix(_AutoAugmentBase): mix = mix.view(orig_dims).to(dtype=image_or_video.dtype) if isinstance(orig_image_or_video, (features.Image, features.Video)): - mix = type(orig_image_or_video).new_like(orig_image_or_video, mix) # type: ignore[arg-type] + mix = type(orig_image_or_video).wrap_like(orig_image_or_video, mix) # type: ignore[arg-type] elif isinstance(orig_image_or_video, PIL.Image.Image): mix = F.to_image_pil(mix) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index 451b57b66..67a6cc3cc 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -119,7 +119,8 @@ class RandomPhotometricDistort(Transform): output = inpt[..., permutation, :, :] if isinstance(inpt, (features.Image, features.Video)): - output = type(inpt).new_like(inpt, output, color_space=features.ColorSpace.OTHER) # type: ignore[arg-type] + output = type(inpt).wrap_like(inpt, output, color_space=features.ColorSpace.OTHER) # type: ignore[arg-type] + elif isinstance(inpt, PIL.Image.Image): output = F.to_image_pil(output) diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py index a9341415c..3979b178f 100644 --- a/torchvision/prototype/transforms/_deprecated.py +++ b/torchvision/prototype/transforms/_deprecated.py @@ -55,7 +55,7 @@ class Grayscale(Transform): def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels) if isinstance(inpt, features.Image): - output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.GRAY) + output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) return output @@ -84,5 +84,5 @@ class RandomGrayscale(_RandomApplyTransform): def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"]) if isinstance(inpt, features.Image): - output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.GRAY) + output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) return output diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 1f132ec92..37e2aee02 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -158,8 +158,8 @@ class FiveCrop(Transform): ... def forward(self, sample: Tuple[Tuple[features.Image, ...], features.Label]): ... images, labels = sample ... batch_size = len(images) - ... images = features.Image.new_like(images[0], torch.stack(images)) - ... labels = features.Label.new_like(labels, labels.repeat(batch_size)) + ... images = features.Image.wrap_like(images[0], torch.stack(images)) + ... labels = features.Label.wrap_like(labels, labels.repeat(batch_size)) ... return images, labels ... >>> image = features.Image(torch.rand(3, 256, 256)) @@ -677,18 +677,18 @@ class RandomIoUCrop(Transform): is_within_crop_area = params["is_within_crop_area"] if isinstance(inpt, (features.Label, features.OneHotLabel)): - return inpt.new_like(inpt, inpt[is_within_crop_area]) # type: ignore[arg-type] + return inpt.wrap_like(inpt, inpt[is_within_crop_area]) # type: ignore[arg-type] output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]) if isinstance(output, features.BoundingBox): bboxes = output[is_within_crop_area] bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size) - output = features.BoundingBox.new_like(output, bboxes) + output = features.BoundingBox.wrap_like(output, bboxes) elif isinstance(output, features.Mask): # apply is_within_crop_area if mask is one-hot encoded masks = output[is_within_crop_area] - output = features.Mask.new_like(output, masks) + output = features.Mask.wrap_like(output, masks) return output @@ -801,7 +801,7 @@ class FixedSizeCrop(Transform): bounding_boxes = cast( features.BoundingBox, F.crop(bounding_boxes, top=top, left=left, height=new_height, width=new_width) ) - bounding_boxes = features.BoundingBox.new_like( + bounding_boxes = features.BoundingBox.wrap_like( bounding_boxes, F.clamp_bounding_box( bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size @@ -840,9 +840,9 @@ class FixedSizeCrop(Transform): if params["is_valid"] is not None: if isinstance(inpt, (features.Label, features.OneHotLabel, features.Mask)): - inpt = inpt.new_like(inpt, inpt[params["is_valid"]]) # type: ignore[arg-type] + inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]]) # type: ignore[arg-type] elif isinstance(inpt, features.BoundingBox): - inpt = features.BoundingBox.new_like( + inpt = features.BoundingBox.wrap_like( inpt, F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size), ) diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index cb090492a..74fbcd60f 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -18,7 +18,7 @@ class ConvertBoundingBoxFormat(Transform): def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox: output = F.convert_format_bounding_box(inpt, old_format=inpt.format, new_format=params["format"]) - return features.BoundingBox.new_like(inpt, output, format=params["format"]) + return features.BoundingBox.wrap_like(inpt, output, format=params["format"]) class ConvertImageDtype(Transform): @@ -30,7 +30,11 @@ class ConvertImageDtype(Transform): def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> features.TensorImageType: output = F.convert_image_dtype(inpt, dtype=self.dtype) - return output if features.is_simple_tensor(inpt) else features.Image.new_like(inpt, output, dtype=self.dtype) # type: ignore[arg-type] + return ( + output + if features.is_simple_tensor(inpt) + else features.Image.wrap_like(inpt, output) # type: ignore[arg-type] + ) class ConvertColorSpace(Transform): @@ -65,4 +69,4 @@ class ClampBoundingBoxes(Transform): def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox: output = F.clamp_bounding_box(inpt, format=inpt.format, image_size=inpt.image_size) - return features.BoundingBox.new_like(inpt, output) + return features.BoundingBox.wrap_like(inpt, output) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index 2531bf8f6..dd1e1cdf8 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -171,4 +171,4 @@ class RemoveSmallBoundingBoxes(Transform): return dict(valid_indices=valid_indices) def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: - return inpt.new_like(inpt, inpt[params["valid_indices"]]) + return inpt.wrap_like(inpt, inpt[params["valid_indices"]]) diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py index 976feb99e..847343dbf 100644 --- a/torchvision/prototype/transforms/functional/_augment.py +++ b/torchvision/prototype/transforms/functional/_augment.py @@ -35,7 +35,7 @@ def erase( if isinstance(inpt, torch.Tensor): output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): - output = type(inpt).new_like(inpt, output) # type: ignore[arg-type] + output = type(inpt).wrap_like(inpt, output) # type: ignore[arg-type] return output else: # isinstance(inpt, PIL.Image.Image): return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index f205b5aea..c63fe5b41 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1409,7 +1409,7 @@ def five_crop( if isinstance(inpt, torch.Tensor): output = five_crop_image_tensor(inpt, size) if not torch.jit.is_scripting() and isinstance(inpt, features.Image): - output = tuple(features.Image.new_like(inpt, item) for item in output) # type: ignore[assignment] + output = tuple(features.Image.wrap_like(inpt, item) for item in output) # type: ignore[assignment] return output else: # isinstance(inpt, PIL.Image.Image): return five_crop_image_pil(inpt, size) @@ -1446,7 +1446,7 @@ def ten_crop(inpt: features.ImageTypeJIT, size: List[int], vertical_flip: bool = if isinstance(inpt, torch.Tensor): output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip) if not torch.jit.is_scripting() and isinstance(inpt, features.Image): - output = [features.Image.new_like(inpt, item) for item in output] + output = [features.Image.wrap_like(inpt, item) for item in output] return output else: # isinstance(inpt, PIL.Image.Image): return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip) -- GitLab From 6e203b44098c3371689f56abc17b7c02bd51a261 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 7 Oct 2022 17:26:50 +0100 Subject: [PATCH 022/624] [prototype] Rewrite the meta dimension methods (#6722) * Rewrite `get_dimensions`, `get_num_channels` and `get_spatial_size` * Remove `get_chw` * Remove comments * Make `get_spatial_size` support non-image input * Reduce the unnecessary use of `get_dimensions*` * Fix linters * Fix merge bug * Linter * Fix linter --- torchvision/prototype/features/_mask.py | 6 +- .../prototype/transforms/_auto_augment.py | 10 ++-- torchvision/prototype/transforms/_utils.py | 7 ++- .../transforms/functional/__init__.py | 6 ++ .../transforms/functional/_geometry.py | 21 ++++--- .../prototype/transforms/functional/_meta.py | 58 ++++++++++++------- 6 files changed, 71 insertions(+), 37 deletions(-) diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py index 65793dc45..7b49ce8e8 100644 --- a/torchvision/prototype/features/_mask.py +++ b/torchvision/prototype/features/_mask.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, List, Optional, Union +from typing import Any, cast, List, Optional, Tuple, Union import torch from torchvision.transforms import InterpolationMode @@ -32,6 +32,10 @@ class Mask(_Feature): ) -> Mask: return cls._wrap(tensor) + @property + def image_size(self) -> Tuple[int, int]: + return cast(Tuple[int, int], tuple(self.shape[-2:])) + def horizontal_flip(self) -> Mask: output = self._F.horizontal_flip_mask(self) return Mask.wrap_like(self, output) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index 7e28d9d6c..6ef9edba3 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -7,7 +7,7 @@ import torch from torch.utils._pytree import tree_flatten, tree_unflatten from torchvision.prototype import features from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform -from torchvision.prototype.transforms.functional._meta import get_chw +from torchvision.prototype.transforms.functional._meta import get_spatial_size from ._utils import _isinstance, _setup_fill_arg @@ -278,7 +278,7 @@ class AutoAugment(_AutoAugmentBase): sample = inputs if len(inputs) > 1 else inputs[0] id, image_or_video = self._extract_image_or_video(sample) - _, height, width = get_chw(image_or_video) + height, width = get_spatial_size(image_or_video) policy = self._policies[int(torch.randint(len(self._policies), ()))] @@ -349,7 +349,7 @@ class RandAugment(_AutoAugmentBase): sample = inputs if len(inputs) > 1 else inputs[0] id, image_or_video = self._extract_image_or_video(sample) - _, height, width = get_chw(image_or_video) + height, width = get_spatial_size(image_or_video) for _ in range(self.num_ops): transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -403,7 +403,7 @@ class TrivialAugmentWide(_AutoAugmentBase): sample = inputs if len(inputs) > 1 else inputs[0] id, image_or_video = self._extract_image_or_video(sample) - _, height, width = get_chw(image_or_video) + height, width = get_spatial_size(image_or_video) transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -473,7 +473,7 @@ class AugMix(_AutoAugmentBase): def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] id, orig_image_or_video = self._extract_image_or_video(sample) - _, height, width = get_chw(orig_image_or_video) + height, width = get_spatial_size(orig_image_or_video) if isinstance(orig_image_or_video, torch.Tensor): image_or_video = orig_image_or_video diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index a76891a34..db1ff4b7b 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -10,7 +10,7 @@ from torchvision._utils import sequence_to_str from torchvision.prototype import features from torchvision.prototype.features._feature import FillType -from torchvision.prototype.transforms.functional._meta import get_chw +from torchvision.prototype.transforms.functional._meta import get_dimensions from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size # noqa: F401 from typing_extensions import Literal @@ -80,7 +80,7 @@ def query_bounding_box(sample: Any) -> features.BoundingBox: def query_chw(sample: Any) -> Tuple[int, int, int]: flat_sample, _ = tree_flatten(sample) chws = { - get_chw(item) + tuple(get_dimensions(item)) for item in flat_sample if isinstance(item, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(item) } @@ -88,7 +88,8 @@ def query_chw(sample: Any) -> Tuple[int, int, int]: raise TypeError("No image or video was found in the sample") elif len(chws) > 1: raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}") - return chws.pop() + c, h, w = chws.pop() + return c, h, w def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool: diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index cb801df73..1e918cc34 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -8,9 +8,15 @@ from ._meta import ( convert_color_space_image_pil, convert_color_space_video, convert_color_space, + get_dimensions_image_tensor, + get_dimensions_image_pil, get_dimensions, get_image_num_channels, + get_num_channels_image_tensor, + get_num_channels_image_pil, get_num_channels, + get_spatial_size_image_tensor, + get_spatial_size_image_pil, get_spatial_size, ) # usort: skip diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index c63fe5b41..670b2cb87 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -21,7 +21,12 @@ from torchvision.transforms.functional_tensor import ( interpolate, ) -from ._meta import convert_format_bounding_box, get_dimensions_image_pil, get_dimensions_image_tensor +from ._meta import ( + convert_format_bounding_box, + get_dimensions_image_tensor, + get_spatial_size_image_pil, + get_spatial_size_image_tensor, +) horizontal_flip_image_tensor = _FT.hflip horizontal_flip_image_pil = _FP.hflip @@ -323,7 +328,7 @@ def affine_image_pil( # it is visually better to estimate the center without 0.5 offset # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine if center is None: - _, height, width = get_dimensions_image_pil(image) + height, width = get_spatial_size_image_pil(image) center = [width * 0.5, height * 0.5] matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear) @@ -1189,13 +1194,13 @@ def _center_crop_compute_crop_anchor( def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor: crop_height, crop_width = _center_crop_parse_output_size(output_size) - _, image_height, image_width = get_dimensions_image_tensor(image) + image_height, image_width = get_spatial_size_image_tensor(image) if crop_height > image_height or crop_width > image_width: padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) image = pad_image_tensor(image, padding_ltrb, fill=0) - _, image_height, image_width = get_dimensions_image_tensor(image) + image_height, image_width = get_spatial_size_image_tensor(image) if crop_width == image_width and crop_height == image_height: return image @@ -1206,13 +1211,13 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor @torch.jit.unused def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image: crop_height, crop_width = _center_crop_parse_output_size(output_size) - _, image_height, image_width = get_dimensions_image_pil(image) + image_height, image_width = get_spatial_size_image_pil(image) if crop_height > image_height or crop_width > image_width: padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) image = pad_image_pil(image, padding_ltrb, fill=0) - _, image_height, image_width = get_dimensions_image_pil(image) + image_height, image_width = get_spatial_size_image_pil(image) if crop_width == image_width and crop_height == image_height: return image @@ -1365,7 +1370,7 @@ def five_crop_image_tensor( image: torch.Tensor, size: List[int] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: crop_height, crop_width = _parse_five_crop_size(size) - _, image_height, image_width = get_dimensions_image_tensor(image) + image_height, image_width = get_spatial_size_image_tensor(image) if crop_width > image_width or crop_height > image_height: msg = "Requested crop size {} is bigger than input size {}" @@ -1385,7 +1390,7 @@ def five_crop_image_pil( image: PIL.Image.Image, size: List[int] ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]: crop_height, crop_width = _parse_five_crop_size(size) - _, image_height, image_width = get_dimensions_image_pil(image) + image_height, image_width = get_spatial_size_image_pil(image) if crop_width > image_width or crop_height > image_height: msg = "Requested crop size {} is bigger than input size {}" diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index 1e53edf39..e24b68c9f 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -6,38 +6,37 @@ from torchvision.prototype import features from torchvision.prototype.features import BoundingBoxFormat, ColorSpace from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT + get_dimensions_image_tensor = _FT.get_dimensions get_dimensions_image_pil = _FP.get_dimensions -# TODO: Should this be prefixed with `_` similar to other methods that don't get exposed by init? -def get_chw(image: features.ImageOrVideoTypeJIT) -> Tuple[int, int, int]: +def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: if isinstance(image, torch.Tensor) and ( torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) ): - channels, height, width = get_dimensions_image_tensor(image) + return get_dimensions_image_tensor(image) elif isinstance(image, (features.Image, features.Video)): channels = image.num_channels height, width = image.image_size - else: # isinstance(image, PIL.Image.Image) - channels, height, width = get_dimensions_image_pil(image) - return channels, height, width - - -# The three functions below are here for BC. Whether we want to have two different kernels and how they and the -# compound version should be named is still under discussion: https://github.com/pytorch/vision/issues/6491 -# Given that these kernels should also support boxes, masks, and videos, it is unlikely that there name will stay. -# They will either be deprecated or simply aliased to the new kernels if we have reached consensus about the issue -# detailed above. + return [channels, height, width] + else: + return get_dimensions_image_pil(image) -def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: - return list(get_chw(image)) +get_num_channels_image_tensor = _FT.get_image_num_channels +get_num_channels_image_pil = _FP.get_image_num_channels def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int: - num_channels, *_ = get_chw(image) - return num_channels + if isinstance(image, torch.Tensor) and ( + torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) + ): + return _FT.get_image_num_channels(image) + elif isinstance(image, (features.Image, features.Video)): + return image.num_channels + else: + return _FP.get_image_num_channels(image) # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without @@ -45,9 +44,28 @@ def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int: get_image_num_channels = get_num_channels -def get_spatial_size(image: features.ImageOrVideoTypeJIT) -> List[int]: - _, *size = get_chw(image) - return size +def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]: + width, height = _FT.get_image_size(image) + return [height, width] + + +@torch.jit.unused +def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]: + width, height = _FP.get_image_size(image) + return [height, width] + + +def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]: + if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): + return get_spatial_size_image_tensor(inpt) + elif isinstance(inpt, features._Feature): + image_size = getattr(inpt, "image_size", None) + if image_size is not None: + return list(image_size) + else: + raise ValueError(f"Type {inpt.__class__} doesn't have spatial size.") + else: + return get_spatial_size_image_pil(inpt) def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor: -- GitLab From af54e5645399bc67711155de2f8bb9cb1f4ebbe1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 10 Oct 2022 09:46:09 +0100 Subject: [PATCH 023/624] [FBcode->GH] Fix GRACE_HOPPER file internal discovery (#6719) Co-authored-by: Vasilis Vryniotis --- test/test_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index d284ec6fe..a169f5053 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -44,9 +44,11 @@ def _get_image(input_shape, real_image, device): To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params` """ if real_image: - GRACE_HOPPER = get_relative_path( - os.path.dirname(os.path.realpath(__file__)), "test", "assets", "encode_jpeg", "grace_hopper_517x606.jpg" + # TODO: Maybe unify file discovery logic with test_image.py + GRACE_HOPPER = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg" ) + img = Image.open(GRACE_HOPPER) original_width, original_height = img.size -- GitLab From 019139f7875c3388aa6c3cd5d65782b69b3059bf Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 10 Oct 2022 11:16:42 +0200 Subject: [PATCH 024/624] make _setup_fill_arg serializable (#6730) --- torchvision/prototype/transforms/_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index db1ff4b7b..a3980fa21 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -1,6 +1,6 @@ +import functools import numbers from collections import defaultdict - from typing import Any, Callable, Dict, Sequence, Tuple, Type, Union import PIL.Image @@ -43,13 +43,19 @@ def _check_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> None: raise TypeError("Got inappropriate fill arg") +def _default_fill(fill: FillType) -> FillType: + return fill + + def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillType]: _check_fill_arg(fill) if isinstance(fill, dict): return fill - return defaultdict(lambda: fill) # type: ignore[return-value, arg-type] + # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle. + # If it were possible, we could replace this with `defaultdict(lambda: fill)` + return defaultdict(functools.partial(_default_fill, fill)) def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None: -- GitLab From 17969ebad94eecf8c59db531d53a205ec8993467 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 10 Oct 2022 11:17:22 +0200 Subject: [PATCH 025/624] enable arbitrary batch size for all prototype kernels (#6726) * enable arbitrary batch size for all prototype kernels * put back perspective dispatcher --- test/prototype_transforms_dispatcher_infos.py | 9 --- test/prototype_transforms_kernel_infos.py | 11 --- .../transforms/functional/_geometry.py | 76 +++++++++---------- .../prototype/transforms/functional/_misc.py | 38 +++++----- 4 files changed, 54 insertions(+), 80 deletions(-) diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py index be8bd3002..de933c7e3 100644 --- a/test/prototype_transforms_dispatcher_infos.py +++ b/test/prototype_transforms_dispatcher_infos.py @@ -138,12 +138,6 @@ def xfail_all_tests(*, reason, condition): ] -xfails_degenerate_or_multi_batch_dims = xfail_all_tests( - reason="See https://github.com/pytorch/vision/issues/6670 for details.", - condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]), -) - - DISPATCHER_INFOS = [ DispatcherInfo( F.horizontal_flip, @@ -260,7 +254,6 @@ DISPATCHER_INFOS = [ pil_kernel_info=PILKernelInfo(F.perspective_image_pil), test_marks=[ xfail_dispatch_pil_if_fill_sequence_needs_broadcast, - *xfails_degenerate_or_multi_batch_dims, ], ), DispatcherInfo( @@ -271,7 +264,6 @@ DISPATCHER_INFOS = [ features.Mask: F.elastic_mask, }, pil_kernel_info=PILKernelInfo(F.elastic_image_pil), - test_marks=xfails_degenerate_or_multi_batch_dims, ), DispatcherInfo( F.center_crop, @@ -294,7 +286,6 @@ DISPATCHER_INFOS = [ test_marks=[ xfail_jit_python_scalar_arg("kernel_size"), xfail_jit_python_scalar_arg("sigma"), - *xfails_degenerate_or_multi_batch_dims, ], ), DispatcherInfo( diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index d90d3bf68..9ebfc7a00 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -156,12 +156,6 @@ def xfail_all_tests(*, reason, condition): ] -xfails_image_degenerate_or_multi_batch_dims = xfail_all_tests( - reason="See https://github.com/pytorch/vision/issues/6670 for details.", - condition=lambda args_kwargs: len(args_kwargs.args[0].shape) > 4 or not all(args_kwargs.args[0].shape[:-3]), -) - - KERNEL_INFOS = [] @@ -1156,7 +1150,6 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.perspective_image_pil), reference_inputs_fn=reference_inputs_perspective_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=xfails_image_degenerate_or_multi_batch_dims, ), KernelInfo( F.perspective_bounding_box, @@ -1168,7 +1161,6 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.perspective_image_pil), reference_inputs_fn=reference_inputs_perspective_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=xfails_image_degenerate_or_multi_batch_dims, ), KernelInfo( F.perspective_video, @@ -1239,7 +1231,6 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.elastic_image_pil), reference_inputs_fn=reference_inputs_elastic_image_tensor, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=xfails_image_degenerate_or_multi_batch_dims, ), KernelInfo( F.elastic_bounding_box, @@ -1251,7 +1242,6 @@ KERNEL_INFOS.extend( reference_fn=pil_reference_wrapper(F.elastic_image_pil), reference_inputs_fn=reference_inputs_elastic_mask, closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - test_marks=xfails_image_degenerate_or_multi_batch_dims, ), KernelInfo( F.elastic_video, @@ -1379,7 +1369,6 @@ KERNEL_INFOS.extend( test_marks=[ xfail_jit_python_scalar_arg("kernel_size"), xfail_jit_python_scalar_arg("sigma"), - *xfails_image_degenerate_or_multi_batch_dims, ], ), KernelInfo( diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 670b2cb87..2c064245e 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -882,7 +882,23 @@ def perspective_image_tensor( interpolation: InterpolationMode = InterpolationMode.BILINEAR, fill: features.FillTypeJIT = None, ) -> torch.Tensor: - return _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill) + if image.numel() == 0: + return image + + shape = image.shape + + if image.ndim > 4: + image = image.view((-1,) + shape[-3:]) + needs_unsquash = True + else: + needs_unsquash = False + + output = _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill) + + if needs_unsquash: + output = output.view(shape) + + return output @torch.jit.unused @@ -1007,25 +1023,7 @@ def perspective_video( interpolation: InterpolationMode = InterpolationMode.BILINEAR, fill: features.FillTypeJIT = None, ) -> torch.Tensor: - # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when - # https://github.com/pytorch/vision/issues/6670 is resolved. - if video.numel() == 0: - return video - - shape = video.shape - - if video.ndim > 4: - video = video.view((-1,) + shape[-3:]) - needs_unsquash = True - else: - needs_unsquash = False - - output = perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill) - - if needs_unsquash: - output = output.view(shape) - - return output + return perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill) def perspective( @@ -1048,7 +1046,23 @@ def elastic_image_tensor( interpolation: InterpolationMode = InterpolationMode.BILINEAR, fill: features.FillTypeJIT = None, ) -> torch.Tensor: - return _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill) + if image.numel() == 0: + return image + + shape = image.shape + + if image.ndim > 4: + image = image.view((-1,) + shape[-3:]) + needs_unsquash = True + else: + needs_unsquash = False + + output = _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill) + + if needs_unsquash: + output = output.view(shape) + + return output @torch.jit.unused @@ -1128,25 +1142,7 @@ def elastic_video( interpolation: InterpolationMode = InterpolationMode.BILINEAR, fill: features.FillTypeJIT = None, ) -> torch.Tensor: - # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when - # https://github.com/pytorch/vision/issues/6670 is resolved. - if video.numel() == 0: - return video - - shape = video.shape - - if video.ndim > 4: - video = video.view((-1,) + shape[-3:]) - needs_unsquash = True - else: - needs_unsquash = False - - output = elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill) - - if needs_unsquash: - output = output.view(shape) - - return output + return elastic_image_tensor(video, displacement, interpolation=interpolation, fill=fill) def elastic( diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 7b3773e63..79a358b4e 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -56,7 +56,23 @@ def gaussian_blur_image_tensor( if s <= 0.0: raise ValueError(f"sigma should have positive values. Got {sigma}") - return _FT.gaussian_blur(image, kernel_size, sigma) + if image.numel() == 0: + return image + + shape = image.shape + + if image.ndim > 4: + image = image.view((-1,) + shape[-3:]) + needs_unsquash = True + else: + needs_unsquash = False + + output = _FT.gaussian_blur(image, kernel_size, sigma) + + if needs_unsquash: + output = output.view(shape) + + return output @torch.jit.unused @@ -71,25 +87,7 @@ def gaussian_blur_image_pil( def gaussian_blur_video( video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None ) -> torch.Tensor: - # TODO: this is a temporary workaround until the image kernel supports arbitrary batch sizes. Remove this when - # https://github.com/pytorch/vision/issues/6670 is resolved. - if video.numel() == 0: - return video - - shape = video.shape - - if video.ndim > 4: - video = video.view((-1,) + shape[-3:]) - needs_unsquash = True - else: - needs_unsquash = False - - output = gaussian_blur_image_tensor(video, kernel_size, sigma) - - if needs_unsquash: - output = output.view(shape) - - return output + return gaussian_blur_image_tensor(video, kernel_size, sigma) def gaussian_blur( -- GitLab From 3f1d9f6b21464aa023327dd0d2b397648470c387 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 10 Oct 2022 13:07:23 +0200 Subject: [PATCH 026/624] Refactor `KernelInfo` and `DispatcherInfo` (#6710) * make args and kwargs in ArgsKwargs more accessible * refactor KernelInfo and DispatcherInfo * remove ArgsKwargs __getitem__ shortcut again --- test/prototype_common_utils.py | 53 +++++++++++ test/prototype_transforms_dispatcher_infos.py | 94 ++++++++++--------- test/prototype_transforms_kernel_infos.py | 92 +++++++----------- test/test_prototype_transforms_functional.py | 33 +++---- 4 files changed, 150 insertions(+), 122 deletions(-) diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index c10cec94c..1d5766b1f 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -3,6 +3,7 @@ import collections.abc import dataclasses import functools +from collections import defaultdict from typing import Callable, Optional, Sequence, Tuple, Union import PIL.Image @@ -47,6 +48,9 @@ __all__ = [ "make_masks", "make_video", "make_videos", + "TestMark", + "mark_framework_limitation", + "InfoBase", ] @@ -588,3 +592,52 @@ def make_video_loaders( make_videos = from_loaders(make_video_loaders) + + +class TestMark: + def __init__( + self, + # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is + # no test class, i.e. a standalone test function, use `None`. + test_id, + # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail` + mark, + *, + # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be + # applied. If omitted, defaults to always apply. + condition=None, + ): + self.test_id = test_id + self.mark = mark + self.condition = condition or (lambda args_kwargs: True) + + +def mark_framework_limitation(test_id, reason): + # The purpose of this function is to have a single entry point for skip marks that are only there, because the test + # framework cannot handle the kernel in general or a specific parameter combination. + # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is + # still justified. + # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus, + # we are wasting CI resources for no reason for most of the time + return TestMark(test_id, pytest.mark.skip(reason=reason)) + + +class InfoBase: + def __init__(self, *, id, test_marks=None, closeness_kwargs=None): + # Identifier if the info that shows up the parametrization. + self.id = id + # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization. + # See the `TestMark` class for details + self.test_marks = test_marks or [] + # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. + self.closeness_kwargs = closeness_kwargs or dict() + + test_marks_map = defaultdict(list) + for test_mark in self.test_marks: + test_marks_map[test_mark.test_id].append(test_mark) + self._test_marks_map = dict(test_marks_map) + + def get_marks(self, test_id, args_kwargs): + return [ + test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) + ] diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py index de933c7e3..82173907c 100644 --- a/test/prototype_transforms_dispatcher_infos.py +++ b/test/prototype_transforms_dispatcher_infos.py @@ -1,57 +1,67 @@ import collections.abc -import dataclasses - -from collections import defaultdict - -from typing import Callable, Dict, List, Optional, Sequence, Type import pytest import torchvision.prototype.transforms.functional as F -from prototype_transforms_kernel_infos import KERNEL_INFOS, TestMark +from prototype_common_utils import InfoBase, TestMark +from prototype_transforms_kernel_infos import KERNEL_INFOS from torchvision.prototype import features __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"] -KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS} - - -@dataclasses.dataclass -class PILKernelInfo: - kernel: Callable - kernel_name: str = dataclasses.field(default=None) - - def __post_init__(self): - self.kernel_name = self.kernel_name or self.kernel.__name__ - -@dataclasses.dataclass -class DispatcherInfo: - dispatcher: Callable - kernels: Dict[Type, Callable] - pil_kernel_info: Optional[PILKernelInfo] = None - method_name: str = dataclasses.field(default=None) - test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list) - _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False) - - def __post_init__(self): - self.kernel_infos = {feature_type: KERNEL_INFO_MAP[kernel] for feature_type, kernel in self.kernels.items()} - self.method_name = self.method_name or self.dispatcher.__name__ - test_marks_map = defaultdict(list) - for test_mark in self.test_marks: - test_marks_map[test_mark.test_id].append(test_mark) - self._test_marks_map = dict(test_marks_map) - - def get_marks(self, test_id, args_kwargs): - return [ - test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) - ] +class PILKernelInfo(InfoBase): + def __init__( + self, + kernel, + *, + # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name + # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then + kernel_name=None, + ): + super().__init__(id=kernel_name or kernel.__name__) + self.kernel = kernel + + +class DispatcherInfo(InfoBase): + _KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS} + + def __init__( + self, + dispatcher, + *, + # Dictionary of types that map to the kernel the dispatcher dispatches to. + kernels, + # If omitted, no PIL dispatch test will be performed. + pil_kernel_info=None, + # See InfoBase + test_marks=None, + # See InfoBase + closeness_kwargs=None, + ): + super().__init__(id=dispatcher.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) + self.dispatcher = dispatcher + self.kernels = kernels + self.pil_kernel_info = pil_kernel_info + + kernel_infos = {} + for feature_type, kernel in self.kernels.items(): + kernel_info = self._KERNEL_INFO_MAP.get(kernel) + if not kernel_info: + raise pytest.UsageError( + f"Can't register {kernel.__name__} for type {feature_type} since there is no `KernelInfo` for it. " + f"Please add a `KernelInfo` for it in `prototype_transforms_kernel_infos.py`." + ) + kernel_infos[feature_type] = kernel_info + self.kernel_infos = kernel_infos def sample_inputs(self, *feature_types, filter_metadata=True): - for feature_type in feature_types or self.kernels.keys(): - if feature_type not in self.kernels: - raise pytest.UsageError(f"There is no kernel registered for type {feature_type.__name__}") + for feature_type in feature_types or self.kernel_infos.keys(): + kernel_info = self.kernel_infos.get(feature_type) + if not kernel_info: + raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}") + + sample_inputs = kernel_info.sample_inputs_fn() - sample_inputs = self.kernel_infos[feature_type].sample_inputs_fn() if not filter_metadata: yield from sample_inputs else: diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 9ebfc7a00..34f1f875a 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1,26 +1,24 @@ -import dataclasses import functools import itertools import math -from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple import numpy as np import pytest import torch.testing import torchvision.ops import torchvision.prototype.transforms.functional as F - -from _pytest.mark.structures import MarkDecorator from common_utils import cycle_over from datasets_utils import combinations_grid from prototype_common_utils import ( ArgsKwargs, + InfoBase, make_bounding_box_loaders, make_image_loader, make_image_loaders, make_mask_loaders, make_video_loaders, + mark_framework_limitation, + TestMark, VALID_EXTRA_DIMS, ) from torchvision.prototype import features @@ -29,51 +27,35 @@ from torchvision.transforms.functional_tensor import _max_value as get_max_value __all__ = ["KernelInfo", "KERNEL_INFOS"] -TestID = Tuple[Optional[str], str] - - -@dataclasses.dataclass -class TestMark: - test_id: TestID - mark: MarkDecorator - condition: Callable[[ArgsKwargs], bool] = lambda args_kwargs: True - - -@dataclasses.dataclass -class KernelInfo: - kernel: Callable - # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but should - # not include extensive parameter combinations to keep to overall test count moderate. - sample_inputs_fn: Callable[[], Iterable[ArgsKwargs]] - # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name - # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then - kernel_name: str = dataclasses.field(default=None) - # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also take - # tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should happen - # inside the function. It should return a tensor or to be more precise an object that can be compared to a - # tensor by `assert_close`. If omitted, no reference test will be performed. - reference_fn: Optional[Callable] = None - # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter - # values to be tested. If not specified, `sample_inputs_fn` will be used. - reference_inputs_fn: Optional[Callable[[], Iterable[ArgsKwargs]]] = None - # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. - closeness_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) - test_marks: Sequence[TestMark] = dataclasses.field(default_factory=list) - _test_marks_map: Dict[str, List[TestMark]] = dataclasses.field(default=None, init=False) - - def __post_init__(self): - self.kernel_name = self.kernel_name or self.kernel.__name__ - self.reference_inputs_fn = self.reference_inputs_fn or self.sample_inputs_fn - - test_marks_map = defaultdict(list) - for test_mark in self.test_marks: - test_marks_map[test_mark.test_id].append(test_mark) - self._test_marks_map = dict(test_marks_map) - - def get_marks(self, test_id, args_kwargs): - return [ - test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) - ] +class KernelInfo(InfoBase): + def __init__( + self, + kernel, + *, + # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name + # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then + kernel_name=None, + # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but + # should not include extensive parameter combinations to keep to overall test count moderate. + sample_inputs_fn, + # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also + # take tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should + # happen inside the function. It should return a tensor or to be more precise an object that can be compared to + # a tensor by `assert_close`. If omitted, no reference test will be performed. + reference_fn=None, + # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter + # values to be tested. If not specified, `sample_inputs_fn` will be used. + reference_inputs_fn=None, + # See InfoBase + test_marks=None, + # See InfoBase + closeness_kwargs=None, + ): + super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) + self.kernel = kernel + self.sample_inputs_fn = sample_inputs_fn + self.reference_fn = reference_fn + self.reference_inputs_fn = reference_inputs_fn DEFAULT_IMAGE_CLOSENESS_KWARGS = dict( @@ -97,16 +79,6 @@ def pil_reference_wrapper(pil_kernel): return wrapper -def mark_framework_limitation(test_id, reason): - # The purpose of this function is to have a single entry point for skip marks that are only there, because the test - # framework cannot handle the kernel in general or a specific parameter combination. - # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is - # still justified. - # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus, - # we are wasting CI resources for no reason for most of the time. - return TestMark(test_id, pytest.mark.skip(reason=reason)) - - def xfail_jit_python_scalar_arg(name, *, reason=None): reason = reason or f"Python scalar int or float for `{name}` is not supported when scripting" return TestMark( diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 5adea4d26..8329de697 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -1,4 +1,3 @@ -import functools import math import os @@ -27,7 +26,7 @@ def script(fn): raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error -def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, name_fn=lambda info: str(info)): +def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None): if condition is None: def condition(info): @@ -41,7 +40,7 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n elif len(parts) == 2: test_class_name, test_function_name = parts else: - raise pytest.UsageError("Unable to parse the test class and test name from test function") + raise pytest.UsageError("Unable to parse the test class name and test function name from test function") test_id = (test_class_name, test_function_name) argnames = ("info", "args_kwargs") @@ -51,7 +50,6 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n continue args_kwargs = list(args_kwargs_fn(info)) - name = name_fn(info) idx_field_len = len(str(len(args_kwargs))) for idx, args_kwargs_ in enumerate(args_kwargs): @@ -60,7 +58,7 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n info, args_kwargs_, marks=info.get_marks(test_id, args_kwargs_), - id=f"{name}-{idx:0{idx_field_len}}", + id=f"{info.id}-{idx:0{idx_field_len}}", ) ) @@ -70,14 +68,11 @@ def make_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None, n class TestKernels: - make_kernel_args_kwargs_parametrization = functools.partial( - make_args_kwargs_parametrization, name_fn=lambda info: info.kernel_name - ) - sample_inputs = kernel_sample_inputs = make_kernel_args_kwargs_parametrization( + sample_inputs = make_info_args_kwargs_parametrization( KERNEL_INFOS, args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(), ) - reference_inputs = make_kernel_args_kwargs_parametrization( + reference_inputs = make_info_args_kwargs_parametrization( KERNEL_INFOS, args_kwargs_fn=lambda info: info.reference_inputs_fn(), condition=lambda info: info.reference_fn is not None, @@ -208,10 +203,7 @@ def spy_on(mocker): class TestDispatchers: - make_dispatcher_args_kwargs_parametrization = functools.partial( - make_args_kwargs_parametrization, name_fn=lambda info: info.dispatcher.__name__ - ) - image_sample_inputs = kernel_sample_inputs = make_dispatcher_args_kwargs_parametrization( + image_sample_inputs = make_info_args_kwargs_parametrization( DISPATCHER_INFOS, args_kwargs_fn=lambda info: info.sample_inputs(features.Image), condition=lambda info: features.Image in info.kernels, @@ -251,13 +243,13 @@ class TestDispatchers: image_simple_tensor = torch.Tensor(image_feature) kernel_info = info.kernel_infos[features.Image] - spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.kernel_name) + spy = spy_on(kernel_info.kernel, module=info.dispatcher.__module__, name=kernel_info.id) info.dispatcher(image_simple_tensor, *other_args, **kwargs) spy.assert_called_once() - @make_dispatcher_args_kwargs_parametrization( + @make_info_args_kwargs_parametrization( DISPATCHER_INFOS, args_kwargs_fn=lambda info: info.sample_inputs(features.Image), condition=lambda info: info.pil_kernel_info is not None, @@ -271,22 +263,23 @@ class TestDispatchers: image_pil = F.to_image_pil(image_feature) pil_kernel_info = info.pil_kernel_info - spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.kernel_name) + spy = spy_on(pil_kernel_info.kernel, module=info.dispatcher.__module__, name=pil_kernel_info.id) info.dispatcher(image_pil, *other_args, **kwargs) spy.assert_called_once() - @make_dispatcher_args_kwargs_parametrization( + @make_info_args_kwargs_parametrization( DISPATCHER_INFOS, args_kwargs_fn=lambda info: info.sample_inputs(), ) def test_dispatch_feature(self, info, args_kwargs, spy_on): (feature, *other_args), kwargs = args_kwargs.load() - method = getattr(feature, info.method_name) + method_name = info.id + method = getattr(feature, method_name) feature_type = type(feature) - spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{info.method_name}") + spy = spy_on(method, module=feature_type.__module__, name=f"{feature_type.__name__}.{method_name}") info.dispatcher(feature, *other_args, **kwargs) -- GitLab From 0ab50f5fabbb976a70c815d49fec4a56b8f46359 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 10 Oct 2022 12:32:31 +0100 Subject: [PATCH 027/624] Remove performance workaround for mask resize (#6729) * Remove performance workaround for mask resize * Fix linter * bug fixes * remove unnecessary import * Fixing linter --- test/prototype_transforms_kernel_infos.py | 11 ------ .../transforms/functional/_geometry.py | 39 ++++--------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 34f1f875a..c8cca77e0 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -19,7 +19,6 @@ from prototype_common_utils import ( make_video_loaders, mark_framework_limitation, TestMark, - VALID_EXTRA_DIMS, ) from torchvision.prototype import features from torchvision.transforms.functional_tensor import _max_value as get_max_value @@ -215,16 +214,6 @@ def sample_inputs_resize_image_tensor(): ): yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation) - # We have a speed hack in place for nearest interpolation and single channel images (grayscale) - for image_loader in make_image_loaders( - sizes=["random"], - color_spaces=[features.ColorSpace.GRAY], - extra_dims=VALID_EXTRA_DIMS, - ): - yield ArgsKwargs( - image_loader, size=[min(image_loader.image_size) + 1], interpolation=F.InterpolationMode.NEAREST - ) - yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 2c064245e..93df59ad6 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -14,12 +14,7 @@ from torchvision.transforms.functional import ( pil_to_tensor, to_pil_image, ) -from torchvision.transforms.functional_tensor import ( - _cast_squeeze_in, - _cast_squeeze_out, - _parse_pad_padding, - interpolate, -) +from torchvision.transforms.functional_tensor import _parse_pad_padding from ._meta import ( convert_format_bounding_box, @@ -130,32 +125,12 @@ def resize_image_tensor( if image.numel() > 0: image = image.view(-1, num_channels, old_height, old_width) - # This is a perf hack to avoid slow channels_last upsample code path - # Related issue: https://github.com/pytorch/pytorch/issues/83840 - # We are transforming (N, 1, H, W) into (N, 2, H, W) to force to take channels_first path - if image.shape[1] == 1 and interpolation == InterpolationMode.NEAREST: - # Below code is copied from _FT.resize - # This is due to the fact that we need to apply the hack on casted image and not before - # Otherwise, image will be copied while cast to float and interpolate will work on twice more data - image, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(image, [torch.float32, torch.float64]) - - shape = (image.shape[0], 2, image.shape[2], image.shape[3]) - image = image.expand(shape) - - image = interpolate( - image, size=[new_height, new_width], mode=interpolation.value, align_corners=None, antialias=False - ) - - image = image[:, 0, ...] - image = _cast_squeeze_out(image, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype) - - else: - image = _FT.resize( - image, - size=[new_height, new_width], - interpolation=interpolation.value, - antialias=antialias, - ) + image = _FT.resize( + image, + size=[new_height, new_width], + interpolation=interpolation.value, + antialias=antialias, + ) return image.view(extra_dims + (num_channels, new_height, new_width)) -- GitLab From a3fe870b0f036e6b7917200b5a884e57c22ec6cf Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 10 Oct 2022 12:40:35 +0100 Subject: [PATCH 028/624] Adding support of Video to remaining Transforms and Kernels (#6724) * Adding support of Video to missed Transforms and Kernels * Fixing Grayscale Transform. * Fixing FiveCrop and TenCrop Transforms. * Fix Linter * Fix more kernels. * Add `five_crop_video` and `ten_crop_video` kernels * Added a TODO. * Missed Video isinstance * nits * Fix bug on AugMix * Nits and TODOs. * Reapply Philip's recommendation * Fix mypy and JIT * Fixing test --- torchvision/prototype/features/__init__.py | 12 ++++++- torchvision/prototype/features/_video.py | 1 + torchvision/prototype/transforms/_augment.py | 1 + .../prototype/transforms/_auto_augment.py | 5 +-- torchvision/prototype/transforms/_color.py | 2 +- .../prototype/transforms/_deprecated.py | 16 ++++----- torchvision/prototype/transforms/_geometry.py | 27 +++++++++------ torchvision/prototype/transforms/_meta.py | 10 +++--- torchvision/prototype/transforms/_misc.py | 1 + .../transforms/functional/__init__.py | 2 ++ .../transforms/functional/_augment.py | 2 +- .../transforms/functional/_deprecated.py | 11 ++++--- .../transforms/functional/_geometry.py | 33 ++++++++++++++----- .../prototype/transforms/functional/_meta.py | 6 +++- 14 files changed, 88 insertions(+), 41 deletions(-) diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py index 6fc2fb6ea..944ae9bd3 100644 --- a/torchvision/prototype/features/__init__.py +++ b/torchvision/prototype/features/__init__.py @@ -13,4 +13,14 @@ from ._image import ( ) from ._label import Label, OneHotLabel from ._mask import Mask -from ._video import ImageOrVideoType, ImageOrVideoTypeJIT, TensorImageOrVideoType, TensorImageOrVideoTypeJIT, Video +from ._video import ( + ImageOrVideoType, + ImageOrVideoTypeJIT, + LegacyVideoType, + LegacyVideoTypeJIT, + TensorImageOrVideoType, + TensorImageOrVideoTypeJIT, + Video, + VideoType, + VideoTypeJIT, +) diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py index a58027243..e32c36d5d 100644 --- a/torchvision/prototype/features/_video.py +++ b/torchvision/prototype/features/_video.py @@ -238,6 +238,7 @@ LegacyVideoTypeJIT = torch.Tensor TensorVideoType = Union[torch.Tensor, Video] TensorVideoTypeJIT = torch.Tensor +# TODO: decide if we should do definitions for both Images and Videos or use unions in the methods ImageOrVideoType = Union[ImageType, VideoType] ImageOrVideoTypeJIT = Union[ImageTypeJIT, VideoTypeJIT] TensorImageOrVideoType = Union[TensorImageType, TensorVideoType] diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index bcab0a3f4..7b2dca8a6 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -99,6 +99,7 @@ class RandomErasing(_RandomApplyTransform): return inpt +# TODO: Add support for Video: https://github.com/pytorch/vision/issues/6731 class _BaseMixupCutmix(_RandomApplyTransform): def __init__(self, alpha: float, p: float = 0.5) -> None: super().__init__(p=p) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index 6ef9edba3..d078cb2d1 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -483,7 +483,8 @@ class AugMix(_AutoAugmentBase): augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE orig_dims = list(image_or_video.shape) - batch = image_or_video.view([1] * max(4 - image_or_video.ndim, 0) + orig_dims) + expected_dim = 5 if isinstance(orig_image_or_video, features.Video) else 4 + batch = image_or_video.view([1] * max(expected_dim - image_or_video.ndim, 0) + orig_dims) batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1) # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a @@ -520,7 +521,7 @@ class AugMix(_AutoAugmentBase): mix = mix.view(orig_dims).to(dtype=image_or_video.dtype) if isinstance(orig_image_or_video, (features.Image, features.Video)): - mix = type(orig_image_or_video).wrap_like(orig_image_or_video, mix) # type: ignore[arg-type] + mix = orig_image_or_video.wrap_like(orig_image_or_video, mix) # type: ignore[arg-type] elif isinstance(orig_image_or_video, PIL.Image.Image): mix = F.to_image_pil(mix) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index 67a6cc3cc..340e721da 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -119,7 +119,7 @@ class RandomPhotometricDistort(Transform): output = inpt[..., permutation, :, :] if isinstance(inpt, (features.Image, features.Video)): - output = type(inpt).wrap_like(inpt, output, color_space=features.ColorSpace.OTHER) # type: ignore[arg-type] + output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.OTHER) # type: ignore[arg-type] elif isinstance(inpt, PIL.Image.Image): output = F.to_image_pil(output) diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py index 3979b178f..f8aec22b9 100644 --- a/torchvision/prototype/transforms/_deprecated.py +++ b/torchvision/prototype/transforms/_deprecated.py @@ -29,7 +29,7 @@ class ToTensor(Transform): class Grayscale(Transform): - _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor) + _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video) def __init__(self, num_output_channels: Literal[1, 3] = 1) -> None: deprecation_msg = ( @@ -52,15 +52,15 @@ class Grayscale(Transform): super().__init__() self.num_output_channels = num_output_channels - def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: + def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels) - if isinstance(inpt, features.Image): - output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) + if isinstance(inpt, (features.Image, features.Video)): + output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) # type: ignore[arg-type] return output class RandomGrayscale(_RandomApplyTransform): - _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor) + _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video) def __init__(self, p: float = 0.1) -> None: warnings.warn( @@ -81,8 +81,8 @@ class RandomGrayscale(_RandomApplyTransform): num_input_channels, _, _ = query_chw(sample) return dict(num_input_channels=num_input_channels) - def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType: + def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"]) - if isinstance(inpt, features.Image): - output = features.Image.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) + if isinstance(inpt, (features.Image, features.Video)): + output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) # type: ignore[arg-type] return output diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 37e2aee02..371ea7f69 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -155,12 +155,13 @@ class FiveCrop(Transform): """ Example: >>> class BatchMultiCrop(transforms.Transform): - ... def forward(self, sample: Tuple[Tuple[features.Image, ...], features.Label]): - ... images, labels = sample - ... batch_size = len(images) - ... images = features.Image.wrap_like(images[0], torch.stack(images)) + ... def forward(self, sample: Tuple[Tuple[Union[features.Image, features.Video], ...], features.Label]): + ... images_or_videos, labels = sample + ... batch_size = len(images_or_videos) + ... image_or_video = images_or_videos[0] + ... images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos)) ... labels = features.Label.wrap_like(labels, labels.repeat(batch_size)) - ... return images, labels + ... return images_or_videos, labels ... >>> image = features.Image(torch.rand(3, 256, 256)) >>> label = features.Label(0) @@ -172,15 +173,21 @@ class FiveCrop(Transform): torch.Size([5]) """ - _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor) + _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video) def __init__(self, size: Union[int, Sequence[int]]) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") def _transform( - self, inpt: features.ImageType, params: Dict[str, Any] - ) -> Tuple[features.ImageType, features.ImageType, features.ImageType, features.ImageType, features.ImageType]: + self, inpt: features.ImageOrVideoType, params: Dict[str, Any] + ) -> Tuple[ + features.ImageOrVideoType, + features.ImageOrVideoType, + features.ImageOrVideoType, + features.ImageOrVideoType, + features.ImageOrVideoType, + ]: return F.five_crop(inpt, self.size) def forward(self, *inputs: Any) -> Any: @@ -194,14 +201,14 @@ class TenCrop(Transform): See :class:`~torchvision.prototype.transforms.FiveCrop` for an example. """ - _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor) + _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video) def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") self.vertical_flip = vertical_flip - def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> List[features.ImageType]: + def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> List[features.ImageOrVideoType]: return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip) def forward(self, *inputs: Any) -> Any: diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index 74fbcd60f..e5c7d05b0 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -22,18 +22,18 @@ class ConvertBoundingBoxFormat(Transform): class ConvertImageDtype(Transform): - _transformed_types = (features.is_simple_tensor, features.Image) + _transformed_types = (features.is_simple_tensor, features.Image, features.Video) def __init__(self, dtype: torch.dtype = torch.float32) -> None: super().__init__() self.dtype = dtype - def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> features.TensorImageType: + def _transform( + self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any] + ) -> features.TensorImageOrVideoType: output = F.convert_image_dtype(inpt, dtype=self.dtype) return ( - output - if features.is_simple_tensor(inpt) - else features.Image.wrap_like(inpt, output) # type: ignore[arg-type] + output if features.is_simple_tensor(inpt) else type(inpt).wrap_like(inpt, output) # type: ignore[attr-defined] ) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index dd1e1cdf8..d3c8a57dc 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -140,6 +140,7 @@ class GaussianBlur(Transform): return F.gaussian_blur(inpt, self.kernel_size, **params) +# TODO: Enhance as described at https://github.com/pytorch/vision/issues/6697 class ToDtype(Lambda): def __init__(self, dtype: torch.dtype, *types: Type) -> None: self.dtype = dtype diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index 1e918cc34..579442dc7 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -96,6 +96,7 @@ from ._geometry import ( five_crop, five_crop_image_pil, five_crop_image_tensor, + five_crop_video, hflip, # TODO: Consider moving all pure alias definitions at the bottom of the file horizontal_flip, horizontal_flip_bounding_box, @@ -136,6 +137,7 @@ from ._geometry import ( ten_crop, ten_crop_image_pil, ten_crop_image_tensor, + ten_crop_video, vertical_flip, vertical_flip_bounding_box, vertical_flip_image_pil, diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py index 847343dbf..57c3602cc 100644 --- a/torchvision/prototype/transforms/functional/_augment.py +++ b/torchvision/prototype/transforms/functional/_augment.py @@ -35,7 +35,7 @@ def erase( if isinstance(inpt, torch.Tensor): output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): - output = type(inpt).wrap_like(inpt, output) # type: ignore[arg-type] + output = inpt.wrap_like(inpt, output) # type: ignore[arg-type] return output else: # isinstance(inpt, PIL.Image.Image): return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py index cbdea5130..854920b96 100644 --- a/torchvision/prototype/transforms/functional/_deprecated.py +++ b/torchvision/prototype/transforms/functional/_deprecated.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, List +from typing import Any, List, Union import PIL.Image import torch @@ -22,10 +22,13 @@ def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Ima return _F.to_grayscale(inpt, num_output_channels=num_output_channels) -def rgb_to_grayscale(inpt: features.LegacyImageTypeJIT, num_output_channels: int = 1) -> features.LegacyImageTypeJIT: +def rgb_to_grayscale( + inpt: Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT], num_output_channels: int = 1 +) -> Union[features.LegacyImageTypeJIT, features.LegacyVideoTypeJIT]: old_color_space = ( features._image._from_tensor_shape(inpt.shape) # type: ignore[arg-type] - if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Image)) + if isinstance(inpt, torch.Tensor) + and (torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video))) else None ) @@ -56,7 +59,7 @@ def to_tensor(inpt: Any) -> torch.Tensor: return _F.to_tensor(inpt) -def get_image_size(inpt: features.ImageTypeJIT) -> List[int]: +def get_image_size(inpt: features.ImageOrVideoTypeJIT) -> List[int]: warnings.warn( "The function `get_image_size(...)` is deprecated and will be removed in a future release. " "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`." diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 93df59ad6..44b4986ab 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1376,16 +1376,27 @@ def five_crop_image_pil( return tl, tr, bl, br, center +def five_crop_video( + video: torch.Tensor, size: List[int] +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + return five_crop_image_tensor(video, size) + + def five_crop( - inpt: features.ImageTypeJIT, size: List[int] + inpt: features.ImageOrVideoTypeJIT, size: List[int] ) -> Tuple[ - features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT + features.ImageOrVideoTypeJIT, + features.ImageOrVideoTypeJIT, + features.ImageOrVideoTypeJIT, + features.ImageOrVideoTypeJIT, + features.ImageOrVideoTypeJIT, ]: - # TODO: consider breaking BC here to return List[features.ImageTypeJIT] to align this op with `ten_crop` + # TODO: consider breaking BC here to return List[features.ImageOrVideoTypeJIT] to align this op with `ten_crop` if isinstance(inpt, torch.Tensor): output = five_crop_image_tensor(inpt, size) - if not torch.jit.is_scripting() and isinstance(inpt, features.Image): - output = tuple(features.Image.wrap_like(inpt, item) for item in output) # type: ignore[assignment] + if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): + tmp = tuple(inpt.wrap_like(inpt, item) for item in output) # type: ignore[arg-type] + output = tmp # type: ignore[assignment] return output else: # isinstance(inpt, PIL.Image.Image): return five_crop_image_pil(inpt, size) @@ -1418,11 +1429,17 @@ def ten_crop_image_pil(image: PIL.Image.Image, size: List[int], vertical_flip: b return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip] -def ten_crop(inpt: features.ImageTypeJIT, size: List[int], vertical_flip: bool = False) -> List[features.ImageTypeJIT]: +def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]: + return ten_crop_image_tensor(video, size, vertical_flip=vertical_flip) + + +def ten_crop( + inpt: features.ImageOrVideoTypeJIT, size: List[int], vertical_flip: bool = False +) -> List[features.ImageOrVideoTypeJIT]: if isinstance(inpt, torch.Tensor): output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip) - if not torch.jit.is_scripting() and isinstance(inpt, features.Image): - output = [features.Image.wrap_like(inpt, item) for item in output] + if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): + output = [inpt.wrap_like(inpt, item) for item in output] # type: ignore[arg-type] return output else: # isinstance(inpt, PIL.Image.Image): return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip) diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index e24b68c9f..c03d65c95 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -55,6 +55,10 @@ def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]: return [height, width] +# TODO: Should we have get_spatial_size_video here? How about masks/bbox etc? What is the criterion for deciding when +# a kernel will be created? + + def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return get_spatial_size_image_tensor(inpt) @@ -246,7 +250,7 @@ def convert_color_space( ): if old_color_space is None: raise RuntimeError( - "In order to convert the color space of simple tensor images, " + "In order to convert the color space of simple tensors, " "the `old_color_space=...` parameter needs to be passed." ) return convert_color_space_image_tensor( -- GitLab From 12adc5426ef345ab7999661538a60da99dd85281 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 10 Oct 2022 14:56:22 +0100 Subject: [PATCH 029/624] Add video support on MixUp and CutMix (#6733) * Add video support on MixUp and CutMix * Switch back to roll * Fix tests and mypy * Another mypy fix --- test/test_prototype_transforms.py | 9 +++-- torchvision/prototype/transforms/_augment.py | 37 +++++++++++-------- .../prototype/transforms/_auto_augment.py | 4 +- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 4037a7467..d7a41e7c1 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -112,9 +112,12 @@ class TestSmoke: ( transform, [ - dict(image=image, one_hot_label=one_hot_label) - for image, one_hot_label in itertools.product( - make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]), + dict(inpt=inpt, one_hot_label=one_hot_label) + for inpt, one_hot_label in itertools.product( + itertools.chain( + make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]), + make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]), + ), make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]), ) ], diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 7b2dca8a6..4bfb5c9ed 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -107,8 +107,11 @@ class _BaseMixupCutmix(_RandomApplyTransform): self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) def forward(self, *inputs: Any) -> Any: - if not (has_any(inputs, features.Image, features.is_simple_tensor) and has_any(inputs, features.OneHotLabel)): - raise TypeError(f"{type(self).__name__}() is only defined for tensor images and one-hot labels.") + if not ( + has_any(inputs, features.Image, features.Video, features.is_simple_tensor) + and has_any(inputs, features.OneHotLabel) + ): + raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.") if has_any(inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label): raise TypeError( f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels." @@ -119,7 +122,7 @@ class _BaseMixupCutmix(_RandomApplyTransform): if inpt.ndim < 2: raise ValueError("Need a batch of one hot labels") output = inpt.clone() - output = output.roll(1, -2).mul_(1 - lam).add_(output.mul_(lam)) + output = output.roll(1, 0).mul_(1.0 - lam).add_(output.mul_(lam)) return features.OneHotLabel.wrap_like(inpt, output) @@ -129,14 +132,15 @@ class RandomMixup(_BaseMixupCutmix): def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: lam = params["lam"] - if isinstance(inpt, features.Image) or features.is_simple_tensor(inpt): - if inpt.ndim < 4: - raise ValueError("Need a batch of images") + if isinstance(inpt, (features.Image, features.Video)) or features.is_simple_tensor(inpt): + expected_ndim = 5 if isinstance(inpt, features.Video) else 4 + if inpt.ndim < expected_ndim: + raise ValueError("The transform expects a batched input") output = inpt.clone() - output = output.roll(1, -4).mul_(1 - lam).add_(output.mul_(lam)) + output = output.roll(1, 0).mul_(1.0 - lam).add_(output.mul_(lam)) - if isinstance(inpt, features.Image): - output = features.Image.wrap_like(inpt, output) + if isinstance(inpt, (features.Image, features.Video)): + output = type(inpt).wrap_like(inpt, output) # type: ignore[arg-type] return output elif isinstance(inpt, features.OneHotLabel): @@ -169,17 +173,18 @@ class RandomCutmix(_BaseMixupCutmix): return dict(box=box, lam_adjusted=lam_adjusted) def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: - if isinstance(inpt, features.Image) or features.is_simple_tensor(inpt): + if isinstance(inpt, (features.Image, features.Video)) or features.is_simple_tensor(inpt): box = params["box"] - if inpt.ndim < 4: - raise ValueError("Need a batch of images") + expected_ndim = 5 if isinstance(inpt, features.Video) else 4 + if inpt.ndim < expected_ndim: + raise ValueError("The transform expects a batched input") x1, y1, x2, y2 = box - image_rolled = inpt.roll(1, -4) + rolled = inpt.roll(1, 0) output = inpt.clone() - output[..., y1:y2, x1:x2] = image_rolled[..., y1:y2, x1:x2] + output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2] - if isinstance(inpt, features.Image): - output = features.Image.wrap_like(inpt, output) + if isinstance(inpt, (features.Image, features.Video)): + output = inpt.wrap_like(inpt, output) # type: ignore[arg-type] return output elif isinstance(inpt, features.OneHotLabel): diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index d078cb2d1..b35b5529b 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -483,8 +483,8 @@ class AugMix(_AutoAugmentBase): augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE orig_dims = list(image_or_video.shape) - expected_dim = 5 if isinstance(orig_image_or_video, features.Video) else 4 - batch = image_or_video.view([1] * max(expected_dim - image_or_video.ndim, 0) + orig_dims) + expected_ndim = 5 if isinstance(orig_image_or_video, features.Video) else 4 + batch = image_or_video.view([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims) batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1) # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a -- GitLab From 3099e0cc73610ccd39cca7fccbb72fce920f09de Mon Sep 17 00:00:00 2001 From: vsuryamurthy Date: Tue, 11 Oct 2022 09:55:08 +0200 Subject: [PATCH 030/624] Add missing type hints to anchor_utils (#6735) * Use the variable name sizes instead of scales for consistency * Add the missing type hints * Restore the naming back to scales instead of sizes to avoid backwards incompatibility --- torchvision/models/detection/anchor_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py index f42c10d82..cdf572a8b 100644 --- a/torchvision/models/detection/anchor_utils.py +++ b/torchvision/models/detection/anchor_utils.py @@ -61,7 +61,7 @@ class AnchorGenerator(nn.Module): aspect_ratios: List[float], dtype: torch.dtype = torch.float32, device: torch.device = torch.device("cpu"), - ): + ) -> Tensor: scales = torch.as_tensor(scales, dtype=dtype, device=device) aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device) h_ratios = torch.sqrt(aspect_ratios) @@ -76,7 +76,7 @@ class AnchorGenerator(nn.Module): def set_cell_anchors(self, dtype: torch.dtype, device: torch.device): self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors] - def num_anchors_per_location(self): + def num_anchors_per_location(self) -> List[int]: return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)] # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2), @@ -201,7 +201,7 @@ class DefaultBoxGenerator(nn.Module): _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device)) return _wh_pairs - def num_anchors_per_location(self): + def num_anchors_per_location(self) -> List[int]: # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map. return [2 + 2 * len(r) for r in self.aspect_ratios] -- GitLab From 4d4711d970f5cbd0a9e1adb465dca2703c8efbfd Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 11 Oct 2022 10:10:48 +0100 Subject: [PATCH 031/624] [prototype] Switch to `spatial_size` (#6736) * Change `image_size` to `spatial_size` * Fix linter * Fixing more tests. * Adding get_num_channels_video and get_spatial_size_* kernels for video, masks and bboxes. * Refactor get_spatial_size * Reduce the usage of `query_chw` where possible * Rename `query_chw` to `query_spatial_size` * Adding `get_num_frames` dispatcher and kernel. * Adding jit-scriptability tests --- test/prototype_common_utils.py | 55 ++++---- test/prototype_transforms_kernel_infos.py | 66 +++++----- test/test_prototype_transforms.py | 118 +++++++++--------- test/test_prototype_transforms_consistency.py | 10 +- test/test_prototype_transforms_functional.py | 115 ++++++++--------- test/test_prototype_transforms_utils.py | 4 +- .../prototype/datasets/_builtin/caltech.py | 4 +- .../prototype/datasets/_builtin/celeba.py | 2 +- .../prototype/datasets/_builtin/coco.py | 16 ++- .../prototype/datasets/_builtin/cub200.py | 12 +- .../prototype/datasets/_builtin/gtsrb.py | 2 +- .../datasets/_builtin/stanford_cars.py | 2 +- .../prototype/datasets/_builtin/voc.py | 2 +- .../prototype/features/_bounding_box.py | 54 ++++---- torchvision/prototype/features/_encoded.py | 8 +- torchvision/prototype/features/_image.py | 2 +- torchvision/prototype/features/_mask.py | 2 +- torchvision/prototype/features/_video.py | 3 +- torchvision/prototype/transforms/_augment.py | 4 +- torchvision/prototype/transforms/_color.py | 2 +- .../prototype/transforms/_deprecated.py | 2 +- torchvision/prototype/transforms/_geometry.py | 38 +++--- torchvision/prototype/transforms/_meta.py | 2 +- torchvision/prototype/transforms/_utils.py | 18 ++- .../transforms/functional/__init__.py | 6 + .../transforms/functional/_geometry.py | 56 ++++----- .../prototype/transforms/functional/_meta.py | 48 +++++-- 27 files changed, 354 insertions(+), 299 deletions(-) diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index 1d5766b1f..220a793ac 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -184,13 +184,18 @@ class ArgsKwargs: return args, kwargs -DEFAULT_SQUARE_IMAGE_SIZE = 15 -DEFAULT_LANDSCAPE_IMAGE_SIZE = (7, 33) -DEFAULT_PORTRAIT_IMAGE_SIZE = (31, 9) -DEFAULT_IMAGE_SIZES = (DEFAULT_LANDSCAPE_IMAGE_SIZE, DEFAULT_PORTRAIT_IMAGE_SIZE, DEFAULT_SQUARE_IMAGE_SIZE, "random") +DEFAULT_SQUARE_SPATIAL_SIZE = 15 +DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33) +DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9) +DEFAULT_SPATIAL_SIZES = ( + DEFAULT_LANDSCAPE_SPATIAL_SIZE, + DEFAULT_PORTRAIT_SPATIAL_SIZE, + DEFAULT_SQUARE_SPATIAL_SIZE, + "random", +) -def _parse_image_size(size, *, name="size"): +def _parse_spatial_size(size, *, name="size"): if size == "random": return tuple(torch.randint(15, 33, (2,)).tolist()) elif isinstance(size, int) and size > 0: @@ -246,11 +251,11 @@ class TensorLoader: @dataclasses.dataclass class ImageLoader(TensorLoader): color_space: features.ColorSpace - image_size: Tuple[int, int] = dataclasses.field(init=False) + spatial_size: Tuple[int, int] = dataclasses.field(init=False) num_channels: int = dataclasses.field(init=False) def __post_init__(self): - self.image_size = self.shape[-2:] + self.spatial_size = self.shape[-2:] self.num_channels = self.shape[-3] @@ -277,7 +282,7 @@ def make_image_loader( dtype=torch.float32, constant_alpha=True, ): - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_channels = get_num_channels(color_space) def fn(shape, dtype, device): @@ -295,7 +300,7 @@ make_image = from_loader(make_image_loader) def make_image_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, color_spaces=( features.ColorSpace.GRAY, features.ColorSpace.GRAY_ALPHA, @@ -316,7 +321,7 @@ make_images = from_loaders(make_image_loaders) @dataclasses.dataclass class BoundingBoxLoader(TensorLoader): format: features.BoundingBoxFormat - image_size: Tuple[int, int] + spatial_size: Tuple[int, int] def randint_with_tensor_bounds(arg1, arg2=None, **kwargs): @@ -331,7 +336,7 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs): ).reshape(low.shape) -def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtype=torch.float32): +def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32): if isinstance(format, str): format = features.BoundingBoxFormat[format] if format not in { @@ -341,7 +346,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp }: raise pytest.UsageError(f"Can't make bounding box in format {format}") - image_size = _parse_image_size(image_size, name="image_size") + spatial_size = _parse_spatial_size(spatial_size, name="spatial_size") def fn(shape, dtype, device): *extra_dims, num_coordinates = shape @@ -350,10 +355,10 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp if any(dim == 0 for dim in extra_dims): return features.BoundingBox( - torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, image_size=image_size + torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size ) - height, width = image_size + height, width = spatial_size if format == features.BoundingBoxFormat.XYXY: x1 = torch.randint(0, width // 2, extra_dims) @@ -375,10 +380,10 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp parts = (cx, cy, w, h) return features.BoundingBox( - torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, image_size=image_size + torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size ) - return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, image_size=image_size) + return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size) make_bounding_box = from_loader(make_bounding_box_loader) @@ -388,11 +393,11 @@ def make_bounding_box_loaders( *, extra_dims=DEFAULT_EXTRA_DIMS, formats=tuple(features.BoundingBoxFormat), - image_size="random", + spatial_size="random", dtypes=(torch.float32, torch.int64), ): for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): - yield make_bounding_box_loader(**params, image_size=image_size) + yield make_bounding_box_loader(**params, spatial_size=spatial_size) make_bounding_boxes = from_loaders(make_bounding_box_loaders) @@ -475,7 +480,7 @@ class MaskLoader(TensorLoader): def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8): # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects def fn(shape, dtype, device): @@ -489,7 +494,7 @@ make_detection_mask = from_loader(make_detection_mask_loader) def make_detection_mask_loaders( - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, num_objects=(1, 0, "random"), extra_dims=DEFAULT_EXTRA_DIMS, dtypes=(torch.uint8,), @@ -503,7 +508,7 @@ make_detection_masks = from_loaders(make_detection_mask_loaders) def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8): # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories def fn(shape, dtype, device): @@ -518,7 +523,7 @@ make_segmentation_mask = from_loader(make_segmentation_mask_loader) def make_segmentation_mask_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, num_categories=(1, 2, "random"), extra_dims=DEFAULT_EXTRA_DIMS, dtypes=(torch.uint8,), @@ -532,7 +537,7 @@ make_segmentation_masks = from_loaders(make_segmentation_mask_loaders) def make_mask_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, num_objects=(1, 0, "random"), num_categories=(1, 2, "random"), extra_dims=DEFAULT_EXTRA_DIMS, @@ -559,7 +564,7 @@ def make_video_loader( extra_dims=(), dtype=torch.uint8, ): - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames def fn(shape, dtype, device): @@ -576,7 +581,7 @@ make_video = from_loader(make_video_loader) def make_video_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, color_spaces=( features.ColorSpace.GRAY, features.ColorSpace.RGB, diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index c8cca77e0..239425d17 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -145,7 +145,7 @@ def sample_inputs_horizontal_flip_bounding_box(): formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32] ): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size + bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size ) @@ -185,9 +185,9 @@ KERNEL_INFOS.extend( ) -def _get_resize_sizes(image_size): - height, width = image_size - length = max(image_size) +def _get_resize_sizes(spatial_size): + height, width = spatial_size + length = max(spatial_size) yield length yield [length] yield (length,) @@ -201,7 +201,7 @@ def sample_inputs_resize_image_tensor(): for image_loader in make_image_loaders( sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] ): - for size in _get_resize_sizes(image_loader.image_size): + for size in _get_resize_sizes(image_loader.spatial_size): yield ArgsKwargs(image_loader, size=size) for image_loader, interpolation in itertools.product( @@ -212,7 +212,7 @@ def sample_inputs_resize_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation) + yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation) yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25) @@ -236,7 +236,7 @@ def reference_inputs_resize_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - for size in _get_resize_sizes(image_loader.image_size): + for size in _get_resize_sizes(image_loader.spatial_size): yield ArgsKwargs( image_loader, size=size, @@ -251,8 +251,8 @@ def reference_inputs_resize_image_tensor(): def sample_inputs_resize_bounding_box(): for bounding_box_loader in make_bounding_box_loaders(): - for size in _get_resize_sizes(bounding_box_loader.image_size): - yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size) + for size in _get_resize_sizes(bounding_box_loader.spatial_size): + yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size) def sample_inputs_resize_mask(): @@ -394,7 +394,7 @@ def sample_inputs_affine_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, **affine_params, ) @@ -422,9 +422,9 @@ def _compute_affine_matrix(angle, translate, scale, shear, center): return true_matrix -def reference_affine_bounding_box(bounding_box, *, format, image_size, angle, translate, scale, shear, center=None): +def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None): if center is None: - center = [s * 0.5 for s in image_size[::-1]] + center = [s * 0.5 for s in spatial_size[::-1]] def transform(bbox): affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center) @@ -473,7 +473,7 @@ def reference_inputs_affine_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, **affine_kwargs, ) @@ -650,7 +650,7 @@ def sample_inputs_vertical_flip_bounding_box(): formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32] ): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size + bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size ) @@ -729,7 +729,7 @@ def sample_inputs_rotate_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, angle=_ROTATE_ANGLES[0], ) @@ -1001,7 +1001,7 @@ def sample_inputs_pad_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, padding=padding, padding_mode="constant", ) @@ -1131,13 +1131,13 @@ KERNEL_INFOS.extend( ) -def _get_elastic_displacement(image_size): - return torch.rand(1, *image_size, 2) +def _get_elastic_displacement(spatial_size): + return torch.rand(1, *spatial_size, 2) def sample_inputs_elastic_image_tensor(): for image_loader in make_image_loaders(sizes=["random"]): - displacement = _get_elastic_displacement(image_loader.image_size) + displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]: yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) @@ -1151,14 +1151,14 @@ def reference_inputs_elastic_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - displacement = _get_elastic_displacement(image_loader.image_size) + displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]: yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill) def sample_inputs_elastic_bounding_box(): for bounding_box_loader in make_bounding_box_loaders(): - displacement = _get_elastic_displacement(bounding_box_loader.image_size) + displacement = _get_elastic_displacement(bounding_box_loader.spatial_size) yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, @@ -1212,7 +1212,7 @@ KERNEL_INFOS.extend( ) -_CENTER_CROP_IMAGE_SIZES = [(16, 16), (7, 33), (31, 9)] +_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)] _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)] @@ -1231,7 +1231,7 @@ def sample_inputs_center_crop_image_tensor(): def reference_inputs_center_crop_image_tensor(): for image_loader, output_size in itertools.product( - make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES + make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES ): yield ArgsKwargs(image_loader, output_size=output_size) @@ -1241,7 +1241,7 @@ def sample_inputs_center_crop_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, output_size=output_size, ) @@ -1254,7 +1254,7 @@ def sample_inputs_center_crop_mask(): def reference_inputs_center_crop_mask(): for mask_loader, output_size in itertools.product( - make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES + make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES ): yield ArgsKwargs(mask_loader, output_size=output_size) @@ -1820,7 +1820,7 @@ KERNEL_INFOS.extend( def sample_inputs_clamp_bounding_box(): for bounding_box_loader in make_bounding_box_loaders(): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size + bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size ) @@ -1834,7 +1834,7 @@ KERNEL_INFOS.append( _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]] -def _get_five_ten_crop_image_size(size): +def _get_five_ten_crop_spatial_size(size): if isinstance(size, int): crop_height = crop_width = size elif len(size) == 1: @@ -1847,28 +1847,32 @@ def _get_five_ten_crop_image_size(size): def sample_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + sizes=[_get_five_ten_crop_spatial_size(size)], + color_spaces=[features.ColorSpace.RGB], + dtypes=[torch.float32], ): yield ArgsKwargs(image_loader, size=size) def reference_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: - for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]): + for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]): yield ArgsKwargs(image_loader, size=size) def sample_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + sizes=[_get_five_ten_crop_spatial_size(size)], + color_spaces=[features.ColorSpace.RGB], + dtypes=[torch.float32], ): yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) def reference_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): - for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]): + for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]): yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index d7a41e7c1..2c095fa6e 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -298,7 +298,7 @@ class TestRandomHorizontalFlip: assert_equal(features.Mask(expected), actual) def test_features_bounding_box(self, p): - input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10)) + input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10)) transform = transforms.RandomHorizontalFlip(p=p) actual = transform(input) @@ -307,7 +307,7 @@ class TestRandomHorizontalFlip: expected = features.BoundingBox.wrap_like(input, expected_image_tensor) assert_equal(expected, actual) assert actual.format == expected.format - assert actual.image_size == expected.image_size + assert actual.spatial_size == expected.spatial_size @pytest.mark.parametrize("p", [0.0, 1.0]) @@ -351,7 +351,7 @@ class TestRandomVerticalFlip: assert_equal(features.Mask(expected), actual) def test_features_bounding_box(self, p): - input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10)) + input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10)) transform = transforms.RandomVerticalFlip(p=p) actual = transform(input) @@ -360,7 +360,7 @@ class TestRandomVerticalFlip: expected = features.BoundingBox.wrap_like(input, expected_image_tensor) assert_equal(expected, actual) assert actual.format == expected.format - assert actual.image_size == expected.image_size + assert actual.spatial_size == expected.spatial_size class TestPad: @@ -435,7 +435,7 @@ class TestRandomZoomOut: transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) image = mocker.MagicMock(spec=features.Image) - h, w = image.image_size = (24, 32) + h, w = image.spatial_size = (24, 32) params = transform._get_params(image) @@ -450,7 +450,7 @@ class TestRandomZoomOut: def test__transform(self, fill, side_range, mocker): inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1) @@ -559,17 +559,17 @@ class TestRandomRotation: @pytest.mark.parametrize("angle", [34, -87]) @pytest.mark.parametrize("expand", [False, True]) - def test_boundingbox_image_size(self, angle, expand): + def test_boundingbox_spatial_size(self, angle, expand): # Specific test for BoundingBox.rotate bbox = features.BoundingBox( - torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32) + torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, spatial_size=(32, 32) ) img = features.Image(torch.rand(1, 3, 32, 32)) out_img = img.rotate(angle, expand=expand) out_bbox = bbox.rotate(angle, expand=expand) - assert out_img.image_size == out_bbox.image_size + assert out_img.spatial_size == out_bbox.spatial_size class TestRandomAffine: @@ -619,8 +619,8 @@ class TestRandomAffine: def test__get_params(self, degrees, translate, scale, shear, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) - h, w = image.image_size + image.spatial_size = (24, 32) + h, w = image.spatial_size transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear) params = transform._get_params(image) @@ -682,7 +682,7 @@ class TestRandomAffine: fn = mocker.patch("torchvision.prototype.transforms.functional.affine") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users @@ -718,8 +718,8 @@ class TestRandomCrop: def test__get_params(self, padding, pad_if_needed, size, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) - h, w = image.image_size + image.spatial_size = (24, 32) + h, w = image.spatial_size transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed) params = transform._get_params(image) @@ -771,19 +771,19 @@ class TestRandomCrop: inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (32, 32) + inpt.spatial_size = (32, 32) expected = mocker.MagicMock(spec=features.Image) expected.num_channels = 3 if isinstance(padding, int): - expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding) + expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding) elif isinstance(padding, list): - expected.image_size = ( - inpt.image_size[0] + sum(padding[0::2]), - inpt.image_size[1] + sum(padding[1::2]), + expected.spatial_size = ( + inpt.spatial_size[0] + sum(padding[0::2]), + inpt.spatial_size[1] + sum(padding[1::2]), ) else: - expected.image_size = inpt.image_size + expected.spatial_size = inpt.spatial_size _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected) fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop") @@ -859,7 +859,7 @@ class TestGaussianBlur: fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users @@ -910,11 +910,11 @@ class TestRandomPerspective: transform = transforms.RandomPerspective(dscale) image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) params = transform._get_params(image) - h, w = image.image_size + h, w = image.spatial_size assert "perspective_coeffs" in params assert len(params["perspective_coeffs"]) == 8 @@ -927,7 +927,7 @@ class TestRandomPerspective: fn = mocker.patch("torchvision.prototype.transforms.functional.perspective") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users # Otherwise, we can mock transform._get_params @@ -971,11 +971,11 @@ class TestElasticTransform: transform = transforms.ElasticTransform(alpha, sigma) image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) params = transform._get_params(image) - h, w = image.image_size + h, w = image.spatial_size displacement = params["displacement"] assert displacement.shape == (1, h, w, 2) assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() @@ -1001,7 +1001,7 @@ class TestElasticTransform: fn = mocker.patch("torchvision.prototype.transforms.functional.elastic") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # Let's mock transform._get_params to control the output: transform._get_params = mocker.MagicMock() @@ -1030,7 +1030,7 @@ class TestRandomErasing: image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) transform = transforms.RandomErasing(value=[1, 2, 3, 4]) @@ -1041,7 +1041,7 @@ class TestRandomErasing: def test__get_params(self, value, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) transform = transforms.RandomErasing(value=value) params = transform._get_params(image) @@ -1057,8 +1057,8 @@ class TestRandomErasing: elif isinstance(value, (list, tuple)): assert v.shape == (image.num_channels, 1, 1) - assert 0 <= i <= image.image_size[0] - h - assert 0 <= j <= image.image_size[1] - w + assert 0 <= i <= image.spatial_size[0] - h + assert 0 <= j <= image.spatial_size[1] - w @pytest.mark.parametrize("p", [0, 1]) def test__transform(self, mocker, p): @@ -1222,11 +1222,11 @@ class TestRandomIoUCrop: def test__get_params(self, device, options, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) bboxes = features.BoundingBox( torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), format="XYXY", - image_size=image.image_size, + spatial_size=image.spatial_size, device=device, ) sample = [image, bboxes] @@ -1245,8 +1245,8 @@ class TestRandomIoUCrop: assert len(params["is_within_crop_area"]) > 0 assert params["is_within_crop_area"].dtype == torch.bool - orig_h = image.image_size[0] - orig_w = image.image_size[1] + orig_h = image.spatial_size[0] + orig_w = image.spatial_size[1] assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h) assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w) @@ -1261,7 +1261,7 @@ class TestRandomIoUCrop: def test__transform_empty_params(self, mocker): transform = transforms.RandomIoUCrop(sampler_options=[2.0]) image = features.Image(torch.rand(1, 3, 4, 4)) - bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4)) + bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4)) label = features.Label(torch.tensor([1])) sample = [image, bboxes, label] # Let's mock transform._get_params to control the output: @@ -1281,7 +1281,7 @@ class TestRandomIoUCrop: transform = transforms.RandomIoUCrop() image = features.Image(torch.rand(3, 32, 24)) - bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,)) + bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,)) label = features.Label(torch.randint(0, 10, size=(6,))) ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1)) masks = make_detection_mask((32, 24), num_objects=6) @@ -1329,12 +1329,12 @@ class TestRandomIoUCrop: class TestScaleJitter: def test__get_params(self, mocker): - image_size = (24, 32) + spatial_size = (24, 32) target_size = (16, 12) scale_range = (0.5, 1.5) transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) - sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size) + sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size) n_samples = 5 for _ in range(n_samples): @@ -1347,11 +1347,11 @@ class TestScaleJitter: assert isinstance(size, tuple) and len(size) == 2 height, width = size - r_min = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[0] - r_max = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[1] + r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0] + r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1] - assert int(image_size[0] * r_min) <= height <= int(image_size[0] * r_max) - assert int(image_size[1] * r_min) <= width <= int(image_size[1] * r_max) + assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max) + assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max) def test__transform(self, mocker): interpolation_sentinel = mocker.MagicMock() @@ -1379,13 +1379,13 @@ class TestScaleJitter: class TestRandomShortestSize: def test__get_params(self, mocker): - image_size = (3, 10) + spatial_size = (3, 10) min_size = [5, 9] max_size = 20 transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) - sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size) + sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size) params = transform._get_params(sample) assert "size" in params @@ -1504,7 +1504,7 @@ class TestSimpleCopyPaste: labels = torch.nn.functional.one_hot(labels, num_classes=5) target = { "boxes": features.BoundingBox( - torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32) + torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32) ), "masks": features.Mask(masks), "labels": label_type(labels), @@ -1519,7 +1519,7 @@ class TestSimpleCopyPaste: paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5) paste_target = { "boxes": features.BoundingBox( - torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32) + torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32) ), "masks": features.Mask(paste_masks), "labels": label_type(paste_labels), @@ -1550,14 +1550,14 @@ class TestFixedSizeCrop: def test__get_params(self, mocker): crop_size = (7, 7) batch_shape = (10,) - image_size = (11, 5) + spatial_size = (11, 5) transform = transforms.FixedSizeCrop(size=crop_size) sample = dict( - image=make_image(size=image_size, color_space=features.ColorSpace.RGB), + image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB), bounding_boxes=make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape + format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape ), ) params = transform._get_params(sample) @@ -1638,7 +1638,7 @@ class TestFixedSizeCrop: def test__transform_culling(self, mocker): batch_size = 10 - image_size = (10, 10) + spatial_size = (10, 10) is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool) mocker.patch( @@ -1647,17 +1647,17 @@ class TestFixedSizeCrop: needs_crop=True, top=0, left=0, - height=image_size[0], - width=image_size[1], + height=spatial_size[0], + width=spatial_size[1], is_valid=is_valid, needs_pad=False, ), ) bounding_boxes = make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,) + format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,) ) - masks = make_detection_mask(size=image_size, extra_dims=(batch_size,)) + masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,)) labels = make_label(extra_dims=(batch_size,)) transform = transforms.FixedSizeCrop((-1, -1)) @@ -1678,7 +1678,7 @@ class TestFixedSizeCrop: def test__transform_bounding_box_clamping(self, mocker): batch_size = 3 - image_size = (10, 10) + spatial_size = (10, 10) mocker.patch( "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params", @@ -1686,15 +1686,15 @@ class TestFixedSizeCrop: needs_crop=True, top=0, left=0, - height=image_size[0], - width=image_size[1], + height=spatial_size[0], + width=spatial_size[1], is_valid=torch.full((batch_size,), fill_value=True), needs_pad=False, ), ) bounding_box = make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,) + format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,) ) mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box") diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index c8debe1e2..f335220fb 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -24,7 +24,7 @@ from torchvision import transforms as legacy_transforms from torchvision._utils import sequence_to_str from torchvision.prototype import features, transforms as prototype_transforms from torchvision.prototype.transforms import functional as F -from torchvision.prototype.transforms._utils import query_chw +from torchvision.prototype.transforms._utils import query_spatial_size from torchvision.prototype.transforms.functional import to_image_pil DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)]) @@ -871,7 +871,7 @@ class TestRefDetTransforms: pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB)) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -881,7 +881,7 @@ class TestRefDetTransforms: tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB)) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -891,7 +891,7 @@ class TestRefDetTransforms: feature_image = make_image(size=size, color_space=features.ColorSpace.RGB) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -949,7 +949,7 @@ class PadIfSmaller(prototype_transforms.Transform): self.fill = prototype_transforms._geometry._setup_fill_arg(fill) def _get_params(self, sample): - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] needs_padding = any(padding) return dict(padding=padding, needs_padding=needs_padding) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 8329de697..56c473a23 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -224,11 +224,14 @@ class TestDispatchers: @pytest.mark.parametrize( "dispatcher", [ + F.clamp_bounding_box, F.convert_color_space, F.convert_image_dtype, F.get_dimensions, F.get_image_num_channels, F.get_image_size, + F.get_num_channels, + F.get_num_frames, F.get_spatial_size, F.rgb_to_grayscale, ], @@ -333,16 +336,16 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): @pytest.mark.parametrize("device", cpu_and_gpu()) def test_correctness_affine_bounding_box_on_fixed_input(device): # Check transformation against known expected output - image_size = (64, 64) + spatial_size = (64, 64) # xyxy format in_boxes = [ [20, 25, 35, 45], [50, 5, 70, 22], - [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10], + [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10], [1, 1, 5, 5], ] in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device + in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device ) # Tested parameters angle = 63 @@ -355,9 +358,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device): # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox # expected_bboxes = [] # for in_box in in_boxes: - # n_in_box = normalize_bbox(in_box, *image_size) - # n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size) - # out_box = denormalize_bbox(n_out_box, *image_size) + # n_in_box = normalize_bbox(in_box, *spatial_size) + # n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *spatial_size) + # out_box = denormalize_bbox(n_out_box, *spatial_size) # expected_bboxes.append(out_box) expected_bboxes = [ (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695), @@ -369,9 +372,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device): output_boxes = F.affine_bounding_box( in_boxes, in_boxes.format, - in_boxes.image_size, + in_boxes.spatial_size, angle, - (dx * image_size[1], dy * image_size[0]), + (dx * spatial_size[1], dy * spatial_size[0]), scale, shear=(0, 0), ) @@ -406,7 +409,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center): affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_) affine_matrix = affine_matrix[:2, :] - height, width = bbox.image_size + height, width = bbox.spatial_size bbox_xyxy = convert_format_bounding_box( bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY ) @@ -444,7 +447,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center): out_bbox = features.BoundingBox( out_bbox, format=features.BoundingBoxFormat.XYXY, - image_size=(height, width), + spatial_size=(height, width), dtype=bbox.dtype, device=bbox.device, ) @@ -455,16 +458,16 @@ def test_correctness_rotate_bounding_box(angle, expand, center): (height, width), ) - image_size = (32, 38) + spatial_size = (32, 38) - for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)): + for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)): bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size - output_bboxes, output_image_size = F.rotate_bounding_box( + output_bboxes, output_spatial_size = F.rotate_bounding_box( bboxes, bboxes_format, - image_size=bboxes_image_size, + spatial_size=bboxes_spatial_size, angle=angle, expand=expand, center=center, @@ -472,38 +475,38 @@ def test_correctness_rotate_bounding_box(angle, expand, center): center_ = center if center_ is None: - center_ = [s * 0.5 for s in bboxes_image_size[::-1]] + center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]] if bboxes.ndim < 2: bboxes = [bboxes] expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) + expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_) expected_bboxes.append(expected_bbox) if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) else: expected_bboxes = expected_bboxes[0] torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0) - torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0) + torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0) @pytest.mark.parametrize("device", cpu_and_gpu()) @pytest.mark.parametrize("expand", [False]) # expand=True does not match D2 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand): # Check transformation against known expected output - image_size = (64, 64) + spatial_size = (64, 64) # xyxy format in_boxes = [ [1, 1, 5, 5], - [1, image_size[0] - 6, 5, image_size[0] - 2], - [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2], - [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10], + [1, spatial_size[0] - 6, 5, spatial_size[0] - 2], + [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2], + [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10], ] in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device + in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device ) # Tested parameters angle = 45 @@ -535,7 +538,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand): output_boxes, _ = F.rotate_bounding_box( in_boxes, in_boxes.format, - in_boxes.image_size, + in_boxes.spatial_size, angle, expand=expand, center=center, @@ -593,11 +596,11 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width, [50.0, 5.0, 70.0, 22.0], [45.0, 46.0, 56.0, 62.0], ] - in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device) + in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=size, device=device) if format != features.BoundingBoxFormat.XYXY: in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format) - output_boxes, output_image_size = F.crop_bounding_box( + output_boxes, output_spatial_size = F.crop_bounding_box( in_boxes, format, top, @@ -610,7 +613,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width, output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) - torch.testing.assert_close(output_image_size, size) + torch.testing.assert_close(output_spatial_size, size) @pytest.mark.parametrize("device", cpu_and_gpu()) @@ -658,7 +661,7 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height bbox[3] = (bbox[3] - top_) * size_[0] / height_ return bbox - image_size = (100, 100) + spatial_size = (100, 100) # xyxy format in_boxes = [ [10.0, 10.0, 20.0, 20.0], @@ -670,18 +673,18 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height expected_bboxes = torch.tensor(expected_bboxes, device=device) in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device + in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device ) if format != features.BoundingBoxFormat.XYXY: in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format) - output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size) + output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size) if format != features.BoundingBoxFormat.XYXY: output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes, expected_bboxes) - torch.testing.assert_close(output_image_size, size) + torch.testing.assert_close(output_spatial_size, size) def _parse_padding(padding): @@ -718,28 +721,28 @@ def test_correctness_pad_bounding_box(device, padding): bbox = bbox.to(bbox_dtype) return bbox - def _compute_expected_image_size(bbox, padding_): + def _compute_expected_spatial_size(bbox, padding_): pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_) - height, width = bbox.image_size + height, width = bbox.spatial_size return height + pad_up + pad_down, width + pad_left + pad_right for bboxes in make_bounding_boxes(): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size - output_boxes, output_image_size = F.pad_bounding_box( - bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding + output_boxes, output_spatial_size = F.pad_bounding_box( + bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding ) - torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding)) + torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding)) if bboxes.ndim < 2 or bboxes.shape[0] == 0: bboxes = [bboxes] expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, padding)) if len(expected_bboxes) > 1: @@ -807,7 +810,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints): out_bbox = features.BoundingBox( np.array(out_bbox), format=features.BoundingBoxFormat.XYXY, - image_size=bbox.image_size, + spatial_size=bbox.spatial_size, dtype=bbox.dtype, device=bbox.device, ) @@ -815,15 +818,15 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints): out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False ) - image_size = (32, 38) + spatial_size = (32, 38) pcoeffs = _get_perspective_coeffs(startpoints, endpoints) inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints) - for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)): + for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size output_bboxes = F.perspective_bounding_box( bboxes, @@ -836,7 +839,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints): expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs)) if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) @@ -853,14 +856,14 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints): def test_correctness_center_crop_bounding_box(device, output_size): def _compute_expected_bbox(bbox, output_size_): format_ = bbox.format - image_size_ = bbox.image_size + spatial_size_ = bbox.spatial_size bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH) if len(output_size_) == 1: output_size_.append(output_size_[-1]) - cy = int(round((image_size_[0] - output_size_[0]) * 0.5)) - cx = int(round((image_size_[1] - output_size_[1]) * 0.5)) + cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5)) + cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5)) out_bbox = [ bbox[0].item() - cx, bbox[1].item() - cy, @@ -870,7 +873,7 @@ def test_correctness_center_crop_bounding_box(device, output_size): out_bbox = features.BoundingBox( out_bbox, format=features.BoundingBoxFormat.XYWH, - image_size=output_size_, + spatial_size=output_size_, dtype=bbox.dtype, device=bbox.device, ) @@ -879,10 +882,10 @@ def test_correctness_center_crop_bounding_box(device, output_size): for bboxes in make_bounding_boxes(extra_dims=((4,),)): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size - output_boxes, output_image_size = F.center_crop_bounding_box( - bboxes, bboxes_format, bboxes_image_size, output_size + output_boxes, output_spatial_size = F.center_crop_bounding_box( + bboxes, bboxes_format, bboxes_spatial_size, output_size ) if bboxes.ndim < 2: @@ -890,7 +893,7 @@ def test_correctness_center_crop_bounding_box(device, output_size): expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, output_size)) if len(expected_bboxes) > 1: @@ -898,7 +901,7 @@ def test_correctness_center_crop_bounding_box(device, output_size): else: expected_bboxes = expected_bboxes[0] torch.testing.assert_close(output_boxes, expected_bboxes) - torch.testing.assert_close(output_image_size, output_size) + torch.testing.assert_close(output_spatial_size, output_size) @pytest.mark.parametrize("device", cpu_and_gpu()) @@ -926,11 +929,11 @@ def test_correctness_center_crop_mask(device, output_size): # Copied from test/test_functional_tensor.py @pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("image_size", ("small", "large")) +@pytest.mark.parametrize("spatial_size", ("small", "large")) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)]) -def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma): +def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma): fn = F.gaussian_blur_image_tensor # true_cv2_results = { @@ -950,7 +953,7 @@ def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, s p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") true_cv2_results = torch.load(p) - if image_size == "small": + if spatial_size == "small": tensor = ( torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device) ) diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py index 9a8ed67dd..3d5960c96 100644 --- a/test/test_prototype_transforms_utils.py +++ b/test/test_prototype_transforms_utils.py @@ -11,8 +11,8 @@ from torchvision.prototype.transforms.functional import to_image_pil IMAGE = make_image(color_space=features.ColorSpace.RGB) -BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size) -MASK = make_detection_mask(size=IMAGE.image_size) +BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size) +MASK = make_detection_mask(size=IMAGE.spatial_size) @pytest.mark.parametrize( diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py index a00bf2e2c..29ed162cc 100644 --- a/torchvision/prototype/datasets/_builtin/caltech.py +++ b/torchvision/prototype/datasets/_builtin/caltech.py @@ -110,7 +110,9 @@ class Caltech101(Dataset): image=image, ann_path=ann_path, bounding_box=BoundingBox( - ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", image_size=image.image_size + ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], + format="xyxy", + spatial_size=image.spatial_size, ), contour=_Feature(ann["obj_contour"].T), ) diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py index a0a021845..3382b62b6 100644 --- a/torchvision/prototype/datasets/_builtin/celeba.py +++ b/torchvision/prototype/datasets/_builtin/celeba.py @@ -144,7 +144,7 @@ class CelebA(Dataset): bounding_box=BoundingBox( [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")], format="xywh", - image_size=image.image_size, + spatial_size=image.spatial_size, ), landmarks={ landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"]))) diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py index 16a16998b..72d76f487 100644 --- a/torchvision/prototype/datasets/_builtin/coco.py +++ b/torchvision/prototype/datasets/_builtin/coco.py @@ -97,25 +97,29 @@ class Coco(Dataset): ) return [images, meta] - def _segmentation_to_mask(self, segmentation: Any, *, is_crowd: bool, image_size: Tuple[int, int]) -> torch.Tensor: + def _segmentation_to_mask( + self, segmentation: Any, *, is_crowd: bool, spatial_size: Tuple[int, int] + ) -> torch.Tensor: from pycocotools import mask if is_crowd: - segmentation = mask.frPyObjects(segmentation, *image_size) + segmentation = mask.frPyObjects(segmentation, *spatial_size) else: - segmentation = mask.merge(mask.frPyObjects(segmentation, *image_size)) + segmentation = mask.merge(mask.frPyObjects(segmentation, *spatial_size)) return torch.from_numpy(mask.decode(segmentation)).to(torch.bool) def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]: - image_size = (image_meta["height"], image_meta["width"]) + spatial_size = (image_meta["height"], image_meta["width"]) labels = [ann["category_id"] for ann in anns] return dict( # TODO: create a segmentation feature segmentations=_Feature( torch.stack( [ - self._segmentation_to_mask(ann["segmentation"], is_crowd=ann["iscrowd"], image_size=image_size) + self._segmentation_to_mask( + ann["segmentation"], is_crowd=ann["iscrowd"], spatial_size=spatial_size + ) for ann in anns ] ) @@ -125,7 +129,7 @@ class Coco(Dataset): bounding_boxes=BoundingBox( [ann["bbox"] for ann in anns], format="xywh", - image_size=image_size, + spatial_size=spatial_size, ), labels=Label(labels, categories=self._categories), super_categories=[self._category_to_super_category[self._categories[label]] for label in labels], diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py index f1531615c..9c32d96f9 100644 --- a/torchvision/prototype/datasets/_builtin/cub200.py +++ b/torchvision/prototype/datasets/_builtin/cub200.py @@ -130,13 +130,13 @@ class CUB200(Dataset): return path.with_suffix(".jpg").name def _2011_prepare_ann( - self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], image_size: Tuple[int, int] + self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int] ) -> Dict[str, Any]: _, (bounding_box_data, segmentation_data) = data segmentation_path, segmentation_buffer = segmentation_data return dict( bounding_box=BoundingBox( - [float(part) for part in bounding_box_data[1:]], format="xywh", image_size=image_size + [float(part) for part in bounding_box_data[1:]], format="xywh", spatial_size=spatial_size ), segmentation_path=segmentation_path, segmentation=EncodedImage.from_file(segmentation_buffer), @@ -149,7 +149,9 @@ class CUB200(Dataset): path = pathlib.Path(data[0]) return path.with_suffix(".jpg").name, data - def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: Tuple[int, int]) -> Dict[str, Any]: + def _2010_prepare_ann( + self, data: Tuple[str, Tuple[str, BinaryIO]], spatial_size: Tuple[int, int] + ) -> Dict[str, Any]: _, (path, buffer) = data content = read_mat(buffer) return dict( @@ -157,7 +159,7 @@ class CUB200(Dataset): bounding_box=BoundingBox( [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")], format="xyxy", - image_size=image_size, + spatial_size=spatial_size, ), segmentation=_Feature(content["seg"]), ) @@ -175,7 +177,7 @@ class CUB200(Dataset): image = EncodedImage.from_file(buffer) return dict( - prepare_ann_fn(anns_data, image.image_size), + prepare_ann_fn(anns_data, image.spatial_size), image=image, label=Label( int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1, diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py index 8dc0a8240..e11dc2bb4 100644 --- a/torchvision/prototype/datasets/_builtin/gtsrb.py +++ b/torchvision/prototype/datasets/_builtin/gtsrb.py @@ -78,7 +78,7 @@ class GTSRB(Dataset): bounding_box = BoundingBox( [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")], format="xyxy", - image_size=(int(csv_info["Height"]), int(csv_info["Width"])), + spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])), ) return { diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py index 011204f2b..a0e7a377e 100644 --- a/torchvision/prototype/datasets/_builtin/stanford_cars.py +++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py @@ -89,7 +89,7 @@ class StanfordCars(Dataset): path=path, image=image, label=Label(target[4] - 1, categories=self._categories), - bounding_box=BoundingBox(target[:4], format="xyxy", image_size=image.image_size), + bounding_box=BoundingBox(target[:4], format="xyxy", spatial_size=image.spatial_size), ) def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py index 84a9b3a7f..8db82b4aa 100644 --- a/torchvision/prototype/datasets/_builtin/voc.py +++ b/torchvision/prototype/datasets/_builtin/voc.py @@ -108,7 +108,7 @@ class VOC(Dataset): for instance in instances ], format="xyxy", - image_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))), + spatial_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))), ), labels=Label( [self._categories.index(instance["name"]) for instance in instances], categories=self._categories diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py index 7b69af5f9..18c607d4d 100644 --- a/torchvision/prototype/features/_bounding_box.py +++ b/torchvision/prototype/features/_bounding_box.py @@ -17,13 +17,13 @@ class BoundingBoxFormat(StrEnum): class BoundingBox(_Feature): format: BoundingBoxFormat - image_size: Tuple[int, int] + spatial_size: Tuple[int, int] @classmethod - def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, image_size: Tuple[int, int]) -> BoundingBox: + def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBox: bounding_box = tensor.as_subclass(cls) bounding_box.format = format - bounding_box.image_size = image_size + bounding_box.spatial_size = spatial_size return bounding_box def __new__( @@ -31,7 +31,7 @@ class BoundingBox(_Feature): data: Any, *, format: Union[BoundingBoxFormat, str], - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, @@ -41,7 +41,7 @@ class BoundingBox(_Feature): if isinstance(format, str): format = BoundingBoxFormat.from_str(format.upper()) - return cls._wrap(tensor, format=format, image_size=image_size) + return cls._wrap(tensor, format=format, spatial_size=spatial_size) @classmethod def wrap_like( @@ -50,16 +50,16 @@ class BoundingBox(_Feature): tensor: torch.Tensor, *, format: Optional[BoundingBoxFormat] = None, - image_size: Optional[Tuple[int, int]] = None, + spatial_size: Optional[Tuple[int, int]] = None, ) -> BoundingBox: return cls._wrap( tensor, format=format if format is not None else other.format, - image_size=image_size if image_size is not None else other.image_size, + spatial_size=spatial_size if spatial_size is not None else other.spatial_size, ) def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] - return self._make_repr(format=self.format, image_size=self.image_size) + return self._make_repr(format=self.format, spatial_size=self.spatial_size) def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox: if isinstance(format, str): @@ -70,11 +70,11 @@ class BoundingBox(_Feature): ) def horizontal_flip(self) -> BoundingBox: - output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size) + output = self._F.horizontal_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size) return BoundingBox.wrap_like(self, output) def vertical_flip(self) -> BoundingBox: - output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size) + output = self._F.vertical_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size) return BoundingBox.wrap_like(self, output) def resize( # type: ignore[override] @@ -84,20 +84,22 @@ class BoundingBox(_Feature): max_size: Optional[int] = None, antialias: bool = False, ) -> BoundingBox: - output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size) - return BoundingBox.wrap_like(self, output, image_size=image_size) + output, spatial_size = self._F.resize_bounding_box( + self, spatial_size=self.spatial_size, size=size, max_size=max_size + ) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox: - output, image_size = self._F.crop_bounding_box( + output, spatial_size = self._F.crop_bounding_box( self, self.format, top=top, left=left, height=height, width=width ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def center_crop(self, output_size: List[int]) -> BoundingBox: - output, image_size = self._F.center_crop_bounding_box( - self, format=self.format, image_size=self.image_size, output_size=output_size + output, spatial_size = self._F.center_crop_bounding_box( + self, format=self.format, spatial_size=self.spatial_size, output_size=output_size ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def resized_crop( self, @@ -109,8 +111,8 @@ class BoundingBox(_Feature): interpolation: InterpolationMode = InterpolationMode.BILINEAR, antialias: bool = False, ) -> BoundingBox: - output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size) - return BoundingBox.wrap_like(self, output, image_size=image_size) + output, spatial_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def pad( self, @@ -118,10 +120,10 @@ class BoundingBox(_Feature): fill: FillTypeJIT = None, padding_mode: str = "constant", ) -> BoundingBox: - output, image_size = self._F.pad_bounding_box( - self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode + output, spatial_size = self._F.pad_bounding_box( + self, format=self.format, spatial_size=self.spatial_size, padding=padding, padding_mode=padding_mode ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def rotate( self, @@ -131,10 +133,10 @@ class BoundingBox(_Feature): fill: FillTypeJIT = None, center: Optional[List[float]] = None, ) -> BoundingBox: - output, image_size = self._F.rotate_bounding_box( - self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center + output, spatial_size = self._F.rotate_bounding_box( + self, format=self.format, spatial_size=self.spatial_size, angle=angle, expand=expand, center=center ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def affine( self, @@ -149,7 +151,7 @@ class BoundingBox(_Feature): output = self._F.affine_bounding_box( self, self.format, - self.image_size, + self.spatial_size, angle, translate=translate, scale=scale, diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py index 4b963986b..9347b4eca 100644 --- a/torchvision/prototype/features/_encoded.py +++ b/torchvision/prototype/features/_encoded.py @@ -49,12 +49,12 @@ class EncodedData(_Feature): class EncodedImage(EncodedData): # TODO: Use @functools.cached_property if we can depend on Python 3.8 @property - def image_size(self) -> Tuple[int, int]: - if not hasattr(self, "_image_size"): + def spatial_size(self) -> Tuple[int, int]: + if not hasattr(self, "_spatial_size"): with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image: - self._image_size = image.height, image.width + self._spatial_size = image.height, image.width - return self._image_size + return self._spatial_size class EncodedVideo(EncodedData): diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index 23f81678d..6d52a178b 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -105,7 +105,7 @@ class Image(_Feature): return self._make_repr(color_space=self.color_space) @property - def image_size(self) -> Tuple[int, int]: + def spatial_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) @property diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py index 7b49ce8e8..2da10195e 100644 --- a/torchvision/prototype/features/_mask.py +++ b/torchvision/prototype/features/_mask.py @@ -33,7 +33,7 @@ class Mask(_Feature): return cls._wrap(tensor) @property - def image_size(self) -> Tuple[int, int]: + def spatial_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) def horizontal_flip(self) -> Mask: diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py index e32c36d5d..ca4253c73 100644 --- a/torchvision/prototype/features/_video.py +++ b/torchvision/prototype/features/_video.py @@ -54,9 +54,8 @@ class Video(_Feature): def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] return self._make_repr(color_space=self.color_space) - # TODO: rename this (and all instances of this term to spatial size) @property - def image_size(self) -> Tuple[int, int]: + def spatial_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) @property diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 4bfb5c9ed..f0e527385 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -11,7 +11,7 @@ from torchvision.prototype import features from torchvision.prototype.transforms import functional as F, InterpolationMode from ._transform import _RandomApplyTransform -from ._utils import has_any, query_chw +from ._utils import has_any, query_chw, query_spatial_size class RandomErasing(_RandomApplyTransform): @@ -153,7 +153,7 @@ class RandomCutmix(_BaseMixupCutmix): def _get_params(self, sample: Any) -> Dict[str, Any]: lam = float(self._dist.sample(())) - _, H, W = query_chw(sample) + H, W = query_spatial_size(sample) r_x = torch.randint(W, ()) r_y = torch.randint(H, ()) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index 340e721da..616669cc8 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -100,7 +100,7 @@ class RandomPhotometricDistort(Transform): self.p = p def _get_params(self, sample: Any) -> Dict[str, Any]: - num_channels, _, _ = query_chw(sample) + num_channels, *_ = query_chw(sample) return dict( zip( ["brightness", "contrast1", "saturation", "hue", "contrast2"], diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py index f8aec22b9..0cc4a90c4 100644 --- a/torchvision/prototype/transforms/_deprecated.py +++ b/torchvision/prototype/transforms/_deprecated.py @@ -78,7 +78,7 @@ class RandomGrayscale(_RandomApplyTransform): super().__init__(p=p) def _get_params(self, sample: Any) -> Dict[str, Any]: - num_input_channels, _, _ = query_chw(sample) + num_input_channels, *_ = query_chw(sample) return dict(num_input_channels=num_input_channels) def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 371ea7f69..91d7c294e 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -24,7 +24,7 @@ from ._utils import ( has_all, has_any, query_bounding_box, - query_chw, + query_spatial_size, ) @@ -105,10 +105,7 @@ class RandomResizedCrop(Transform): self._log_ratio = torch.log(torch.tensor(self.ratio)) def _get_params(self, sample: Any) -> Dict[str, Any]: - # vfdev-5: techically, this op can work on bboxes/segm masks only inputs without image in samples - # What if we have multiple images/bboxes/masks of different sizes ? - # TODO: let's support bbox or mask in samples without image - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) area = height * width log_ratio = self._log_ratio @@ -263,7 +260,7 @@ class RandomZoomOut(_RandomApplyTransform): raise ValueError(f"Invalid canvas side range provided {side_range}.") def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_h, orig_w = query_chw(sample) + orig_h, orig_w = query_spatial_size(sample) r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) canvas_width = int(orig_w * r) @@ -362,10 +359,7 @@ class RandomAffine(Transform): self.center = center def _get_params(self, sample: Any) -> Dict[str, Any]: - - # Get image size - # TODO: make it work with bboxes and segm masks - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item()) if self.translate is not None: @@ -427,7 +421,7 @@ class RandomCrop(Transform): self.padding_mode = padding_mode def _get_params(self, sample: Any) -> Dict[str, Any]: - _, padded_height, padded_width = query_chw(sample) + padded_height, padded_width = query_spatial_size(sample) if self.padding is not None: pad_left, pad_right, pad_top, pad_bottom = self.padding @@ -515,9 +509,7 @@ class RandomPerspective(_RandomApplyTransform): self.fill = _setup_fill_arg(fill) def _get_params(self, sample: Any) -> Dict[str, Any]: - # Get image size - # TODO: make it work with bboxes and segm masks - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) distortion_scale = self.distortion_scale @@ -571,9 +563,7 @@ class ElasticTransform(Transform): self.fill = _setup_fill_arg(fill) def _get_params(self, sample: Any) -> Dict[str, Any]: - # Get image size - # TODO: make it work with bboxes and segm masks - _, *size = query_chw(sample) + size = list(query_spatial_size(sample)) dx = torch.rand([1, 1] + size) * 2 - 1 if self.sigma[0] > 0.0: @@ -628,7 +618,7 @@ class RandomIoUCrop(Transform): self.trials = trials def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_h, orig_w = query_chw(sample) + orig_h, orig_w = query_spatial_size(sample) bboxes = query_bounding_box(sample) while True: @@ -690,7 +680,7 @@ class RandomIoUCrop(Transform): if isinstance(output, features.BoundingBox): bboxes = output[is_within_crop_area] - bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size) + bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size) output = features.BoundingBox.wrap_like(output, bboxes) elif isinstance(output, features.Mask): # apply is_within_crop_area if mask is one-hot encoded @@ -727,7 +717,7 @@ class ScaleJitter(Transform): self.antialias = antialias def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_height, orig_width = query_chw(sample) + orig_height, orig_width = query_spatial_size(sample) scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0]) r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale @@ -755,7 +745,7 @@ class RandomShortestSize(Transform): self.antialias = antialias def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_height, orig_width = query_chw(sample) + orig_height, orig_width = query_spatial_size(sample) min_size = self.min_size[int(torch.randint(len(self.min_size), ()))] r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width)) @@ -786,7 +776,7 @@ class FixedSizeCrop(Transform): self.padding_mode = padding_mode def _get_params(self, sample: Any) -> Dict[str, Any]: - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) new_height = min(height, self.crop_height) new_width = min(width, self.crop_width) @@ -811,7 +801,7 @@ class FixedSizeCrop(Transform): bounding_boxes = features.BoundingBox.wrap_like( bounding_boxes, F.clamp_bounding_box( - bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size + bounding_boxes, format=bounding_boxes.format, spatial_size=bounding_boxes.spatial_size ), ) height_and_width = bounding_boxes.to_format(features.BoundingBoxFormat.XYWH)[..., 2:] @@ -851,7 +841,7 @@ class FixedSizeCrop(Transform): elif isinstance(inpt, features.BoundingBox): inpt = features.BoundingBox.wrap_like( inpt, - F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size), + F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size), ) if params["needs_pad"]: diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index e5c7d05b0..dc109269f 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -68,5 +68,5 @@ class ClampBoundingBoxes(Transform): _transformed_types = (features.BoundingBox,) def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox: - output = F.clamp_bounding_box(inpt, format=inpt.format, image_size=inpt.image_size) + output = F.clamp_bounding_box(inpt, format=inpt.format, spatial_size=inpt.spatial_size) return features.BoundingBox.wrap_like(inpt, output) diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index a3980fa21..53b27f2e2 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -10,7 +10,7 @@ from torchvision._utils import sequence_to_str from torchvision.prototype import features from torchvision.prototype.features._feature import FillType -from torchvision.prototype.transforms.functional._meta import get_dimensions +from torchvision.prototype.transforms.functional._meta import get_dimensions, get_spatial_size from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size # noqa: F401 from typing_extensions import Literal @@ -98,6 +98,22 @@ def query_chw(sample: Any) -> Tuple[int, int, int]: return c, h, w +def query_spatial_size(sample: Any) -> Tuple[int, int]: + flat_sample, _ = tree_flatten(sample) + sizes = { + tuple(get_spatial_size(item)) + for item in flat_sample + if isinstance(item, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox)) + or features.is_simple_tensor(item) + } + if not sizes: + raise TypeError("No image, video, mask or bounding box was found in the sample") + elif len(sizes) > 1: + raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}") + h, w = sizes.pop() + return h, w + + def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool: for type_or_check in types_or_checks: if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj): diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index 579442dc7..fb72e7b57 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -11,12 +11,18 @@ from ._meta import ( get_dimensions_image_tensor, get_dimensions_image_pil, get_dimensions, + get_num_frames_video, + get_num_frames, get_image_num_channels, get_num_channels_image_tensor, get_num_channels_image_pil, + get_num_channels_video, get_num_channels, + get_spatial_size_bounding_box, get_spatial_size_image_tensor, get_spatial_size_image_pil, + get_spatial_size_mask, + get_spatial_size_video, get_spatial_size, ) # usort: skip diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 44b4986ab..590a13310 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -32,7 +32,7 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor: def horizontal_flip_bounding_box( - bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int] + bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_box.shape @@ -40,7 +40,7 @@ def horizontal_flip_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - bounding_box[:, [0, 2]] = image_size[1] - bounding_box[:, [2, 0]] + bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]] return convert_format_bounding_box( bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False @@ -69,7 +69,7 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor: def vertical_flip_bounding_box( - bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int] + bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_box.shape @@ -77,7 +77,7 @@ def vertical_flip_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - bounding_box[:, [1, 3]] = image_size[0] - bounding_box[:, [3, 1]] + bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]] return convert_format_bounding_box( bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False @@ -104,11 +104,11 @@ vflip = vertical_flip def _compute_resized_output_size( - image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None + spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> List[int]: if isinstance(size, int): size = [size] - return __compute_resized_output_size(image_size, size=size, max_size=max_size) + return __compute_resized_output_size(spatial_size, size=size, max_size=max_size) def resize_image_tensor( @@ -162,10 +162,10 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N def resize_bounding_box( - bounding_box: torch.Tensor, image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None + bounding_box: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> Tuple[torch.Tensor, Tuple[int, int]]: - old_height, old_width = image_size - new_height, new_width = _compute_resized_output_size(image_size, size=size, max_size=max_size) + old_height, old_width = spatial_size + new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size) ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device) return ( bounding_box.view(-1, 2, 2).mul(ratios).to(bounding_box.dtype).view(bounding_box.shape), @@ -312,7 +312,7 @@ def affine_image_pil( def _affine_bounding_box_xyxy( bounding_box: torch.Tensor, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], angle: Union[int, float], translate: List[float], scale: float, @@ -325,7 +325,7 @@ def _affine_bounding_box_xyxy( ) if center is None: - height, width = image_size + height, width = spatial_size center = [width * 0.5, height * 0.5] dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32 @@ -359,7 +359,7 @@ def _affine_bounding_box_xyxy( if expand: # Compute minimum point for transformed image frame: # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points. - height, width = image_size + height, width = spatial_size points = torch.tensor( [ [0.0, 0.0, 1.0], @@ -378,15 +378,15 @@ def _affine_bounding_box_xyxy( # Estimate meta-data for image with inverted=True and with center=[0,0] affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear) new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height) - image_size = (new_height, new_width) + spatial_size = (new_height, new_width) - return out_bboxes.to(bounding_box.dtype), image_size + return out_bboxes.to(bounding_box.dtype), spatial_size def affine_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], angle: Union[int, float], translate: List[float], scale: float, @@ -398,7 +398,7 @@ def affine_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center) + out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center) # out_bboxes should be of shape [N boxes, 4] @@ -573,7 +573,7 @@ def rotate_image_pil( def rotate_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], angle: float, expand: bool = False, center: Optional[List[float]] = None, @@ -587,9 +587,9 @@ def rotate_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - out_bboxes, image_size = _affine_bounding_box_xyxy( + out_bboxes, spatial_size = _affine_bounding_box_xyxy( bounding_box, - image_size, + spatial_size, angle=-angle, translate=[0.0, 0.0], scale=1.0, @@ -602,7 +602,7 @@ def rotate_bounding_box( convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False ).view(original_shape), - image_size, + spatial_size, ) @@ -756,7 +756,7 @@ def pad_mask( def pad_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], padding: Union[int, List[int]], padding_mode: str = "constant", ) -> Tuple[torch.Tensor, Tuple[int, int]]: @@ -775,7 +775,7 @@ def pad_bounding_box( bounding_box[..., 2] += left bounding_box[..., 3] += top - height, width = image_size + height, width = spatial_size height += top + bottom width += left + right @@ -1066,10 +1066,10 @@ def elastic_bounding_box( ).view(-1, 4) # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it - # Or add image_size arg and check displacement shape - image_size = displacement.shape[-3], displacement.shape[-2] + # Or add spatial_size arg and check displacement shape + spatial_size = displacement.shape[-3], displacement.shape[-2] - id_grid = _FT._create_identity_grid(list(image_size)).to(bounding_box.device) + id_grid = _FT._create_identity_grid(list(spatial_size)).to(bounding_box.device) # We construct an approximation of inverse grid as inv_grid = id_grid - displacement # This is not an exact inverse of the grid inv_grid = id_grid - displacement @@ -1079,7 +1079,7 @@ def elastic_bounding_box( index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long) index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long) # Transform points: - t_size = torch.tensor(image_size[::-1], device=displacement.device, dtype=displacement.dtype) + t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype) transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5 transformed_points = transformed_points.view(-1, 4, 2) @@ -1199,11 +1199,11 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL def center_crop_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], output_size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: crop_height, crop_width = _center_crop_parse_output_size(output_size) - crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *image_size) + crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size) return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width) diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index c03d65c95..a118784eb 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -18,7 +18,7 @@ def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: return get_dimensions_image_tensor(image) elif isinstance(image, (features.Image, features.Video)): channels = image.num_channels - height, width = image.image_size + height, width = image.spatial_size return [channels, height, width] else: return get_dimensions_image_pil(image) @@ -28,6 +28,10 @@ get_num_channels_image_tensor = _FT.get_image_num_channels get_num_channels_image_pil = _FP.get_image_num_channels +def get_num_channels_video(video: torch.Tensor) -> int: + return get_num_channels_image_tensor(video) + + def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int: if isinstance(image, torch.Tensor) and ( torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) @@ -55,21 +59,39 @@ def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]: return [height, width] -# TODO: Should we have get_spatial_size_video here? How about masks/bbox etc? What is the criterion for deciding when -# a kernel will be created? +def get_spatial_size_video(video: torch.Tensor) -> List[int]: + return get_spatial_size_image_tensor(video) + + +def get_spatial_size_mask(mask: torch.Tensor) -> List[int]: + return get_spatial_size_image_tensor(mask) + + +@torch.jit.unused +def get_spatial_size_bounding_box(bounding_box: features.BoundingBox) -> List[int]: + return list(bounding_box.spatial_size) def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return get_spatial_size_image_tensor(inpt) - elif isinstance(inpt, features._Feature): - image_size = getattr(inpt, "image_size", None) - if image_size is not None: - return list(image_size) - else: - raise ValueError(f"Type {inpt.__class__} doesn't have spatial size.") + elif isinstance(inpt, (features.Image, features.Video, features.BoundingBox, features.Mask)): + return list(inpt.spatial_size) + else: + return get_spatial_size_image_pil(inpt) # type: ignore[no-any-return] + + +def get_num_frames_video(video: torch.Tensor) -> int: + return video.shape[-4] + + +def get_num_frames(inpt: features.VideoTypeJIT) -> int: + if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)): + return get_num_frames_video(inpt) + elif isinstance(inpt, features.Video): + return inpt.num_frames else: - return get_spatial_size_image_pil(inpt) + raise TypeError(f"The video should be a Tensor. Got {type(inpt)}") def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor: @@ -125,13 +147,13 @@ def convert_format_bounding_box( def clamp_bounding_box( - bounding_box: torch.Tensor, format: BoundingBoxFormat, image_size: Tuple[int, int] + bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: # TODO: (PERF) Possible speed up clamping if we have different implementations for each bbox format. # Not sure if they yield equivalent results. xyxy_boxes = convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY) - xyxy_boxes[..., 0::2].clamp_(min=0, max=image_size[1]) - xyxy_boxes[..., 1::2].clamp_(min=0, max=image_size[0]) + xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1]) + xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0]) return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format, copy=False) -- GitLab From 6e72f2fda1df6704003742238f0e87732b9635a1 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 11 Oct 2022 16:59:58 +0100 Subject: [PATCH 032/624] Add seeds on Kernel Info and reduce randomness for Gaussian Blur (#6741) * Add seeds on Kernel Info and reduce randomness for Gaussian Blur * Fix linter --- test/prototype_transforms_kernel_infos.py | 9 +++++++-- test/test_prototype_transforms_functional.py | 14 +++++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 239425d17..f7b1e71f3 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -49,12 +49,14 @@ class KernelInfo(InfoBase): test_marks=None, # See InfoBase closeness_kwargs=None, + seed=None, ): super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) self.kernel = kernel self.sample_inputs_fn = sample_inputs_fn self.reference_fn = reference_fn self.reference_inputs_fn = reference_inputs_fn + self.seed = seed DEFAULT_IMAGE_CLOSENESS_KWARGS = dict( @@ -1304,7 +1306,7 @@ KERNEL_INFOS.extend( def sample_inputs_gaussian_blur_image_tensor(): make_gaussian_blur_image_loaders = functools.partial( - make_image_loaders, sizes=["random"], color_spaces=[features.ColorSpace.RGB] + make_image_loaders, sizes=[(7, 33)], color_spaces=[features.ColorSpace.RGB] ) for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]): @@ -1317,7 +1319,7 @@ def sample_inputs_gaussian_blur_image_tensor(): def sample_inputs_gaussian_blur_video(): - for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + for video_loader in make_video_loaders(sizes=[(7, 33)], num_frames=[5]): yield ArgsKwargs(video_loader, kernel_size=[3, 3]) @@ -1331,10 +1333,13 @@ KERNEL_INFOS.extend( xfail_jit_python_scalar_arg("kernel_size"), xfail_jit_python_scalar_arg("sigma"), ], + seed=0, ), KernelInfo( F.gaussian_blur_video, sample_inputs_fn=sample_inputs_gaussian_blur_video, + closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, + seed=0, ), ] ) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 56c473a23..c08228769 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -6,7 +6,7 @@ import PIL.Image import pytest import torch -from common_utils import cache, cpu_and_gpu, needs_cuda +from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed from prototype_common_utils import assert_close, make_bounding_boxes, make_image from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS from prototype_transforms_kernel_infos import KERNEL_INFOS @@ -81,6 +81,8 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_scripted_vs_eager(self, info, args_kwargs, device): + if info.seed is not None: + set_rng_seed(info.seed) kernel_eager = info.kernel kernel_scripted = script(kernel_eager) @@ -111,6 +113,8 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_batched_vs_single(self, info, args_kwargs, device): + if info.seed is not None: + set_rng_seed(info.seed) (batched_input, *other_args), kwargs = args_kwargs.load(device) feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input) @@ -146,6 +150,8 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_no_inplace(self, info, args_kwargs, device): + if info.seed is not None: + set_rng_seed(info.seed) (input, *other_args), kwargs = args_kwargs.load(device) if input.numel() == 0: @@ -159,6 +165,8 @@ class TestKernels: @sample_inputs @needs_cuda def test_cuda_vs_cpu(self, info, args_kwargs): + if info.seed is not None: + set_rng_seed(info.seed) (input_cpu, *other_args), kwargs = args_kwargs.load("cpu") input_cuda = input_cpu.to("cuda") @@ -170,6 +178,8 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_dtype_and_device_consistency(self, info, args_kwargs, device): + if info.seed is not None: + set_rng_seed(info.seed) (input, *other_args), kwargs = args_kwargs.load(device) output = info.kernel(input, *other_args, **kwargs) @@ -182,6 +192,8 @@ class TestKernels: @reference_inputs def test_against_reference(self, info, args_kwargs): + if info.seed is not None: + set_rng_seed(info.seed) args, kwargs = args_kwargs.load("cpu") actual = info.kernel(*args, **kwargs) -- GitLab From 1b5e1b4dd4e173655c255a3c472b0a668d8c9414 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 11 Oct 2022 12:41:55 -0400 Subject: [PATCH 033/624] Fix for windows and python 3.8 call to add_dll_directory (#6742) --- torchvision/extension.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torchvision/extension.py b/torchvision/extension.py index 702e7e33b..de5ea0c94 100644 --- a/torchvision/extension.py +++ b/torchvision/extension.py @@ -21,12 +21,16 @@ try: # To find cuda related dlls we need to make sure the # conda environment/bin path is configured Please take a look: # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python + # Please note: if some path can't be added using add_dll_directory we simply ignore this path if os.name == "nt" and sys.version_info >= (3, 8) and sys.version_info < (3, 9): env_path = os.environ["PATH"] path_arr = env_path.split(";") for path in path_arr: if os.path.exists(path): - os.add_dll_directory(path) # type: ignore[attr-defined] + try: + os.add_dll_directory(path) # type: ignore[attr-defined] + except Exception: + pass lib_path = _get_extension_path("_C") torch.ops.load_library(lib_path) -- GitLab From 9d16da222434c59fe26645c22116618625ccfed0 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 11 Oct 2022 15:04:56 -0400 Subject: [PATCH 034/624] Increase inactivity timeout for binary build jobs (#6746) * Increase inactivity timeout for binary build jobs * Fix binary build steo --- .circleci/config.yml | 17 ++++++++++++++--- .circleci/config.yml.in | 17 ++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1e4f2e319..713c1e6c4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -377,7 +377,12 @@ jobs: steps: - checkout_merge - designate_upload_channel - - run: packaging/build_wheel.sh + - run: + name: Build conda packages + no_output_timeout: 30m + command: | + set -ex + packaging/build_wheel.sh - store_artifacts: path: dist - persist_to_workspace: @@ -393,7 +398,12 @@ jobs: steps: - checkout_merge - designate_upload_channel - - run: packaging/build_conda.sh + - run: + name: Build conda packages + no_output_timeout: 30m + command: | + set -ex + packaging/build_conda.sh - store_artifacts: path: /opt/conda/conda-bld/linux-64 - persist_to_workspace: @@ -411,7 +421,7 @@ jobs: - designate_upload_channel - run: name: Build conda packages - no_output_timeout: 20m + no_output_timeout: 30m command: | set -ex source packaging/windows/internal/vc_install_helper.sh @@ -438,6 +448,7 @@ jobs: - designate_upload_channel - run: name: Build wheel packages + no_output_timeout: 30m command: | set -ex source packaging/windows/internal/vc_install_helper.sh diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index d93ddb0be..b421dc1a7 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -377,7 +377,12 @@ jobs: steps: - checkout_merge - designate_upload_channel - - run: packaging/build_wheel.sh + - run: + name: Build conda packages + no_output_timeout: 30m + command: | + set -ex + packaging/build_wheel.sh - store_artifacts: path: dist - persist_to_workspace: @@ -393,7 +398,12 @@ jobs: steps: - checkout_merge - designate_upload_channel - - run: packaging/build_conda.sh + - run: + name: Build conda packages + no_output_timeout: 30m + command: | + set -ex + packaging/build_conda.sh - store_artifacts: path: /opt/conda/conda-bld/linux-64 - persist_to_workspace: @@ -411,7 +421,7 @@ jobs: - designate_upload_channel - run: name: Build conda packages - no_output_timeout: 20m + no_output_timeout: 30m command: | set -ex source packaging/windows/internal/vc_install_helper.sh @@ -438,6 +448,7 @@ jobs: - designate_upload_channel - run: name: Build wheel packages + no_output_timeout: 30m command: | set -ex source packaging/windows/internal/vc_install_helper.sh -- GitLab From 11a2eeda8fb127a7ad72b4c98ca918b93055c1e7 Mon Sep 17 00:00:00 2001 From: vfdev Date: Tue, 11 Oct 2022 23:47:46 +0200 Subject: [PATCH 035/624] [proto] Small improvement for tensor equalize op (#6738) * [proto] Small improvement for tensor equalize op * Fix code formatting * Added a comment on the ops --- .../prototype/transforms/functional/_color.py | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py index d11dd3c3b..63fa8a28c 100644 --- a/torchvision/prototype/transforms/functional/_color.py +++ b/torchvision/prototype/transforms/functional/_color.py @@ -183,6 +183,30 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT: return autocontrast_image_pil(inpt) +def _scale_channel(img_chan: torch.Tensor) -> torch.Tensor: + # TODO: we should expect bincount to always be faster than histc, but this + # isn't always the case. Once + # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if + # block and only use bincount. + if img_chan.is_cuda: + hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255) + else: + hist = torch.bincount(img_chan.view(-1), minlength=256) + + nonzero_hist = hist[hist != 0] + step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor") + if step == 0: + return img_chan + + lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor") + # Doing inplace clamp and converting lut to uint8 improves perfs + lut.clamp_(0, 255) + lut = lut.to(torch.uint8) + lut = torch.nn.functional.pad(lut[:-1], [1, 0]) + + return lut[img_chan.to(torch.int64)] + + def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: if image.dtype != torch.uint8: raise TypeError(f"Only torch.uint8 image tensors are supported, but found {image.dtype}") @@ -194,15 +218,9 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: if image.numel() == 0: return image elif image.ndim == 2: - return _FT._scale_channel(image) + return _scale_channel(image) else: - return torch.stack( - [ - # TODO: when merging transforms v1 and v2, we can inline this function call - _FT._equalize_single_image(single_image) - for single_image in image.view(-1, num_channels, height, width) - ] - ).view(image.shape) + return torch.stack([_scale_channel(x) for x in image.view(-1, height, width)]).view(image.shape) equalize_image_pil = _FP.equalize -- GitLab From 0bfbabc2a9841a160a66a72c1e02ca8a97e6f8ee Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 12 Oct 2022 11:41:38 +0200 Subject: [PATCH 036/624] cache traceback together with exceptions (#6748) --- test/common_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index 8f07e91d1..9e919a149 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -210,7 +210,7 @@ def cache(fn): """ sentinel = object() out_cache = {} - exc_cache = {} + exc_tb_cache = {} @functools.wraps(fn) def wrapper(*args, **kwargs): @@ -220,14 +220,17 @@ def cache(fn): if out is not sentinel: return out - exc = exc_cache.get(key, sentinel) - if exc is not sentinel: - raise exc + exc_tb = exc_tb_cache.get(key, sentinel) + if exc_tb is not sentinel: + raise exc_tb[0].with_traceback(exc_tb[1]) try: out = fn(*args, **kwargs) except Exception as exc: - exc_cache[key] = exc + # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest + # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding + # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details. + exc_tb_cache[key] = exc, exc.__traceback__ raise exc out_cache[key] = out -- GitLab From 7d36d263a8356fac0bb363617b0c57c3bac6f89f Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 12 Oct 2022 12:03:22 +0200 Subject: [PATCH 037/624] Seed transform tests (#6749) * Revert "Add seeds on Kernel Info and reduce randomness for Gaussian Blur (#6741)" This reverts commit 6e72f2fda1df6704003742238f0e87732b9635a1. * add fixture to fix the RNG seed * re-add changes to gaussian_blur_* sample input shapes Co-authored-by: Vasilis Vryniotis --- test/prototype_transforms_kernel_infos.py | 5 ----- test/test_prototype_transforms_functional.py | 18 ++++++------------ 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index f7b1e71f3..5af2f8f6a 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -49,14 +49,12 @@ class KernelInfo(InfoBase): test_marks=None, # See InfoBase closeness_kwargs=None, - seed=None, ): super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) self.kernel = kernel self.sample_inputs_fn = sample_inputs_fn self.reference_fn = reference_fn self.reference_inputs_fn = reference_inputs_fn - self.seed = seed DEFAULT_IMAGE_CLOSENESS_KWARGS = dict( @@ -1333,13 +1331,10 @@ KERNEL_INFOS.extend( xfail_jit_python_scalar_arg("kernel_size"), xfail_jit_python_scalar_arg("sigma"), ], - seed=0, ), KernelInfo( F.gaussian_blur_video, sample_inputs_fn=sample_inputs_gaussian_blur_video, - closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS, - seed=0, ), ] ) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index c08228769..982d776bd 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -67,6 +67,12 @@ def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=No return decorator +@pytest.fixture(autouse=True) +def fix_rng_seed(): + set_rng_seed(0) + yield + + class TestKernels: sample_inputs = make_info_args_kwargs_parametrization( KERNEL_INFOS, @@ -81,8 +87,6 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_scripted_vs_eager(self, info, args_kwargs, device): - if info.seed is not None: - set_rng_seed(info.seed) kernel_eager = info.kernel kernel_scripted = script(kernel_eager) @@ -113,8 +117,6 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_batched_vs_single(self, info, args_kwargs, device): - if info.seed is not None: - set_rng_seed(info.seed) (batched_input, *other_args), kwargs = args_kwargs.load(device) feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input) @@ -150,8 +152,6 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_no_inplace(self, info, args_kwargs, device): - if info.seed is not None: - set_rng_seed(info.seed) (input, *other_args), kwargs = args_kwargs.load(device) if input.numel() == 0: @@ -165,8 +165,6 @@ class TestKernels: @sample_inputs @needs_cuda def test_cuda_vs_cpu(self, info, args_kwargs): - if info.seed is not None: - set_rng_seed(info.seed) (input_cpu, *other_args), kwargs = args_kwargs.load("cpu") input_cuda = input_cpu.to("cuda") @@ -178,8 +176,6 @@ class TestKernels: @sample_inputs @pytest.mark.parametrize("device", cpu_and_gpu()) def test_dtype_and_device_consistency(self, info, args_kwargs, device): - if info.seed is not None: - set_rng_seed(info.seed) (input, *other_args), kwargs = args_kwargs.load(device) output = info.kernel(input, *other_args, **kwargs) @@ -192,8 +188,6 @@ class TestKernels: @reference_inputs def test_against_reference(self, info, args_kwargs): - if info.seed is not None: - set_rng_seed(info.seed) args, kwargs = args_kwargs.load("cpu") actual = info.kernel(*args, **kwargs) -- GitLab From 54a2d4e8f7a4568823532d4342f6ba13e7339dce Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 12 Oct 2022 14:44:10 +0100 Subject: [PATCH 038/624] [prototype] Video Classes Clean up (#6751) * Removing unnecessary methods/classes. * Unions instead of ImageOrVideo types * Fixing JIT issue. --- torchvision/prototype/features/__init__.py | 8 +++----- torchvision/prototype/features/_encoded.py | 4 ---- torchvision/prototype/features/_image.py | 14 +------------- torchvision/prototype/features/_video.py | 8 +------- torchvision/prototype/transforms/_augment.py | 7 ++++--- .../prototype/transforms/_auto_augment.py | 6 +++--- torchvision/prototype/transforms/_color.py | 8 +++++--- .../prototype/transforms/_deprecated.py | 8 ++++++-- torchvision/prototype/transforms/_geometry.py | 17 ++++++++--------- torchvision/prototype/transforms/_meta.py | 8 +++++--- torchvision/prototype/transforms/_misc.py | 8 ++++++-- .../transforms/functional/_augment.py | 6 ++++-- .../transforms/functional/_deprecated.py | 2 +- .../transforms/functional/_geometry.py | 19 ++++++++----------- .../prototype/transforms/functional/_meta.py | 12 ++++++------ .../prototype/transforms/functional/_misc.py | 7 +++++-- 16 files changed, 66 insertions(+), 76 deletions(-) diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py index 944ae9bd3..8a461e1be 100644 --- a/torchvision/prototype/features/__init__.py +++ b/torchvision/prototype/features/__init__.py @@ -1,5 +1,5 @@ from ._bounding_box import BoundingBox, BoundingBoxFormat -from ._encoded import EncodedData, EncodedImage, EncodedVideo +from ._encoded import EncodedData, EncodedImage from ._feature import _Feature, FillType, FillTypeJIT, InputType, InputTypeJIT, is_simple_tensor from ._image import ( ColorSpace, @@ -14,12 +14,10 @@ from ._image import ( from ._label import Label, OneHotLabel from ._mask import Mask from ._video import ( - ImageOrVideoType, - ImageOrVideoTypeJIT, LegacyVideoType, LegacyVideoTypeJIT, - TensorImageOrVideoType, - TensorImageOrVideoTypeJIT, + TensorVideoType, + TensorVideoTypeJIT, Video, VideoType, VideoTypeJIT, diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py index 9347b4eca..ffa347a3e 100644 --- a/torchvision/prototype/features/_encoded.py +++ b/torchvision/prototype/features/_encoded.py @@ -55,7 +55,3 @@ class EncodedImage(EncodedData): self._spatial_size = image.height, image.width return self._spatial_size - - -class EncodedVideo(EncodedData): - pass diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index 6d52a178b..e9128b94b 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -6,10 +6,8 @@ from typing import Any, cast, List, Optional, Tuple, Union import PIL.Image import torch from torchvision._utils import StrEnum -from torchvision.transforms.functional import InterpolationMode, to_pil_image -from torchvision.utils import draw_bounding_boxes, make_grid +from torchvision.transforms.functional import InterpolationMode -from ._bounding_box import BoundingBox from ._feature import _Feature, FillTypeJIT @@ -124,16 +122,6 @@ class Image(_Feature): color_space=color_space, ) - def show(self) -> None: - # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we - # promote this out of the prototype state - to_pil_image(make_grid(self.view(-1, *self.shape[-3:]))).show() - - def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image: - # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we - # promote this out of the prototype state - return Image.wrap_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs)) - def horizontal_flip(self) -> Image: output = self._F.horizontal_flip_image_tensor(self) return Image.wrap_like(self, output) diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py index ca4253c73..9dfff7f96 100644 --- a/torchvision/prototype/features/_video.py +++ b/torchvision/prototype/features/_video.py @@ -7,7 +7,7 @@ import torch from torchvision.transforms.functional import InterpolationMode from ._feature import _Feature, FillTypeJIT -from ._image import ColorSpace, ImageType, ImageTypeJIT, TensorImageType, TensorImageTypeJIT +from ._image import ColorSpace class Video(_Feature): @@ -236,9 +236,3 @@ LegacyVideoType = torch.Tensor LegacyVideoTypeJIT = torch.Tensor TensorVideoType = Union[torch.Tensor, Video] TensorVideoTypeJIT = torch.Tensor - -# TODO: decide if we should do definitions for both Images and Videos or use unions in the methods -ImageOrVideoType = Union[ImageType, VideoType] -ImageOrVideoTypeJIT = Union[ImageTypeJIT, VideoTypeJIT] -TensorImageOrVideoType = Union[TensorImageType, TensorVideoType] -TensorImageOrVideoTypeJIT = Union[TensorImageTypeJIT, TensorVideoTypeJIT] diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index f0e527385..9a4d32fc6 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -1,7 +1,7 @@ import math import numbers import warnings -from typing import Any, cast, Dict, List, Optional, Tuple +from typing import Any, cast, Dict, List, Optional, Tuple, Union import PIL.Image import torch @@ -92,14 +92,15 @@ class RandomErasing(_RandomApplyTransform): return dict(i=i, j=j, h=h, w=w, v=v) - def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: + def _transform( + self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] + ) -> Union[features.ImageType, features.VideoType]: if params["v"] is not None: inpt = F.erase(inpt, **params, inplace=self.inplace) return inpt -# TODO: Add support for Video: https://github.com/pytorch/vision/issues/6731 class _BaseMixupCutmix(_RandomApplyTransform): def __init__(self, alpha: float, p: float = 0.5) -> None: super().__init__(p=p) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index b35b5529b..02c1a18da 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -35,7 +35,7 @@ class _AutoAugmentBase(Transform): self, sample: Any, unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask), - ) -> Tuple[int, features.ImageOrVideoType]: + ) -> Tuple[int, Union[features.ImageType, features.VideoType]]: sample_flat, _ = tree_flatten(sample) image_or_videos = [] for id, inpt in enumerate(sample_flat): @@ -60,12 +60,12 @@ class _AutoAugmentBase(Transform): def _apply_image_or_video_transform( self, - image: features.ImageOrVideoType, + image: Union[features.ImageType, features.VideoType], transform_id: str, magnitude: float, interpolation: InterpolationMode, fill: Dict[Type, features.FillType], - ) -> features.ImageOrVideoType: + ) -> Union[features.ImageType, features.VideoType]: fill_ = fill[type(image)] fill_ = F._geometry._convert_fill_arg(fill_) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index 616669cc8..609f03bf4 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -111,8 +111,8 @@ class RandomPhotometricDistort(Transform): ) def _permute_channels( - self, inpt: features.ImageOrVideoType, permutation: torch.Tensor - ) -> features.ImageOrVideoType: + self, inpt: Union[features.ImageType, features.VideoType], permutation: torch.Tensor + ) -> Union[features.ImageType, features.VideoType]: if isinstance(inpt, PIL.Image.Image): inpt = F.pil_to_tensor(inpt) @@ -126,7 +126,9 @@ class RandomPhotometricDistort(Transform): return output - def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: + def _transform( + self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] + ) -> Union[features.ImageType, features.VideoType]: if params["brightness"]: inpt = F.adjust_brightness( inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1]) diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py index 0cc4a90c4..e401534f4 100644 --- a/torchvision/prototype/transforms/_deprecated.py +++ b/torchvision/prototype/transforms/_deprecated.py @@ -52,7 +52,9 @@ class Grayscale(Transform): super().__init__() self.num_output_channels = num_output_channels - def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: + def _transform( + self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] + ) -> Union[features.ImageType, features.VideoType]: output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels) if isinstance(inpt, (features.Image, features.Video)): output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) # type: ignore[arg-type] @@ -81,7 +83,9 @@ class RandomGrayscale(_RandomApplyTransform): num_input_channels, *_ = query_chw(sample) return dict(num_input_channels=num_input_channels) - def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: + def _transform( + self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] + ) -> Union[features.ImageType, features.VideoType]: output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"]) if isinstance(inpt, (features.Image, features.Video)): output = inpt.wrap_like(inpt, output, color_space=features.ColorSpace.GRAY) # type: ignore[arg-type] diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 91d7c294e..b09533273 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -148,6 +148,9 @@ class RandomResizedCrop(Transform): ) +ImageOrVideoTypeJIT = Union[features.ImageTypeJIT, features.VideoTypeJIT] + + class FiveCrop(Transform): """ Example: @@ -177,14 +180,8 @@ class FiveCrop(Transform): self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") def _transform( - self, inpt: features.ImageOrVideoType, params: Dict[str, Any] - ) -> Tuple[ - features.ImageOrVideoType, - features.ImageOrVideoType, - features.ImageOrVideoType, - features.ImageOrVideoType, - features.ImageOrVideoType, - ]: + self, inpt: ImageOrVideoTypeJIT, params: Dict[str, Any] + ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]: return F.five_crop(inpt, self.size) def forward(self, *inputs: Any) -> Any: @@ -205,7 +202,9 @@ class TenCrop(Transform): self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") self.vertical_flip = vertical_flip - def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> List[features.ImageOrVideoType]: + def _transform( + self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] + ) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]: return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip) def forward(self, *inputs: Any) -> Any: diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index dc109269f..bdfe8b47a 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -29,8 +29,8 @@ class ConvertImageDtype(Transform): self.dtype = dtype def _transform( - self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any] - ) -> features.TensorImageOrVideoType: + self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any] + ) -> Union[features.TensorImageType, features.TensorVideoType]: output = F.convert_image_dtype(inpt, dtype=self.dtype) return ( output if features.is_simple_tensor(inpt) else type(inpt).wrap_like(inpt, output) # type: ignore[attr-defined] @@ -58,7 +58,9 @@ class ConvertColorSpace(Transform): self.copy = copy - def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: + def _transform( + self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] + ) -> Union[features.ImageType, features.VideoType]: return F.convert_color_space( inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy ) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index d3c8a57dc..945aa8456 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -68,7 +68,9 @@ class LinearTransformation(Transform): return super().forward(*inputs) - def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor: + def _transform( + self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any] + ) -> torch.Tensor: # Image instance after linear transformation is not Image anymore due to unknown data range # Thus we will return Tensor for input Image @@ -101,7 +103,9 @@ class Normalize(Transform): self.std = list(std) self.inplace = inplace - def _transform(self, inpt: features.TensorImageOrVideoType, params: Dict[str, Any]) -> torch.Tensor: + def _transform( + self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any] + ) -> torch.Tensor: return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace) def forward(self, *inpts: Any) -> Any: diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py index 57c3602cc..20e5ac916 100644 --- a/torchvision/prototype/transforms/functional/_augment.py +++ b/torchvision/prototype/transforms/functional/_augment.py @@ -1,3 +1,5 @@ +from typing import Union + import PIL.Image import torch @@ -24,14 +26,14 @@ def erase_video( def erase( - inpt: features.ImageOrVideoTypeJIT, + inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False, -) -> features.ImageOrVideoTypeJIT: +) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]: if isinstance(inpt, torch.Tensor): output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace) if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py index 854920b96..e18c267e8 100644 --- a/torchvision/prototype/transforms/functional/_deprecated.py +++ b/torchvision/prototype/transforms/functional/_deprecated.py @@ -59,7 +59,7 @@ def to_tensor(inpt: Any) -> torch.Tensor: return _F.to_tensor(inpt) -def get_image_size(inpt: features.ImageOrVideoTypeJIT) -> List[int]: +def get_image_size(inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]: warnings.warn( "The function `get_image_size(...)` is deprecated and will be removed in a future release. " "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`." diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 590a13310..43962ad4d 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1382,16 +1382,13 @@ def five_crop_video( return five_crop_image_tensor(video, size) +ImageOrVideoTypeJIT = Union[features.ImageTypeJIT, features.VideoTypeJIT] + + def five_crop( - inpt: features.ImageOrVideoTypeJIT, size: List[int] -) -> Tuple[ - features.ImageOrVideoTypeJIT, - features.ImageOrVideoTypeJIT, - features.ImageOrVideoTypeJIT, - features.ImageOrVideoTypeJIT, - features.ImageOrVideoTypeJIT, -]: - # TODO: consider breaking BC here to return List[features.ImageOrVideoTypeJIT] to align this op with `ten_crop` + inpt: ImageOrVideoTypeJIT, size: List[int] +) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]: + # TODO: consider breaking BC here to return List[features.ImageTypeJIT/VideoTypeJIT] to align this op with `ten_crop` if isinstance(inpt, torch.Tensor): output = five_crop_image_tensor(inpt, size) if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): @@ -1434,8 +1431,8 @@ def ten_crop_video(video: torch.Tensor, size: List[int], vertical_flip: bool = F def ten_crop( - inpt: features.ImageOrVideoTypeJIT, size: List[int], vertical_flip: bool = False -) -> List[features.ImageOrVideoTypeJIT]: + inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], size: List[int], vertical_flip: bool = False +) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]: if isinstance(inpt, torch.Tensor): output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip) if not torch.jit.is_scripting() and isinstance(inpt, (features.Image, features.Video)): diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index a118784eb..2903d73ce 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -1,4 +1,4 @@ -from typing import cast, List, Optional, Tuple +from typing import List, Optional, Tuple, Union import PIL.Image import torch @@ -11,7 +11,7 @@ get_dimensions_image_tensor = _FT.get_dimensions get_dimensions_image_pil = _FP.get_dimensions -def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: +def get_dimensions(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> List[int]: if isinstance(image, torch.Tensor) and ( torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) ): @@ -32,7 +32,7 @@ def get_num_channels_video(video: torch.Tensor) -> int: return get_num_channels_image_tensor(video) -def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int: +def get_num_channels(image: Union[features.ImageTypeJIT, features.VideoTypeJIT]) -> int: if isinstance(image, torch.Tensor) and ( torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) ): @@ -262,11 +262,11 @@ def convert_color_space_video( def convert_color_space( - inpt: features.ImageOrVideoTypeJIT, + inpt: Union[features.ImageTypeJIT, features.VideoTypeJIT], color_space: ColorSpace, old_color_space: Optional[ColorSpace] = None, copy: bool = True, -) -> features.ImageOrVideoTypeJIT: +) -> Union[features.ImageTypeJIT, features.VideoTypeJIT]: if isinstance(inpt, torch.Tensor) and ( torch.jit.is_scripting() or not isinstance(inpt, (features.Image, features.Video)) ): @@ -281,4 +281,4 @@ def convert_color_space( elif isinstance(inpt, (features.Image, features.Video)): return inpt.to_color_space(color_space, copy=copy) else: - return cast(features.ImageOrVideoTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy)) + return convert_color_space_image_pil(inpt, color_space, copy=copy) diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 79a358b4e..8fda24e17 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union import PIL.Image import torch @@ -14,7 +14,10 @@ def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], in def normalize( - inpt: features.TensorImageOrVideoTypeJIT, mean: List[float], std: List[float], inplace: bool = False + inpt: Union[features.TensorImageTypeJIT, features.TensorVideoTypeJIT], + mean: List[float], + std: List[float], + inplace: bool = False, ) -> torch.Tensor: if torch.jit.is_scripting(): correct_type = isinstance(inpt, torch.Tensor) -- GitLab From b16dec19a4b737b3fb120c48c7da4b07456902fa Mon Sep 17 00:00:00 2001 From: vfdev Date: Thu, 13 Oct 2022 13:29:45 +0200 Subject: [PATCH 039/624] [proto] Performance improvements for equalize op (#6757) * [proto] Performance improvements for equalize op * Added tests --- test/test_prototype_transforms_functional.py | 11 ++++ .../prototype/transforms/functional/_color.py | 59 +++++++++++-------- 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 982d776bd..34291611d 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -1037,3 +1037,14 @@ def test_to_image_pil(inpt, mode): assert isinstance(output, PIL.Image.Image) assert np.asarray(inpt).sum() == np.asarray(output).sum() + + +def test_equalize_image_tensor_edge_cases(): + inpt = torch.zeros(3, 200, 200, dtype=torch.uint8) + output = F.equalize_image_tensor(inpt) + torch.testing.assert_close(inpt, output) + + inpt = torch.zeros(5, 3, 200, 200, dtype=torch.uint8) + inpt[..., 100:, 100:] = 1 + output = F.equalize_image_tensor(inpt) + assert output.unique().tolist() == [0, 255] diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py index 63fa8a28c..7cbf8885c 100644 --- a/torchvision/prototype/transforms/functional/_color.py +++ b/torchvision/prototype/transforms/functional/_color.py @@ -183,28 +183,37 @@ def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT: return autocontrast_image_pil(inpt) -def _scale_channel(img_chan: torch.Tensor) -> torch.Tensor: - # TODO: we should expect bincount to always be faster than histc, but this - # isn't always the case. Once - # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if - # block and only use bincount. - if img_chan.is_cuda: - hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255) - else: - hist = torch.bincount(img_chan.view(-1), minlength=256) - - nonzero_hist = hist[hist != 0] - step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor") - if step == 0: - return img_chan - - lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor") - # Doing inplace clamp and converting lut to uint8 improves perfs - lut.clamp_(0, 255) - lut = lut.to(torch.uint8) - lut = torch.nn.functional.pad(lut[:-1], [1, 0]) - - return lut[img_chan.to(torch.int64)] +def _equalize_image_tensor_vec(img: torch.Tensor) -> torch.Tensor: + # input img shape should be [N, H, W] + shape = img.shape + # Compute image histogram: + flat_img = img.flatten(start_dim=1).to(torch.long) # -> [N, H * W] + hist = flat_img.new_zeros(shape[0], 256) + hist.scatter_add_(dim=1, index=flat_img, src=flat_img.new_ones(1).expand_as(flat_img)) + + # Compute image cdf + chist = hist.cumsum_(dim=1) + # Compute steps, where step per channel is nonzero_hist[:-1].sum() // 255 + # Trick: nonzero_hist[:-1].sum() == chist[idx - 1], where idx = chist.argmax() + idx = chist.argmax(dim=1).sub_(1) + # If histogram is degenerate (hist of zero image), index is -1 + neg_idx_mask = idx < 0 + idx.clamp_(min=0) + step = chist.gather(dim=1, index=idx.unsqueeze(1)) + step[neg_idx_mask] = 0 + step.div_(255, rounding_mode="floor") + + # Compute batched Look-up-table: + # Necessary to avoid an integer division by zero, which raises + clamped_step = step.clamp(min=1) + chist.add_(torch.div(step, 2, rounding_mode="floor")).div_(clamped_step, rounding_mode="floor").clamp_(0, 255) + lut = chist.to(torch.uint8) # [N, 256] + + # Pad lut with zeros + zeros = lut.new_zeros((1, 1)).expand(shape[0], 1) + lut = torch.cat([zeros, lut[:, :-1]], dim=1) + + return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).view_as(img)) def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: @@ -217,10 +226,8 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: if image.numel() == 0: return image - elif image.ndim == 2: - return _scale_channel(image) - else: - return torch.stack([_scale_channel(x) for x in image.view(-1, height, width)]).view(image.shape) + + return _equalize_image_tensor_vec(image.view(-1, height, width)).view(image.shape) equalize_image_pil = _FP.equalize -- GitLab From 6d774c6fe53f2492e782095bacd544eebfeb0fc5 Mon Sep 17 00:00:00 2001 From: vfdev Date: Thu, 13 Oct 2022 13:31:26 +0200 Subject: [PATCH 040/624] Fixed repr for ElasticTransform (#6758) Co-authored-by: Vasilis Vryniotis --- torchvision/transforms/transforms.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 18e2ffc96..6011e2372 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -2133,9 +2133,9 @@ class ElasticTransform(torch.nn.Module): return F.elastic_transform(tensor, displacement, self.interpolation, self.fill) def __repr__(self): - format_string = self.__class__.__name__ + "(alpha=" - format_string += str(self.alpha) + ")" - format_string += ", (sigma=" + str(self.sigma) + ")" - format_string += ", interpolation={self.interpolation}" - format_string += ", fill={self.fill})" + format_string = self.__class__.__name__ + format_string += f"(alpha={self.alpha}" + format_string += f", sigma={self.sigma}" + format_string += f", interpolation={self.interpolation}" + format_string += f", fill={self.fill})" return format_string -- GitLab From 3eafe77a51ba1aab061b2564f9cd8774a0df3be7 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 13 Oct 2022 13:45:31 +0200 Subject: [PATCH 041/624] expand ToDtype to support multiple conversions at once (#6756) * expand ToDtype to support multiple conversions at once * simplify --- test/test_prototype_transforms.py | 38 +++++++++++++++++++++++ torchvision/prototype/transforms/_misc.py | 21 +++++++++---- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 2c095fa6e..f18597a24 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1789,3 +1789,41 @@ class TestRandomResize: mock_resize.assert_called_with( inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel ) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtypes"), + [ + ( + torch.float64, + {torch.Tensor: torch.float64, features.Image: torch.float64, features.BoundingBox: torch.float64}, + ), + ( + {torch.Tensor: torch.int32, features.Image: torch.float32, features.BoundingBox: torch.float64}, + {torch.Tensor: torch.int32, features.Image: torch.float32, features.BoundingBox: torch.float64}, + ), + ], +) +def test_to_dtype(dtype, expected_dtypes): + sample = dict( + plain_tensor=torch.testing.make_tensor(5, dtype=torch.int64, device="cpu"), + image=make_image(dtype=torch.uint8), + bounding_box=make_bounding_box(format=features.BoundingBoxFormat.XYXY, dtype=torch.float32), + str="str", + int=0, + ) + + transform = transforms.ToDtype(dtype) + transformed_sample = transform(sample) + + for key, value in sample.items(): + value_type = type(value) + transformed_value = transformed_sample[key] + + # make sure the transformation retains the type + assert isinstance(transformed_value, value_type) + + if isinstance(value, torch.Tensor): + assert transformed_value.dtype is expected_dtypes[value_type] + else: + assert transformed_value is value diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index 945aa8456..eac65da6e 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -1,4 +1,5 @@ import functools +from collections import defaultdict from typing import Any, Callable, Dict, Sequence, Type, Union import PIL.Image @@ -144,14 +145,22 @@ class GaussianBlur(Transform): return F.gaussian_blur(inpt, self.kernel_size, **params) -# TODO: Enhance as described at https://github.com/pytorch/vision/issues/6697 -class ToDtype(Lambda): - def __init__(self, dtype: torch.dtype, *types: Type) -> None: +class ToDtype(Transform): + _transformed_types = (torch.Tensor,) + + def _default_dtype(self, dtype: torch.dtype) -> torch.dtype: + return dtype + + def __init__(self, dtype: Union[torch.dtype, Dict[Type, torch.dtype]]) -> None: + super().__init__() + if not isinstance(dtype, dict): + # This weird looking construct only exists, since `lambda`'s cannot be serialized by pickle. + # If it were possible, we could replace this with `defaultdict(lambda: dtype)` + dtype = defaultdict(functools.partial(self._default_dtype, dtype)) self.dtype = dtype - super().__init__(functools.partial(torch.Tensor.to, dtype=dtype), *types or (torch.Tensor,)) - def extra_repr(self) -> str: - return ", ".join([f"dtype={self.dtype}", f"types={[type.__name__ for type in self.types]}"]) + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return inpt.to(self.dtype[type(inpt)]) class RemoveSmallBoundingBoxes(Transform): -- GitLab From bdc55567d0e0f639b1c7b1dc4374819bd1b9693f Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 13 Oct 2022 13:59:04 +0200 Subject: [PATCH 042/624] introduce nearest-exact interpolation (#6754) * introduce nearest-exact interpolation * update prototype tests * update stable tests --- test/prototype_transforms_kernel_infos.py | 2 ++ test/test_functional_tensor.py | 11 ++++++++--- test/test_transforms_tensor.py | 15 ++++++++++----- torchvision/transforms/functional.py | 11 ++++++++--- torchvision/transforms/transforms.py | 8 ++++---- 5 files changed, 32 insertions(+), 15 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 5af2f8f6a..c455caa6b 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -232,6 +232,7 @@ def reference_inputs_resize_image_tensor(): make_image_loaders(extra_dims=[()]), [ F.InterpolationMode.NEAREST, + F.InterpolationMode.NEAREST_EXACT, F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC, ], @@ -881,6 +882,7 @@ def reference_inputs_resized_crop_image_tensor(): make_image_loaders(extra_dims=[()]), [ F.InterpolationMode.NEAREST, + F.InterpolationMode.NEAREST_EXACT, F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC, ], diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index 9bdd4ab83..25f4e709f 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -25,7 +25,12 @@ from common_utils import ( ) from torchvision.transforms import InterpolationMode -NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC +NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = ( + InterpolationMode.NEAREST, + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR, + InterpolationMode.BICUBIC, +) @pytest.mark.parametrize("device", cpu_and_gpu()) @@ -506,7 +511,7 @@ def test_perspective_interpolation_warning(): ], ) @pytest.mark.parametrize("max_size", [None, 34, 40, 1000]) -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST]) +@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT]) def test_resize(device, dt, size, max_size, interpolation): if dt == torch.float16 and device == "cpu": @@ -966,7 +971,7 @@ def test_pad(device, dt, pad, config): @pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("mode", [NEAREST, BILINEAR, BICUBIC]) +@pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC]) def test_resized_crop(device, mode): # test values of F.resized_crop in several cases: # 1) resize to the same size, crop to the same size => should be identity diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index f4ca544de..7b75a4436 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -20,7 +20,12 @@ from torchvision import transforms as T from torchvision.transforms import functional as F, InterpolationMode from torchvision.transforms.autoaugment import _apply_op -NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC +NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = ( + InterpolationMode.NEAREST, + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR, + InterpolationMode.BICUBIC, +) def _test_transform_vs_scripted(transform, s_transform, tensor, msg=None): @@ -378,7 +383,7 @@ class TestResize: @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64]) @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]]) @pytest.mark.parametrize("max_size", [None, 35, 1000]) - @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST]) + @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT]) def test_resize_scripted(self, dt, size, max_size, interpolation, device): tensor, _ = _create_data(height=34, width=36, device=device) batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device) @@ -402,12 +407,12 @@ class TestResize: @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]]) @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]]) @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]]) - @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC]) + @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC, NEAREST_EXACT]) @pytest.mark.parametrize("antialias", [None, True, False]) def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device): - if antialias and interpolation == NEAREST: - pytest.skip("Can not resize if interpolation mode is NEAREST and antialias=True") + if antialias and interpolation in {NEAREST, NEAREST_EXACT}: + pytest.skip(f"Can not resize if interpolation mode is {interpolation} and antialias=True") tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device) diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py index e82c5eca8..f06b5dbc9 100644 --- a/torchvision/transforms/functional.py +++ b/torchvision/transforms/functional.py @@ -20,10 +20,12 @@ from . import functional_pil as F_pil, functional_tensor as F_t class InterpolationMode(Enum): """Interpolation modes - Available interpolation methods are ``nearest``, ``bilinear``, ``bicubic``, ``box``, ``hamming``, and ``lanczos``. + Available interpolation methods are ``nearest``, ``nearest-exact``, ``bilinear``, ``bicubic``, ``box``, ``hamming``, + and ``lanczos``. """ NEAREST = "nearest" + NEAREST_EXACT = "nearest-exact" BILINEAR = "bilinear" BICUBIC = "bicubic" # For PIL compatibility @@ -50,6 +52,7 @@ pil_modes_mapping = { InterpolationMode.NEAREST: 0, InterpolationMode.BILINEAR: 2, InterpolationMode.BICUBIC: 3, + InterpolationMode.NEAREST_EXACT: 0, InterpolationMode.BOX: 4, InterpolationMode.HAMMING: 5, InterpolationMode.LANCZOS: 1, @@ -416,7 +419,8 @@ def resize( interpolation (InterpolationMode): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``, - ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are + supported. For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted, but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum. max_size (int, optional): The maximum allowed for the longer edge of @@ -617,7 +621,8 @@ def resized_crop( interpolation (InterpolationMode): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``, - ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are + supported. For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted, but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum. antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 6011e2372..985937678 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -296,8 +296,8 @@ class Resize(torch.nn.Module): In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. interpolation (InterpolationMode): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. - If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and - ``InterpolationMode.BICUBIC`` are supported. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted, but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum. max_size (int, optional): The maximum allowed for the longer edge of @@ -865,8 +865,8 @@ class RandomResizedCrop(torch.nn.Module): resizing. interpolation (InterpolationMode): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. - If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and - ``InterpolationMode.BICUBIC`` are supported. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted, but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum. antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias -- GitLab From e1b21f9c20e70ee5385ecd6ea2268010b8c4aed1 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 13 Oct 2022 16:46:12 +0200 Subject: [PATCH 043/624] introduce _check method for type checks on prototype transforms (#6503) * introduce _check method for type checks on prototype transforms * cleanup * Update torchvision/prototype/transforms/_geometry.py Co-authored-by: Vasilis Vryniotis * introduce _check on new transforms * _check -> _check_inputs * always check inputs in _RandomApplyTransform Co-authored-by: Vasilis Vryniotis --- torchvision/prototype/transforms/_augment.py | 9 ++- torchvision/prototype/transforms/_geometry.py | 63 +++++++++---------- torchvision/prototype/transforms/_misc.py | 15 ++--- .../prototype/transforms/_transform.py | 20 +++++- 4 files changed, 58 insertions(+), 49 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 9a4d32fc6..5861dd291 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -107,17 +107,16 @@ class _BaseMixupCutmix(_RandomApplyTransform): self.alpha = alpha self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def forward(self, *inputs: Any) -> Any: + def _check_inputs(self, sample: Any) -> None: if not ( - has_any(inputs, features.Image, features.Video, features.is_simple_tensor) - and has_any(inputs, features.OneHotLabel) + has_any(sample, features.Image, features.Video, features.is_simple_tensor) + and has_any(sample, features.OneHotLabel) ): raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.") - if has_any(inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label): + if has_any(sample, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label): raise TypeError( f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels." ) - return super().forward(*inputs) def _mixup_onehotlabel(self, inpt: features.OneHotLabel, lam: float) -> features.OneHotLabel: if inpt.ndim < 2: diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index b09533273..5b31adc9e 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -184,10 +184,9 @@ class FiveCrop(Transform): ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]: return F.five_crop(inpt, self.size) - def forward(self, *inputs: Any) -> Any: - if has_any(inputs, features.BoundingBox, features.Mask): + def _check_inputs(self, sample: Any) -> None: + if has_any(sample, features.BoundingBox, features.Mask): raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()") - return super().forward(*inputs) class TenCrop(Transform): @@ -202,16 +201,15 @@ class TenCrop(Transform): self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") self.vertical_flip = vertical_flip + def _check_inputs(self, sample: Any) -> None: + if has_any(sample, features.BoundingBox, features.Mask): + raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()") + def _transform( self, inpt: Union[features.ImageType, features.VideoType], params: Dict[str, Any] ) -> Union[List[features.ImageTypeJIT], List[features.VideoTypeJIT]]: return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip) - def forward(self, *inputs: Any) -> Any: - if has_any(inputs, features.BoundingBox, features.Mask): - raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()") - return super().forward(*inputs) - class Pad(Transform): def __init__( @@ -616,6 +614,17 @@ class RandomIoUCrop(Transform): self.options = sampler_options self.trials = trials + def _check_inputs(self, sample: Any) -> None: + if not ( + has_all(sample, features.BoundingBox) + and has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor) + and has_any(sample, features.Label, features.OneHotLabel) + ): + raise TypeError( + f"{type(self).__name__}() requires input sample to contain Images or PIL Images, " + "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks." + ) + def _get_params(self, sample: Any) -> Dict[str, Any]: orig_h, orig_w = query_spatial_size(sample) bboxes = query_bounding_box(sample) @@ -688,18 +697,6 @@ class RandomIoUCrop(Transform): return output - def forward(self, *inputs: Any) -> Any: - if not ( - has_all(inputs, features.BoundingBox) - and has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor) - and has_any(inputs, features.Label, features.OneHotLabel) - ): - raise TypeError( - f"{type(self).__name__}() requires input sample to contain Images or PIL Images, " - "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks." - ) - return super().forward(*inputs) - class ScaleJitter(Transform): def __init__( @@ -774,6 +771,18 @@ class FixedSizeCrop(Transform): self.padding_mode = padding_mode + def _check_inputs(self, sample: Any) -> None: + if not has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video): + raise TypeError( + f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." + ) + + if has_any(sample, features.BoundingBox) and not has_any(sample, features.Label, features.OneHotLabel): + raise TypeError( + f"If a BoundingBox is contained in the input sample, " + f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel." + ) + def _get_params(self, sample: Any) -> Dict[str, Any]: height, width = query_spatial_size(sample) new_height = min(height, self.crop_height) @@ -850,20 +859,6 @@ class FixedSizeCrop(Transform): return inpt - def forward(self, *inputs: Any) -> Any: - if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video): - raise TypeError( - f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." - ) - - if has_any(inputs, features.BoundingBox) and not has_any(inputs, features.Label, features.OneHotLabel): - raise TypeError( - f"If a BoundingBox is contained in the input sample, " - f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel." - ) - - return super().forward(*inputs) - class RandomResize(Transform): def __init__( diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index eac65da6e..61be60cee 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -63,12 +63,10 @@ class LinearTransformation(Transform): self.transformation_matrix = transformation_matrix self.mean_vector = mean_vector - def forward(self, *inputs: Any) -> Any: - if has_any(inputs, PIL.Image.Image): + def _check_inputs(self, sample: Any) -> Any: + if has_any(sample, PIL.Image.Image): raise TypeError("LinearTransformation does not work on PIL Images") - return super().forward(*inputs) - def _transform( self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any] ) -> torch.Tensor: @@ -104,16 +102,15 @@ class Normalize(Transform): self.std = list(std) self.inplace = inplace + def _check_inputs(self, sample: Any) -> Any: + if has_any(sample, PIL.Image.Image): + raise TypeError(f"{type(self).__name__}() does not support PIL images.") + def _transform( self, inpt: Union[features.TensorImageType, features.TensorVideoType], params: Dict[str, Any] ) -> torch.Tensor: return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace) - def forward(self, *inpts: Any) -> Any: - if has_any(inpts, PIL.Image.Image): - raise TypeError(f"{type(self).__name__}() does not support PIL images.") - return super().forward(*inpts) - class GaussianBlur(Transform): def __init__( diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py index 54ae91b79..056c2da9f 100644 --- a/torchvision/prototype/transforms/_transform.py +++ b/torchvision/prototype/transforms/_transform.py @@ -23,6 +23,9 @@ class Transform(nn.Module): super().__init__() _log_api_usage_once(self) + def _check_inputs(self, sample: Any) -> None: + pass + def _get_params(self, sample: Any) -> Dict[str, Any]: return dict() @@ -32,6 +35,8 @@ class Transform(nn.Module): def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] + self._check_inputs(sample) + params = self._get_params(sample) flat_inputs, spec = tree_flatten(sample) @@ -64,9 +69,22 @@ class _RandomApplyTransform(Transform): self.p = p def forward(self, *inputs: Any) -> Any: + # We need to almost duplicate `Transform.forward()` here since we always want to check the inputs, but return + # early afterwards in case the random check triggers. The same result could be achieved by calling + # `super().forward()` after the random check, but that would call `self._check_inputs` twice. + sample = inputs if len(inputs) > 1 else inputs[0] + self._check_inputs(sample) + if torch.rand(1) >= self.p: return sample - return super().forward(sample) + params = self._get_params(sample) + + flat_inputs, spec = tree_flatten(sample) + flat_outputs = [ + self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt + for inpt in flat_inputs + ] + return tree_unflatten(flat_outputs, spec) -- GitLab From dc5fd831ed9f4a0c58a194853ffa9cce6c240026 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 13 Oct 2022 17:13:45 +0200 Subject: [PATCH 044/624] improve test id for consistency tests (#6763) --- test/test_prototype_transforms_consistency.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index f335220fb..589d45595 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -575,9 +575,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s @pytest.mark.parametrize( ("config", "args_kwargs"), [ - pytest.param(config, args_kwargs, id=f"{config.legacy_cls.__name__}({args_kwargs})") + pytest.param( + config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}" + ) for config in CONSISTENCY_CONFIGS - for args_kwargs in config.args_kwargs + for idx, args_kwargs in enumerate(config.args_kwargs) ], ) def test_call_consistency(config, args_kwargs): -- GitLab From e3238e5af74e2c1af594ab4bae8cd6bfbf5bce2c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 14 Oct 2022 10:01:40 +0200 Subject: [PATCH 045/624] only flatten a pytree once (#6767) --- test/test_prototype_transforms.py | 45 ++++++------ test/test_prototype_transforms_consistency.py | 2 +- torchvision/prototype/transforms/_augment.py | 26 +++---- .../prototype/transforms/_auto_augment.py | 54 +++++++------- torchvision/prototype/transforms/_color.py | 8 +-- .../prototype/transforms/_deprecated.py | 6 +- torchvision/prototype/transforms/_geometry.py | 72 ++++++++++--------- torchvision/prototype/transforms/_misc.py | 8 +-- .../prototype/transforms/_transform.py | 25 +++---- torchvision/prototype/transforms/_utils.py | 42 +++++------ 10 files changed, 143 insertions(+), 145 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index f18597a24..11a51f7b5 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -437,7 +437,7 @@ class TestRandomZoomOut: image = mocker.MagicMock(spec=features.Image) h, w = image.spatial_size = (24, 32) - params = transform._get_params(image) + params = transform._get_params([image]) assert len(params["padding"]) == 4 assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w @@ -462,7 +462,7 @@ class TestRandomZoomOut: _ = transform(inpt) torch.manual_seed(12) torch.rand(1) # random apply changes random state - params = transform._get_params(inpt) + params = transform._get_params([inpt]) fill = transforms.functional._geometry._convert_fill_arg(fill) fn.assert_called_once_with(inpt, **params, fill=fill) @@ -623,7 +623,7 @@ class TestRandomAffine: h, w = image.spatial_size transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear) - params = transform._get_params(image) + params = transform._get_params([image]) if not isinstance(degrees, (list, tuple)): assert -degrees <= params["angle"] <= degrees @@ -690,7 +690,7 @@ class TestRandomAffine: torch.manual_seed(12) _ = transform(inpt) torch.manual_seed(12) - params = transform._get_params(inpt) + params = transform._get_params([inpt]) fill = transforms.functional._geometry._convert_fill_arg(fill) fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center) @@ -722,7 +722,7 @@ class TestRandomCrop: h, w = image.spatial_size transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed) - params = transform._get_params(image) + params = transform._get_params([image]) if padding is not None: if isinstance(padding, int): @@ -793,7 +793,7 @@ class TestRandomCrop: torch.manual_seed(12) _ = transform(inpt) torch.manual_seed(12) - params = transform._get_params(inpt) + params = transform._get_params([inpt]) if padding is None and not pad_if_needed: fn_crop.assert_called_once_with( inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1] @@ -832,7 +832,7 @@ class TestGaussianBlur: @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]]) def test__get_params(self, sigma): transform = transforms.GaussianBlur(3, sigma=sigma) - params = transform._get_params(None) + params = transform._get_params([]) if isinstance(sigma, float): assert params["sigma"][0] == params["sigma"][1] == 10 @@ -867,7 +867,7 @@ class TestGaussianBlur: torch.manual_seed(12) _ = transform(inpt) torch.manual_seed(12) - params = transform._get_params(inpt) + params = transform._get_params([inpt]) fn.assert_called_once_with(inpt, kernel_size, **params) @@ -912,7 +912,7 @@ class TestRandomPerspective: image.num_channels = 3 image.spatial_size = (24, 32) - params = transform._get_params(image) + params = transform._get_params([image]) h, w = image.spatial_size assert "perspective_coeffs" in params @@ -935,7 +935,7 @@ class TestRandomPerspective: _ = transform(inpt) torch.manual_seed(12) torch.rand(1) # random apply changes random state - params = transform._get_params(inpt) + params = transform._get_params([inpt]) fill = transforms.functional._geometry._convert_fill_arg(fill) fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation) @@ -973,7 +973,7 @@ class TestElasticTransform: image.num_channels = 3 image.spatial_size = (24, 32) - params = transform._get_params(image) + params = transform._get_params([image]) h, w = image.spatial_size displacement = params["displacement"] @@ -1006,7 +1006,7 @@ class TestElasticTransform: # Let's mock transform._get_params to control the output: transform._get_params = mocker.MagicMock() _ = transform(inpt) - params = transform._get_params(inpt) + params = transform._get_params([inpt]) fill = transforms.functional._geometry._convert_fill_arg(fill) fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation) @@ -1035,7 +1035,7 @@ class TestRandomErasing: transform = transforms.RandomErasing(value=[1, 2, 3, 4]) with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): - transform._get_params(image) + transform._get_params([image]) @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"]) def test__get_params(self, value, mocker): @@ -1044,7 +1044,7 @@ class TestRandomErasing: image.spatial_size = (24, 32) transform = transforms.RandomErasing(value=value) - params = transform._get_params(image) + params = transform._get_params([image]) v = params["v"] h, w = params["h"], params["w"] @@ -1197,6 +1197,7 @@ class TestContainers: [ [transforms.Pad(2), transforms.RandomCrop(28)], [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)], + [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)], ], ) def test_ctor(self, transform_cls, trfms): @@ -1339,7 +1340,7 @@ class TestScaleJitter: n_samples = 5 for _ in range(n_samples): - params = transform._get_params(sample) + params = transform._get_params([sample]) assert "size" in params size = params["size"] @@ -1386,7 +1387,7 @@ class TestRandomShortestSize: transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size) - params = transform._get_params(sample) + params = transform._get_params([sample]) assert "size" in params size = params["size"] @@ -1554,13 +1555,13 @@ class TestFixedSizeCrop: transform = transforms.FixedSizeCrop(size=crop_size) - sample = dict( - image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB), - bounding_boxes=make_bounding_box( + flat_inputs = [ + make_image(size=spatial_size, color_space=features.ColorSpace.RGB), + make_bounding_box( format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape ), - ) - params = transform._get_params(sample) + ] + params = transform._get_params(flat_inputs) assert params["needs_crop"] assert params["height"] <= crop_size[0] @@ -1759,7 +1760,7 @@ class TestRandomResize: transform = transforms.RandomResize(min_size=min_size, max_size=max_size) for _ in range(10): - params = transform._get_params(None) + params = transform._get_params([]) assert isinstance(params["size"], list) and len(params["size"]) == 1 size = params["size"][0] diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 589d45595..7f439fb26 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -639,7 +639,7 @@ class TestContainerTransforms: prototype_transform = prototype_transforms.RandomApply( [ prototype_transforms.Resize(256), - legacy_transforms.CenterCrop(224), + prototype_transforms.CenterCrop(224), ], p=p, ) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 5861dd291..99b77eb40 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -45,8 +45,8 @@ class RandomErasing(_RandomApplyTransform): self._log_ratio = torch.log(torch.tensor(self.ratio)) - def _get_params(self, sample: Any) -> Dict[str, Any]: - img_c, img_h, img_w = query_chw(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + img_c, img_h, img_w = query_chw(flat_inputs) if isinstance(self.value, (int, float)): value = [self.value] @@ -107,13 +107,13 @@ class _BaseMixupCutmix(_RandomApplyTransform): self.alpha = alpha self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def _check_inputs(self, sample: Any) -> None: + def _check_inputs(self, flat_inputs: List[Any]) -> None: if not ( - has_any(sample, features.Image, features.Video, features.is_simple_tensor) - and has_any(sample, features.OneHotLabel) + has_any(flat_inputs, features.Image, features.Video, features.is_simple_tensor) + and has_any(flat_inputs, features.OneHotLabel) ): raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.") - if has_any(sample, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label): + if has_any(flat_inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label): raise TypeError( f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels." ) @@ -127,7 +127,7 @@ class _BaseMixupCutmix(_RandomApplyTransform): class RandomMixup(_BaseMixupCutmix): - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: return dict(lam=float(self._dist.sample(()))) def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -150,10 +150,10 @@ class RandomMixup(_BaseMixupCutmix): class RandomCutmix(_BaseMixupCutmix): - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: lam = float(self._dist.sample(())) - H, W = query_spatial_size(sample) + H, W = query_spatial_size(flat_inputs) r_x = torch.randint(W, ()) r_y = torch.randint(H, ()) @@ -344,9 +344,9 @@ class SimpleCopyPaste(_RandomApplyTransform): c3 += 1 def forward(self, *inputs: Any) -> Any: - flat_sample, spec = tree_flatten(inputs) + flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) - images, targets = self._extract_image_targets(flat_sample) + images, targets = self._extract_image_targets(flat_inputs) # images = [t1, t2, ..., tN] # Let's define paste_images as shifted list of input images @@ -384,6 +384,6 @@ class SimpleCopyPaste(_RandomApplyTransform): output_targets.append(output_target) # Insert updated images and targets into input flat_sample - self._insert_outputs(flat_sample, output_images, output_targets) + self._insert_outputs(flat_inputs, output_images, output_targets) - return tree_unflatten(flat_sample, spec) + return tree_unflatten(flat_inputs, spec) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index 02c1a18da..47fc15422 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -4,7 +4,7 @@ from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, TypeV import PIL.Image import torch -from torch.utils._pytree import tree_flatten, tree_unflatten +from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec from torchvision.prototype import features from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform from torchvision.prototype.transforms.functional._meta import get_spatial_size @@ -31,16 +31,17 @@ class _AutoAugmentBase(Transform): key = keys[int(torch.randint(len(keys), ()))] return key, dct[key] - def _extract_image_or_video( + def _flatten_and_extract_image_or_video( self, - sample: Any, + inputs: Any, unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask), - ) -> Tuple[int, Union[features.ImageType, features.VideoType]]: - sample_flat, _ = tree_flatten(sample) + ) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[features.ImageType, features.VideoType]]: + flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) + image_or_videos = [] - for id, inpt in enumerate(sample_flat): + for idx, inpt in enumerate(flat_inputs): if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor, features.Video)): - image_or_videos.append((id, inpt)) + image_or_videos.append((idx, inpt)) elif isinstance(inpt, unsupported_types): raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()") @@ -51,12 +52,18 @@ class _AutoAugmentBase(Transform): f"Auto augment transformations are only properly defined for a single image or video, " f"but found {len(image_or_videos)}." ) - return image_or_videos[0] - def _put_into_sample(self, sample: Any, id: int, item: Any) -> Any: - sample_flat, spec = tree_flatten(sample) - sample_flat[id] = item - return tree_unflatten(sample_flat, spec) + idx, image_or_video = image_or_videos[0] + return (flat_inputs, spec, idx), image_or_video + + def _unflatten_and_insert_image_or_video( + self, + flat_inputs_with_spec: Tuple[List[Any], TreeSpec, int], + image_or_video: Union[features.ImageType, features.VideoType], + ) -> Any: + flat_inputs, spec, idx = flat_inputs_with_spec + flat_inputs[idx] = image_or_video + return tree_unflatten(flat_inputs, spec) def _apply_image_or_video_transform( self, @@ -275,9 +282,7 @@ class AutoAugment(_AutoAugmentBase): raise ValueError(f"The provided policy {policy} is not recognized.") def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] - - id, image_or_video = self._extract_image_or_video(sample) + flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) height, width = get_spatial_size(image_or_video) policy = self._policies[int(torch.randint(len(self._policies), ()))] @@ -300,7 +305,7 @@ class AutoAugment(_AutoAugmentBase): image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - return self._put_into_sample(sample, id, image_or_video) + return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video) class RandAugment(_AutoAugmentBase): @@ -346,9 +351,7 @@ class RandAugment(_AutoAugmentBase): self.num_magnitude_bins = num_magnitude_bins def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] - - id, image_or_video = self._extract_image_or_video(sample) + flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) height, width = get_spatial_size(image_or_video) for _ in range(self.num_ops): @@ -364,7 +367,7 @@ class RandAugment(_AutoAugmentBase): image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - return self._put_into_sample(sample, id, image_or_video) + return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video) class TrivialAugmentWide(_AutoAugmentBase): @@ -400,9 +403,7 @@ class TrivialAugmentWide(_AutoAugmentBase): self.num_magnitude_bins = num_magnitude_bins def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] - - id, image_or_video = self._extract_image_or_video(sample) + flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) height, width = get_spatial_size(image_or_video) transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -418,7 +419,7 @@ class TrivialAugmentWide(_AutoAugmentBase): image_or_video = self._apply_image_or_video_transform( image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - return self._put_into_sample(sample, id, image_or_video) + return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video) class AugMix(_AutoAugmentBase): @@ -471,8 +472,7 @@ class AugMix(_AutoAugmentBase): return torch._sample_dirichlet(params) def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] - id, orig_image_or_video = self._extract_image_or_video(sample) + flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs) height, width = get_spatial_size(orig_image_or_video) if isinstance(orig_image_or_video, torch.Tensor): @@ -525,4 +525,4 @@ class AugMix(_AutoAugmentBase): elif isinstance(orig_image_or_video, PIL.Image.Image): mix = F.to_image_pil(mix) - return self._put_into_sample(sample, id, mix) + return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, mix) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index 609f03bf4..3647365c3 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -1,5 +1,5 @@ import collections.abc -from typing import Any, Dict, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import PIL.Image import torch @@ -53,7 +53,7 @@ class ColorJitter(Transform): def _generate_value(left: float, right: float) -> float: return float(torch.distributions.Uniform(left, right).sample()) - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: fn_idx = torch.randperm(4) b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1]) @@ -99,8 +99,8 @@ class RandomPhotometricDistort(Transform): self.saturation = saturation self.p = p - def _get_params(self, sample: Any) -> Dict[str, Any]: - num_channels, *_ = query_chw(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + num_channels, *_ = query_chw(flat_inputs) return dict( zip( ["brightness", "contrast1", "saturation", "hue", "contrast2"], diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py index e401534f4..ac61f4f77 100644 --- a/torchvision/prototype/transforms/_deprecated.py +++ b/torchvision/prototype/transforms/_deprecated.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, Dict, Union +from typing import Any, Dict, List, Union import numpy as np import PIL.Image @@ -79,8 +79,8 @@ class RandomGrayscale(_RandomApplyTransform): super().__init__(p=p) - def _get_params(self, sample: Any) -> Dict[str, Any]: - num_input_channels, *_ = query_chw(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + num_input_channels, *_ = query_chw(flat_inputs) return dict(num_input_channels=num_input_channels) def _transform( diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 5b31adc9e..4987256ce 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -104,8 +104,8 @@ class RandomResizedCrop(Transform): self._log_ratio = torch.log(torch.tensor(self.ratio)) - def _get_params(self, sample: Any) -> Dict[str, Any]: - height, width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + height, width = query_spatial_size(flat_inputs) area = height * width log_ratio = self._log_ratio @@ -184,8 +184,8 @@ class FiveCrop(Transform): ) -> Tuple[ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT, ImageOrVideoTypeJIT]: return F.five_crop(inpt, self.size) - def _check_inputs(self, sample: Any) -> None: - if has_any(sample, features.BoundingBox, features.Mask): + def _check_inputs(self, flat_inputs: List[Any]) -> None: + if has_any(flat_inputs, features.BoundingBox, features.Mask): raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()") @@ -201,8 +201,8 @@ class TenCrop(Transform): self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") self.vertical_flip = vertical_flip - def _check_inputs(self, sample: Any) -> None: - if has_any(sample, features.BoundingBox, features.Mask): + def _check_inputs(self, flat_inputs: List[Any]) -> None: + if has_any(flat_inputs, features.BoundingBox, features.Mask): raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()") def _transform( @@ -256,8 +256,8 @@ class RandomZoomOut(_RandomApplyTransform): if side_range[0] < 1.0 or side_range[0] > side_range[1]: raise ValueError(f"Invalid canvas side range provided {side_range}.") - def _get_params(self, sample: Any) -> Dict[str, Any]: - orig_h, orig_w = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + orig_h, orig_w = query_spatial_size(flat_inputs) r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) canvas_width = int(orig_w * r) @@ -299,7 +299,7 @@ class RandomRotation(Transform): self.center = center - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item()) return dict(angle=angle) @@ -355,8 +355,8 @@ class RandomAffine(Transform): self.center = center - def _get_params(self, sample: Any) -> Dict[str, Any]: - height, width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + height, width = query_spatial_size(flat_inputs) angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item()) if self.translate is not None: @@ -417,8 +417,8 @@ class RandomCrop(Transform): self.fill = _setup_fill_arg(fill) self.padding_mode = padding_mode - def _get_params(self, sample: Any) -> Dict[str, Any]: - padded_height, padded_width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + padded_height, padded_width = query_spatial_size(flat_inputs) if self.padding is not None: pad_left, pad_right, pad_top, pad_bottom = self.padding @@ -505,8 +505,8 @@ class RandomPerspective(_RandomApplyTransform): self.interpolation = interpolation self.fill = _setup_fill_arg(fill) - def _get_params(self, sample: Any) -> Dict[str, Any]: - height, width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + height, width = query_spatial_size(flat_inputs) distortion_scale = self.distortion_scale @@ -559,8 +559,8 @@ class ElasticTransform(Transform): self.interpolation = interpolation self.fill = _setup_fill_arg(fill) - def _get_params(self, sample: Any) -> Dict[str, Any]: - size = list(query_spatial_size(sample)) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + size = list(query_spatial_size(flat_inputs)) dx = torch.rand([1, 1] + size) * 2 - 1 if self.sigma[0] > 0.0: @@ -614,20 +614,20 @@ class RandomIoUCrop(Transform): self.options = sampler_options self.trials = trials - def _check_inputs(self, sample: Any) -> None: + def _check_inputs(self, flat_inputs: List[Any]) -> None: if not ( - has_all(sample, features.BoundingBox) - and has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor) - and has_any(sample, features.Label, features.OneHotLabel) + has_all(flat_inputs, features.BoundingBox) + and has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor) + and has_any(flat_inputs, features.Label, features.OneHotLabel) ): raise TypeError( f"{type(self).__name__}() requires input sample to contain Images or PIL Images, " "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks." ) - def _get_params(self, sample: Any) -> Dict[str, Any]: - orig_h, orig_w = query_spatial_size(sample) - bboxes = query_bounding_box(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + orig_h, orig_w = query_spatial_size(flat_inputs) + bboxes = query_bounding_box(flat_inputs) while True: # sample an option @@ -712,8 +712,8 @@ class ScaleJitter(Transform): self.interpolation = interpolation self.antialias = antialias - def _get_params(self, sample: Any) -> Dict[str, Any]: - orig_height, orig_width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + orig_height, orig_width = query_spatial_size(flat_inputs) scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0]) r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale @@ -740,8 +740,8 @@ class RandomShortestSize(Transform): self.interpolation = interpolation self.antialias = antialias - def _get_params(self, sample: Any) -> Dict[str, Any]: - orig_height, orig_width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + orig_height, orig_width = query_spatial_size(flat_inputs) min_size = self.min_size[int(torch.randint(len(self.min_size), ()))] r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width)) @@ -771,20 +771,22 @@ class FixedSizeCrop(Transform): self.padding_mode = padding_mode - def _check_inputs(self, sample: Any) -> None: - if not has_any(sample, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video): + def _check_inputs(self, flat_inputs: List[Any]) -> None: + if not has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor, features.Video): raise TypeError( f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." ) - if has_any(sample, features.BoundingBox) and not has_any(sample, features.Label, features.OneHotLabel): + if has_any(flat_inputs, features.BoundingBox) and not has_any( + flat_inputs, features.Label, features.OneHotLabel + ): raise TypeError( f"If a BoundingBox is contained in the input sample, " f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel." ) - def _get_params(self, sample: Any) -> Dict[str, Any]: - height, width = query_spatial_size(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + height, width = query_spatial_size(flat_inputs) new_height = min(height, self.crop_height) new_width = min(width, self.crop_width) @@ -798,7 +800,7 @@ class FixedSizeCrop(Transform): left = int(offset_width * r) try: - bounding_boxes = query_bounding_box(sample) + bounding_boxes = query_bounding_box(flat_inputs) except ValueError: bounding_boxes = None @@ -874,7 +876,7 @@ class RandomResize(Transform): self.interpolation = interpolation self.antialias = antialias - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: size = int(torch.randint(self.min_size, self.max_size, ())) return dict(size=[size]) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index 61be60cee..e26656339 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -1,6 +1,6 @@ import functools from collections import defaultdict -from typing import Any, Callable, Dict, Sequence, Type, Union +from typing import Any, Callable, Dict, List, Sequence, Type, Union import PIL.Image @@ -134,7 +134,7 @@ class GaussianBlur(Transform): self.sigma = _setup_float_or_seq(sigma, "sigma", 2) - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item() return dict(sigma=[sigma, sigma]) @@ -167,8 +167,8 @@ class RemoveSmallBoundingBoxes(Transform): super().__init__() self.min_size = min_size - def _get_params(self, sample: Any) -> Dict[str, Any]: - bounding_box = query_bounding_box(sample) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + bounding_box = query_bounding_box(flat_inputs) # TODO: We can improve performance here by not using the `remove_small_boxes` function. It requires the box to # be in XYXY format only to calculate the width and height internally. Thus, if the box is in XYWH or CXCYWH diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py index 056c2da9f..523fa18fa 100644 --- a/torchvision/prototype/transforms/_transform.py +++ b/torchvision/prototype/transforms/_transform.py @@ -1,5 +1,5 @@ import enum -from typing import Any, Callable, Dict, Tuple, Type, Union +from typing import Any, Callable, Dict, List, Tuple, Type, Union import PIL.Image import torch @@ -23,27 +23,27 @@ class Transform(nn.Module): super().__init__() _log_api_usage_once(self) - def _check_inputs(self, sample: Any) -> None: + def _check_inputs(self, flat_inputs: List[Any]) -> None: pass - def _get_params(self, sample: Any) -> Dict[str, Any]: + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: return dict() def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: raise NotImplementedError def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] + flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) - self._check_inputs(sample) + self._check_inputs(flat_inputs) - params = self._get_params(sample) + params = self._get_params(flat_inputs) - flat_inputs, spec = tree_flatten(sample) flat_outputs = [ self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt for inpt in flat_inputs ] + return tree_unflatten(flat_outputs, spec) def extra_repr(self) -> str: @@ -73,18 +73,19 @@ class _RandomApplyTransform(Transform): # early afterwards in case the random check triggers. The same result could be achieved by calling # `super().forward()` after the random check, but that would call `self._check_inputs` twice. - sample = inputs if len(inputs) > 1 else inputs[0] + inputs = inputs if len(inputs) > 1 else inputs[0] + flat_inputs, spec = tree_flatten(inputs) - self._check_inputs(sample) + self._check_inputs(flat_inputs) if torch.rand(1) >= self.p: - return sample + return inputs - params = self._get_params(sample) + params = self._get_params(flat_inputs) - flat_inputs, spec = tree_flatten(sample) flat_outputs = [ self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt for inpt in flat_inputs ] + return tree_unflatten(flat_outputs, spec) diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index 53b27f2e2..b3e241d16 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -1,11 +1,10 @@ import functools import numbers from collections import defaultdict -from typing import Any, Callable, Dict, Sequence, Tuple, Type, Union +from typing import Any, Callable, Dict, List, Sequence, Tuple, Type, Union import PIL.Image -from torch.utils._pytree import tree_flatten from torchvision._utils import sequence_to_str from torchvision.prototype import features from torchvision.prototype.features._feature import FillType @@ -73,9 +72,8 @@ def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", raise ValueError("Padding mode should be either constant, edge, reflect or symmetric") -def query_bounding_box(sample: Any) -> features.BoundingBox: - flat_sample, _ = tree_flatten(sample) - bounding_boxes = {item for item in flat_sample if isinstance(item, features.BoundingBox)} +def query_bounding_box(flat_inputs: List[Any]) -> features.BoundingBox: + bounding_boxes = {inpt for inpt in flat_inputs if isinstance(inpt, features.BoundingBox)} if not bounding_boxes: raise TypeError("No bounding box was found in the sample") elif len(bounding_boxes) > 1: @@ -83,12 +81,11 @@ def query_bounding_box(sample: Any) -> features.BoundingBox: return bounding_boxes.pop() -def query_chw(sample: Any) -> Tuple[int, int, int]: - flat_sample, _ = tree_flatten(sample) +def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]: chws = { - tuple(get_dimensions(item)) - for item in flat_sample - if isinstance(item, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(item) + tuple(get_dimensions(inpt)) + for inpt in flat_inputs + if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video)) or features.is_simple_tensor(inpt) } if not chws: raise TypeError("No image or video was found in the sample") @@ -98,13 +95,12 @@ def query_chw(sample: Any) -> Tuple[int, int, int]: return c, h, w -def query_spatial_size(sample: Any) -> Tuple[int, int]: - flat_sample, _ = tree_flatten(sample) +def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]: sizes = { - tuple(get_spatial_size(item)) - for item in flat_sample - if isinstance(item, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox)) - or features.is_simple_tensor(item) + tuple(get_spatial_size(inpt)) + for inpt in flat_inputs + if isinstance(inpt, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox)) + or features.is_simple_tensor(inpt) } if not sizes: raise TypeError("No image, video, mask or bounding box was found in the sample") @@ -121,19 +117,17 @@ def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], boo return False -def has_any(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool: - flat_sample, _ = tree_flatten(sample) - for obj in flat_sample: - if _isinstance(obj, types_or_checks): +def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool: + for inpt in flat_inputs: + if _isinstance(inpt, types_or_checks): return True return False -def has_all(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool: - flat_sample, _ = tree_flatten(sample) +def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool: for type_or_check in types_or_checks: - for obj in flat_sample: - if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj): + for inpt in flat_inputs: + if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt): break else: return False -- GitLab From 88b6b93d2b7f89b30f427ec715bff8dd1756535e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 14 Oct 2022 12:54:56 +0100 Subject: [PATCH 046/624] Extend `RandomShortestSize` to support Video specific flavour of the augmentation (#6770) * Extend RandomShortestSize to support Video specific flavour of the augmentation * Adding a test. * Apply changes from code review --- test/test_prototype_transforms.py | 10 ++++------ torchvision/prototype/transforms/_geometry.py | 6 ++++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 11a51f7b5..5928e6718 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1379,10 +1379,9 @@ class TestScaleJitter: class TestRandomShortestSize: - def test__get_params(self, mocker): + @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)]) + def test__get_params(self, min_size, max_size, mocker): spatial_size = (3, 10) - min_size = [5, 9] - max_size = 20 transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) @@ -1395,10 +1394,9 @@ class TestRandomShortestSize: assert isinstance(size, tuple) and len(size) == 2 longer = max(size) - assert longer <= max_size - shorter = min(size) - if longer == max_size: + if max_size is not None: + assert longer <= max_size assert shorter <= max_size else: assert shorter in min_size diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 4987256ce..5c67bf0ec 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -730,7 +730,7 @@ class RandomShortestSize(Transform): def __init__( self, min_size: Union[List[int], Tuple[int], int], - max_size: int, + max_size: Optional[int] = None, interpolation: InterpolationMode = InterpolationMode.BILINEAR, antialias: Optional[bool] = None, ): @@ -744,7 +744,9 @@ class RandomShortestSize(Transform): orig_height, orig_width = query_spatial_size(flat_inputs) min_size = self.min_size[int(torch.randint(len(self.min_size), ()))] - r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width)) + r = min_size / min(orig_height, orig_width) + if self.max_size is not None: + r = min(r, self.max_size / max(orig_height, orig_width)) new_width = int(orig_width * r) new_height = int(orig_height * r) -- GitLab From c960273c131e41a06a7b47836fb5ee81c88ebc5d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 14 Oct 2022 13:34:29 +0100 Subject: [PATCH 047/624] Switch `view()` with `reshape()` on equalize (#6772) --- torchvision/prototype/transforms/functional/_color.py | 2 +- torchvision/transforms/functional_tensor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py index 7cbf8885c..8460f9c64 100644 --- a/torchvision/prototype/transforms/functional/_color.py +++ b/torchvision/prototype/transforms/functional/_color.py @@ -227,7 +227,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: if image.numel() == 0: return image - return _equalize_image_tensor_vec(image.view(-1, height, width)).view(image.shape) + return _equalize_image_tensor_vec(image.view(-1, height, width)).reshape(image.shape) equalize_image_pil = _FP.equalize diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index 20b76fbf0..4944c75fa 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -875,7 +875,7 @@ def _scale_channel(img_chan: Tensor) -> Tensor: if img_chan.is_cuda: hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255) else: - hist = torch.bincount(img_chan.view(-1), minlength=256) + hist = torch.bincount(img_chan.reshape(-1), minlength=256) nonzero_hist = hist[hist != 0] step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor") -- GitLab From 8ec7a70f29010945cf640645d2cd16cb79bf3d9e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 14 Oct 2022 16:05:12 +0200 Subject: [PATCH 048/624] allow tolerances in transforms consistency checks (#6774) --- test/test_prototype_transforms_consistency.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 7f439fb26..7d2f1d735 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -12,6 +12,7 @@ import pytest import torch from prototype_common_utils import ( ArgsKwargs, + assert_close, assert_equal, make_bounding_box, make_detection_mask, @@ -40,6 +41,7 @@ class ConsistencyConfig: make_images_kwargs=None, supports_pil=True, removed_params=(), + closeness_kwargs=None, ): self.prototype_cls = prototype_cls self.legacy_cls = legacy_cls @@ -47,6 +49,7 @@ class ConsistencyConfig: self.make_images_kwargs = make_images_kwargs or DEFAULT_MAKE_IMAGES_KWARGS self.supports_pil = supports_pil self.removed_params = removed_params + self.closeness_kwargs = closeness_kwargs or dict(rtol=0, atol=0) # These are here since both the prototype and legacy transform need to be constructed with the same random parameters @@ -491,10 +494,14 @@ def test_signature_consistency(config): assert prototype_kinds == legacy_kinds -def check_call_consistency(prototype_transform, legacy_transform, images=None, supports_pil=True): +def check_call_consistency( + prototype_transform, legacy_transform, images=None, supports_pil=True, closeness_kwargs=None +): if images is None: images = make_images(**DEFAULT_MAKE_IMAGES_KWARGS) + closeness_kwargs = closeness_kwargs or dict() + for image in images: image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]" @@ -520,10 +527,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s f"`is_simple_tensor` path in `_transform`." ) from exc - assert_equal( + assert_close( output_prototype_tensor, output_legacy_tensor, msg=lambda msg: f"Tensor image consistency check failed with: \n\n{msg}", + **closeness_kwargs, ) try: @@ -536,10 +544,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s f"`features.Image` path in `_transform`." ) from exc - assert_equal( + assert_close( output_prototype_image, output_prototype_tensor, msg=lambda msg: f"Output for feature and tensor images is not equal: \n\n{msg}", + **closeness_kwargs, ) if image.ndim == 3 and supports_pil: @@ -565,10 +574,11 @@ def check_call_consistency(prototype_transform, legacy_transform, images=None, s f"`PIL.Image.Image` path in `_transform`." ) from exc - assert_equal( + assert_close( output_prototype_pil, output_legacy_pil, msg=lambda msg: f"PIL image consistency check failed with: \n\n{msg}", + **closeness_kwargs, ) @@ -606,6 +616,7 @@ def test_call_consistency(config, args_kwargs): legacy_transform, images=make_images(**config.make_images_kwargs), supports_pil=config.supports_pil, + closeness_kwargs=config.closeness_kwargs, ) -- GitLab From e1aacdd9f0712ec971e689dc23e7a3204597179d Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 14 Oct 2022 15:52:08 +0100 Subject: [PATCH 049/624] Update `ToDtype` to avoid unnecessary `to()` calls and fixing types on `Transform` (#6773) * Fix `ToDtype` to avoid errors when a type is not defined. * Nit `(features.is_simple_tensor, features._Feature)` to `(Tensor,)` * Fixing linter * Adding comment. * Switch back to indexing. Python's default dict seems to have a nasty behaviour. --- torchvision/prototype/transforms/_misc.py | 5 ++++- torchvision/prototype/transforms/_transform.py | 8 ++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index e26656339..b31c688dc 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -157,7 +157,10 @@ class ToDtype(Transform): self.dtype = dtype def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: - return inpt.to(self.dtype[type(inpt)]) + dtype = self.dtype[type(inpt)] + if dtype is None: + return inpt + return inpt.to(dtype=dtype) class RemoveSmallBoundingBoxes(Transform): diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py index 523fa18fa..95cf9c011 100644 --- a/torchvision/prototype/transforms/_transform.py +++ b/torchvision/prototype/transforms/_transform.py @@ -5,7 +5,6 @@ import PIL.Image import torch from torch import nn from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision.prototype import features from torchvision.prototype.transforms._utils import _isinstance from torchvision.utils import _log_api_usage_once @@ -13,11 +12,8 @@ from torchvision.utils import _log_api_usage_once class Transform(nn.Module): # Class attribute defining transformed types. Other types are passed-through without any transformation - _transformed_types: Tuple[Union[Type, Callable[[Any], bool]], ...] = ( - features.is_simple_tensor, - features._Feature, - PIL.Image.Image, - ) + # We support both Types and callables that are able to do further checks on the type of the input. + _transformed_types: Tuple[Union[Type, Callable[[Any], bool]], ...] = (torch.Tensor, PIL.Image.Image) def __init__(self) -> None: super().__init__() -- GitLab From e2fa1f9ddfe63237c226ef478baf1e35ff8d7e7e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 14 Oct 2022 17:37:58 +0100 Subject: [PATCH 050/624] Reshare input before equalize (#6775) --- torchvision/prototype/transforms/functional/_color.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py index 8460f9c64..379736b00 100644 --- a/torchvision/prototype/transforms/functional/_color.py +++ b/torchvision/prototype/transforms/functional/_color.py @@ -227,7 +227,7 @@ def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: if image.numel() == 0: return image - return _equalize_image_tensor_vec(image.view(-1, height, width)).reshape(image.shape) + return _equalize_image_tensor_vec(image.reshape(-1, height, width)).reshape(image.shape) equalize_image_pil = _FP.equalize -- GitLab From f467349ce0d41c23695538add22f6fec5a30ece4 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Sat, 15 Oct 2022 08:16:21 +0200 Subject: [PATCH 051/624] replace .view with .reshape (#6777) --- .../prototype/transforms/_auto_augment.py | 10 +-- torchvision/prototype/transforms/_misc.py | 4 +- .../prototype/transforms/functional/_color.py | 6 +- .../transforms/functional/_geometry.py | 66 +++++++++---------- .../prototype/transforms/functional/_misc.py | 4 +- 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index 47fc15422..56d581eff 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -484,7 +484,7 @@ class AugMix(_AutoAugmentBase): orig_dims = list(image_or_video.shape) expected_ndim = 5 if isinstance(orig_image_or_video, features.Video) else 4 - batch = image_or_video.view([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims) + batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims) batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1) # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a @@ -497,9 +497,9 @@ class AugMix(_AutoAugmentBase): # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos. combined_weights = self._sample_dirichlet( torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1) - ) * m[:, 1].view([batch_dims[0], -1]) + ) * m[:, 1].reshape([batch_dims[0], -1]) - mix = m[:, 0].view(batch_dims) * batch + mix = m[:, 0].reshape(batch_dims) * batch for i in range(self.mixture_width): aug = batch depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item()) @@ -517,8 +517,8 @@ class AugMix(_AutoAugmentBase): aug = self._apply_image_or_video_transform( aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill ) - mix.add_(combined_weights[:, i].view(batch_dims) * aug) - mix = mix.view(orig_dims).to(dtype=image_or_video.dtype) + mix.add_(combined_weights[:, i].reshape(batch_dims) * aug) + mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype) if isinstance(orig_image_or_video, (features.Image, features.Video)): mix = orig_image_or_video.wrap_like(orig_image_or_video, mix) # type: ignore[arg-type] diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index b31c688dc..bf7af5c26 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -88,9 +88,9 @@ class LinearTransformation(Transform): f"Got {inpt.device} vs {self.mean_vector.device}" ) - flat_tensor = inpt.view(-1, n) - self.mean_vector + flat_tensor = inpt.reshape(-1, n) - self.mean_vector transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix) - return transformed_tensor.view(shape) + return transformed_tensor.reshape(shape) class Normalize(Transform): diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py index 379736b00..49a769e04 100644 --- a/torchvision/prototype/transforms/functional/_color.py +++ b/torchvision/prototype/transforms/functional/_color.py @@ -69,7 +69,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) shape = image.shape if image.ndim > 4: - image = image.view(-1, num_channels, height, width) + image = image.reshape(-1, num_channels, height, width) needs_unsquash = True else: needs_unsquash = False @@ -77,7 +77,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float) output = _FT._blend(image, _FT._blurred_degenerate_image(image), sharpness_factor) if needs_unsquash: - output = output.view(shape) + output = output.reshape(shape) return output @@ -213,7 +213,7 @@ def _equalize_image_tensor_vec(img: torch.Tensor) -> torch.Tensor: zeros = lut.new_zeros((1, 1)).expand(shape[0], 1) lut = torch.cat([zeros, lut[:, :-1]], dim=1) - return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).view_as(img)) + return torch.where((step == 0).unsqueeze(-1), img, lut.gather(dim=1, index=flat_img).reshape_as(img)) def equalize_image_tensor(image: torch.Tensor) -> torch.Tensor: diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 43962ad4d..1c897700c 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -38,13 +38,13 @@ def horizontal_flip_bounding_box( bounding_box = convert_format_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY - ).view(-1, 4) + ).reshape(-1, 4) bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]] return convert_format_bounding_box( bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False - ).view(shape) + ).reshape(shape) def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor: @@ -75,13 +75,13 @@ def vertical_flip_bounding_box( bounding_box = convert_format_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY - ).view(-1, 4) + ).reshape(-1, 4) bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]] return convert_format_bounding_box( bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False - ).view(shape) + ).reshape(shape) def vertical_flip_video(video: torch.Tensor) -> torch.Tensor: @@ -123,7 +123,7 @@ def resize_image_tensor( extra_dims = image.shape[:-3] if image.numel() > 0: - image = image.view(-1, num_channels, old_height, old_width) + image = image.reshape(-1, num_channels, old_height, old_width) image = _FT.resize( image, @@ -132,7 +132,7 @@ def resize_image_tensor( antialias=antialias, ) - return image.view(extra_dims + (num_channels, new_height, new_width)) + return image.reshape(extra_dims + (num_channels, new_height, new_width)) @torch.jit.unused @@ -168,7 +168,7 @@ def resize_bounding_box( new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size) ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device) return ( - bounding_box.view(-1, 2, 2).mul(ratios).to(bounding_box.dtype).view(bounding_box.shape), + bounding_box.reshape(-1, 2, 2).mul(ratios).to(bounding_box.dtype).reshape(bounding_box.shape), (new_height, new_width), ) @@ -270,7 +270,7 @@ def affine_image_tensor( num_channels, height, width = image.shape[-3:] extra_dims = image.shape[:-3] - image = image.view(-1, num_channels, height, width) + image = image.reshape(-1, num_channels, height, width) angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center) @@ -283,7 +283,7 @@ def affine_image_tensor( matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear) output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill) - return output.view(extra_dims + (num_channels, height, width)) + return output.reshape(extra_dims + (num_channels, height, width)) @torch.jit.unused @@ -338,20 +338,20 @@ def _affine_bounding_box_xyxy( dtype=dtype, device=device, ) - .view(2, 3) + .reshape(2, 3) .T ) # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners). # Tensor of points has shape (N * 4, 3), where N is the number of bboxes # Single point structure is similar to # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)] - points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2) + points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1) # 2) Now let's transform the points using affine matrix transformed_points = torch.matmul(points, transposed_affine_matrix) # 3) Reshape transformed points to [N boxes, 4 points, x/y coords] # and compute bounding box from 4 transformed points: - transformed_points = transformed_points.view(-1, 4, 2) + transformed_points = transformed_points.reshape(-1, 4, 2) out_bbox_mins, _ = torch.min(transformed_points, dim=1) out_bbox_maxs, _ = torch.max(transformed_points, dim=1) out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1) @@ -396,7 +396,7 @@ def affine_bounding_box( original_shape = bounding_box.shape bounding_box = convert_format_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY - ).view(-1, 4) + ).reshape(-1, 4) out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center) @@ -404,7 +404,7 @@ def affine_bounding_box( return convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False - ).view(original_shape) + ).reshape(original_shape) def affine_mask( @@ -539,7 +539,7 @@ def rotate_image_tensor( if image.numel() > 0: image = _FT.rotate( - image.view(-1, num_channels, height, width), + image.reshape(-1, num_channels, height, width), matrix, interpolation=interpolation.value, expand=expand, @@ -549,7 +549,7 @@ def rotate_image_tensor( else: new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height) - return image.view(extra_dims + (num_channels, new_height, new_width)) + return image.reshape(extra_dims + (num_channels, new_height, new_width)) @torch.jit.unused @@ -585,7 +585,7 @@ def rotate_bounding_box( original_shape = bounding_box.shape bounding_box = convert_format_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY - ).view(-1, 4) + ).reshape(-1, 4) out_bboxes, spatial_size = _affine_bounding_box_xyxy( bounding_box, @@ -601,7 +601,7 @@ def rotate_bounding_box( return ( convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False - ).view(original_shape), + ).reshape(original_shape), spatial_size, ) @@ -691,7 +691,7 @@ def _pad_with_scalar_fill( if image.numel() > 0: image = _FT.pad( - img=image.view(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode + img=image.reshape(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode ) new_height, new_width = image.shape[-2:] else: @@ -699,7 +699,7 @@ def _pad_with_scalar_fill( new_height = height + top + bottom new_width = width + left + right - return image.view(extra_dims + (num_channels, new_height, new_width)) + return image.reshape(extra_dims + (num_channels, new_height, new_width)) # TODO: This should be removed once pytorch pad supports non-scalar padding values @@ -714,7 +714,7 @@ def _pad_with_vector_fill( output = _pad_with_scalar_fill(image, padding, fill=0, padding_mode="constant") left, right, top, bottom = _parse_pad_padding(padding) - fill = torch.tensor(fill, dtype=image.dtype, device=image.device).view(-1, 1, 1) + fill = torch.tensor(fill, dtype=image.dtype, device=image.device).reshape(-1, 1, 1) if top > 0: output[..., :top, :] = fill @@ -863,7 +863,7 @@ def perspective_image_tensor( shape = image.shape if image.ndim > 4: - image = image.view((-1,) + shape[-3:]) + image = image.reshape((-1,) + shape[-3:]) needs_unsquash = True else: needs_unsquash = False @@ -871,7 +871,7 @@ def perspective_image_tensor( output = _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill) if needs_unsquash: - output = output.view(shape) + output = output.reshape(shape) return output @@ -898,7 +898,7 @@ def perspective_bounding_box( original_shape = bounding_box.shape bounding_box = convert_format_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY - ).view(-1, 4) + ).reshape(-1, 4) dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32 device = bounding_box.device @@ -947,7 +947,7 @@ def perspective_bounding_box( # Tensor of points has shape (N * 4, 3), where N is the number of bboxes # Single point structure is similar to # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)] - points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2) + points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1) # 2) Now let's transform the points using perspective matrices # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1) @@ -959,7 +959,7 @@ def perspective_bounding_box( # 3) Reshape transformed points to [N boxes, 4 points, x/y coords] # and compute bounding box from 4 transformed points: - transformed_points = transformed_points.view(-1, 4, 2) + transformed_points = transformed_points.reshape(-1, 4, 2) out_bbox_mins, _ = torch.min(transformed_points, dim=1) out_bbox_maxs, _ = torch.max(transformed_points, dim=1) out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype) @@ -968,7 +968,7 @@ def perspective_bounding_box( return convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False - ).view(original_shape) + ).reshape(original_shape) def perspective_mask( @@ -1027,7 +1027,7 @@ def elastic_image_tensor( shape = image.shape if image.ndim > 4: - image = image.view((-1,) + shape[-3:]) + image = image.reshape((-1,) + shape[-3:]) needs_unsquash = True else: needs_unsquash = False @@ -1035,7 +1035,7 @@ def elastic_image_tensor( output = _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill) if needs_unsquash: - output = output.view(shape) + output = output.reshape(shape) return output @@ -1063,7 +1063,7 @@ def elastic_bounding_box( original_shape = bounding_box.shape bounding_box = convert_format_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY - ).view(-1, 4) + ).reshape(-1, 4) # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it # Or add spatial_size arg and check displacement shape @@ -1075,21 +1075,21 @@ def elastic_bounding_box( inv_grid = id_grid - displacement # Get points from bboxes - points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2) + points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long) index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long) # Transform points: t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype) transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5 - transformed_points = transformed_points.view(-1, 4, 2) + transformed_points = transformed_points.reshape(-1, 4, 2) out_bbox_mins, _ = torch.min(transformed_points, dim=1) out_bbox_maxs, _ = torch.max(transformed_points, dim=1) out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype) return convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False - ).view(original_shape) + ).reshape(original_shape) def elastic_mask( diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 8fda24e17..5b2dd135a 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -65,7 +65,7 @@ def gaussian_blur_image_tensor( shape = image.shape if image.ndim > 4: - image = image.view((-1,) + shape[-3:]) + image = image.reshape((-1,) + shape[-3:]) needs_unsquash = True else: needs_unsquash = False @@ -73,7 +73,7 @@ def gaussian_blur_image_tensor( output = _FT.gaussian_blur(image, kernel_size, sigma) if needs_unsquash: - output = output.view(shape) + output = output.reshape(shape) return output -- GitLab From 149edda463b54b3eabe989e260a839727c89d099 Mon Sep 17 00:00:00 2001 From: vfdev Date: Mon, 17 Oct 2022 09:59:33 +0200 Subject: [PATCH 052/624] [proto] Reduce number of calls of __torch_function__ (#6681) * [proto] Reduce number of calls of __torch_function__ * Use DisableTorchFunction and super * Use self._tensor * Fixes mypy and color space handling * revert Image.new_like * WIP * Perf opt with ref to tensor and properties * Removed requires_grad property * Use _tensor ref * Revert "Use _tensor ref" This reverts commit 38f8e21242830fed46ddf31287edb67c1abd124a. * Update torchvision/prototype/features/_feature.py Co-authored-by: Philip Meier Co-authored-by: Philip Meier --- torchvision/prototype/features/_feature.py | 23 ++++++++++++++++++++++ torchvision/prototype/features/_video.py | 6 +++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py index a56441f29..1cc2d8d4b 100644 --- a/torchvision/prototype/features/_feature.py +++ b/torchvision/prototype/features/_feature.py @@ -6,6 +6,7 @@ from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type import PIL.Image import torch from torch._C import DisableTorchFunction +from torch.types import _device, _dtype, _size from torchvision.transforms import InterpolationMode @@ -128,6 +129,28 @@ class _Feature(torch.Tensor): _Feature.__F = functional return _Feature.__F + # Add properties for common attributes like shape, dtype, device, ndim etc + # this way we return the result without passing into __torch_function__ + @property + def shape(self) -> _size: # type: ignore[override] + with DisableTorchFunction(): + return super().shape + + @property + def ndim(self) -> int: # type: ignore[override] + with DisableTorchFunction(): + return super().ndim + + @property + def device(self, *args: Any, **kwargs: Any) -> _device: # type: ignore[override] + with DisableTorchFunction(): + return super().device + + @property + def dtype(self) -> _dtype: # type: ignore[override] + with DisableTorchFunction(): + return super().dtype + def horizontal_flip(self) -> _Feature: return self diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py index 9dfff7f96..26f97549a 100644 --- a/torchvision/prototype/features/_video.py +++ b/torchvision/prototype/features/_video.py @@ -15,9 +15,9 @@ class Video(_Feature): @classmethod def _wrap(cls, tensor: torch.Tensor, *, color_space: ColorSpace) -> Video: - image = tensor.as_subclass(cls) - image.color_space = color_space - return image + video = tensor.as_subclass(cls) + video.color_space = color_space + return video def __new__( cls, -- GitLab From decb191962250e5969686f0fc07c8ee5f45b181b Mon Sep 17 00:00:00 2001 From: vfdev Date: Mon, 17 Oct 2022 16:45:28 +0200 Subject: [PATCH 053/624] [proto] Small optimization for gaussian_blur functional op (#6762) * Use softmax in _get_gaussian_kernel1d * Revert "Use softmax in _get_gaussian_kernel1d" This reverts commit eb8fba36302d2da9e06e6f40afaaf901b276a771. * Code update * Relaxed tolerance in consistency tests for GaussianBlur and ElasticTransform * Code review updates * Update test_prototype_transforms_consistency.py --- test/test_prototype_transforms_consistency.py | 40 +++++++++++-------- .../prototype/transforms/functional/_misc.py | 31 +++++++++++++- 2 files changed, 53 insertions(+), 18 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 7d2f1d735..212755068 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -308,22 +308,28 @@ CONSISTENCY_CONFIGS = [ ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.7, hue=0.3), ], ), - ConsistencyConfig( - prototype_transforms.ElasticTransform, - legacy_transforms.ElasticTransform, - [ - ArgsKwargs(), - ArgsKwargs(alpha=20.0), - ArgsKwargs(alpha=(15.3, 27.2)), - ArgsKwargs(sigma=3.0), - ArgsKwargs(sigma=(2.5, 3.9)), - ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST), - ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC), - ArgsKwargs(fill=1), - ], - # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(163, 163), (72, 333), (313, 95)]), - ), + *[ + ConsistencyConfig( + prototype_transforms.ElasticTransform, + legacy_transforms.ElasticTransform, + [ + ArgsKwargs(), + ArgsKwargs(alpha=20.0), + ArgsKwargs(alpha=(15.3, 27.2)), + ArgsKwargs(sigma=3.0), + ArgsKwargs(sigma=(2.5, 3.9)), + ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST), + ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC), + ArgsKwargs(fill=1), + ], + # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(163, 163), (72, 333), (313, 95)], dtypes=[dt]), + # We updated gaussian blur kernel generation with a faster and numerically more stable version + # This brings float32 accumulation visible in elastic transform -> we need to relax consistency tolerance + closeness_kwargs=ckw, + ) + for dt, ckw in [(torch.uint8, {"rtol": 1e-1, "atol": 1}), (torch.float32, {"rtol": 1e-2, "atol": 1e-3})] + ], ConsistencyConfig( prototype_transforms.GaussianBlur, legacy_transforms.GaussianBlur, @@ -333,6 +339,7 @@ CONSISTENCY_CONFIGS = [ ArgsKwargs(kernel_size=3, sigma=0.7), ArgsKwargs(kernel_size=5, sigma=(0.3, 1.4)), ], + closeness_kwargs={"rtol": 1e-5, "atol": 1e-5}, ), ConsistencyConfig( prototype_transforms.RandomAffine, @@ -506,7 +513,6 @@ def check_call_consistency( image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]" image_tensor = torch.Tensor(image) - try: torch.manual_seed(0) output_legacy_tensor = legacy_transform(image_tensor) diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 5b2dd135a..fa4a6e9be 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -1,7 +1,9 @@ +import math from typing import List, Optional, Union import PIL.Image import torch +from torch.nn.functional import conv2d, pad as torch_pad from torchvision.prototype import features from torchvision.transforms import functional_tensor as _FT from torchvision.transforms.functional import pil_to_tensor, to_pil_image @@ -32,6 +34,22 @@ def normalize( return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace) +def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> torch.Tensor: + lim = (kernel_size - 1) / (2 * math.sqrt(2) * sigma) + x = torch.linspace(-lim, lim, steps=kernel_size) + kernel1d = torch.softmax(-x.pow_(2), dim=0) + return kernel1d + + +def _get_gaussian_kernel2d( + kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device +) -> torch.Tensor: + kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype) + kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype) + kernel2d = kernel1d_y.unsqueeze(-1) * kernel1d_x + return kernel2d + + def gaussian_blur_image_tensor( image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None ) -> torch.Tensor: @@ -70,7 +88,18 @@ def gaussian_blur_image_tensor( else: needs_unsquash = False - output = _FT.gaussian_blur(image, kernel_size, sigma) + dtype = image.dtype if torch.is_floating_point(image) else torch.float32 + kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=image.device) + kernel = kernel.expand(image.shape[-3], 1, kernel.shape[0], kernel.shape[1]) + + image, need_cast, need_squeeze, out_dtype = _FT._cast_squeeze_in(image, [kernel.dtype]) + + # padding = (left, right, top, bottom) + padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2] + output = torch_pad(image, padding, mode="reflect") + output = conv2d(output, kernel, groups=output.shape[-3]) + + output = _FT._cast_squeeze_out(output, need_cast, need_squeeze, out_dtype) if needs_unsquash: output = output.reshape(shape) -- GitLab From 0610b13ac4af3717f538454a9c6b1f441cb386f3 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Mon, 17 Oct 2022 14:01:21 -0400 Subject: [PATCH 054/624] [Nova] Add GHA Linux CPU Unittests for Torchvision (#6759) * [Nova][WIP] Add Linux CPU Unittests for Torchvision * use conda-builder image since conda installation is needed * install torch dep with conda instead * use circleCI command to run tests * larger instance to avoid OOM issues * proper syntax for self-hosted runners * 4xlarge instance * 8xlarge * 12xlarge * use setup-miniconda job * add back PATH change to help setup py detect conda * run conda shell script * install other deps up front * git config and undo path change * revert to local conda install * conda-builder image * support for whole python version matrix * clean up the conda env once we are done with the job --- .github/workflows/test-linux-cpu.yml | 69 ++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 .github/workflows/test-linux-cpu.yml diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml new file mode 100644 index 000000000..81ace6351 --- /dev/null +++ b/.github/workflows/test-linux-cpu.yml @@ -0,0 +1,69 @@ +name: Unit-tests on Linux CPU + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +env: + CHANNEL: "nightly" + +jobs: + tests: + name: "Unit-tests on Linux CPU" + runs-on: [self-hosted, linux.12xlarge] + container: + image: pytorch/conda-builder:cpu + strategy: + matrix: + py_vers: ["3.7", "3.8", "3.9", "3.10"] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Set Release CHANNEL (for release) + if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} + run: | + echo "CHANNEL=test" >> "$GITHUB_ENV" + - name: Setup Conda + shell: bash -l {0} + env: + ENV_NAME: conda-env-${{ github.run_id }} + PY_VERS: ${{ matrix.py_vers }} + run: | + git config --global --add safe.directory /__w/vision/vision + . ~/miniconda3/etc/profile.d/conda.sh + conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy + echo "CONDA_RUN=conda run -p ${ENV_NAME}" >> "$GITHUB_ENV" + - name: Install TorchVision + shell: bash -l {0} + env: + VERSION: cpu + CUDATOOLKIT: cpuonly + run: | + # Needed for JPEG library detection as setup.py detects conda presence + # by running `shutil.which('conda')` + export PATH=~/miniconda3/bin:$PATH + set -ex + ${CONDA_RUN} conda install \ + --yes \ + -c "pytorch-${CHANNEL}" \ + -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ + "${CUDATOOLKIT}" + ${CONDA_RUN} python3 setup.py develop + ${CONDA_RUN} python3 -m pip install pytest pytest-mock av + - name: Run tests + shell: bash -l {0} + env: + ENV_NAME: conda-env-${{ github.run_id }} + PY_VERS: ${{ matrix.py_vers }} + run: | + . ~/miniconda3/etc/profile.d/conda.sh + set -ex + ${CONDA_RUN} python3 -m torch.utils.collect_env + ${CONDA_RUN} python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + conda env remove -p ${ENV_NAME} -- GitLab From e23542da0dd85193bb831d6f62b2101a9651cad0 Mon Sep 17 00:00:00 2001 From: YosuaMichael Date: Tue, 18 Oct 2022 15:38:36 +0100 Subject: [PATCH 055/624] Add raft_stereo weights (#6786) * Add raft_stereo weights * Update the metrics layout --- .../models/depth/stereo/raft_stereo.py | 94 ++++++++++++++++++- 1 file changed, 91 insertions(+), 3 deletions(-) diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py index 541a11f04..4b6f5a0bd 100644 --- a/torchvision/prototype/models/depth/stereo/raft_stereo.py +++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py @@ -1,3 +1,4 @@ +from functools import partial from typing import Callable, List, Optional, Tuple import torch @@ -5,11 +6,12 @@ import torch.nn as nn import torch.nn.functional as F import torchvision.models.optical_flow.raft as raft from torch import Tensor -from torchvision.models._api import register_model, WeightsEnum +from torchvision.models._api import register_model, Weights, WeightsEnum from torchvision.models._utils import handle_legacy_interface from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow from torchvision.models.optical_flow.raft import FlowHead, MotionEncoder, ResidualBlock from torchvision.ops import Conv2dNormActivation +from torchvision.prototype.transforms._presets import StereoMatching from torchvision.utils import _log_api_usage_once @@ -624,11 +626,97 @@ def _raft_stereo( class Raft_Stereo_Realtime_Weights(WeightsEnum): - pass + SCENEFLOW_V1 = Weights( + # Weights ported from https://github.com/princeton-vl/RAFT-Stereo + url="https://download.pytorch.org/models/raft_stereo_realtime-cf345ccb.pth", + transforms=partial(StereoMatching, resize_size=(224, 224)), + meta={ + "num_params": 8077152, + "recipe": "https://github.com/princeton-vl/RAFT-Stereo", + "_metrics": { + # Following metrics from paper: https://arxiv.org/abs/2109.07547 + "Kitty2015": { + "3px": 0.9409, + } + }, + }, + ) + + DEFAULT = SCENEFLOW_V1 class Raft_Stereo_Base_Weights(WeightsEnum): - pass + SCENEFLOW_V1 = Weights( + # Weights ported from https://github.com/princeton-vl/RAFT-Stereo + url="https://download.pytorch.org/models/raft_stereo_base_sceneflow-eff3f2e6.pth", + transforms=partial(StereoMatching, resize_size=(224, 224)), + meta={ + "num_params": 11116176, + "recipe": "https://github.com/princeton-vl/RAFT-Stereo", + "_metrics": { + # Following metrics from paper: https://arxiv.org/abs/2109.07547 + # Using standard metrics for each datasets + "Kitty2015": { + # Ratio of pixels with difference less than 3px from ground truth + "3px": 0.9426, + }, + # For middlebury, ratio of pixels with difference less than 2px from ground truth + # on full, half, and quarter image resolution + "Middlebury2014-val-full": { + "2px": 0.8167, + }, + "Middlebury2014-val-half": { + "2px": 0.8741, + }, + "Middlebury2014-val-quarter": { + "2px": 0.9064, + }, + "ETH3D-val": { + # Ratio of pixels with difference less than 1px from ground truth + "1px": 0.9672, + }, + }, + }, + ) + + MIDDLEBURY_V1 = Weights( + # Weights ported from https://github.com/princeton-vl/RAFT-Stereo + url="https://download.pytorch.org/models/raft_stereo_base_middlebury-afa9d252.pth", + transforms=partial(StereoMatching, resize_size=(224, 224)), + meta={ + "num_params": 11116176, + "recipe": "https://github.com/princeton-vl/RAFT-Stereo", + "_metrics": { + # Following metrics from paper: https://arxiv.org/abs/2109.07547 + "Middlebury-test": { + "mae": 1.27, + "1px": 0.9063, + "2px": 0.9526, + "5px": 0.9725, + } + }, + }, + ) + + ETH3D_V1 = Weights( + # Weights ported from https://github.com/princeton-vl/RAFT-Stereo + url="https://download.pytorch.org/models/raft_stereo_base_eth3d-d4830f22.pth", + transforms=partial(StereoMatching, resize_size=(224, 224)), + meta={ + "num_params": 11116176, + "recipe": "https://github.com/princeton-vl/RAFT-Stereo", + "_metrics": { + # Following metrics from paper: https://arxiv.org/abs/2109.07547 + "ETH3D-test": { + "mae": 0.18, + "1px": 0.9756, + "2px": 0.9956, + } + }, + }, + ) + + DEFAULT = MIDDLEBURY_V1 @register_model() -- GitLab From f8b5a7af8be9ef0e27187b0595d8a9304fa9ba52 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 18 Oct 2022 16:45:23 +0200 Subject: [PATCH 056/624] don't fail linux CPU tests fast (#6788) --- .github/workflows/test-linux-cpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index 81ace6351..1e127c6ac 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -21,6 +21,7 @@ jobs: strategy: matrix: py_vers: ["3.7", "3.8", "3.9", "3.10"] + fail-fast: false steps: - name: Checkout repository -- GitLab From 32757a260dfedebf71eb470bd0a072ed20beddc3 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 18 Oct 2022 16:52:09 +0200 Subject: [PATCH 057/624] fix warnings in prototype transforms test suite (#6785) * fix, ignore, or assert warnings for consistency tests * fix, ignore, or assert warnings for kernel infos * fix to_image_tensor for numpy inputs * make image from numpy contiguous * fix test --- test/prototype_transforms_kernel_infos.py | 15 +++++++++++---- test/test_prototype_transforms_consistency.py | 13 ++++++++----- test/test_prototype_transforms_functional.py | 9 +-------- .../transforms/functional/_type_conversion.py | 2 +- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index c455caa6b..f8b237f2e 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1,6 +1,7 @@ import functools import itertools import math +import re import numpy as np import pytest @@ -172,6 +173,12 @@ KERNEL_INFOS.extend( KernelInfo( F.horizontal_flip_bounding_box, sample_inputs_fn=sample_inputs_horizontal_flip_bounding_box, + test_marks=[ + TestMark( + ("TestKernels", "test_scripted_vs_eager"), + pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %72')}:UserWarning"), + ) + ], ), KernelInfo( F.horizontal_flip_mask, @@ -443,10 +450,10 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, transformed_points = np.matmul(points, affine_matrix.T) out_bbox = torch.tensor( [ - np.min(transformed_points[:, 0]), - np.min(transformed_points[:, 1]), - np.max(transformed_points[:, 0]), - np.max(transformed_points[:, 1]), + np.min(transformed_points[:, 0]).item(), + np.min(transformed_points[:, 1]).item(), + np.max(transformed_points[:, 0]).item(), + np.max(transformed_points[:, 1]).item(), ], dtype=bbox.dtype, ) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 212755068..362a7a1c0 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -1,6 +1,7 @@ import enum import inspect import random +import re from collections import defaultdict from importlib.machinery import SourceFileLoader from pathlib import Path @@ -598,6 +599,7 @@ def check_call_consistency( for idx, args_kwargs in enumerate(config.args_kwargs) ], ) +@pytest.mark.filterwarnings("ignore") def test_call_consistency(config, args_kwargs): args, kwargs = args_kwargs @@ -671,21 +673,21 @@ class TestContainerTransforms: check_call_consistency(prototype_transform, legacy_transform) # We can't test other values for `p` since the random parameter generation is different - @pytest.mark.parametrize("p", [(0, 1), (1, 0)]) - def test_random_choice(self, p): + @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)]) + def test_random_choice(self, probabilities): prototype_transform = prototype_transforms.RandomChoice( [ prototype_transforms.Resize(256), legacy_transforms.CenterCrop(224), ], - p=p, + probabilities=probabilities, ) legacy_transform = legacy_transforms.RandomChoice( [ legacy_transforms.Resize(256), legacy_transforms.CenterCrop(224), ], - p=p, + p=probabilities, ) check_call_consistency(prototype_transform, legacy_transform) @@ -702,7 +704,8 @@ class TestToTensorTransforms: assert_equal(prototype_transform(image_pil), legacy_transform(image_pil)) def test_to_tensor(self): - prototype_transform = prototype_transforms.ToTensor() + with pytest.warns(UserWarning, match=re.escape("The transform `ToTensor()` is deprecated")): + prototype_transform = prototype_transforms.ToTensor() legacy_transform = legacy_transforms.ToTensor() for image in make_images(extra_dims=[()]): diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 34291611d..bafe1f134 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -1012,17 +1012,10 @@ def test_normalize_output_type(): def test_to_image_tensor(inpt): output = F.to_image_tensor(inpt) assert isinstance(output, torch.Tensor) + assert output.shape == (3, 32, 32) assert np.asarray(inpt).sum() == output.sum().item() - if isinstance(inpt, PIL.Image.Image): - # we can't check this option - # as PIL -> numpy is always copying - return - - inpt[0, 0, 0] = 11 - assert output[0, 0, 0] == 11 - @pytest.mark.parametrize( "inpt", diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py index b171716ae..5fe990eb7 100644 --- a/torchvision/prototype/transforms/functional/_type_conversion.py +++ b/torchvision/prototype/transforms/functional/_type_conversion.py @@ -27,7 +27,7 @@ def decode_video_with_av(encoded_video: torch.Tensor) -> Tuple[torch.Tensor, tor @torch.jit.unused def to_image_tensor(image: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image: if isinstance(image, np.ndarray): - output = torch.from_numpy(image) + output = torch.from_numpy(image).permute((2, 0, 1)).contiguous() elif isinstance(image, PIL.Image.Image): output = pil_to_tensor(image) else: # isinstance(inpt, torch.Tensor): -- GitLab From 78fdaf3a757e6eaacc458883bcf7464b1711ce7a Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 19 Oct 2022 11:21:36 +0200 Subject: [PATCH 058/624] pin pyav to <10 (#6789) * pin pyav to <10 * pin av in GHA workflows as well * also pin in M1 workflow --- .circleci/unittest/linux/scripts/environment.yml | 2 +- .circleci/unittest/windows/scripts/environment.yml | 2 +- .github/workflows/test-linux-cpu.yml | 2 +- .github/workflows/test-m1.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml index 77ee99295..fae96c5f9 100644 --- a/.circleci/unittest/linux/scripts/environment.yml +++ b/.circleci/unittest/linux/scripts/environment.yml @@ -13,4 +13,4 @@ dependencies: - pip: - future - scipy - - av + - av < 10 diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml index 0e07ae80d..d229aafb4 100644 --- a/.circleci/unittest/windows/scripts/environment.yml +++ b/.circleci/unittest/windows/scripts/environment.yml @@ -14,6 +14,6 @@ dependencies: - pip: - future - scipy - - av != 9.1.1 + - av !=9.1.1, <10 - dataclasses - h5py diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index 1e127c6ac..b6891bbfb 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -56,7 +56,7 @@ jobs: -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" ${CONDA_RUN} python3 setup.py develop - ${CONDA_RUN} python3 -m pip install pytest pytest-mock av + ${CONDA_RUN} python3 -m pip install pytest pytest-mock 'av<10' - name: Run tests shell: bash -l {0} env: diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml index 1e5f79f82..c03fa9f76 100644 --- a/.github/workflows/test-m1.yml +++ b/.github/workflows/test-m1.yml @@ -37,7 +37,7 @@ jobs: conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} conda run -p ${ENV_NAME} python3 setup.py develop - conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock av + conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock 'av<10' - name: Run tests shell: arch -arch arm64 bash {0} env: -- GitLab From 7a62a545ce76f43ccc5cfe0009131f7db14ae7b5 Mon Sep 17 00:00:00 2001 From: YosuaMichael Date: Wed, 19 Oct 2022 19:11:41 +0100 Subject: [PATCH 059/624] Some fixes for crestereo (#6791) --- .../prototype/models/depth/stereo/crestereo.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py index 496438522..29c0be936 100644 --- a/torchvision/prototype/models/depth/stereo/crestereo.py +++ b/torchvision/prototype/models/depth/stereo/crestereo.py @@ -763,7 +763,7 @@ class CREStereo(nn.Module): return "1d" if iteration % 2 == 0 else "2d" def forward( - self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor], num_iters: int = 10 + self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 10 ) -> List[Tensor]: features = torch.cat([left_image, right_image], dim=0) features = self.feature_encoder(features) @@ -781,10 +781,10 @@ class CREStereo(nn.Module): ctx_pyramid = self.downsampling_pyramid(ctx) # we store in reversed order because we process the pyramid from top to bottom - l_pyramid: Dict[str, Tensor] = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)} - r_pyramid: Dict[str, Tensor] = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)} - net_pyramid: Dict[str, Tensor] = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)} - ctx_pyramid: Dict[str, Tensor] = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)} + l_pyramid = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)} + r_pyramid = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)} + net_pyramid = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)} + ctx_pyramid = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)} # offsets for sampling pixel candidates in the correlation ops offsets: Dict[str, Tensor] = {} @@ -1425,6 +1425,9 @@ def crestereo_base(*, weights: Optional[CREStereo_Base_Weights] = None, progress .. autoclass:: torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights :members: """ + + weights = CREStereo_Base_Weights.verify(weights) + return _crestereo( weights=weights, progress=progress, -- GitLab From 211563fba461062268db7cdf5a83203ed9e83e6f Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 20 Oct 2022 14:14:37 +0200 Subject: [PATCH 060/624] improve perf on convert_image_dtype and add tests (#6795) * improve perf on convert_image_dtype and add tests * add reference tests * use bitshifts for int to int * revert bitshifts for int to int upscale * fix warning ignore --- test/prototype_transforms_kernel_infos.py | 118 ++++++++++++++++++ test/test_prototype_transforms_functional.py | 46 +++++-- .../transforms/functional/_type_conversion.py | 77 +++++++++++- 3 files changed, 226 insertions(+), 15 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index f8b237f2e..133508f5f 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1,3 +1,4 @@ +import decimal import functools import itertools import math @@ -21,6 +22,7 @@ from prototype_common_utils import ( mark_framework_limitation, TestMark, ) +from torch.utils._pytree import tree_map from torchvision.prototype import features from torchvision.transforms.functional_tensor import _max_value as get_max_value @@ -1947,3 +1949,119 @@ KERNEL_INFOS.extend( ), ] ) + + +def sample_inputs_convert_image_dtype(): + for input_dtype, output_dtype in itertools.product( + [torch.uint8, torch.int64, torch.float32, torch.float64], repeat=2 + ): + if input_dtype.is_floating_point and output_dtype == torch.int64: + # conversion cannot be performed safely + continue + + for image_loader in make_image_loaders( + sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[input_dtype] + ): + yield ArgsKwargs(image_loader, dtype=output_dtype) + + yield ArgsKwargs(make_image_loader(color_space=features.ColorSpace.RGB), dtype=torch.uint8) + + +def reference_convert_image_dtype(image, dtype=torch.float): + input_dtype = image.dtype + output_dtype = dtype + + if output_dtype == input_dtype: + return image + + def fn(value): + if input_dtype.is_floating_point: + if output_dtype.is_floating_point: + return value + else: + return int(decimal.Decimal(value) * torch.iinfo(output_dtype).max) + else: + input_max_value = torch.iinfo(input_dtype).max + + if output_dtype.is_floating_point: + return float(decimal.Decimal(value) / input_max_value) + else: + output_max_value = torch.iinfo(output_dtype).max + + if input_max_value > output_max_value: + factor = (input_max_value + 1) // (output_max_value + 1) + return value // factor + else: + factor = (output_max_value + 1) // (input_max_value + 1) + return value * factor + + return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype) + + +def reference_inputs_convert_image_dtype(): + for input_dtype, output_dtype in itertools.product( + [ + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + torch.float16, + torch.float32, + torch.float64, + torch.bfloat16, + ], + repeat=2, + ): + if (input_dtype == torch.float32 and output_dtype in {torch.int32, torch.int64}) or ( + input_dtype == torch.float64 and output_dtype == torch.int64 + ): + continue + + if input_dtype.is_floating_point: + data = [0.0, 0.5, 1.0] + else: + max_value = torch.iinfo(input_dtype).max + data = [0, max_value // 2, max_value] + image = torch.tensor(data, dtype=input_dtype) + + yield ArgsKwargs(image, dtype=output_dtype) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.convert_image_dtype, + sample_inputs_fn=sample_inputs_convert_image_dtype, + reference_fn=reference_convert_image_dtype, + reference_inputs_fn=reference_inputs_convert_image_dtype, + test_marks=[ + TestMark( + ("TestKernels", "test_scripted_vs_eager"), + pytest.mark.filterwarnings(f"ignore:{re.escape('operator() profile_node %41')}:UserWarning"), + ), + TestMark( + ("TestKernels", "test_dtype_and_device_consistency"), + pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"), + condition=lambda args_kwargs: args_kwargs.args[0].dtype + != args_kwargs.kwargs.get("dtype", torch.float32), + ), + TestMark( + ("TestKernels", "test_against_reference"), + pytest.mark.xfail(reason="Conversion overflows"), + condition=lambda args_kwargs: ( + args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16} + and not args_kwargs.kwargs["dtype"].is_floating_point + ) + or ( + args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16} + and args_kwargs.kwargs["dtype"] == torch.int64 + ) + or ( + args_kwargs.args[0].dtype in {torch.int32, torch.int64} + and args_kwargs.kwargs["dtype"] == torch.float16 + ), + ), + ], + ), + ] +) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index bafe1f134..3423006e2 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -26,6 +26,20 @@ def script(fn): raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error +def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None): + args_kwargs = list(args_kwargs_fn(info)) + idx_field_len = len(str(len(args_kwargs))) + return [ + pytest.param( + info, + args_kwargs_, + marks=info.get_marks(test_id, args_kwargs_) if test_id else [], + id=f"{info.id}-{idx:0{idx_field_len}}", + ) + for idx, args_kwargs_ in enumerate(args_kwargs) + ] + + def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=None): if condition is None: @@ -49,18 +63,7 @@ def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn, condition=No if not condition(info): continue - args_kwargs = list(args_kwargs_fn(info)) - idx_field_len = len(str(len(args_kwargs))) - - for idx, args_kwargs_ in enumerate(args_kwargs): - argvalues.append( - pytest.param( - info, - args_kwargs_, - marks=info.get_marks(test_id, args_kwargs_), - id=f"{info.id}-{idx:0{idx_field_len}}", - ) - ) + argvalues.extend(make_info_args_kwargs_params(info, args_kwargs_fn=args_kwargs_fn, test_id=test_id)) return pytest.mark.parametrize(argnames, argvalues)(test_fn) @@ -232,7 +235,6 @@ class TestDispatchers: [ F.clamp_bounding_box, F.convert_color_space, - F.convert_image_dtype, F.get_dimensions, F.get_image_num_channels, F.get_image_size, @@ -312,6 +314,24 @@ def test_alias(alias, target): assert alias is target +@pytest.mark.parametrize( + ("info", "args_kwargs"), + make_info_args_kwargs_params( + next(info for info in KERNEL_INFOS if info.kernel is F.convert_image_dtype), + args_kwargs_fn=lambda info: info.sample_inputs_fn(), + ), +) +@pytest.mark.parametrize("device", cpu_and_gpu()) +def test_dtype_and_device_convert_image_dtype(info, args_kwargs, device): + (input, *other_args), kwargs = args_kwargs.load(device) + dtype = other_args[0] if other_args else kwargs.get("dtype", torch.float32) + + output = info.kernel(input, dtype) + + assert output.dtype == dtype + assert output.device == input.device + + # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in # `prototype_transforms_kernel_infos.py` diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py index 5fe990eb7..a57fbc655 100644 --- a/torchvision/prototype/transforms/functional/_type_conversion.py +++ b/torchvision/prototype/transforms/functional/_type_conversion.py @@ -7,7 +7,7 @@ import torch from torchvision.io.video import read_video from torchvision.prototype import features from torchvision.prototype.utils._internal import ReadOnlyTensorBuffer -from torchvision.transforms import functional as _F +from torchvision.transforms import functional as _F, functional_tensor as _FT @torch.jit.unused @@ -42,4 +42,77 @@ pil_to_tensor = _F.pil_to_tensor # prevalent and well understood. Thus, we just alias it without deprecating the old name. to_pil_image = to_image_pil -convert_image_dtype = _F.convert_image_dtype + +def _num_value_bits(dtype: torch.dtype) -> int: + if dtype == torch.uint8: + return 8 + elif dtype == torch.int8: + return 7 + elif dtype == torch.int16: + return 15 + elif dtype == torch.int32: + return 31 + elif dtype == torch.int64: + return 63 + else: + raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.") + + +def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor: + if not isinstance(image, torch.Tensor): + raise TypeError("Input img should be Tensor Image") + + if image.dtype == dtype: + return image + + float_input = image.is_floating_point() + if torch.jit.is_scripting(): + # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT + float_output = torch.tensor(0, dtype=dtype).is_floating_point() + else: + float_output = dtype.is_floating_point + + if float_input: + # float to float + if float_output: + return image.to(dtype) + + # float to int + if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or ( + image.dtype == torch.float64 and dtype == torch.int64 + ): + raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.") + + # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting + # to the integer dtype is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only + # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321 + # for a detailed analysis. + # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation. + # Instead, we can also multiply by the maximum value plus something close to `1`. See + # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details. + eps = 1e-3 + max_value = float(_FT._max_value(dtype)) + # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the + # discrete set `{0, 1}`. + return image.mul(max_value + 1.0 - eps).to(dtype) + else: + # int to float + if float_output: + return image.to(dtype).div_(_FT._max_value(image.dtype)) + + # int to int + num_value_bits_input = _num_value_bits(image.dtype) + num_value_bits_output = _num_value_bits(dtype) + + if num_value_bits_input > num_value_bits_output: + return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype) + else: + # The bitshift kernel is not vectorized + # https://github.com/pytorch/pytorch/blob/703c19008df4700b6a522b0ae5c4b6d5ffc0906f/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L315-L322 + # This results in the multiplication actually being faster. + # TODO: If the bitshift kernel is optimized in core, replace the computation below with + # `image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)` + max_value_input = float(_FT._max_value(dtype)) + max_value_output = float(_FT._max_value(image.dtype)) + factor = int((max_value_input + 1) // (max_value_output + 1)) + return image.to(dtype).mul_(factor) -- GitLab From 246de0772c80eae435c0a562268d8d21ed7a27a2 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Thu, 20 Oct 2022 14:01:32 -0400 Subject: [PATCH 061/624] [Nova] Migrate Linux CPU job to Generic Job (#6797) * [Nova] Migrate Linux CPU job to Generic Job * branch ref for composite action job * move checkout step to separate job * added runs-on * nit fixes * no need to run conda sheel script thing * Channel is set inside the script * add remaining env vars * nit env var fix * cleanup * simplify unneeded jobs * name of the conda env should be the path * remove main ref to use PR --- .github/workflows/test-linux-cpu.yml | 85 ++++++++++++---------------- 1 file changed, 36 insertions(+), 49 deletions(-) diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index b6891bbfb..f78dd323d 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -14,57 +14,44 @@ env: jobs: tests: - name: "Unit-tests on Linux CPU" - runs-on: [self-hosted, linux.12xlarge] - container: - image: pytorch/conda-builder:cpu strategy: matrix: py_vers: ["3.7", "3.8", "3.9", "3.10"] fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.12xlarge + repository: pytorch/vision + script: | + # Mark Build Directory Safe + git config --global --add safe.directory /__w/vision/vision - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set Release CHANNEL (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Setup Conda - shell: bash -l {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - git config --global --add safe.directory /__w/vision/vision - . ~/miniconda3/etc/profile.d/conda.sh - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy - echo "CONDA_RUN=conda run -p ${ENV_NAME}" >> "$GITHUB_ENV" - - name: Install TorchVision - shell: bash -l {0} - env: - VERSION: cpu - CUDATOOLKIT: cpuonly - run: | - # Needed for JPEG library detection as setup.py detects conda presence - # by running `shutil.which('conda')` - export PATH=~/miniconda3/bin:$PATH - set -ex - ${CONDA_RUN} conda install \ - --yes \ - -c "pytorch-${CHANNEL}" \ - -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ - "${CUDATOOLKIT}" - ${CONDA_RUN} python3 setup.py develop - ${CONDA_RUN} python3 -m pip install pytest pytest-mock 'av<10' - - name: Run tests - shell: bash -l {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - . ~/miniconda3/etc/profile.d/conda.sh - set -ex - ${CONDA_RUN} python3 -m torch.utils.collect_env - ${CONDA_RUN} python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 - conda env remove -p ${ENV_NAME} + # Set up Environment Variables + export PYTHON_VERSION="${{ matrix.py_vers }}" + export VERSION="cpu" + export CUDATOOLKIT="cpuonly" + + # Set CHANNEL + if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then + export CHANNEL=test + else + export CHANNEL=nightly + fi + + # Create Conda Env + conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy + conda activate /work/ci_env + + # Install PyTorch, Torchvision, and testing libraries + set -ex + conda install \ + --yes \ + -c "pytorch-${CHANNEL}" \ + -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ + "${CUDATOOLKIT}" + python3 setup.py develop + python3 -m pip install pytest pytest-mock 'av<10' + + # Run Tests + python3 -m torch.utils.collect_env + python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 -- GitLab From 06ad05fa60f8af0ba36b726c8e0233040811a588 Mon Sep 17 00:00:00 2001 From: Joao Gomes Date: Fri, 21 Oct 2022 09:58:13 +0100 Subject: [PATCH 062/624] Read video from memory newapi (#6771) * add tensor as optional param * add init from memory * fix bug * fix bug * first working version * apply formatting and add tests * simplify tests * fix tests * fix wrong variable name * add path as optional parameter * add src as optional * address pr comments * Fix warning messages * address pr comments * make tests stricter * Revert "make tests stricter" This reverts commit 6c92e94e8372f381c9496c9f885c2c71b6a4356b. --- test/test_videoapi.py | 41 +++++++++++++++++++++ torchvision/csrc/io/decoder/defs.h | 2 +- torchvision/csrc/io/video/video.cpp | 57 +++++++++++++++++++++++------ torchvision/csrc/io/video/video.h | 20 +++++++++- torchvision/io/video_reader.py | 47 +++++++++++++++++++++--- 5 files changed, 148 insertions(+), 19 deletions(-) diff --git a/test/test_videoapi.py b/test/test_videoapi.py index 895b9b835..4688e5a64 100644 --- a/test/test_videoapi.py +++ b/test/test_videoapi.py @@ -77,6 +77,7 @@ class TestVideoApi: # compare the frames and ptss for i in range(len(vr_frames)): assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) + mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) # on average the difference is very small and caused # by decoding (around 1%) @@ -114,6 +115,46 @@ class TestVideoApi: # we assure that there is never more than 1% difference in signal assert max_delta.item() < 0.001 + @pytest.mark.parametrize("stream", ["video", "audio"]) + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_frame_reading_mem_vs_file(self, test_video, stream): + full_path = os.path.join(VIDEO_DIR, test_video) + + # Test video reading from file vs from memory + vr_frames, vr_frames_mem = [], [] + vr_pts, vr_pts_mem = [], [] + # get vr frames + video_reader = VideoReader(full_path, stream) + for vr_frame in video_reader: + vr_frames.append(vr_frame["data"]) + vr_pts.append(vr_frame["pts"]) + + # get vr frames = read from memory + f = open(full_path, "rb") + fbytes = f.read() + f.close() + video_reader_from_mem = VideoReader(fbytes, stream) + + for vr_frame_from_mem in video_reader_from_mem: + vr_frames_mem.append(vr_frame_from_mem["data"]) + vr_pts_mem.append(vr_frame_from_mem["pts"]) + + # same number of frames + assert len(vr_frames) == len(vr_frames_mem) + assert len(vr_pts) == len(vr_pts_mem) + + # compare the frames and ptss + for i in range(len(vr_frames)): + assert vr_pts[i] == vr_pts_mem[i] + mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float())) + # on average the difference is very small and caused + # by decoding (around 1%) + # TODO: asses empirically how to set this? atm it's 1% + # averaged over all frames + assert mean_delta.item() < 2.55 + + del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem + @pytest.mark.parametrize("test_video,config", test_videos.items()) def test_metadata(self, test_video, config): """ diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h index dac6293d3..502e5762e 100644 --- a/torchvision/csrc/io/decoder/defs.h +++ b/torchvision/csrc/io/decoder/defs.h @@ -165,7 +165,7 @@ struct MediaFormat { struct DecoderParameters { // local file, remote file, http url, rtmp stream uri, etc. anything that // ffmpeg can recognize - std::string uri; + std::string uri{std::string()}; // timeout on getting bytes for decoding size_t timeoutMs{1000}; // logging level, default AV_LOG_PANIC diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp index 38b350145..d8b36a35a 100644 --- a/torchvision/csrc/io/video/video.cpp +++ b/torchvision/csrc/io/video/video.cpp @@ -156,14 +156,34 @@ void Video::_getDecoderParams( } // _get decoder params -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); +void Video::initFromFile( + std::string videoPath, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + params.uri = videoPath; + _init(stream, numThreads); +} + +void Video::initFromMemory( + torch::Tensor videoTensor, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + callback = MemoryBuffer::getCallback( + videoTensor.data_ptr(), videoTensor.size(0)); + _init(stream, numThreads); +} + +void Video::_init(std::string stream, int64_t numThreads) { // set number of threads global numThreads_ = numThreads; // parse stream information current_stream = _parseStream(stream); // note that in the initial call we want to get all streams - Video::_getDecoderParams( + _getDecoderParams( 0, // video start 0, // headerOnly std::get<0>(current_stream), // stream info - remove that @@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { std::string logMessage, logType; - // TODO: add read from memory option - params.uri = videoPath; - logType = "file"; - logMessage = videoPath; - // locals std::vector audioFPS, videoFPS; std::vector audioDuration, videoDuration, ccDuration, subsDuration; @@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { c10::Dict> subsMetadata; // callback and metadata defined in struct - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); if (succeeded) { for (const auto& header : metadata) { double fps = double(header.fps); @@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { streamsMetadata.insert("subtitles", subsMetadata); streamsMetadata.insert("cc", ccMetadata); - succeeded = Video::setCurrentStream(stream); + succeeded = setCurrentStream(stream); LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n"; if (std::get<1>(current_stream) != -1) { LOG(INFO) << "Stream index set to " << std::get<1>(current_stream) << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; } +} + +Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { + C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); + if (!videoPath.empty()) { + initFromFile(videoPath, stream, numThreads); + } } // video bool Video::setCurrentStream(std::string stream = "video") { + TORCH_CHECK(initialized, "Video object has to be initialized first"); if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { current_stream = _parseStream(stream); } @@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") { ); // callback and metadata defined in Video.h - return (decoder.init(params, std::move(callback), &metadata)); + DecoderInCallback tmp_callback = callback; + return (decoder.init(params, std::move(tmp_callback), &metadata)); } std::tuple Video::getCurrentStream() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return current_stream; } c10::Dict>> Video:: getStreamMetadata() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return streamsMetadata; } void Video::Seek(double ts, bool fastSeek = false) { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // initialize the class variables used for seeking and retrurn _getDecoderParams( ts, // video start @@ -282,11 +310,14 @@ void Video::Seek(double ts, bool fastSeek = false) { ); // callback and metadata defined in Video.h - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); + LOG(INFO) << "Decoder init at seek " << succeeded << "\n"; } std::tuple Video::Next() { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // if failing to decode simply return a null tensor (note, should we // raise an exeption?) double frame_pts_s; @@ -345,6 +376,8 @@ std::tuple Video::Next() { static auto registerVideo = torch::class_