Fix flakiness on detection tests (#2966)

* Simplify the ACCEPT=True logic in assertExpected(). * Separate the expected filename estimation from assertExpected * Unflatten expected values. * Assert for duplicate scores if primary check fails. * Remove custom exceptions for algorithms and add a compact function for shrinking large ouputs. * Removing unused variables. * Add warning and comments. * Re-enable all autocast unit-test for detection and marking the tests as skipped in partial validation. * Move test skip at the end. * Changing the warning message.

Fix flakiness on detection tests (#2966)
* Simplify the ACCEPT=True logic in assertExpected(). * Separate the expected filename estimation from assertExpected * Unflatten expected values. * Assert for duplicate scores if primary check fails. * Remove custom exceptions for algorithms and add a compact function for shrinking large ouputs. * Removing unused variables. * Add warning and comments. * Re-enable all autocast unit-test for detection and marking the tests as skipped in partial validation. * Move test skip at the end. * Changing the warning message.
7f7ff056 · Vasilis Vryniotis · GitHub · 47382673 · 7f7ff056 · 7f7ff056
Unverified Commit 7f7ff056 authored Nov 06, 2020 by Vasilis Vryniotis Committed by GitHub Nov 06, 2020
6 changed files
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -105,16 +105,14 @@ class TestCase(unittest.TestCase):
                                     "expect",
                                     munged_id)
-        subname_output = ""
        if subname:
            expected_file += "_" + subname
-            subname_output = " ({})".format(subname)
        expected_file += "_expect.pkl"
        if not ACCEPT and not os.path.exists(expected_file):
            raise RuntimeError(
-                ("No expect file exists for {}{}; to accept the current output, run:\n"
+                ("No expect file exists for {}; to accept the current output, run:\n"
-                 "python {} {} --accept").format(munged_id, subname_output, __main__.__file__, munged_id))
+                 "python {} {} --accept").format(os.path.basename(expected_file), __main__.__file__, munged_id))
        return expected_file
@@ -139,11 +137,13 @@ class TestCase(unittest.TestCase):
        expected_file = self._get_expected_file(subname, strip_suffix)
        if ACCEPT:
-            print("Accepting updated output for {}:\n\n{}".format(os.path.basename(expected_file), output))
+            filename = {os.path.basename(expected_file)}
+            print("Accepting updated output for {}:\n\n{}".format(filename, output))
            torch.save(output, expected_file)
            MAX_PICKLE_SIZE = 50 * 1000  # 50 KB
            binary_size = os.path.getsize(expected_file)
-            self.assertTrue(binary_size <= MAX_PICKLE_SIZE)
+            if binary_size > MAX_PICKLE_SIZE:
+                raise RuntimeError("The output for {}, is larger than 50kb".format(filename))
        else:
            expected = torch.load(expected_file)
            self.assertEqual(output, expected, prec=prec)

--- a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl
+++ b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl
--- a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl
+++ b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl
--- a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl
+++ b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl
--- a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
+++ b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
--- a/test/test_models.py
+++ b/test/test_models.py
 from common_utils import TestCase, map_nested_tensor_object, freeze_rng_state
 from collections import OrderedDict
 from itertools import product
+import functools
+import operator
 import torch
 import torch.nn as nn
 import numpy as np
 from torchvision import models
 import unittest
 import random
+import warnings
-from torchvision.models.detection._utils import overwrite_eps
 def set_rng_seed(seed):
@@ -88,14 +89,10 @@ script_test_models = {
 # trying autocast. However, they still try an autocasted forward pass, so they still ensure
 # autocast coverage suffices to prevent dtype errors in each model.
 autocast_flaky_numerics = (
-    "fasterrcnn_resnet50_fpn",
    "inception_v3",
-    "keypointrcnn_resnet50_fpn",
-    "maskrcnn_resnet50_fpn",
    "resnet101",
    "resnet152",
    "wide_resnet101_2",
-    "retinanet_resnet50_fpn",
 )
@@ -148,10 +145,9 @@ class ModelTester(TestCase):
        set_rng_seed(0)
        kwargs = {}
        if "retinanet" in name:
-            kwargs["score_thresh"] = 0.013
+            # Reduce the default threshold to ensure the returned boxes are not empty.
+            kwargs["score_thresh"] = 0.01
        model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False, **kwargs)
-        if "keypointrcnn" in name or "retinanet" in name:
-            overwrite_eps(model, 0.0)
        model.eval().to(device=dev)
        input_shape = (3, 300, 300)
        # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
@@ -163,15 +159,22 @@ class ModelTester(TestCase):
        def check_out(out):
            self.assertEqual(len(out), 1)
+            def compact(tensor):
+                size = tensor.size()
+                elements_per_sample = functools.reduce(operator.mul, size[1:], 1)
+                if elements_per_sample > 30:
+                    return compute_mean_std(tensor)
+                else:
+                    return subsample_tensor(tensor)
            def subsample_tensor(tensor):
-                num_elems = tensor.numel()
+                num_elems = tensor.size(0)
                num_samples = 20
                if num_elems <= num_samples:
                    return tensor
-                flat_tensor = tensor.flatten()
                ith_index = num_elems // num_samples
-                return flat_tensor[ith_index - 1::ith_index]
+                return tensor[ith_index - 1::ith_index]
            def compute_mean_std(tensor):
                # can't compute mean of integral tensor
@@ -180,18 +183,32 @@ class ModelTester(TestCase):
                std = torch.std(tensor)
                return {"mean": mean, "std": std}
-            if name == "maskrcnn_resnet50_fpn":
+            output = map_nested_tensor_object(out, tensor_map_fn=compact)
-                # maskrcnn_resnet_50_fpn numerically unstable across platforms, so for now
+            prec = 0.01
-                # compare results with mean and std
+            strip_suffix = "_" + dev
-                test_value = map_nested_tensor_object(out, tensor_map_fn=compute_mean_std)
+            try:
-                # mean values are small, use large prec
+                # We first try to assert the entire output if possible. This is not
-                self.assertExpected(test_value, prec=.01, strip_suffix="_" + dev)
+                # only the best way to assert results but also handles the cases
-            else:
+                # where we need to create a new expected result.
-                self.assertExpected(map_nested_tensor_object(out, tensor_map_fn=subsample_tensor),
+                self.assertExpected(output, prec=prec, strip_suffix=strip_suffix)
-                                    prec=0.01,
+            except AssertionError:
-                                    strip_suffix="_" + dev)
+                # Unfortunately detection models are flaky due to the unstable sort
+                # in NMS. If matching across all outputs fails, use the same approach
-        check_out(out)
+                # as in NMSTester.test_nms_cuda to see if this is caused by duplicate
+                # scores.
+                expected_file = self._get_expected_file(strip_suffix=strip_suffix)
+                expected = torch.load(expected_file)
+                self.assertEqual(output[0]["scores"], expected[0]["scores"], prec=prec)
+                # Note: Fmassa proposed turning off NMS by adapting the threshold
+                # and then using the Hungarian algorithm as in DETR to find the
+                # best match between output and expected boxes and eliminate some
+                # of the flakiness. Worth exploring.
+                return False  # Partial validation performed
+            return True  # Full validation performed
+        full_validation = check_out(out)
        scripted_model = torch.jit.script(model)
        scripted_model.eval()
@@ -200,9 +217,6 @@ class ModelTester(TestCase):
        self.assertEqual(scripted_out[0]["scores"], out[0]["scores"])
        # labels currently float in script: need to investigate (though same result)
        self.assertEqual(scripted_out[0]["labels"].to(dtype=torch.long), out[0]["labels"])
-        self.assertTrue("boxes" in out[0])
-        self.assertTrue("scores" in out[0])
-        self.assertTrue("labels" in out[0])
        # don't check script because we are compiling it here:
        # TODO: refactor tests
        # self.check_script(model, name)
@@ -213,7 +227,15 @@ class ModelTester(TestCase):
                out = model(model_input)
                # See autocast_flaky_numerics comment at top of file.
                if name not in autocast_flaky_numerics:
-                    check_out(out)
+                    full_validation &= check_out(out)
+        if not full_validation:
+            msg = "The output of {} could only be partially validated. " \
+                  "This is likely due to unit-test flakiness, but you may " \
+                  "want to do additional manual checks if you made " \
+                  "significant changes to the codebase.".format(self._testMethodName)
+            warnings.warn(msg, RuntimeWarning)
+            raise unittest.SkipTest(msg)
    def _test_detection_model_validation(self, name):
        set_rng_seed(0)