Unverified Commit 7f7ff056 authored by Vasilis Vryniotis's avatar Vasilis Vryniotis Committed by GitHub
Browse files

Fix flakiness on detection tests (#2966)

* Simplify the ACCEPT=True logic in assertExpected().

* Separate the expected filename estimation from assertExpected

* Unflatten expected values.

* Assert for duplicate scores if primary check fails.

* Remove custom exceptions for algorithms and add a compact function for shrinking large ouputs.

* Removing unused variables.

* Add warning and comments.

* Re-enable all autocast unit-test for detection and marking the tests as skipped in partial validation.

* Move test skip at the end.

* Changing the warning message.
parent 47382673
...@@ -105,16 +105,14 @@ class TestCase(unittest.TestCase): ...@@ -105,16 +105,14 @@ class TestCase(unittest.TestCase):
"expect", "expect",
munged_id) munged_id)
subname_output = ""
if subname: if subname:
expected_file += "_" + subname expected_file += "_" + subname
subname_output = " ({})".format(subname)
expected_file += "_expect.pkl" expected_file += "_expect.pkl"
if not ACCEPT and not os.path.exists(expected_file): if not ACCEPT and not os.path.exists(expected_file):
raise RuntimeError( raise RuntimeError(
("No expect file exists for {}{}; to accept the current output, run:\n" ("No expect file exists for {}; to accept the current output, run:\n"
"python {} {} --accept").format(munged_id, subname_output, __main__.__file__, munged_id)) "python {} {} --accept").format(os.path.basename(expected_file), __main__.__file__, munged_id))
return expected_file return expected_file
...@@ -139,11 +137,13 @@ class TestCase(unittest.TestCase): ...@@ -139,11 +137,13 @@ class TestCase(unittest.TestCase):
expected_file = self._get_expected_file(subname, strip_suffix) expected_file = self._get_expected_file(subname, strip_suffix)
if ACCEPT: if ACCEPT:
print("Accepting updated output for {}:\n\n{}".format(os.path.basename(expected_file), output)) filename = {os.path.basename(expected_file)}
print("Accepting updated output for {}:\n\n{}".format(filename, output))
torch.save(output, expected_file) torch.save(output, expected_file)
MAX_PICKLE_SIZE = 50 * 1000 # 50 KB MAX_PICKLE_SIZE = 50 * 1000 # 50 KB
binary_size = os.path.getsize(expected_file) binary_size = os.path.getsize(expected_file)
self.assertTrue(binary_size <= MAX_PICKLE_SIZE) if binary_size > MAX_PICKLE_SIZE:
raise RuntimeError("The output for {}, is larger than 50kb".format(filename))
else: else:
expected = torch.load(expected_file) expected = torch.load(expected_file)
self.assertEqual(output, expected, prec=prec) self.assertEqual(output, expected, prec=prec)
......
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
from common_utils import TestCase, map_nested_tensor_object, freeze_rng_state from common_utils import TestCase, map_nested_tensor_object, freeze_rng_state
from collections import OrderedDict from collections import OrderedDict
from itertools import product from itertools import product
import functools
import operator
import torch import torch
import torch.nn as nn import torch.nn as nn
import numpy as np import numpy as np
from torchvision import models from torchvision import models
import unittest import unittest
import random import random
import warnings
from torchvision.models.detection._utils import overwrite_eps
def set_rng_seed(seed): def set_rng_seed(seed):
...@@ -88,14 +89,10 @@ script_test_models = { ...@@ -88,14 +89,10 @@ script_test_models = {
# trying autocast. However, they still try an autocasted forward pass, so they still ensure # trying autocast. However, they still try an autocasted forward pass, so they still ensure
# autocast coverage suffices to prevent dtype errors in each model. # autocast coverage suffices to prevent dtype errors in each model.
autocast_flaky_numerics = ( autocast_flaky_numerics = (
"fasterrcnn_resnet50_fpn",
"inception_v3", "inception_v3",
"keypointrcnn_resnet50_fpn",
"maskrcnn_resnet50_fpn",
"resnet101", "resnet101",
"resnet152", "resnet152",
"wide_resnet101_2", "wide_resnet101_2",
"retinanet_resnet50_fpn",
) )
...@@ -148,10 +145,9 @@ class ModelTester(TestCase): ...@@ -148,10 +145,9 @@ class ModelTester(TestCase):
set_rng_seed(0) set_rng_seed(0)
kwargs = {} kwargs = {}
if "retinanet" in name: if "retinanet" in name:
kwargs["score_thresh"] = 0.013 # Reduce the default threshold to ensure the returned boxes are not empty.
kwargs["score_thresh"] = 0.01
model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False, **kwargs) model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False, **kwargs)
if "keypointrcnn" in name or "retinanet" in name:
overwrite_eps(model, 0.0)
model.eval().to(device=dev) model.eval().to(device=dev)
input_shape = (3, 300, 300) input_shape = (3, 300, 300)
# RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
...@@ -163,15 +159,22 @@ class ModelTester(TestCase): ...@@ -163,15 +159,22 @@ class ModelTester(TestCase):
def check_out(out): def check_out(out):
self.assertEqual(len(out), 1) self.assertEqual(len(out), 1)
def compact(tensor):
size = tensor.size()
elements_per_sample = functools.reduce(operator.mul, size[1:], 1)
if elements_per_sample > 30:
return compute_mean_std(tensor)
else:
return subsample_tensor(tensor)
def subsample_tensor(tensor): def subsample_tensor(tensor):
num_elems = tensor.numel() num_elems = tensor.size(0)
num_samples = 20 num_samples = 20
if num_elems <= num_samples: if num_elems <= num_samples:
return tensor return tensor
flat_tensor = tensor.flatten()
ith_index = num_elems // num_samples ith_index = num_elems // num_samples
return flat_tensor[ith_index - 1::ith_index] return tensor[ith_index - 1::ith_index]
def compute_mean_std(tensor): def compute_mean_std(tensor):
# can't compute mean of integral tensor # can't compute mean of integral tensor
...@@ -180,18 +183,32 @@ class ModelTester(TestCase): ...@@ -180,18 +183,32 @@ class ModelTester(TestCase):
std = torch.std(tensor) std = torch.std(tensor)
return {"mean": mean, "std": std} return {"mean": mean, "std": std}
if name == "maskrcnn_resnet50_fpn": output = map_nested_tensor_object(out, tensor_map_fn=compact)
# maskrcnn_resnet_50_fpn numerically unstable across platforms, so for now prec = 0.01
# compare results with mean and std strip_suffix = "_" + dev
test_value = map_nested_tensor_object(out, tensor_map_fn=compute_mean_std) try:
# mean values are small, use large prec # We first try to assert the entire output if possible. This is not
self.assertExpected(test_value, prec=.01, strip_suffix="_" + dev) # only the best way to assert results but also handles the cases
else: # where we need to create a new expected result.
self.assertExpected(map_nested_tensor_object(out, tensor_map_fn=subsample_tensor), self.assertExpected(output, prec=prec, strip_suffix=strip_suffix)
prec=0.01, except AssertionError:
strip_suffix="_" + dev) # Unfortunately detection models are flaky due to the unstable sort
# in NMS. If matching across all outputs fails, use the same approach
check_out(out) # as in NMSTester.test_nms_cuda to see if this is caused by duplicate
# scores.
expected_file = self._get_expected_file(strip_suffix=strip_suffix)
expected = torch.load(expected_file)
self.assertEqual(output[0]["scores"], expected[0]["scores"], prec=prec)
# Note: Fmassa proposed turning off NMS by adapting the threshold
# and then using the Hungarian algorithm as in DETR to find the
# best match between output and expected boxes and eliminate some
# of the flakiness. Worth exploring.
return False # Partial validation performed
return True # Full validation performed
full_validation = check_out(out)
scripted_model = torch.jit.script(model) scripted_model = torch.jit.script(model)
scripted_model.eval() scripted_model.eval()
...@@ -200,9 +217,6 @@ class ModelTester(TestCase): ...@@ -200,9 +217,6 @@ class ModelTester(TestCase):
self.assertEqual(scripted_out[0]["scores"], out[0]["scores"]) self.assertEqual(scripted_out[0]["scores"], out[0]["scores"])
# labels currently float in script: need to investigate (though same result) # labels currently float in script: need to investigate (though same result)
self.assertEqual(scripted_out[0]["labels"].to(dtype=torch.long), out[0]["labels"]) self.assertEqual(scripted_out[0]["labels"].to(dtype=torch.long), out[0]["labels"])
self.assertTrue("boxes" in out[0])
self.assertTrue("scores" in out[0])
self.assertTrue("labels" in out[0])
# don't check script because we are compiling it here: # don't check script because we are compiling it here:
# TODO: refactor tests # TODO: refactor tests
# self.check_script(model, name) # self.check_script(model, name)
...@@ -213,7 +227,15 @@ class ModelTester(TestCase): ...@@ -213,7 +227,15 @@ class ModelTester(TestCase):
out = model(model_input) out = model(model_input)
# See autocast_flaky_numerics comment at top of file. # See autocast_flaky_numerics comment at top of file.
if name not in autocast_flaky_numerics: if name not in autocast_flaky_numerics:
check_out(out) full_validation &= check_out(out)
if not full_validation:
msg = "The output of {} could only be partially validated. " \
"This is likely due to unit-test flakiness, but you may " \
"want to do additional manual checks if you made " \
"significant changes to the codebase.".format(self._testMethodName)
warnings.warn(msg, RuntimeWarning)
raise unittest.SkipTest(msg)
def _test_detection_model_validation(self, name): def _test_detection_model_validation(self, name):
set_rng_seed(0) set_rng_seed(0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment