Commit cbdd46ab authored by Owen Wang's avatar Owen Wang Committed by Facebook GitHub Bot
Browse files

make prediction count evaluation stable with DDP

Summary: Prediction count evaluator needs to gather it's state before computing metrics, otherwise when parallelized across N GPUs, we only get metrics computed from 1/N of the dataset, increasing our eval signal's variance.

Reviewed By: wat3rBro

Differential Revision: D27416864

fbshipit-source-id: b2c5334cd5a38bebcd06c6ace1627a6b71645fdd
parent 82f17be0
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import itertools
import logging import logging
from collections import OrderedDict from collections import OrderedDict
import numpy as np import numpy as np
import detectron2.utils.comm as comm
from detectron2.evaluation import DatasetEvaluator from detectron2.evaluation import DatasetEvaluator
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -22,6 +22,11 @@ class PredictionCountEvaluator(DatasetEvaluator): ...@@ -22,6 +22,11 @@ class PredictionCountEvaluator(DatasetEvaluator):
:func:`inference_on_dataset` to see how this class will be called. :func:`inference_on_dataset` to see how this class will be called.
""" """
def __init__(self, distributed: bool = True):
self._distributed = distributed
self.prediction_counts = []
self.confidence_scores = []
def reset(self): def reset(self):
self.prediction_counts = [] self.prediction_counts = []
self.confidence_scores = [] self.confidence_scores = []
...@@ -54,8 +59,21 @@ class PredictionCountEvaluator(DatasetEvaluator): ...@@ -54,8 +59,21 @@ class PredictionCountEvaluator(DatasetEvaluator):
* key: the name of the task (e.g., bbox) * key: the name of the task (e.g., bbox)
* value: a dict of {metric name: score}, e.g.: {"AP50": 80} * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
""" """
mpi = np.mean(self.prediction_counts) if self._distributed:
mcp = np.mean(self.confidence_scores) comm.synchronize()
prediction_counts = comm.gather(self.prediction_counts, dst=0)
prediction_counts = list(itertools.chain(*prediction_counts))
confidence_scores = comm.gather(self.confidence_scores, dst=0)
confidence_scores = list(itertools.chain(*confidence_scores))
if not comm.is_main_process():
return {}
else:
prediction_counts = self.prediction_counts
confidence_scores = self.confidence_scores
mpi = np.mean(prediction_counts)
mcp = np.mean(confidence_scores)
output_metrics = OrderedDict( output_metrics = OrderedDict(
{ {
"false_positives": { "false_positives": {
......
#!/usr/bin/env python3
import unittest
import torch
from d2go.evaluation.prediction_count_evaluation import PredictionCountEvaluator
from detectron2.structures.instances import Instances
class TestPredictionCountEvaluation(unittest.TestCase):
def setUp(self):
self.evaluator = PredictionCountEvaluator()
image_size = (224, 224)
self.mock_outputs = [
{"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8, 0.7]))},
{"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8, 0.7]))},
{"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8]))},
{"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8]))},
{"instances": Instances(image_size, scores=torch.Tensor([0.9]))},
]
# PredictionCountEvaluator does not depend on inputs
self.mock_inputs = [None] * len(self.mock_outputs)
def test_process_evaluate_reset(self):
self.assertEqual(len(self.evaluator.prediction_counts), 0)
self.assertEqual(len(self.evaluator.confidence_scores), 0)
# Test that `process` registers the outputs.
self.evaluator.process(self.mock_inputs, self.mock_outputs)
self.assertListEqual(self.evaluator.prediction_counts, [3, 3, 2, 2, 1])
self.assertEqual(len(self.evaluator.confidence_scores), 11)
# Test that `evaluate` returns the correct metrics.
output_metrics = self.evaluator.evaluate()
self.assertDictAlmostEqual(
output_metrics,
{
"false_positives": {
"predictions_per_image": 11 / 5,
"confidence_per_prediction": (0.9 * 5 + 0.8 * 4 + 0.7 * 2) / 11,
}
}
)
# Test that `reset` clears the evaluator state.
self.evaluator.reset()
self.assertEqual(len(self.evaluator.prediction_counts), 0)
self.assertEqual(len(self.evaluator.confidence_scores), 0)
def assertDictAlmostEqual(self, dict1, dict2):
keys1 = list(dict1.keys())
keys2 = list(dict2.keys())
# Assert lists are equal, irrespective of ordering
self.assertCountEqual(keys1, keys2)
for k, v1 in dict1.items():
v2 = dict2[k]
if isinstance(v2, list):
self.assertListEqual(v1, v2)
elif isinstance(v2, dict):
self.assertDictAlmostEqual(v1, v2)
else:
self.assertAlmostEqual(v1, v2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment