make prediction count evaluation stable with DDP

Summary: Prediction count evaluator needs to gather it's state before computing metrics, otherwise when parallelized across N GPUs, we only get metrics computed from 1/N of the dataset, increasing our eval signal's variance. Reviewed By: wat3rBro Differential Revision: D27416864 fbshipit-source-id: b2c5334cd5a38bebcd06c6ace1627a6b71645fdd

make prediction count evaluation stable with DDP
Summary: Prediction count evaluator needs to gather it's state before computing metrics, otherwise when parallelized across N GPUs, we only get metrics computed from 1/N of the dataset, increasing our eval signal's variance. Reviewed By: wat3rBro Differential Revision: D27416864 fbshipit-source-id: b2c5334cd5a38bebcd06c6ace1627a6b71645fdd
cbdd46ab · Owen Wang · Facebook GitHub Bot · 82f17be0 · cbdd46ab · cbdd46ab
Commit cbdd46ab authored Apr 05, 2021 by Owen Wang Committed by Facebook GitHub Bot Apr 05, 2021
Showing with 87 additions and 4 deletions

d2go/evaluation/prediction_count_evaluation.py d2go/evaluation/prediction_count_evaluation.py +22 -4

tests/evaluation/test_prediction_count_evaluation.py tests/evaluation/test_prediction_count_evaluation.py +65 -0

No files found.
--- a/d2go/evaluation/prediction_count_evaluation.py
+++ b/d2go/evaluation/prediction_count_evaluation.py
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
 import logging
 from collections import OrderedDict
 import numpy as np
+import detectron2.utils.comm as comm
 from detectron2.evaluation import DatasetEvaluator
 logger = logging.getLogger(__name__)
@@ -22,6 +22,11 @@ class PredictionCountEvaluator(DatasetEvaluator):
    :func:`inference_on_dataset` to see how this class will be called.
    """
+    def __init__(self, distributed: bool = True):
+        self._distributed = distributed
+        self.prediction_counts = []
+        self.confidence_scores = []
    def reset(self):
        self.prediction_counts = []
        self.confidence_scores = []
@@ -54,8 +59,21 @@ class PredictionCountEvaluator(DatasetEvaluator):
                * key: the name of the task (e.g., bbox)
                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
        """
-        mpi = np.mean(self.prediction_counts)
+        if self._distributed:
-        mcp = np.mean(self.confidence_scores)
+            comm.synchronize()
+            prediction_counts = comm.gather(self.prediction_counts, dst=0)
+            prediction_counts = list(itertools.chain(*prediction_counts))
+            confidence_scores = comm.gather(self.confidence_scores, dst=0)
+            confidence_scores = list(itertools.chain(*confidence_scores))
+            if not comm.is_main_process():
+                return {}
+        else:
+            prediction_counts = self.prediction_counts
+            confidence_scores = self.confidence_scores
+        mpi = np.mean(prediction_counts)
+        mcp = np.mean(confidence_scores)
        output_metrics = OrderedDict(
            {
                "false_positives": {

--- a/tests/evaluation/test_prediction_count_evaluation.py
+++ b/tests/evaluation/test_prediction_count_evaluation.py
+#!/usr/bin/env python3
+import unittest
+import torch
+from d2go.evaluation.prediction_count_evaluation import PredictionCountEvaluator
+from detectron2.structures.instances import Instances
+class TestPredictionCountEvaluation(unittest.TestCase):
+    def setUp(self):
+        self.evaluator = PredictionCountEvaluator()
+        image_size = (224, 224)
+        self.mock_outputs = [
+            {"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8, 0.7]))},
+            {"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8, 0.7]))},
+            {"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8]))},
+            {"instances": Instances(image_size, scores=torch.Tensor([0.9, 0.8]))},
+            {"instances": Instances(image_size, scores=torch.Tensor([0.9]))},
+        ]
+        # PredictionCountEvaluator does not depend on inputs
+        self.mock_inputs  = [None] * len(self.mock_outputs)
+    def test_process_evaluate_reset(self):
+        self.assertEqual(len(self.evaluator.prediction_counts), 0)
+        self.assertEqual(len(self.evaluator.confidence_scores), 0)
+        # Test that `process` registers the outputs.
+        self.evaluator.process(self.mock_inputs, self.mock_outputs)
+        self.assertListEqual(self.evaluator.prediction_counts, [3, 3, 2, 2, 1])
+        self.assertEqual(len(self.evaluator.confidence_scores), 11)
+        # Test that `evaluate` returns the correct metrics.
+        output_metrics = self.evaluator.evaluate()
+        self.assertDictAlmostEqual(
+            output_metrics,
+            {
+                "false_positives": {
+                    "predictions_per_image": 11 / 5,
+                    "confidence_per_prediction": (0.9 * 5 + 0.8 * 4 + 0.7 * 2) / 11,
+                }
+            }
+        )
+        # Test that `reset` clears the evaluator state.
+        self.evaluator.reset()
+        self.assertEqual(len(self.evaluator.prediction_counts), 0)
+        self.assertEqual(len(self.evaluator.confidence_scores), 0)
+    def assertDictAlmostEqual(self, dict1, dict2):
+        keys1 = list(dict1.keys())
+        keys2 = list(dict2.keys())
+        # Assert lists are equal, irrespective of ordering
+        self.assertCountEqual(keys1, keys2)
+        for k, v1 in dict1.items():
+            v2 = dict2[k]
+            if isinstance(v2, list):
+                self.assertListEqual(v1, v2)
+            elif isinstance(v2, dict):
+                self.assertDictAlmostEqual(v1, v2)
+            else:
+                self.assertAlmostEqual(v1, v2)