Improve testing and documentation for weighted dataset sampling.

PiperOrigin-RevId: 345586199

Improve testing and documentation for weighted dataset sampling.
PiperOrigin-RevId: 345586199
b4499b90 · Yoni Ben-Meshulam · TF Object Detection Team · 101282ad · b4499b90 · b4499b90
Commit b4499b90 authored Dec 03, 2020 by Yoni Ben-Meshulam Committed by TF Object Detection Team Dec 03, 2020
2 changed files
--- a/research/object_detection/builders/dataset_builder_test.py
+++ b/research/object_detection/builders/dataset_builder_test.py
@@ -577,6 +577,28 @@ class ReadDatasetTest(test_case.TestCase):
    self._assert_item_count(data, 2, 0.25)
    self._assert_item_count(data, 20, 0.25)
+  def test_read_dataset_sample_from_datasets_weights_non_normalized(self):
+    """Ensure that the values are equally-weighted when not normalized."""
+    config = input_reader_pb2.InputReader()
+    config.num_readers = 2
+    config.shuffle = False
+    # Values are not normalized to sum to 1. In this case, it's a 50/50 split
+    # with each dataset having weight of 1.
+    config.sample_from_datasets_weights.extend([1, 1])
+    def graph_fn():
+      return self._get_dataset_next(
+          [self._path_template % '0', self._path_template % '1'],
+          config,
+          batch_size=1000)
+    data = list(self.execute(graph_fn, []))
+    self.assertEqual(len(data), 1000)
+    self._assert_item_count(data, 1, 0.25)
+    self._assert_item_count(data, 10, 0.25)
+    self._assert_item_count(data, 2, 0.25)
+    self._assert_item_count(data, 20, 0.25)
  def test_read_dataset_sample_from_datasets_weights_zero_weight(self):
    """Ensure that the files' values are equally-weighted."""
    config = input_reader_pb2.InputReader()

--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -159,7 +159,22 @@ message InputReader {
  // applied individually to each dataset.
  //
  // Implementation follows tf.data.experimental.sample_from_datasets sampling
-  // strategy.
+  // strategy. Weights may take any value - only relative weights matter.
+  // Zero weights will result in a dataset not being sampled.
+  //
+  // Examples, assuming two input files configured:
+  //
+  // Equal weighting:
+  // sample_from_datasets_weights: 0.5
+  // sample_from_datasets_weights: 0.5
+  //
+  // 2:1 weighting:
+  // sample_from_datasets_weights: 2
+  // sample_from_datasets_weights: 1
+  //
+  // Exclude the second dataset:
+  // sample_from_datasets_weights: 1
+  // sample_from_datasets_weights: 0
  repeated float sample_from_datasets_weights = 34;