Internal change

PiperOrigin-RevId: 363529035

Internal change
PiperOrigin-RevId: 363529035
9557e02a · Chen Chen · A. Unique TensorFlower · e4388f88 · 9557e02a · 9557e02a
Commit 9557e02a authored Mar 17, 2021 by Chen Chen Committed by A. Unique TensorFlower Mar 17, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 10 deletions

official/core/config_definitions.py official/core/config_definitions.py +4 -2

official/core/input_reader.py official/core/input_reader.py +19 -8

No files found.
--- a/official/core/config_definitions.py
+++ b/official/core/config_definitions.py
@@ -44,8 +44,10 @@ class DataConfig(base_config.Config):
    drop_remainder: Whether the last batch should be dropped in the case it has
      fewer than `global_batch_size` elements.
    shuffle_buffer_size: The buffer size used for shuffling training data.
-    cache: Whether to cache dataset examples. Can be used to avoid re-reading
+    cache: Whether to cache dataset examples. If `True`, we will cache the
-      from disk on the second epoch. Requires significant memory overhead.
+      dataset after applying the decode_fn and parse_fn. It can be used to avoid
+      re-reading from disk, re-decoding and re-parsing the example on the
+      second epoch, but it requires significant memory overhead.
    cycle_length: The number of files that will be processed concurrently when
      interleaving files.
    block_length: The number of consecutive elements to produce from each input

--- a/official/core/input_reader.py
+++ b/official/core/input_reader.py
@@ -174,11 +174,13 @@ class InputReader:
    dataset = tf.data.Dataset.from_tensor_slices(matched_files)
    # Shuffle and repeat at file level.
+    # If cache is enabled, `reshuffle_each_iteration` is set to False,
+    # because we will read the same cached data in every iteration anyway.
    if self._is_training:
      dataset = dataset.shuffle(
          len(matched_files),
          seed=self._seed,
-          reshuffle_each_iteration=True)
+          reshuffle_each_iteration=True if not self._cache else False)
    # Do not enable sharding if tf.data service is enabled, as sharding will be
    # handled inside tf.data service.
@@ -187,7 +189,9 @@ class InputReader:
        not self._enable_tf_data_service):
      dataset = dataset.shard(input_context.num_input_pipelines,
                              input_context.input_pipeline_id)
-    if self._is_training:
+    # If cache is enabled, we will call `repeat()` later after `cache()`.
+    if self._is_training and not self._cache:
      dataset = dataset.repeat()
    dataset = dataset.interleave(
@@ -222,7 +226,9 @@ class InputReader:
        not self._enable_tf_data_service):
      dataset = dataset.shard(input_context.num_input_pipelines,
                              input_context.input_pipeline_id)
-    if self._is_training:
+    # If cache is enabled, we will call `repeat()` later after `cache()`.
+    if self._is_training and not self._cache:
      dataset = dataset.repeat()
    return dataset
@@ -249,7 +255,8 @@ class InputReader:
        decoders=decoders,
        read_config=read_config)
-    if self._is_training:
+    # If cache is enabled, we will call `repeat()` later after `cache()`.
+    if self._is_training and not self._cache:
      dataset = dataset.repeat()
    return dataset
@@ -295,10 +302,8 @@ class InputReader:
      raise ValueError('It is unexpected that `tfds_builder` is None and '
                       'there is also no `matched_files`.')
-    if self._cache:
+    # If cache is enabled, we will call `shuffle()` later after `cache()`.
-      dataset = dataset.cache()
+    if self._is_training and not self._cache:
-    if self._is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)
    dataset = _maybe_map_fn(dataset, self._decoder_fn)
@@ -306,6 +311,12 @@ class InputReader:
      dataset = dataset.apply(self._sample_fn)
    dataset = _maybe_map_fn(dataset, self._parser_fn)
+    if self._cache:
+      dataset = dataset.cache()
+      if self._is_training:
+        dataset = dataset.repeat()
+        dataset = dataset.shuffle(self._shuffle_buffer_size)
    if self._transform_and_batch_fn is not None:
      dataset = self._transform_and_batch_fn(dataset, input_context)
    else: