Pass the file patterns instead of flattened file paths into...

Pass the file patterns instead of flattened file paths into tf.data.Dataset.list_file. This helps Cloud DF4x4 Bert Startup from 11min34s to 6min53s. PiperOrigin-RevId: 275953359

Pass the file patterns instead of flattened file paths into...
Pass the file patterns instead of flattened file paths into tf.data.Dataset.list_file. This helps Cloud DF4x4 Bert Startup from 11min34s to 6min53s. PiperOrigin-RevId: 275953359
ad5a14f2 · Ruoxin Sang · A. Unique TensorFlower · 2bceb1b7 · ad5a14f2 · ad5a14f2
Commit ad5a14f2 authored Oct 21, 2019 by Ruoxin Sang Committed by A. Unique TensorFlower Oct 21, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 8 deletions

official/nlp/bert/input_pipeline.py official/nlp/bert/input_pipeline.py +7 -3

official/nlp/bert/run_pretraining.py official/nlp/bert/run_pretraining.py +2 -5

No files found.
--- a/official/nlp/bert/input_pipeline.py
+++ b/official/nlp/bert/input_pipeline.py
@@ -59,7 +59,7 @@ def file_based_input_fn_builder(input_file, name_to_features):
  return input_fn


-def create_pretrain_dataset(file_paths,
+def create_pretrain_dataset(input_patterns,
                            seq_length,
                            max_predictions_per_seq,
                            batch_size,
@@ -83,16 +83,20 @@ def create_pretrain_dataset(file_paths,
          tf.io.FixedLenFeature([1], tf.int64),
  }

-  dataset = tf.data.Dataset.list_files(file_paths, shuffle=is_training)
+  dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)

  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
                            input_pipeline_context.input_pipeline_id)

  dataset = dataset.repeat()
+
  # We set shuffle buffer to exactly match total number of
  # training files to ensure that training data is well shuffled.
-  dataset = dataset.shuffle(len(file_paths))
+  input_files = []
+  for input_pattern in input_patterns:
+    input_files.extend(tf.io.gfile.glob(input_pattern))
+  dataset = dataset.shuffle(len(input_files))

  # In parallel, create tf record dataset for each train files.
  # cycle_length = 8 means that up to 8 files will be read and deserialized in

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -78,12 +78,9 @@ def get_pretrain_input_data(input_file_pattern, seq_length,

  def _dataset_fn(ctx=None):
    """Returns tf.data.Dataset for distributed BERT pretraining."""
-    input_files = []
-    for input_pattern in input_file_pattern.split(','):
-      input_files.extend(tf.io.gfile.glob(input_pattern))
-
+    input_patterns = input_file_pattern.split(',')
    train_dataset = input_pipeline.create_pretrain_dataset(
-        input_files,
+        input_patterns,
        seq_length,
        max_predictions_per_seq,
        batch_size,