"vscode:/vscode.git/clone" did not exist on "c90cb0bc1420853c3b4625800db5fdaff871db1f"
Commit 7a3d6c4c authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 298986381
parent bb8a18c9
...@@ -87,15 +87,15 @@ def create_pretrain_dataset(input_patterns, ...@@ -87,15 +87,15 @@ def create_pretrain_dataset(input_patterns,
if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1: if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
dataset = dataset.shard(input_pipeline_context.num_input_pipelines, dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
input_pipeline_context.input_pipeline_id) input_pipeline_context.input_pipeline_id)
if is_training:
dataset = dataset.repeat()
dataset = dataset.repeat() # We set shuffle buffer to exactly match total number of
# training files to ensure that training data is well shuffled.
# We set shuffle buffer to exactly match total number of input_files = []
# training files to ensure that training data is well shuffled. for input_pattern in input_patterns:
input_files = [] input_files.extend(tf.io.gfile.glob(input_pattern))
for input_pattern in input_patterns: dataset = dataset.shuffle(len(input_files))
input_files.extend(tf.io.gfile.glob(input_pattern))
dataset = dataset.shuffle(len(input_files))
# In parallel, create tf record dataset for each train files. # In parallel, create tf record dataset for each train files.
# cycle_length = 8 means that up to 8 files will be read and deserialized in # cycle_length = 8 means that up to 8 files will be read and deserialized in
...@@ -132,7 +132,7 @@ def create_pretrain_dataset(input_patterns, ...@@ -132,7 +132,7 @@ def create_pretrain_dataset(input_patterns,
if is_training: if is_training:
dataset = dataset.shuffle(100) dataset = dataset.shuffle(100)
dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.batch(batch_size, drop_remainder=is_training)
dataset = dataset.prefetch(1024) dataset = dataset.prefetch(1024)
return dataset return dataset
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment