Commit 34beb7ad authored by Reed's avatar Reed Committed by Taylor Robie
Browse files

Fix race condition with ready file. (#5271)

parent e6ce8cdd
......@@ -58,6 +58,7 @@ NUM_EVAL_NEGATIVES = 999
CYCLES_TO_BUFFER = 3 # The number of train cycles worth of data to "run ahead"
# of the main training loop.
READY_FILE_TEMP = "ready.json.temp"
READY_FILE = "ready.json"
TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
......
......@@ -282,11 +282,17 @@ def _construct_training_records(
raise ValueError("Error detected: point counts do not match: {} vs. {}"
.format(num_pts, written_pts))
with tf.gfile.Open(os.path.join(record_dir, rconst.READY_FILE), "w") as f:
# We write to a temp file then atomically rename it to the final file, because
# writing directly to the final file can cause the main process to read a
# partially written JSON file.
ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
with tf.gfile.Open(ready_file_temp, "w") as f:
json.dump({
"batch_size": train_batch_size,
"batch_count": batch_count,
}, f)
ready_file = os.path.join(record_dir, rconst.READY_FILE)
tf.gfile.Rename(ready_file_temp, ready_file)
log_msg("Cycle {} complete. Total time: {:.1f} seconds"
.format(train_cycle, timeit.default_timer() - st))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment