"vscode:/vscode.git/clone" did not exist on "6a63ad165ad169147130bfe568a749f5252d831b"
Commit 7babedc5 authored by Reed's avatar Reed Committed by Taylor Robie
Browse files

Fix spurious "did not start correctly" error. (#5252)

* Fix spurious "did not start correctly" error.

The error "Generation subprocess did not start correctly" would occur if the async process started up after the main process checked for the subproc_alive file.

* Add error message
parent 5856878d
...@@ -452,6 +452,14 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, ...@@ -452,6 +452,14 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
atexit.register(tf.gfile.DeleteRecursively, atexit.register(tf.gfile.DeleteRecursively,
ncf_dataset.cache_paths.cache_root) ncf_dataset.cache_paths.cache_root)
for _ in range(15):
if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
break
time.sleep(1) # allow `alive` file to be written
if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
raise ValueError("Generation subprocess did not start correctly. Data will "
"not be available; exiting to avoid waiting forever.")
return ncf_dataset return ncf_dataset
...@@ -495,8 +503,10 @@ def make_train_input_fn(ncf_dataset): ...@@ -495,8 +503,10 @@ def make_train_input_fn(ncf_dataset):
"""Construct training input_fn for the current epoch.""" """Construct training input_fn for the current epoch."""
if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
raise ValueError("Generation subprocess did not start correctly. Data will " # The generation subprocess must have been alive at some point, because we
"not be available; exiting to avoid waiting forever.") # earlier checked that the subproc_alive file existed.
raise ValueError("Generation subprocess unexpectedly died. Data will not "
"be available; exiting to avoid waiting forever.")
train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir
while not tf.gfile.Exists(train_epoch_dir): while not tf.gfile.Exists(train_epoch_dir):
......
...@@ -115,11 +115,6 @@ class BaseTest(tf.test.TestCase): ...@@ -115,11 +115,6 @@ class BaseTest(tf.test.TestCase):
batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2, batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2,
num_neg=NUM_NEG) num_neg=NUM_NEG)
for _ in range(30):
if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
break
time.sleep(1) # allow `alive` file to be written
g = tf.Graph() g = tf.Graph()
with g.as_default(): with g.as_default():
input_fn, record_dir, batch_count = \ input_fn, record_dir, batch_count = \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment