No checkpointing only if multi worker strategy. (#6322)

a5db4420 · Ayush Dubey · Toby Boyd · 8cf8446b · a5db4420
Commit a5db4420 authored Mar 07, 2019 by Ayush Dubey Committed by Toby Boyd Mar 07, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

official/resnet/resnet_run_loop.py official/resnet/resnet_run_loop.py +5 -1

No files found.
--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -486,10 +486,14 @@ def resnet_main(

  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  # results in checkpoints determined only by `epochs_between_evals`.
+  # TODO(ayushd,yuefengz): re-enable checkpointing for multi-worker strategy.
+  save_checkpoints_secs = (None if distribution_strategy.__class__.__name__ in
+                           ['CollectiveAllReduceStrategy',
+                            'MultiWorkerMirroredStrategy'] else 60*60*24)
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy,
      session_config=session_config,
-      save_checkpoints_secs=None,
+      save_checkpoints_secs=save_checkpoints_secs,
      save_checkpoints_steps=None)

  # Initializes model with all but the dense layer from pretrained ResNet.