Adds trainer and checkpoint exporter as the arguments of the run_experiment functions.

PiperOrigin-RevId: 368778443

Adds trainer and checkpoint exporter as the arguments of the run_experiment functions.
PiperOrigin-RevId: 368778443
e3704ce2 · Yeqing Li · A. Unique TensorFlower · 85a6db17 · e3704ce2
Commit e3704ce2 authored Apr 15, 2021 by Yeqing Li Committed by A. Unique TensorFlower Apr 15, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 17 deletions

official/core/train_lib.py official/core/train_lib.py +22 -17

No files found.
--- a/official/core/train_lib.py
+++ b/official/core/train_lib.py
@@ -15,7 +15,7 @@
 """TFM common training driver library."""
 # pytype: disable=attribute-error
 import os
-from typing import Any, Mapping, Tuple
+from typing import Any, Mapping, Tuple, Optional
 # Import libraries
 from absl import logging
@@ -23,21 +23,23 @@ import orbit
 import tensorflow as tf
 from official.core import base_task
+from official.core import base_trainer
 from official.core import config_definitions
 from official.core import train_utils
-BestCheckpointExporter = train_utils.BestCheckpointExporter
 maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter
-def run_experiment(distribution_strategy: tf.distribute.Strategy,
+def run_experiment(
-                   task: base_task.Task,
+    distribution_strategy: tf.distribute.Strategy,
-                   mode: str,
+    task: base_task.Task,
-                   params: config_definitions.ExperimentConfig,
+    mode: str,
-                   model_dir: str,
+    params: config_definitions.ExperimentConfig,
-                   run_post_eval: bool = False,
+    model_dir: str,
-                   save_summary: bool = True) \
+    run_post_eval: bool = False,
-> Tuple[tf.keras.Model, Mapping[str, Any]]:
+    save_summary: bool = True,
+    trainer: Optional[base_trainer.Trainer] = None
+) -> Tuple[tf.keras.Model, Mapping[str, Any]]:
  """Runs train/eval configured by the experiment params.
  Args:
@@ -50,6 +52,8 @@ def run_experiment(distribution_strategy: tf.distribute.Strategy,
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.
    save_summary: Whether to save train and validation summary.
+    trainer: the base_trainer.Trainer instance. It should be created within the
+      strategy.scope().
  Returns:
    A 2-tuple of (model, eval_logs).
@@ -59,13 +63,14 @@ def run_experiment(distribution_strategy: tf.distribute.Strategy,
  """
  with distribution_strategy.scope():
-    trainer = train_utils.create_trainer(
+    if not trainer:
-        params,
+      trainer = train_utils.create_trainer(
-        task,
+          params,
-        train='train' in mode,
+          task,
-        evaluate=('eval' in mode) or run_post_eval,
+          train='train' in mode,
-        checkpoint_exporter=maybe_create_best_ckpt_exporter(
+          evaluate=('eval' in mode) or run_post_eval,
-            params, model_dir))
+          checkpoint_exporter=maybe_create_best_ckpt_exporter(
+              params, model_dir))
  if trainer.checkpoint:
    checkpoint_manager = tf.train.CheckpointManager(