Clean up primary_cpu_task. It is not necessary.

PiperOrigin-RevId: 277649735

Clean up primary_cpu_task. It is not necessary.
PiperOrigin-RevId: 277649735
fc2056bc · Hongkun Yu · A. Unique TensorFlower · 53eff257 · fc2056bc · fc2056bc
Commit fc2056bc authored Oct 30, 2019 by Hongkun Yu Committed by A. Unique TensorFlower Oct 30, 2019
Showing with 45 additions and 63 deletions

official/modeling/training/distributed_executor.py official/modeling/training/distributed_executor.py +1 -16

official/nlp/bert/run_squad.py official/nlp/bert/run_squad.py +44 -47

No files found.
--- a/official/modeling/training/distributed_executor.py
+++ b/official/modeling/training/distributed_executor.py
@@ -131,13 +131,6 @@ def hparam_flags_dict():
  }


-def primary_cpu_task(use_remote_tpu=False):
-  """Returns primary CPU task to which input pipeline Ops are put."""
-
-  # Remote Eager Borg job configures the TPU worker with job name 'worker'.
-  return '/job:worker' if use_remote_tpu else ''
-
-
 def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
  """Saves model to model_dir with provided checkpoint prefix."""

@@ -204,7 +197,6 @@ class DistributedExecutor(object):
    metric_fn: metric function. Signature: () -> tf.keras.metrics.Metric.
    is_multi_host: Set to True when using multi hosts for training, like multi
      worker GPU or TPU pod (slice). Otherwise, False.
-    use_remote_tpu: If True, run on remote TPU mode.
  """

  def __init__(self,
@@ -212,14 +204,12 @@ class DistributedExecutor(object):
               params,
               model_fn,
               loss_fn,
-               is_multi_host=False,
-               use_remote_tpu=False):
+               is_multi_host=False):

    self._params = params
    self._model_fn = model_fn
    self._loss_fn = loss_fn
    self._strategy = strategy
-    self._use_remote_tpu = use_remote_tpu
    self._checkpoint_name = 'ctl_step_{step}.ckpt'
    self._is_multi_host = is_multi_host

@@ -867,11 +857,6 @@ class ExecutorBuilder(object):
      raise ValueError('`strategy` should not be None. You need to specify '
                       '`strategy_type` in the builder contructor or directly '
                       'set the `strategy` property of the builder.')
-    if 'use_remote_tpu' not in kwargs:
-      use_remote_tpu = (
-          isinstance(self._strategy, tf.distribute.experimental.TPUStrategy) and
-          bool(self._strategy_config.tpu))
-      kwargs['use_remote_tpu'] = use_remote_tpu
    return class_ctor(
        strategy=self._strategy,
        params=params,

--- a/official/nlp/bert/run_squad.py
+++ b/official/nlp/bert/run_squad.py
@@ -135,53 +135,50 @@ def get_raw_results(predictions):
 def predict_squad_customized(strategy, input_meta_data, bert_config,
                             predict_tfrecord_path, num_steps):
  """Make predictions using a Bert-based squad model."""
-  primary_cpu_task = '/job:worker' if FLAGS.tpu else ''
-
-  with tf.device(primary_cpu_task):
-    predict_dataset = input_pipeline.create_squad_dataset(
-        predict_tfrecord_path,
-        input_meta_data['max_seq_length'],
-        FLAGS.predict_batch_size,
-        is_training=False)
-    predict_iterator = iter(
-        strategy.experimental_distribute_dataset(predict_dataset))
-
-    with strategy.scope():
-      # Prediction always uses float32, even if training uses mixed precision.
-      tf.keras.mixed_precision.experimental.set_policy('float32')
-      squad_model, _ = bert_models.squad_model(
-          bert_config, input_meta_data['max_seq_length'], float_type=tf.float32)
-
-    checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
-    logging.info('Restoring checkpoints from %s', checkpoint_path)
-    checkpoint = tf.train.Checkpoint(model=squad_model)
-    checkpoint.restore(checkpoint_path).expect_partial()
-
-    @tf.function
-    def predict_step(iterator):
-      """Predicts on distributed devices."""
-
-      def _replicated_step(inputs):
-        """Replicated prediction calculation."""
-        x, _ = inputs
-        unique_ids, start_logits, end_logits = squad_model(x, training=False)
-        return dict(
-            unique_ids=unique_ids,
-            start_logits=start_logits,
-            end_logits=end_logits)
-
-      outputs = strategy.experimental_run_v2(
-          _replicated_step, args=(next(iterator),))
-      return tf.nest.map_structure(strategy.experimental_local_results, outputs)
-
-    all_results = []
-    for _ in range(num_steps):
-      predictions = predict_step(predict_iterator)
-      for result in get_raw_results(predictions):
-        all_results.append(result)
-      if len(all_results) % 100 == 0:
-        logging.info('Made predictions for %d records.', len(all_results))
-    return all_results
+  predict_dataset = input_pipeline.create_squad_dataset(
+      predict_tfrecord_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.predict_batch_size,
+      is_training=False)
+  predict_iterator = iter(
+      strategy.experimental_distribute_dataset(predict_dataset))
+
+  with strategy.scope():
+    # Prediction always uses float32, even if training uses mixed precision.
+    tf.keras.mixed_precision.experimental.set_policy('float32')
+    squad_model, _ = bert_models.squad_model(
+        bert_config, input_meta_data['max_seq_length'], float_type=tf.float32)
+
+  checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
+  logging.info('Restoring checkpoints from %s', checkpoint_path)
+  checkpoint = tf.train.Checkpoint(model=squad_model)
+  checkpoint.restore(checkpoint_path).expect_partial()
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, _ = inputs
+      unique_ids, start_logits, end_logits = squad_model(x, training=False)
+      return dict(
+          unique_ids=unique_ids,
+          start_logits=start_logits,
+          end_logits=end_logits)
+
+    outputs = strategy.experimental_run_v2(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(strategy.experimental_local_results, outputs)
+
+  all_results = []
+  for _ in range(num_steps):
+    predictions = predict_step(predict_iterator)
+    for result in get_raw_results(predictions):
+      all_results.append(result)
+    if len(all_results) % 100 == 0:
+      logging.info('Made predictions for %d records.', len(all_results))
+  return all_results


 def train_squad(strategy,