ncf_keras_main.py 19.5 KB
Newer Older
Shining Sun's avatar
Shining Sun committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""NCF framework to train and evaluate the NeuMF model.

The NeuMF model assembles both MF and MLP models under the NCF framework. Check
`neumf_model.py` for more details about the models.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

25
import json
Shining Sun's avatar
Shining Sun committed
26
27
28
import os

# pylint: disable=g-bad-import-order
Hongkun Yu's avatar
Hongkun Yu committed
29

David Chen's avatar
David Chen committed
30
from absl import app
Shining Sun's avatar
Shining Sun committed
31
from absl import flags
32
from absl import logging
Hongkun Yu's avatar
Hongkun Yu committed
33
import tensorflow.compat.v2 as tf
Shining Sun's avatar
Shining Sun committed
34
35
# pylint: enable=g-bad-import-order

36
from official.common import distribute_utils
37
from official.recommendation import constants as rconst
38
from official.recommendation import movielens
Shining Sun's avatar
Shining Sun committed
39
from official.recommendation import ncf_common
40
from official.recommendation import ncf_input_pipeline
Shining Sun's avatar
Shining Sun committed
41
from official.recommendation import neumf_model
42
from official.utils.flags import core as flags_core
43
from official.utils.misc import keras_utils
Shining Sun's avatar
Shining Sun committed
44
from official.utils.misc import model_helpers
45

Shining Sun's avatar
Shining Sun committed
46
47
48
FLAGS = flags.FLAGS


49
def metric_fn(logits, dup_mask, match_mlperf):
guptapriya's avatar
guptapriya committed
50
  dup_mask = tf.cast(dup_mask, tf.float32)
51
  logits = tf.slice(logits, [0, 1], [-1, -1])
guptapriya's avatar
guptapriya committed
52
  in_top_k, _, metric_weights, _ = neumf_model.compute_top_k_and_ndcg(
Hongkun Yu's avatar
Hongkun Yu committed
53
      logits, dup_mask, match_mlperf)
guptapriya's avatar
guptapriya committed
54
55
56
57
  metric_weights = tf.cast(metric_weights, tf.float32)
  return in_top_k, metric_weights


58
59
60
class MetricLayer(tf.keras.layers.Layer):
  """Custom layer of metrics for NCF model."""

61
  def __init__(self, match_mlperf):
62
    super(MetricLayer, self).__init__()
63
64
65
66
67
68
69
70
    self.match_mlperf = match_mlperf

  def get_config(self):
    return {"match_mlperf": self.match_mlperf}

  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)
guptapriya's avatar
guptapriya committed
71

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
72
  def call(self, inputs, training=False):
73
    logits, dup_mask = inputs
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
74
75
76
77
78

    if training:
      hr_sum = 0.0
      hr_count = 0.0
    else:
79
      metric, metric_weights = metric_fn(logits, dup_mask, self.match_mlperf)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
80
81
82
83
84
      hr_sum = tf.reduce_sum(metric * metric_weights)
      hr_count = tf.reduce_sum(metric_weights)

    self.add_metric(hr_sum, name="hr_sum", aggregation="mean")
    self.add_metric(hr_count, name="hr_count", aggregation="mean")
guptapriya's avatar
guptapriya committed
85
    return logits
86
87


88
89
90
91
class LossLayer(tf.keras.layers.Layer):
  """Pass-through loss layer for NCF model."""

  def __init__(self, loss_normalization_factor):
92
93
    # The loss may overflow in float16, so we use float32 instead.
    super(LossLayer, self).__init__(dtype="float32")
94
95
96
97
    self.loss_normalization_factor = loss_normalization_factor
    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction="sum")

98
99
100
101
102
103
104
  def get_config(self):
    return {"loss_normalization_factor": self.loss_normalization_factor}

  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)

105
106
107
108
109
110
111
112
113
  def call(self, inputs):
    logits, labels, valid_pt_mask_input = inputs
    loss = self.loss(
        y_true=labels, y_pred=logits, sample_weight=valid_pt_mask_input)
    loss = loss * (1.0 / self.loss_normalization_factor)
    self.add_loss(loss)
    return logits


Shining Sun's avatar
Shining Sun committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class IncrementEpochCallback(tf.keras.callbacks.Callback):
  """A callback to increase the requested epoch for the data producer.

  The reason why we need this is because we can only buffer a limited amount of
  data. So we keep a moving window to represent the buffer. This is to move the
  one of the window's boundaries for each epoch.
  """

  def __init__(self, producer):
    self._producer = producer

  def on_epoch_begin(self, epoch, logs=None):
    self._producer.increment_request_epoch()


129
130
131
132
133
134
135
136
class CustomEarlyStopping(tf.keras.callbacks.Callback):
  """Stop training has reached a desired hit rate."""

  def __init__(self, monitor, desired_value):
    super(CustomEarlyStopping, self).__init__()

    self.monitor = monitor
    self.desired = desired_value
137
    self.stopped_epoch = 0
138
139
140
141
142
143
144
145
146

  def on_epoch_end(self, epoch, logs=None):
    current = self.get_monitor_value(logs)
    if current and current >= self.desired:
      self.stopped_epoch = epoch
      self.model.stop_training = True

  def on_train_end(self, logs=None):
    if self.stopped_epoch > 0:
Haoyu Zhang's avatar
Haoyu Zhang committed
147
      print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))
148
149
150
151
152

  def get_monitor_value(self, logs):
    logs = logs or {}
    monitor_value = logs.get(self.monitor)
    if monitor_value is None:
Hongkun Yu's avatar
Hongkun Yu committed
153
154
155
156
      logging.warning(
          "Early stopping conditioned on metric `%s` "
          "which is not available. Available metrics are: %s", self.monitor,
          ",".join(list(logs.keys())))
157
158
159
    return monitor_value


Shining Sun's avatar
Shining Sun committed
160
161
def _get_keras_model(params):
  """Constructs and returns the model."""
Haoyu Zhang's avatar
Haoyu Zhang committed
162
  batch_size = params["batch_size"]
Shining Sun's avatar
Shining Sun committed
163
164

  user_input = tf.keras.layers.Input(
165
      shape=(1,), name=movielens.USER_COLUMN, dtype=tf.int32)
Shining Sun's avatar
Shining Sun committed
166
167

  item_input = tf.keras.layers.Input(
168
      shape=(1,), name=movielens.ITEM_COLUMN, dtype=tf.int32)
guptapriya's avatar
guptapriya committed
169

170
  valid_pt_mask_input = tf.keras.layers.Input(
171
      shape=(1,), name=rconst.VALID_POINT_MASK, dtype=tf.bool)
172
173

  dup_mask_input = tf.keras.layers.Input(
174
      shape=(1,), name=rconst.DUPLICATE_MASK, dtype=tf.int32)
175
176

  label_input = tf.keras.layers.Input(
177
      shape=(1,), name=rconst.TRAIN_LABEL_KEY, dtype=tf.bool)
Shining Sun's avatar
Shining Sun committed
178

179
  base_model = neumf_model.construct_model(user_input, item_input, params)
Shining Sun's avatar
Shining Sun committed
180

181
  logits = base_model.output
182

Hongkun Yu's avatar
Hongkun Yu committed
183
  zeros = tf.keras.layers.Lambda(lambda x: x * 0)(logits)
Shining Sun's avatar
Shining Sun committed
184

Hongkun Yu's avatar
Hongkun Yu committed
185
  softmax_logits = tf.keras.layers.concatenate([zeros, logits], axis=-1)
Shining Sun's avatar
Shining Sun committed
186

187
188
  # Custom training loop calculates loss and metric as a part of
  # training/evaluation step function.
189
  if not params["keras_use_ctl"]:
Chen Chen's avatar
Chen Chen committed
190
191
    softmax_logits = MetricLayer(
        params["match_mlperf"])([softmax_logits, dup_mask_input])
192
193
194
195
    # TODO(b/134744680): Use model.add_loss() instead once the API is well
    # supported.
    softmax_logits = LossLayer(batch_size)(
        [softmax_logits, label_input, valid_pt_mask_input])
196

Shining Sun's avatar
Shining Sun committed
197
  keras_model = tf.keras.Model(
guptapriya's avatar
guptapriya committed
198
199
200
201
202
      inputs={
          movielens.USER_COLUMN: user_input,
          movielens.ITEM_COLUMN: item_input,
          rconst.VALID_POINT_MASK: valid_pt_mask_input,
          rconst.DUPLICATE_MASK: dup_mask_input,
Hongkun Yu's avatar
Hongkun Yu committed
203
204
          rconst.TRAIN_LABEL_KEY: label_input
      },
Shining Sun's avatar
Shining Sun committed
205
206
207
208
209
210
211
      outputs=softmax_logits)

  keras_model.summary()
  return keras_model


def run_ncf(_):
212
213
  """Run NCF training and eval with Keras."""

214
215
  keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

guptapriya's avatar
guptapriya committed
216
217
218
  if FLAGS.seed is not None:
    print("Setting tf seed")
    tf.random.set_seed(FLAGS.seed)
219

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
220
  model_helpers.apply_clean(FLAGS)
Shining Sun's avatar
Shining Sun committed
221

222
  if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
Reed Wanderman-Milne's avatar
Reed Wanderman-Milne committed
223
    tf.keras.mixed_precision.set_global_policy("mixed_float16")
224

225
  strategy = distribute_utils.get_distribution_strategy(
226
      distribution_strategy=FLAGS.distribution_strategy,
227
228
      num_gpus=FLAGS.num_gpus,
      tpu_address=FLAGS.tpu)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
229
230

  params = ncf_common.parse_flags(FLAGS)
231
  params["distribute_strategy"] = strategy
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
232
  params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")
233

234
235
236
  if params["use_tpu"] and not params["keras_use_ctl"]:
    logging.error("Custom training loop must be used when using TPUStrategy.")
    return
237

238
  batch_size = params["batch_size"]
239
240
241
242
243
244
245
246
  time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
  callbacks = [time_callback]

  producer, input_meta_data = None, None
  generate_input_online = params["train_dataset_path"] is None

  if generate_input_online:
    # Start data producing thread.
247
    num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
248
249
250
251
252
    producer.start()
    per_epoch_callback = IncrementEpochCallback(producer)
    callbacks.append(per_epoch_callback)
  else:
    assert params["eval_dataset_path"] and params["input_meta_data_path"]
253
    with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
254
255
256
      input_meta_data = json.loads(reader.read().decode("utf-8"))
      num_users = input_meta_data["num_users"]
      num_items = input_meta_data["num_items"]
Shining Sun's avatar
Shining Sun committed
257
258

  params["num_users"], params["num_items"] = num_users, num_items
259
260
261

  if FLAGS.early_stopping:
    early_stopping_callback = CustomEarlyStopping(
guptapriya's avatar
guptapriya committed
262
        "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
263
    callbacks.append(early_stopping_callback)
264

265
266
267
268
269
270
  (train_input_dataset, eval_input_dataset,
   num_train_steps, num_eval_steps) = \
    (ncf_input_pipeline.create_ncf_input_data(
        params, producer, input_meta_data, strategy))
  steps_per_epoch = None if generate_input_online else num_train_steps

271
  with distribute_utils.get_strategy_scope(strategy):
272
273
274
275
276
277
    keras_model = _get_keras_model(params)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=params["learning_rate"],
        beta_1=params["beta1"],
        beta_2=params["beta2"],
        epsilon=params["epsilon"])
278
    if FLAGS.fp16_implementation == "graph_rewrite":
279
280
      optimizer = \
        tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
281
            optimizer,
282
283
            loss_scale=flags_core.get_loss_scale(FLAGS,
                                                 default_for_fp16="dynamic"))
Reed Wanderman-Milne's avatar
Reed Wanderman-Milne committed
284
285
286
287
288
289
290
291
292
293
294
    elif FLAGS.dtype == "fp16":
      loss_scale = flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")
      # Note Model.compile automatically wraps the optimizer with a
      # LossScaleOptimizer using dynamic loss scaling. We explicitly wrap it
      # here for the case where a custom training loop or fixed loss scale is
      # used.
      if loss_scale == "dynamic":
        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
      else:
        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
            optimizer, dynamic=False, initial_scale=loss_scale)
295
296
297
298
299
300
301
302
303
304
305
306
307
308

    if params["keras_use_ctl"]:
      train_loss, eval_results = run_ncf_custom_training(
          params,
          strategy,
          keras_model,
          optimizer,
          callbacks,
          train_input_dataset,
          eval_input_dataset,
          num_train_steps,
          num_eval_steps,
          generate_input_online=generate_input_online)
    else:
309
      keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
310
311
312
313
314
315
316
317
318
319

      if not FLAGS.ml_perf:
        # Create Tensorboard summary and checkpoint callbacks.
        summary_dir = os.path.join(FLAGS.model_dir, "summaries")
        summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
        checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint")
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            checkpoint_path, save_weights_only=True)

        callbacks += [summary_callback, checkpoint_callback]
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350

      history = keras_model.fit(
          train_input_dataset,
          epochs=FLAGS.train_epochs,
          steps_per_epoch=steps_per_epoch,
          callbacks=callbacks,
          validation_data=eval_input_dataset,
          validation_steps=num_eval_steps,
          verbose=2)

      logging.info("Training done. Start evaluating")

      eval_loss_and_metrics = keras_model.evaluate(
          eval_input_dataset, steps=num_eval_steps, verbose=2)

      logging.info("Keras evaluation is done.")

      # Keras evaluate() API returns scalar loss and metric values from
      # evaluation as a list. Here, the returned list would contain
      # [evaluation loss, hr sum, hr count].
      eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2]

      # Format evaluation result into [eval loss, eval hit accuracy].
      eval_results = [eval_loss_and_metrics[0], eval_hit_rate]

      if history and history.history:
        train_history = history.history
        train_loss = train_history["loss"][-1]

  stats = build_stats(train_loss, eval_results, time_callback)
  return stats
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385


def run_ncf_custom_training(params,
                            strategy,
                            keras_model,
                            optimizer,
                            callbacks,
                            train_input_dataset,
                            eval_input_dataset,
                            num_train_steps,
                            num_eval_steps,
                            generate_input_online=True):
  """Runs custom training loop.

  Args:
    params: Dictionary containing training parameters.
    strategy: Distribution strategy to be used for distributed training.
    keras_model: Model used for training.
    optimizer: Optimizer used for training.
    callbacks: Callbacks to be invoked between batches/epochs.
    train_input_dataset: tf.data.Dataset used for training.
    eval_input_dataset: tf.data.Dataset used for evaluation.
    num_train_steps: Total number of steps to run for training.
    num_eval_steps: Total number of steps to run for evaluation.
    generate_input_online: Whether input data was generated by data producer.
      When data is generated by data producer, then train dataset must be
      re-initialized after every epoch.

  Returns:
    A tuple of train loss and a list of training and evaluation results.
  """
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
      reduction="sum", from_logits=True)
  train_input_iterator = iter(
      strategy.experimental_distribute_dataset(train_input_dataset))
386

387
388
  def train_step(train_iterator):
    """Called once per step to train the model."""
389

390
391
392
393
    def step_fn(features):
      """Computes loss and applied gradient per replica."""
      with tf.GradientTape() as tape:
        softmax_logits = keras_model(features)
394
395
        # The loss can overflow in float16, so we cast to float32.
        softmax_logits = tf.cast(softmax_logits, "float32")
396
397
398
399
400
401
        labels = features[rconst.TRAIN_LABEL_KEY]
        loss = loss_object(
            labels,
            softmax_logits,
            sample_weight=features[rconst.VALID_POINT_MASK])
        loss *= (1.0 / params["batch_size"])
Nimit Nigania's avatar
Nimit Nigania committed
402
403
        if FLAGS.dtype == "fp16":
          loss = optimizer.get_scaled_loss(loss)
404
405

      grads = tape.gradient(loss, keras_model.trainable_variables)
Nimit Nigania's avatar
Nimit Nigania committed
406
407
      if FLAGS.dtype == "fp16":
        grads = optimizer.get_unscaled_gradients(grads)
408
409
410
411
412
413
      # Converting gradients to dense form helps in perf on GPU for NCF
      grads = neumf_model.sparse_to_dense_grads(
          list(zip(grads, keras_model.trainable_variables)))
      optimizer.apply_gradients(grads)
      return loss

Hongkun Yu's avatar
Hongkun Yu committed
414
    per_replica_losses = strategy.run(step_fn, args=(next(train_iterator),))
415
416
417
418
419
420
421
422
423
424
425
426
    mean_loss = strategy.reduce(
        tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
    return mean_loss

  def eval_step(eval_iterator):
    """Called once per eval step to compute eval metrics."""

    def step_fn(features):
      """Computes eval metrics per replica."""
      softmax_logits = keras_model(features)
      in_top_k, metric_weights = metric_fn(softmax_logits,
                                           features[rconst.DUPLICATE_MASK],
427
                                           params["match_mlperf"])
428
429
430
      hr_sum = tf.reduce_sum(in_top_k * metric_weights)
      hr_count = tf.reduce_sum(metric_weights)
      return hr_sum, hr_count
431

432
    per_replica_hr_sum, per_replica_hr_count = (
Hongkun Yu's avatar
Hongkun Yu committed
433
        strategy.run(step_fn, args=(next(eval_iterator),)))
434
435
436
437
438
    hr_sum = strategy.reduce(
        tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None)
    hr_count = strategy.reduce(
        tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
    return hr_sum, hr_count
439

440
441
442
  if not FLAGS.run_eagerly:
    train_step = tf.function(train_step)
    eval_step = tf.function(eval_step)
443

444
445
  for callback in callbacks:
    callback.on_train_begin()
446

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
447
448
449
450
451
452
453
454
455
456
  # Not writing tensorboard summaries if running in MLPerf.
  if FLAGS.ml_perf:
    eval_summary_writer, train_summary_writer = None, None
  else:
    summary_dir = os.path.join(FLAGS.model_dir, "summaries")
    eval_summary_writer = tf.summary.create_file_writer(
        os.path.join(summary_dir, "eval"))
    train_summary_writer = tf.summary.create_file_writer(
        os.path.join(summary_dir, "train"))

457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
  train_loss = 0
  for epoch in range(FLAGS.train_epochs):
    for cb in callbacks:
      cb.on_epoch_begin(epoch)

    # As NCF dataset is sampled with randomness, not repeating
    # data elements in each epoch has significant impact on
    # convergence. As so, offline-generated TF record files
    # contains all epoch worth of data. Thus we do not need
    # to initialize dataset when reading from tf record files.
    if generate_input_online:
      train_input_iterator = iter(
          strategy.experimental_distribute_dataset(train_input_dataset))

    train_loss = 0
    for step in range(num_train_steps):
      current_step = step + epoch * num_train_steps
      for c in callbacks:
        c.on_batch_begin(current_step)

      train_loss += train_step(train_input_iterator)

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
479
480
      # Write train loss once in every 1000 steps.
      if train_summary_writer and step % 1000 == 0:
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
481
        with train_summary_writer.as_default():
Hongkun Yu's avatar
Hongkun Yu committed
482
483
          tf.summary.scalar(
              "training_loss", train_loss / (step + 1), step=current_step)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
484

485
486
487
488
      for c in callbacks:
        c.on_batch_end(current_step)

    train_loss /= num_train_steps
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
489
    logging.info("Done training epoch %s, epoch loss=%.3f", epoch + 1,
490
491
                 train_loss)

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
492
493
    eval_input_iterator = iter(
        strategy.experimental_distribute_dataset(eval_input_dataset))
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
494
495
496

    hr_sum = 0.0
    hr_count = 0.0
497
498
499
500
501
    for _ in range(num_eval_steps):
      step_hr_sum, step_hr_count = eval_step(eval_input_iterator)
      hr_sum += step_hr_sum
      hr_count += step_hr_count

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
502
    logging.info("Done eval epoch %s, hit_rate=%.3f", epoch + 1,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
503
504
505
506
                 hr_sum / hr_count)
    if eval_summary_writer:
      with eval_summary_writer.as_default():
        tf.summary.scalar("hit_rate", hr_sum / hr_count, step=current_step)
507
508
509
510
511
512
513
514

    if (FLAGS.early_stopping and
        float(hr_sum / hr_count) > params["hr_threshold"]):
      break

  for c in callbacks:
    c.on_train_end()

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
515
516
517
518
519
520
521
  # Saving the model at the end of training.
  if not FLAGS.ml_perf:
    checkpoint = tf.train.Checkpoint(model=keras_model, optimizer=optimizer)
    checkpoint_path = os.path.join(FLAGS.model_dir, "ctl_checkpoint")
    checkpoint.save(checkpoint_path)
    logging.info("Saving model as TF checkpoint: %s", checkpoint_path)

522
  return train_loss, [None, hr_sum / hr_count]
523
524


525
def build_stats(loss, eval_result, time_callback):
526
527
  """Normalizes and returns dictionary of stats.

Haoyu Zhang's avatar
Haoyu Zhang committed
528
529
530
531
532
533
534
535
  Args:
    loss: The final loss at training time.
    eval_result: Output of the eval step. Assumes first value is eval_loss and
      second value is accuracy_top_1.
    time_callback: Time tracking callback likely used during keras.fit.

  Returns:
    Dictionary of normalized results.
536
537
  """
  stats = {}
538
  if loss:
Haoyu Zhang's avatar
Haoyu Zhang committed
539
    stats["loss"] = loss
540
541

  if eval_result:
Haoyu Zhang's avatar
Haoyu Zhang committed
542
543
    stats["eval_loss"] = eval_result[0]
    stats["eval_hit_rate"] = eval_result[1]
544
545
546

  if time_callback:
    timestamp_log = time_callback.timestamp_log
Haoyu Zhang's avatar
Haoyu Zhang committed
547
548
    stats["step_timestamp_log"] = timestamp_log
    stats["train_finish_time"] = time_callback.train_finish_time
549
    if len(timestamp_log) > 1:
Haoyu Zhang's avatar
Haoyu Zhang committed
550
      stats["avg_exp_per_second"] = (
551
          time_callback.batch_size * time_callback.log_steps *
Hongkun Yu's avatar
Hongkun Yu committed
552
          (len(time_callback.timestamp_log) - 1) /
553
554
555
          (timestamp_log[-1].timestamp - timestamp_log[0].timestamp))

  return stats
Shining Sun's avatar
Shining Sun committed
556
557
558


def main(_):
559
  logging.info("Result is %s", run_ncf(FLAGS))
Shining Sun's avatar
Shining Sun committed
560
561
562
563


if __name__ == "__main__":
  ncf_common.define_ncf_flags()
David Chen's avatar
David Chen committed
564
  app.run(main)