ncf_common.py 12.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
Hongkun Yu's avatar
Hongkun Yu committed
15
"""Common functionalities used by both Keras and Estimator implementations."""
16

17
18
19
20
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

21
import json
22
23
import os

24
# pylint: disable=g-bad-import-order
Hongkun Yu's avatar
Hongkun Yu committed
25

26
import numpy as np
27
from absl import flags
28
from absl import logging
29
import tensorflow as tf
30
# pylint: enable=g-bad-import-order
31

32
from official.recommendation import constants as rconst
33
from official.recommendation import data_pipeline
34
from official.recommendation import data_preprocessing
35
from official.recommendation import movielens
36
from official.utils.flags import core as flags_core
37
from official.utils.misc import distribution_utils
38
from official.utils.misc import keras_utils
39

Reed's avatar
Reed committed
40
41
42
FLAGS = flags.FLAGS


Shining Sun's avatar
Shining Sun committed
43
44
45
46
def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)
47

Shining Sun's avatar
Shining Sun committed
48
49
  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)
50

Shining Sun's avatar
Shining Sun committed
51
52
  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
53
    num_users, num_items = movielens.DATASET_TO_NUM_USERS_AND_ITEMS[
Shining Sun's avatar
Shining Sun committed
54
55
56
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
57
  else:
Shining Sun's avatar
Shining Sun committed
58
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
Hongkun Yu's avatar
Hongkun Yu committed
59
60
61
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        params=params,
Shining Sun's avatar
Shining Sun committed
62
63
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)
64
65
    num_train_steps = producer.train_batches_per_epoch
    num_eval_steps = producer.eval_batches_per_epoch
66

Shining Sun's avatar
Shining Sun committed
67
  return num_users, num_items, num_train_steps, num_eval_steps, producer
68
69
70


def parse_flags(flags_obj):
Taylor Robie's avatar
Taylor Robie committed
71
  """Convenience function to turn flags into params."""
72
73
  num_gpus = flags_core.get_num_gpus(flags_obj)

74
  batch_size = flags_obj.batch_size
Taylor Robie's avatar
Taylor Robie committed
75
  eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
76
77
78

  return {
      "train_epochs": flags_obj.train_epochs,
79
      "batches_per_step": 1,
80
81
82
83
84
85
86
87
88
      "use_seed": flags_obj.seed is not None,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": flags_obj.learning_rate,
      "mf_dim": flags_obj.num_factors,
      "model_layers": [int(layer) for layer in flags_obj.layers],
      "mf_regularization": flags_obj.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
      "num_neg": flags_obj.num_neg,
89
      "distribution_strategy": flags_obj.distribution_strategy,
90
91
92
93
94
95
96
97
98
      "num_gpus": num_gpus,
      "use_tpu": flags_obj.tpu is not None,
      "tpu": flags_obj.tpu,
      "tpu_zone": flags_obj.tpu_zone,
      "tpu_gcp_project": flags_obj.tpu_gcp_project,
      "beta1": flags_obj.beta1,
      "beta2": flags_obj.beta2,
      "epsilon": flags_obj.epsilon,
      "match_mlperf": flags_obj.ml_perf,
Yuefeng Zhou's avatar
Yuefeng Zhou committed
99
      "epochs_between_evals": flags_obj.epochs_between_evals,
100
      "keras_use_ctl": flags_obj.keras_use_ctl,
101
      "hr_threshold": flags_obj.hr_threshold,
102
      "stream_files": flags_obj.tpu is not None,
103
104
105
      "train_dataset_path": flags_obj.train_dataset_path,
      "eval_dataset_path": flags_obj.eval_dataset_path,
      "input_meta_data_path": flags_obj.input_meta_data_path,
106
  }
107
108


109
def get_v1_distribution_strategy(params):
Shining Sun's avatar
Shining Sun committed
110
111
112
  """Returns the distribution strategy to use."""
  if params["use_tpu"]:
    # Some of the networking libraries are quite chatty.
Hongkun Yu's avatar
Hongkun Yu committed
113
114
115
116
    for name in [
        "googleapiclient.discovery", "googleapiclient.discovery_cache",
        "oauth2client.transport"
    ]:
Shining Sun's avatar
Shining Sun committed
117
      logging.getLogger(name).setLevel(logging.ERROR)
118

119
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
Shining Sun's avatar
Shining Sun committed
120
121
122
        tpu=params["tpu"],
        zone=params["tpu_zone"],
        project=params["tpu_gcp_project"],
Hongkun Yu's avatar
Hongkun Yu committed
123
        coordinator_name="coordinator")
124

125
    logging.info("Issuing reset command to TPU to ensure a clean state.")
Shining Sun's avatar
Shining Sun committed
126
    tf.Session.reset(tpu_cluster_resolver.get_master())
127

Shining Sun's avatar
Shining Sun committed
128
129
130
131
    # Estimator looks at the master it connects to for MonitoredTrainingSession
    # by reading the `TF_CONFIG` environment variable, and the coordinator
    # is used by StreamingFilesDataset.
    tf_config_env = {
Hongkun Yu's avatar
Hongkun Yu committed
132
133
134
135
136
137
        "session_master":
            tpu_cluster_resolver.get_master(),
        "eval_session_master":
            tpu_cluster_resolver.get_master(),
        "coordinator":
            tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"]
Shining Sun's avatar
Shining Sun committed
138
    }
Haoyu Zhang's avatar
Haoyu Zhang committed
139
    os.environ["TF_CONFIG"] = json.dumps(tf_config_env)
140

141
    distribution = tf.distribute.experimental.TPUStrategy(
Shining Sun's avatar
Shining Sun committed
142
        tpu_cluster_resolver, steps_per_run=100)
143

Shining Sun's avatar
Shining Sun committed
144
145
146
  else:
    distribution = distribution_utils.get_distribution_strategy(
        num_gpus=params["num_gpus"])
147

Shining Sun's avatar
Shining Sun committed
148
  return distribution
149

150
151
152
153

def define_ncf_flags():
  """Add flags for running ncf_main."""
  # Add common flags
Hongkun Yu's avatar
Hongkun Yu committed
154
155
156
157
158
159
160
161
162
163
  flags_core.define_base(
      model_dir=True,
      clean=True,
      train_epochs=True,
      epochs_between_evals=True,
      export_dir=False,
      run_eagerly=True,
      stop_threshold=True,
      num_gpu=True,
      distribution_strategy=True)
164
  flags_core.define_performance(
165
      synthetic_data=True,
Nimit Nigania's avatar
Nimit Nigania committed
166
      dtype=True,
167
      fp16_implementation=True,
Nimit Nigania's avatar
Nimit Nigania committed
168
169
      loss_scale=True,
      dynamic_loss_scale=True,
170
      enable_xla=True,
171
  )
172
  flags_core.define_device(tpu=True)
173
174
175
176
  flags_core.define_benchmark()

  flags.adopt_module_key_flags(flags_core)

177
178
  movielens.define_flags()

179
180
181
  flags_core.set_defaults(
      model_dir="/tmp/ncf/",
      data_dir="/tmp/movielens-data/",
182
      dataset=movielens.ML_1M,
183
      train_epochs=2,
184
      batch_size=99000,
Hongkun Yu's avatar
Hongkun Yu committed
185
      tpu=None)
186
187

  # Add ncf-specific flags
188
  flags.DEFINE_boolean(
Hongkun Yu's avatar
Hongkun Yu committed
189
190
191
      name="download_if_missing",
      default=True,
      help=flags_core.help_wrap(
192
193
          "Download data to data_dir if it is not already present."))

194
  flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
195
196
197
      name="eval_batch_size",
      default=None,
      help=flags_core.help_wrap(
198
199
200
201
202
          "The batch size used for evaluation. This should generally be larger"
          "than the training batch size as the lack of back propagation during"
          "evaluation can allow for larger batch sizes to fit in memory. If not"
          "specified, the training batch size (--batch_size) will be used."))

203
  flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
204
205
      name="num_factors",
      default=8,
206
207
208
209
      help=flags_core.help_wrap("The Embedding size of MF model."))

  # Set the default as a list of strings to be consistent with input arguments
  flags.DEFINE_list(
Hongkun Yu's avatar
Hongkun Yu committed
210
211
      name="layers",
      default=["64", "32", "16", "8"],
212
213
214
215
216
      help=flags_core.help_wrap(
          "The sizes of hidden layers for MLP. Example "
          "to specify different sizes of MLP layers: --layers=32,16,8,4"))

  flags.DEFINE_float(
Hongkun Yu's avatar
Hongkun Yu committed
217
218
      name="mf_regularization",
      default=0.,
219
220
221
222
223
224
      help=flags_core.help_wrap(
          "The regularization factor for MF embeddings. The factor is used by "
          "regularizer which allows to apply penalties on layer parameters or "
          "layer activity during optimization."))

  flags.DEFINE_list(
Hongkun Yu's avatar
Hongkun Yu committed
225
226
      name="mlp_regularization",
      default=["0.", "0.", "0.", "0."],
227
228
229
230
231
      help=flags_core.help_wrap(
          "The regularization factor for each MLP layer. See mf_regularization "
          "help for more info about regularization factor."))

  flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
232
233
      name="num_neg",
      default=4,
234
235
236
237
      help=flags_core.help_wrap(
          "The Number of negative instances to pair with a positive instance."))

  flags.DEFINE_float(
Hongkun Yu's avatar
Hongkun Yu committed
238
239
      name="learning_rate",
      default=0.001,
240
241
      help=flags_core.help_wrap("The learning rate."))

242
  flags.DEFINE_float(
Hongkun Yu's avatar
Hongkun Yu committed
243
244
      name="beta1",
      default=0.9,
245
246
247
      help=flags_core.help_wrap("beta1 hyperparameter for the Adam optimizer."))

  flags.DEFINE_float(
Hongkun Yu's avatar
Hongkun Yu committed
248
249
      name="beta2",
      default=0.999,
250
251
252
      help=flags_core.help_wrap("beta2 hyperparameter for the Adam optimizer."))

  flags.DEFINE_float(
Hongkun Yu's avatar
Hongkun Yu committed
253
254
      name="epsilon",
      default=1e-8,
255
256
257
      help=flags_core.help_wrap("epsilon hyperparameter for the Adam "
                                "optimizer."))

258
  flags.DEFINE_float(
Hongkun Yu's avatar
Hongkun Yu committed
259
260
      name="hr_threshold",
      default=1.0,
261
262
263
264
265
266
      help=flags_core.help_wrap(
          "If passed, training will stop when the evaluation metric HR is "
          "greater than or equal to hr_threshold. For dataset ml-1m, the "
          "desired hr_threshold is 0.68 which is the result from the paper; "
          "For dataset ml-20m, the threshold can be set as 0.95 which is "
          "achieved by MLPerf implementation."))
267

268
  flags.DEFINE_enum(
Hongkun Yu's avatar
Hongkun Yu committed
269
270
271
272
      name="constructor_type",
      default="bisection",
      enum_values=["bisection", "materialized"],
      case_sensitive=False,
273
274
275
276
277
      help=flags_core.help_wrap(
          "Strategy to use for generating false negatives. materialized has a"
          "precompute that scales badly, but a faster per-epoch construction"
          "time and can be faster on very large systems."))

278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
  flags.DEFINE_string(
      name="train_dataset_path",
      default=None,
      help=flags_core.help_wrap("Path to training data."))

  flags.DEFINE_string(
      name="eval_dataset_path",
      default=None,
      help=flags_core.help_wrap("Path to evaluation data."))

  flags.DEFINE_string(
      name="input_meta_data_path",
      default=None,
      help=flags_core.help_wrap("Path to input meta data file."))

293
  flags.DEFINE_bool(
Hongkun Yu's avatar
Hongkun Yu committed
294
295
      name="ml_perf",
      default=False,
296
297
298
299
300
301
302
303
304
305
306
307
308
      help=flags_core.help_wrap(
          "If set, changes the behavior of the model slightly to match the "
          "MLPerf reference implementations here: \n"
          "https://github.com/mlperf/reference/tree/master/recommendation/"
          "pytorch\n"
          "The two changes are:\n"
          "1. When computing the HR and NDCG during evaluation, remove "
          "duplicate user-item pairs before the computation. This results in "
          "better HRs and NDCGs.\n"
          "2. Use a different soring algorithm when sorting the input data, "
          "which performs better due to the fact the sorting algorithms are "
          "not stable."))

Reed's avatar
Reed committed
309
  flags.DEFINE_bool(
Hongkun Yu's avatar
Hongkun Yu committed
310
311
      name="output_ml_perf_compliance_logging",
      default=False,
Reed's avatar
Reed committed
312
313
314
315
316
317
      help=flags_core.help_wrap(
          "If set, output the MLPerf compliance logging. This is only useful "
          "if one is running the model for MLPerf. See "
          "https://github.com/mlperf/policies/blob/master/training_rules.adoc"
          "#submission-compliance-logs for details. This uses sudo and so may "
          "ask for your password, as root access is needed to clear the system "
Hongkun Yu's avatar
Hongkun Yu committed
318
          "caches, which is required for MLPerf compliance."))
Reed's avatar
Reed committed
319

320
  flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
321
322
323
      name="seed",
      default=None,
      help=flags_core.help_wrap(
324
325
          "This value will be used to seed both NumPy and TensorFlow."))

Hongkun Yu's avatar
Hongkun Yu committed
326
327
328
329
  @flags.validator(
      "eval_batch_size",
      "eval_batch_size must be at least {}".format(rconst.NUM_EVAL_NEGATIVES +
                                                   1))
330
  def eval_size_check(eval_batch_size):
Taylor Robie's avatar
Taylor Robie committed
331
332
    return (eval_batch_size is None or
            int(eval_batch_size) > rconst.NUM_EVAL_NEGATIVES)
333

334
335
336
337
  flags.DEFINE_bool(
      name="early_stopping",
      default=False,
      help=flags_core.help_wrap(
Haoyu Zhang's avatar
Haoyu Zhang committed
338
          "If True, we stop the training when it reaches hr_threshold"))
339

340
341
342
343
  flags.DEFINE_bool(
      name="keras_use_ctl",
      default=False,
      help=flags_core.help_wrap(
Haoyu Zhang's avatar
Haoyu Zhang committed
344
          "If True, we use a custom training loop for keras."))
345

Haoyu Zhang's avatar
Haoyu Zhang committed
346

Shining Sun's avatar
Shining Sun committed
347
def convert_to_softmax_logits(logits):
348
  """Convert the logits returned by the base model to softmax logits.
Shining Sun's avatar
Shining Sun committed
349

350
351
352
353
354
355
  Args:
    logits: used to create softmax.

  Returns:
    Softmax with the first column of zeros is equivalent to sigmoid.
  """
Shining Sun's avatar
Shining Sun committed
356
357
  softmax_logits = tf.concat([logits * 0, logits], axis=1)
  return softmax_logits