Add dataset info and hyper parameter logging for benchmark. (#4152)

* Add dataset info and hyper parameter logging for benchmark. * Address review comments. * Address the view comment for data schema name. * Fix test cases. * Lint fix.

Add dataset info and hyper parameter logging for benchmark. (#4152)
* Add dataset info and hyper parameter logging for benchmark. * Address review comments. * Address the view comment for data schema name. * Fix test cases. * Lint fix.
eb0c0dfd · Qianli Scott Zhu · GitHub · 8e73530e · eb0c0dfd · eb0c0dfd
Unverified Commit eb0c0dfd authored May 03, 2018 by Qianli Scott Zhu Committed by GitHub May 03, 2018
6 changed files
--- a/official/benchmark/datastore/schema/benchmark_run.json
+++ b/official/benchmark/datastore/schema/benchmark_run.json
@@ -98,41 +98,41 @@
    "type": "RECORD"
  },
  {
-    "description": "The list of hyperparameters of the model.",
+    "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
    "fields": [
      {
-        "description": "The name of the hyperparameter.",
+        "description": "The name of the parameter.",
        "mode": "REQUIRED",
        "name": "name",
        "type": "STRING"
      },
      {
-        "description": "The string value of the hyperparameter.",
+        "description": "The string value of the parameter.",
        "mode": "NULLABLE",
        "name": "string_value",
        "type": "STRING"
      },
      {
-        "description": "The bool value of the hyperparameter.",
+        "description": "The bool value of the parameter.",
        "mode": "NULLABLE",
        "name": "bool_value",
        "type": "STRING"
      },
      {
-        "description": "The int/long value of the hyperparameter.",
+        "description": "The int/long value of the parameter.",
        "mode": "NULLABLE",
        "name": "long_value",
        "type": "INTEGER"
      },
      {
-        "description": "The double/float value of hyperparameter.",
+        "description": "The double/float value of parameter.",
        "mode": "NULLABLE",
        "name": "float_value",
        "type": "FLOAT"
      }
    ],
    "mode": "REPEATED",
-    "name": "hyperparameter",
+    "name": "run_parameters",
    "type": "RECORD"
  },
  {

--- a/official/resnet/cifar10_main.py
+++ b/official/resnet/cifar10_main.py
@@ -42,6 +42,8 @@ _NUM_IMAGES = {
    'validation': 10000,
 }

+DATASET_NAME = 'CIFAR-10'
+

 ###############################################################################
 # Data processing
@@ -237,7 +239,7 @@ def run_cifar(flags_obj):
                    or input_fn)

  resnet_run_loop.resnet_main(
-      flags_obj, cifar10_model_fn, input_function,
+      flags_obj, cifar10_model_fn, input_function, DATASET_NAME,
      shape=[_HEIGHT, _WIDTH, _NUM_CHANNELS])



--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -41,6 +41,7 @@ _NUM_IMAGES = {
 _NUM_TRAIN_FILES = 1024
 _SHUFFLE_BUFFER = 1500

+DATASET_NAME = 'ImageNet'

 ###############################################################################
 # Data processing
@@ -312,7 +313,7 @@ def run_imagenet(flags_obj):
                    or input_fn)

  resnet_run_loop.resnet_main(
-      flags_obj, imagenet_model_fn, input_function,
+      flags_obj, imagenet_model_fn, input_function, DATASET_NAME,
      shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])



--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -331,7 +331,8 @@ def per_device_batch_size(batch_size, num_gpus):
  return int(batch_size / num_gpus)


-def resnet_main(flags_obj, model_function, input_function, shape=None):
+def resnet_main(
+    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
@@ -342,6 +343,8 @@ def resnet_main(flags_obj, model_function, input_function, shape=None):
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
+    dataset_name: the name of the dataset for training and evaluation. This is
+      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """
@@ -381,8 +384,16 @@ def resnet_main(flags_obj, model_function, input_function, shape=None):
          'dtype': flags_core.get_tf_dtype(flags_obj)
      })

+  run_params = {
+      'batch_size': flags_obj.batch_size,
+      'dtype': flags_core.get_tf_dtype(flags_obj),
+      'resnet_size': flags_obj.resnet_size,
+      'resnet_version': flags_obj.version,
+      'synthetic_data': flags_obj.use_synthetic_data,
+      'train_epochs': flags_obj.train_epochs,
+  }
  benchmark_logger = logger.config_benchmark_logger(flags_obj.benchmark_log_dir)
-  benchmark_logger.log_run_info('resnet')
+  benchmark_logger.log_run_info('resnet', dataset_name, run_params)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,

--- a/official/utils/logs/logger.py
+++ b/official/utils/logs/logger.py
@@ -109,8 +109,9 @@ class BaseBenchmarkLogger(object):
                    "Name %s, value %d, unit %s, global_step %d, extras %s",
                    name, value, unit, global_step, extras)

-  def log_run_info(self, model_name):
-    tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))
+  def log_run_info(self, model_name, dataset_name, run_params):
+    tf.logging.info("Benchmark run: %s",
+                    _gather_run_info(model_name, dataset_name, run_params))


 class BenchmarkFileLogger(BaseBenchmarkLogger):
@@ -159,15 +160,18 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
        tf.logging.warning("Failed to dump metric to log file: "
                           "name %s, value %s, error %s", name, value, e)

-  def log_run_info(self, model_name):
+  def log_run_info(self, model_name, dataset_name, run_params):
    """Collect most of the TF runtime information for the local env.

    The schema of the run info follows official/benchmark/datastore/schema.

    Args:
      model_name: string, the name of the model.
+      dataset_name: string, the name of dataset for training and evaluation.
+      run_params: dict, the dictionary of parameters for the run, it could
+        include hyperparameters or other params that are important for the run.
    """
-    run_info = _gather_run_info(model_name)
+    run_info = _gather_run_info(model_name, dataset_name, run_params)

    with tf.gfile.GFile(os.path.join(
        self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
@@ -179,15 +183,17 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
                           e)


-def _gather_run_info(model_name):
+def _gather_run_info(model_name, dataset_name, run_params):
  """Collect the benchmark run information for the local environment."""
  run_info = {
      "model_name": model_name,
+      "dataset": {"name": dataset_name},
      "machine_config": {},
      "run_date": datetime.datetime.utcnow().strftime(
          _DATE_TIME_FORMAT_PATTERN)}
  _collect_tensorflow_info(run_info)
  _collect_tensorflow_environment_variables(run_info)
+  _collect_run_params(run_info, run_params)
  _collect_cpu_info(run_info)
  _collect_gpu_info(run_info)
  _collect_memory_info(run_info)
@@ -199,6 +205,21 @@ def _collect_tensorflow_info(run_info):
      "version": tf.VERSION, "git_hash": tf.GIT_VERSION}


+def _collect_run_params(run_info, run_params):
+  """Log the parameter information for the benchmark run."""
+  def process_param(name, value):
+    type_check = {
+        str: {"name": name, "string_value": value},
+        int: {"name": name, "long_value": value},
+        bool: {"name": name, "bool_value": str(value)},
+        float: {"name": name, "float_value": value},
+    }
+    return type_check.get(type(value),
+                          {"name": name, "string_value": str(value)})
+  if run_params:
+    run_info["run_parameters"] = [
+        process_param(k, v) for k, v in sorted(run_params.items())]
+
 def _collect_tensorflow_environment_variables(run_info):
  run_info["tensorflow_environment_variables"] = [
      {"name": k, "value": v}

--- a/official/utils/logs/logger_test.py
+++ b/official/utils/logs/logger_test.py
@@ -180,6 +180,32 @@ class BenchmarkFileLoggerTest(tf.test.TestCase):
    self.assertEqual(run_info["tensorflow_version"]["version"], tf.VERSION)
    self.assertEqual(run_info["tensorflow_version"]["git_hash"], tf.GIT_VERSION)

+  def test_collect_run_params(self):
+    run_info = {}
+    run_parameters = {
+        "batch_size": 32,
+        "synthetic_data": True,
+        "train_epochs": 100.00,
+        "dtype": "fp16",
+        "resnet_size": 50,
+        "random_tensor": tf.constant(2.0)
+    }
+    logger._collect_run_params(run_info, run_parameters)
+    self.assertEqual(len(run_info["run_parameters"]), 6)
+    self.assertEqual(run_info["run_parameters"][0],
+                     {"name": "batch_size", "long_value": 32})
+    self.assertEqual(run_info["run_parameters"][1],
+                     {"name": "dtype", "string_value": "fp16"})
+    self.assertEqual(run_info["run_parameters"][2],
+                     {"name": "random_tensor", "string_value":
+                          "Tensor(\"Const:0\", shape=(), dtype=float32)"})
+    self.assertEqual(run_info["run_parameters"][3],
+                     {"name": "resnet_size", "long_value": 50})
+    self.assertEqual(run_info["run_parameters"][4],
+                     {"name": "synthetic_data", "bool_value": "True"})
+    self.assertEqual(run_info["run_parameters"][5],
+                     {"name": "train_epochs", "float_value": 100.00})
+
  def test_collect_tensorflow_environment_variables(self):
    os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
    os.environ["TF_OTHER"] = "2"