Unverified Commit eb0c0dfd authored by Qianli Scott Zhu's avatar Qianli Scott Zhu Committed by GitHub
Browse files

Add dataset info and hyper parameter logging for benchmark. (#4152)

* Add dataset info and hyper parameter logging for benchmark.

* Address review comments.

* Address the view comment for data schema name.

* Fix test cases.

* Lint fix.
parent 8e73530e
...@@ -98,41 +98,41 @@ ...@@ -98,41 +98,41 @@
"type": "RECORD" "type": "RECORD"
}, },
{ {
"description": "The list of hyperparameters of the model.", "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
"fields": [ "fields": [
{ {
"description": "The name of the hyperparameter.", "description": "The name of the parameter.",
"mode": "REQUIRED", "mode": "REQUIRED",
"name": "name", "name": "name",
"type": "STRING" "type": "STRING"
}, },
{ {
"description": "The string value of the hyperparameter.", "description": "The string value of the parameter.",
"mode": "NULLABLE", "mode": "NULLABLE",
"name": "string_value", "name": "string_value",
"type": "STRING" "type": "STRING"
}, },
{ {
"description": "The bool value of the hyperparameter.", "description": "The bool value of the parameter.",
"mode": "NULLABLE", "mode": "NULLABLE",
"name": "bool_value", "name": "bool_value",
"type": "STRING" "type": "STRING"
}, },
{ {
"description": "The int/long value of the hyperparameter.", "description": "The int/long value of the parameter.",
"mode": "NULLABLE", "mode": "NULLABLE",
"name": "long_value", "name": "long_value",
"type": "INTEGER" "type": "INTEGER"
}, },
{ {
"description": "The double/float value of hyperparameter.", "description": "The double/float value of parameter.",
"mode": "NULLABLE", "mode": "NULLABLE",
"name": "float_value", "name": "float_value",
"type": "FLOAT" "type": "FLOAT"
} }
], ],
"mode": "REPEATED", "mode": "REPEATED",
"name": "hyperparameter", "name": "run_parameters",
"type": "RECORD" "type": "RECORD"
}, },
{ {
......
...@@ -42,6 +42,8 @@ _NUM_IMAGES = { ...@@ -42,6 +42,8 @@ _NUM_IMAGES = {
'validation': 10000, 'validation': 10000,
} }
DATASET_NAME = 'CIFAR-10'
############################################################################### ###############################################################################
# Data processing # Data processing
...@@ -237,7 +239,7 @@ def run_cifar(flags_obj): ...@@ -237,7 +239,7 @@ def run_cifar(flags_obj):
or input_fn) or input_fn)
resnet_run_loop.resnet_main( resnet_run_loop.resnet_main(
flags_obj, cifar10_model_fn, input_function, flags_obj, cifar10_model_fn, input_function, DATASET_NAME,
shape=[_HEIGHT, _WIDTH, _NUM_CHANNELS]) shape=[_HEIGHT, _WIDTH, _NUM_CHANNELS])
......
...@@ -41,6 +41,7 @@ _NUM_IMAGES = { ...@@ -41,6 +41,7 @@ _NUM_IMAGES = {
_NUM_TRAIN_FILES = 1024 _NUM_TRAIN_FILES = 1024
_SHUFFLE_BUFFER = 1500 _SHUFFLE_BUFFER = 1500
DATASET_NAME = 'ImageNet'
############################################################################### ###############################################################################
# Data processing # Data processing
...@@ -312,7 +313,7 @@ def run_imagenet(flags_obj): ...@@ -312,7 +313,7 @@ def run_imagenet(flags_obj):
or input_fn) or input_fn)
resnet_run_loop.resnet_main( resnet_run_loop.resnet_main(
flags_obj, imagenet_model_fn, input_function, flags_obj, imagenet_model_fn, input_function, DATASET_NAME,
shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS]) shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
......
...@@ -331,7 +331,8 @@ def per_device_batch_size(batch_size, num_gpus): ...@@ -331,7 +331,8 @@ def per_device_batch_size(batch_size, num_gpus):
return int(batch_size / num_gpus) return int(batch_size / num_gpus)
def resnet_main(flags_obj, model_function, input_function, shape=None): def resnet_main(
flags_obj, model_function, input_function, dataset_name, shape=None):
"""Shared main loop for ResNet Models. """Shared main loop for ResNet Models.
Args: Args:
...@@ -342,6 +343,8 @@ def resnet_main(flags_obj, model_function, input_function, shape=None): ...@@ -342,6 +343,8 @@ def resnet_main(flags_obj, model_function, input_function, shape=None):
input_function: the function that processes the dataset and returns a input_function: the function that processes the dataset and returns a
dataset that the estimator can train on. This will be wrapped with dataset that the estimator can train on. This will be wrapped with
all the relevant flags for running and passed to estimator. all the relevant flags for running and passed to estimator.
dataset_name: the name of the dataset for training and evaluation. This is
used for logging purpose.
shape: list of ints representing the shape of the images used for training. shape: list of ints representing the shape of the images used for training.
This is only used if flags_obj.export_dir is passed. This is only used if flags_obj.export_dir is passed.
""" """
...@@ -381,8 +384,16 @@ def resnet_main(flags_obj, model_function, input_function, shape=None): ...@@ -381,8 +384,16 @@ def resnet_main(flags_obj, model_function, input_function, shape=None):
'dtype': flags_core.get_tf_dtype(flags_obj) 'dtype': flags_core.get_tf_dtype(flags_obj)
}) })
run_params = {
'batch_size': flags_obj.batch_size,
'dtype': flags_core.get_tf_dtype(flags_obj),
'resnet_size': flags_obj.resnet_size,
'resnet_version': flags_obj.version,
'synthetic_data': flags_obj.use_synthetic_data,
'train_epochs': flags_obj.train_epochs,
}
benchmark_logger = logger.config_benchmark_logger(flags_obj.benchmark_log_dir) benchmark_logger = logger.config_benchmark_logger(flags_obj.benchmark_log_dir)
benchmark_logger.log_run_info('resnet') benchmark_logger.log_run_info('resnet', dataset_name, run_params)
train_hooks = hooks_helper.get_train_hooks( train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks, flags_obj.hooks,
......
...@@ -109,8 +109,9 @@ class BaseBenchmarkLogger(object): ...@@ -109,8 +109,9 @@ class BaseBenchmarkLogger(object):
"Name %s, value %d, unit %s, global_step %d, extras %s", "Name %s, value %d, unit %s, global_step %d, extras %s",
name, value, unit, global_step, extras) name, value, unit, global_step, extras)
def log_run_info(self, model_name): def log_run_info(self, model_name, dataset_name, run_params):
tf.logging.info("Benchmark run: %s", _gather_run_info(model_name)) tf.logging.info("Benchmark run: %s",
_gather_run_info(model_name, dataset_name, run_params))
class BenchmarkFileLogger(BaseBenchmarkLogger): class BenchmarkFileLogger(BaseBenchmarkLogger):
...@@ -159,15 +160,18 @@ class BenchmarkFileLogger(BaseBenchmarkLogger): ...@@ -159,15 +160,18 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
tf.logging.warning("Failed to dump metric to log file: " tf.logging.warning("Failed to dump metric to log file: "
"name %s, value %s, error %s", name, value, e) "name %s, value %s, error %s", name, value, e)
def log_run_info(self, model_name): def log_run_info(self, model_name, dataset_name, run_params):
"""Collect most of the TF runtime information for the local env. """Collect most of the TF runtime information for the local env.
The schema of the run info follows official/benchmark/datastore/schema. The schema of the run info follows official/benchmark/datastore/schema.
Args: Args:
model_name: string, the name of the model. model_name: string, the name of the model.
dataset_name: string, the name of dataset for training and evaluation.
run_params: dict, the dictionary of parameters for the run, it could
include hyperparameters or other params that are important for the run.
""" """
run_info = _gather_run_info(model_name) run_info = _gather_run_info(model_name, dataset_name, run_params)
with tf.gfile.GFile(os.path.join( with tf.gfile.GFile(os.path.join(
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f: self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
...@@ -179,15 +183,17 @@ class BenchmarkFileLogger(BaseBenchmarkLogger): ...@@ -179,15 +183,17 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
e) e)
def _gather_run_info(model_name): def _gather_run_info(model_name, dataset_name, run_params):
"""Collect the benchmark run information for the local environment.""" """Collect the benchmark run information for the local environment."""
run_info = { run_info = {
"model_name": model_name, "model_name": model_name,
"dataset": {"name": dataset_name},
"machine_config": {}, "machine_config": {},
"run_date": datetime.datetime.utcnow().strftime( "run_date": datetime.datetime.utcnow().strftime(
_DATE_TIME_FORMAT_PATTERN)} _DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info) _collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info) _collect_tensorflow_environment_variables(run_info)
_collect_run_params(run_info, run_params)
_collect_cpu_info(run_info) _collect_cpu_info(run_info)
_collect_gpu_info(run_info) _collect_gpu_info(run_info)
_collect_memory_info(run_info) _collect_memory_info(run_info)
...@@ -199,6 +205,21 @@ def _collect_tensorflow_info(run_info): ...@@ -199,6 +205,21 @@ def _collect_tensorflow_info(run_info):
"version": tf.VERSION, "git_hash": tf.GIT_VERSION} "version": tf.VERSION, "git_hash": tf.GIT_VERSION}
def _collect_run_params(run_info, run_params):
"""Log the parameter information for the benchmark run."""
def process_param(name, value):
type_check = {
str: {"name": name, "string_value": value},
int: {"name": name, "long_value": value},
bool: {"name": name, "bool_value": str(value)},
float: {"name": name, "float_value": value},
}
return type_check.get(type(value),
{"name": name, "string_value": str(value)})
if run_params:
run_info["run_parameters"] = [
process_param(k, v) for k, v in sorted(run_params.items())]
def _collect_tensorflow_environment_variables(run_info): def _collect_tensorflow_environment_variables(run_info):
run_info["tensorflow_environment_variables"] = [ run_info["tensorflow_environment_variables"] = [
{"name": k, "value": v} {"name": k, "value": v}
......
...@@ -180,6 +180,32 @@ class BenchmarkFileLoggerTest(tf.test.TestCase): ...@@ -180,6 +180,32 @@ class BenchmarkFileLoggerTest(tf.test.TestCase):
self.assertEqual(run_info["tensorflow_version"]["version"], tf.VERSION) self.assertEqual(run_info["tensorflow_version"]["version"], tf.VERSION)
self.assertEqual(run_info["tensorflow_version"]["git_hash"], tf.GIT_VERSION) self.assertEqual(run_info["tensorflow_version"]["git_hash"], tf.GIT_VERSION)
def test_collect_run_params(self):
run_info = {}
run_parameters = {
"batch_size": 32,
"synthetic_data": True,
"train_epochs": 100.00,
"dtype": "fp16",
"resnet_size": 50,
"random_tensor": tf.constant(2.0)
}
logger._collect_run_params(run_info, run_parameters)
self.assertEqual(len(run_info["run_parameters"]), 6)
self.assertEqual(run_info["run_parameters"][0],
{"name": "batch_size", "long_value": 32})
self.assertEqual(run_info["run_parameters"][1],
{"name": "dtype", "string_value": "fp16"})
self.assertEqual(run_info["run_parameters"][2],
{"name": "random_tensor", "string_value":
"Tensor(\"Const:0\", shape=(), dtype=float32)"})
self.assertEqual(run_info["run_parameters"][3],
{"name": "resnet_size", "long_value": 50})
self.assertEqual(run_info["run_parameters"][4],
{"name": "synthetic_data", "bool_value": "True"})
self.assertEqual(run_info["run_parameters"][5],
{"name": "train_epochs", "float_value": 100.00})
def test_collect_tensorflow_environment_variables(self): def test_collect_tensorflow_environment_variables(self):
os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1" os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
os.environ["TF_OTHER"] = "2" os.environ["TF_OTHER"] = "2"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment