# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Logging utilities for benchmark. For collecting local environment metrics like CPU and memory, certain python packages need be installed. See README for details. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import datetime import json import multiprocessing import numbers import os import tensorflow as tf from tensorflow.python.client import device_lib METRIC_LOG_FILE_NAME = "metric.log" BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log" _DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ" class BenchmarkLogger(object): """Class to log the benchmark information to local disk.""" def __init__(self, logging_dir): self._logging_dir = logging_dir if not tf.gfile.IsDirectory(self._logging_dir): tf.gfile.MakeDirs(self._logging_dir) def log_estimator_evaluation_result(self, eval_results): """Log the evaluation result for a estimator. The evaluate result is a directory that contains metrics defined in model_fn. It also contains a entry for global_step which contains the value of the global step when evaluation was performed. Args: eval_results: dict, the result of evaluate() from a estimator. """ if not isinstance(eval_results, dict): tf.logging.warning("eval_results should be directory for logging. Got %s", type(eval_results)) return global_step = eval_results[tf.GraphKeys.GLOBAL_STEP] for key in sorted(eval_results): if key != tf.GraphKeys.GLOBAL_STEP: self.log_metric(key, eval_results[key], global_step=global_step) def log_metric(self, name, value, unit=None, global_step=None, extras=None): """Log the benchmark metric information to local file. Currently the logging is done in a synchronized way. This should be updated to log asynchronously. Args: name: string, the name of the metric to log. value: number, the value of the metric. The value will not be logged if it is not a number type. unit: string, the unit of the metric, E.g "image per second". global_step: int, the global_step when the metric is logged. extras: map of string:string, the extra information about the metric. """ if not isinstance(value, numbers.Number): tf.logging.warning( "Metric value to log should be a number. Got %s", type(value)) return if extras: extras = [{"name": k, "value": v} for k, v in sorted(extras.items())] else: extras = [] with tf.gfile.GFile( os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f: metric = { "name": name, "value": float(value), "unit": unit, "global_step": global_step, "timestamp": datetime.datetime.now().strftime( _DATE_TIME_FORMAT_PATTERN), "extras": extras} try: json.dump(metric, f) f.write("\n") except (TypeError, ValueError) as e: tf.logging.warning("Failed to dump metric to log file: " "name %s, value %s, error %s", name, value, e) def log_run_info(self, model_name): """Collect most of the TF runtime information for the local env. The schema of the run info follows official/benchmark/datastore/schema. Args: model_name: string, the name of the model. """ run_info = { "model_name": model_name, "machine_config": {}, "run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)} _collect_tensorflow_info(run_info) _collect_tensorflow_environment_variables(run_info) _collect_cpu_info(run_info) _collect_gpu_info(run_info) _collect_memory_info(run_info) with tf.gfile.GFile(os.path.join( self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f: try: json.dump(run_info, f) f.write("\n") except (TypeError, ValueError) as e: tf.logging.warning("Failed to dump benchmark run info to log file: %s", e) def _collect_tensorflow_info(run_info): run_info["tensorflow_version"] = { "version": tf.VERSION, "git_hash": tf.GIT_VERSION} def _collect_tensorflow_environment_variables(run_info): run_info["tensorflow_environment_variables"] = [ {"name": k, "value": v} for k, v in sorted(os.environ.items()) if k.startswith("TF_")] # The following code is mirrored from tensorflow/tools/test/system_info_lib # which is not exposed for import. def _collect_cpu_info(run_info): """Collect the CPU information for the local environment.""" cpu_info = {} cpu_info["num_cores"] = multiprocessing.cpu_count() # Note: cpuinfo is not installed in the TensorFlow OSS tree. # It is installable via pip. import cpuinfo # pylint: disable=g-import-not-at-top info = cpuinfo.get_cpu_info() cpu_info["cpu_info"] = info["brand"] cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6 run_info["machine_config"]["cpu_info"] = cpu_info def _collect_gpu_info(run_info): """Collect local GPU information by TF device library.""" gpu_info = {} local_device_protos = device_lib.list_local_devices() gpu_info["count"] = len([d for d in local_device_protos if d.device_type == "GPU"]) # The device description usually is a JSON string, which contains the GPU # model info, eg: # "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0" for d in local_device_protos: if d.device_type == "GPU": gpu_info["model"] = _parse_gpu_model(d.physical_device_desc) # Assume all the GPU connected are same model break run_info["machine_config"]["gpu_info"] = gpu_info def _collect_memory_info(run_info): # Note: psutil is not installed in the TensorFlow OSS tree. # It is installable via pip. import psutil # pylint: disable=g-import-not-at-top vmem = psutil.virtual_memory() run_info["machine_config"]["memory_total"] = vmem.total run_info["machine_config"]["memory_available"] = vmem.available def _parse_gpu_model(physical_device_desc): # Assume all the GPU connected are same model for kv in physical_device_desc.split(","): k, _, v = kv.partition(":") if k.strip() == "name": return v.strip() return None