logger.py

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Logging utilities for benchmark.

For collecting local environment metrics like CPU and memory, certain python
packages need be installed. See README for details.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import json
import multiprocessing
import numbers
import os
import threading

import tensorflow as tf
from tensorflow.python.client import device_lib

METRIC_LOG_FILE_NAME = "metric.log"
BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"


# Don't use it directly. Use get_benchmark_logger to access a logger.
_benchmark_logger = None
_logger_lock = threading.Lock()


def config_benchmark_logger(logging_dir):
  """Config the global benchmark logger"""
  _logger_lock.acquire()
  try:
    global _benchmark_logger
    if logging_dir:
      _benchmark_logger = BenchmarkFileLogger(logging_dir)
    else:
      _benchmark_logger = BaseBenchmarkLogger()
  finally:
    _logger_lock.release()
  return _benchmark_logger


def get_benchmark_logger():
  if not _benchmark_logger:
    config_benchmark_logger(None)

  return _benchmark_logger


class BaseBenchmarkLogger(object):
  """Class to log the benchmark information to STDOUT."""

  def log_evaluation_result(self, eval_results):
    """Log the evaluation result.

    The evaluate result is a dictionary that contains metrics defined in
    model_fn. It also contains a entry for global_step which contains the value
    of the global step when evaluation was performed.

    Args:
      eval_results: dict, the result of evaluate.
    """
    if not isinstance(eval_results, dict):
      tf.logging.warning("eval_results should be dictionary for logging. "
                         "Got %s", type(eval_results))
      return
    global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
    for key in sorted(eval_results):
      if key != tf.GraphKeys.GLOBAL_STEP:
        self.log_metric(key, eval_results[key], global_step=global_step)

  def log_metric(self, name, value, unit=None, global_step=None, extras=None):
    """Log the benchmark metric information to local file.

    Currently the logging is done in a synchronized way. This should be updated
    to log asynchronously.

    Args:
      name: string, the name of the metric to log.
      value: number, the value of the metric. The value will not be logged if it
        is not a number type.
      unit: string, the unit of the metric, E.g "image per second".
      global_step: int, the global_step when the metric is logged.
      extras: map of string:string, the extra information about the metric.
    """
    if not isinstance(value, numbers.Number):
      tf.logging.warning(
          "Metric value to log should be a number. Got %s", type(value))
      return
    extras = _convert_to_json_dict(extras)

    tf.logging.info("Benchmark metric: "
                    "Name %s, value %d, unit %s, global_step %d, extras %s",
                    name, value, unit, global_step, extras)

  def log_run_info(self, model_name):
    tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))


class BenchmarkFileLogger(BaseBenchmarkLogger):
  """Class to log the benchmark information to local disk."""

  def __init__(self, logging_dir):
    super(BenchmarkFileLogger, self).__init__()
    self._logging_dir = logging_dir
    if not tf.gfile.IsDirectory(self._logging_dir):
      tf.gfile.MakeDirs(self._logging_dir)

  def log_metric(self, name, value, unit=None, global_step=None, extras=None):
    """Log the benchmark metric information to local file.

    Currently the logging is done in a synchronized way. This should be updated
    to log asynchronously.

    Args:
      name: string, the name of the metric to log.
      value: number, the value of the metric. The value will not be logged if it
        is not a number type.
      unit: string, the unit of the metric, E.g "image per second".
      global_step: int, the global_step when the metric is logged.
      extras: map of string:string, the extra information about the metric.
    """
    if not isinstance(value, numbers.Number):
      tf.logging.warning(
          "Metric value to log should be a number. Got %s", type(value))
      return
    extras = _convert_to_json_dict(extras)

    with tf.gfile.GFile(
        os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
      metric = {
          "name": name,
          "value": float(value),
          "unit": unit,
          "global_step": global_step,
          "timestamp": datetime.datetime.now().strftime(
              _DATE_TIME_FORMAT_PATTERN),
          "extras": extras}
      try:
        json.dump(metric, f)
        f.write("\n")
      except (TypeError, ValueError) as e:
        tf.logging.warning("Failed to dump metric to log file: "
                           "name %s, value %s, error %s", name, value, e)

  def log_run_info(self, model_name):
    """Collect most of the TF runtime information for the local env.

    The schema of the run info follows official/benchmark/datastore/schema.

    Args:
      model_name: string, the name of the model.
    """
    run_info = _gather_run_info(model_name)

    with tf.gfile.GFile(os.path.join(
        self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
      try:
        json.dump(run_info, f)
        f.write("\n")
      except (TypeError, ValueError) as e:
        tf.logging.warning("Failed to dump benchmark run info to log file: %s",
                           e)


def _gather_run_info(model_name):
  """Collect the benchmark run information for the local environment."""
  run_info = {
      "model_name": model_name,
      "machine_config": {},
      "run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
  _collect_tensorflow_info(run_info)
  _collect_tensorflow_environment_variables(run_info)
  _collect_cpu_info(run_info)
  _collect_gpu_info(run_info)
  _collect_memory_info(run_info)
  return run_info


def _collect_tensorflow_info(run_info):
  run_info["tensorflow_version"] = {
      "version": tf.VERSION, "git_hash": tf.GIT_VERSION}


def _collect_tensorflow_environment_variables(run_info):
  run_info["tensorflow_environment_variables"] = [
      {"name": k, "value": v}
      for k, v in sorted(os.environ.items()) if k.startswith("TF_")]


# The following code is mirrored from tensorflow/tools/test/system_info_lib
# which is not exposed for import.
def _collect_cpu_info(run_info):
  """Collect the CPU information for the local environment."""
  cpu_info = {}

  cpu_info["num_cores"] = multiprocessing.cpu_count()

  # Note: cpuinfo is not installed in the TensorFlow OSS tree.
  # It is installable via pip.
  import cpuinfo    # pylint: disable=g-import-not-at-top

  info = cpuinfo.get_cpu_info()
  cpu_info["cpu_info"] = info["brand"]
  cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6

  run_info["machine_config"]["cpu_info"] = cpu_info


def _collect_gpu_info(run_info):
  """Collect local GPU information by TF device library."""
  gpu_info = {}
  local_device_protos = device_lib.list_local_devices()

  gpu_info["count"] = len([d for d in local_device_protos
                           if d.device_type == "GPU"])
  # The device description usually is a JSON string, which contains the GPU
  # model info, eg:
  # "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0"
  for d in local_device_protos:
    if d.device_type == "GPU":
      gpu_info["model"] = _parse_gpu_model(d.physical_device_desc)
      # Assume all the GPU connected are same model
      break
  run_info["machine_config"]["gpu_info"] = gpu_info


def _collect_memory_info(run_info):
  # Note: psutil is not installed in the TensorFlow OSS tree.
  # It is installable via pip.
  import psutil   # pylint: disable=g-import-not-at-top
  vmem = psutil.virtual_memory()
  run_info["machine_config"]["memory_total"] = vmem.total
  run_info["machine_config"]["memory_available"] = vmem.available


def _parse_gpu_model(physical_device_desc):
  # Assume all the GPU connected are same model
  for kv in physical_device_desc.split(","):
    k, _, v = kv.partition(":")
    if k.strip() == "name":
      return v.strip()
  return None


def _convert_to_json_dict(input_dict):
  if input_dict:
    return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
  else:
    return []