Commit 0e4029f0 authored by Hongkun Yu's avatar Hongkun Yu Committed by A. Unique TensorFlower
Browse files

BenchmarkBigQueryLogger is never used.

The logger was probably replaced by perfzero(?).

PiperOrigin-RevId: 307756692
parent 50dd4b4c
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to upload benchmark generated by BenchmarkLogger to remote repo.
This library require google cloud bigquery lib as dependency, which can be
installed with:
> pip install --upgrade google-cloud-bigquery
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
from google.cloud import bigquery
from google.cloud import exceptions
import tensorflow as tf
class BigQueryUploader(object):
"""Upload the benchmark and metric info from JSON input to BigQuery. """
def __init__(self, gcp_project=None, credentials=None):
"""Initialized BigQueryUploader with proper setting.
Args:
gcp_project: string, the name of the GCP project that the log will be
uploaded to. The default project name will be detected from local
environment if no value is provided.
credentials: google.auth.credentials. The credential to access the
BigQuery service. The default service account credential will be
detected from local environment if no value is provided. Please use
google.oauth2.service_account.Credentials to load credential from local
file for the case that the test is run out side of GCP.
"""
self._bq_client = bigquery.Client(
project=gcp_project, credentials=credentials)
def upload_benchmark_run_json(
self, dataset_name, table_name, run_id, run_json):
"""Upload benchmark run information to Bigquery.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the data will be uploaded.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format.
run_json: dict, the JSON data that contains the benchmark run info.
"""
run_json["model_id"] = run_id
self._upload_json(dataset_name, table_name, [run_json])
def upload_benchmark_metric_json(
self, dataset_name, table_name, run_id, metric_json_list):
"""Upload metric information to Bigquery.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the metric data will be uploaded. This is different from the
benchmark_run table.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format. This should be the same as the benchmark run_id.
metric_json_list: list, a list of JSON object that record the metric info.
"""
for m in metric_json_list:
m["run_id"] = run_id
self._upload_json(dataset_name, table_name, metric_json_list)
def upload_benchmark_run_file(
self, dataset_name, table_name, run_id, run_json_file):
"""Upload benchmark run information to Bigquery from input json file.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the data will be uploaded.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format.
run_json_file: string, the file path that contains the run JSON data.
"""
with tf.io.gfile.GFile(run_json_file) as f:
benchmark_json = json.load(f)
self.upload_benchmark_run_json(
dataset_name, table_name, run_id, benchmark_json)
def upload_metric_file(
self, dataset_name, table_name, run_id, metric_json_file):
"""Upload metric information to Bigquery from input json file.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the metric data will be uploaded. This is different from the
benchmark_run table.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format. This should be the same as the benchmark run_id.
metric_json_file: string, the file path that contains the metric JSON
data.
"""
with tf.io.gfile.GFile(metric_json_file) as f:
metrics = []
for line in f:
metrics.append(json.loads(line.strip()))
self.upload_benchmark_metric_json(
dataset_name, table_name, run_id, metrics)
def _upload_json(self, dataset_name, table_name, json_list):
# Find the unique table reference based on dataset and table name, so that
# the data can be inserted to it.
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, json_list)
if errors:
tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))
def insert_run_status(self, dataset_name, table_name, run_id, run_status):
"""Insert the run status in to Bigquery run status table."""
query = ("INSERT {ds}.{tb} "
"(run_id, status) "
"VALUES('{rid}', '{status}')").format(
ds=dataset_name, tb=table_name, rid=run_id, status=run_status)
try:
self._bq_client.query(query=query).result()
except exceptions.GoogleCloudError as e:
tf.logging.error("Failed to insert run status: %s", e)
def update_run_status(self, dataset_name, table_name, run_id, run_status):
"""Update the run status in in Bigquery run status table."""
query = ("UPDATE {ds}.{tb} "
"SET status = '{status}' "
"WHERE run_id = '{rid}'").format(
ds=dataset_name, tb=table_name, status=run_status, rid=run_id)
try:
self._bq_client.query(query=query).result()
except exceptions.GoogleCloudError as e:
tf.logging.error("Failed to update run status: %s", e)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Binary to upload benchmark generated by BenchmarkLogger to remote repo.
This library require google cloud bigquery lib as dependency, which can be
installed with:
> pip install --upgrade google-cloud-bigquery
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import uuid
from absl import app as absl_app
from absl import flags
from official.benchmark import benchmark_uploader
from official.utils.flags import core as flags_core
from official.utils.logs import logger
def main(_):
if not flags.FLAGS.benchmark_log_dir:
print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir")
sys.exit(1)
uploader = benchmark_uploader.BigQueryUploader(
gcp_project=flags.FLAGS.gcp_project)
run_id = str(uuid.uuid4())
run_json_file = os.path.join(
flags.FLAGS.benchmark_log_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME)
metric_json_file = os.path.join(
flags.FLAGS.benchmark_log_dir, logger.METRIC_LOG_FILE_NAME)
uploader.upload_benchmark_run_file(
flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_run_table, run_id,
run_json_file)
uploader.upload_metric_file(
flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_metric_table, run_id,
metric_json_file)
# Assume the run finished successfully before user invoke the upload script.
uploader.insert_run_status(
flags.FLAGS.bigquery_data_set, flags.FLAGS.bigquery_run_status_table,
run_id, logger.RUN_STATUS_SUCCESS)
if __name__ == "__main__":
flags_core.define_benchmark()
flags.adopt_module_key_flags(flags_core)
absl_app.run(main=main)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark_uploader."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import tempfile
import unittest
from mock import MagicMock
from mock import patch
import tensorflow as tf # pylint: disable=g-bad-import-order
try:
from google.cloud import bigquery
from official.benchmark import benchmark_uploader
except ImportError:
bigquery = None
benchmark_uploader = None
@unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
class BigQueryUploaderTest(tf.test.TestCase):
@patch.object(bigquery, "Client")
def setUp(self, mock_bigquery):
self.mock_client = mock_bigquery.return_value
self.mock_dataset = MagicMock(name="dataset")
self.mock_table = MagicMock(name="table")
self.mock_client.dataset.return_value = self.mock_dataset
self.mock_dataset.table.return_value = self.mock_table
self.mock_client.insert_rows_json.return_value = []
self.benchmark_uploader = benchmark_uploader.BigQueryUploader()
self.benchmark_uploader._bq_client = self.mock_client
self.log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
with open(os.path.join(self.log_dir, "metric.log"), "a") as f:
json.dump({"name": "accuracy", "value": 1.0}, f)
f.write("\n")
json.dump({"name": "loss", "value": 0.5}, f)
f.write("\n")
with open(os.path.join(self.log_dir, "run.log"), "w") as f:
json.dump({"model_name": "value"}, f)
def tearDown(self):
tf.io.gfile.rmtree(self.get_temp_dir())
def test_upload_benchmark_run_json(self):
self.benchmark_uploader.upload_benchmark_run_json(
"dataset", "table", "run_id", {"model_name": "value"})
self.mock_client.insert_rows_json.assert_called_once_with(
self.mock_table, [{"model_name": "value", "model_id": "run_id"}])
def test_upload_benchmark_metric_json(self):
metric_json_list = [
{"name": "accuracy", "value": 1.0},
{"name": "loss", "value": 0.5}
]
expected_params = [
{"run_id": "run_id", "name": "accuracy", "value": 1.0},
{"run_id": "run_id", "name": "loss", "value": 0.5}
]
self.benchmark_uploader.upload_benchmark_metric_json(
"dataset", "table", "run_id", metric_json_list)
self.mock_client.insert_rows_json.assert_called_once_with(
self.mock_table, expected_params)
def test_upload_benchmark_run_file(self):
self.benchmark_uploader.upload_benchmark_run_file(
"dataset", "table", "run_id", os.path.join(self.log_dir, "run.log"))
self.mock_client.insert_rows_json.assert_called_once_with(
self.mock_table, [{"model_name": "value", "model_id": "run_id"}])
def test_upload_metric_file(self):
self.benchmark_uploader.upload_metric_file(
"dataset", "table", "run_id",
os.path.join(self.log_dir, "metric.log"))
expected_params = [
{"run_id": "run_id", "name": "accuracy", "value": 1.0},
{"run_id": "run_id", "name": "loss", "value": 0.5}
]
self.mock_client.insert_rows_json.assert_called_once_with(
self.mock_table, expected_params)
def test_insert_run_status(self):
self.benchmark_uploader.insert_run_status(
"dataset", "table", "run_id", "status")
expected_query = ("INSERT dataset.table "
"(run_id, status) "
"VALUES('run_id', 'status')")
self.mock_client.query.assert_called_once_with(query=expected_query)
def test_update_run_status(self):
self.benchmark_uploader.update_run_status(
"dataset", "table", "run_id", "status")
expected_query = ("UPDATE dataset.table "
"SET status = 'status' "
"WHERE run_id = 'run_id'")
self.mock_client.query.assert_called_once_with(query=expected_query)
if __name__ == "__main__":
tf.test.main()
...@@ -46,8 +46,7 @@ def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True): ...@@ -46,8 +46,7 @@ def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True):
flags.DEFINE_enum( flags.DEFINE_enum(
name="benchmark_logger_type", default="BaseBenchmarkLogger", name="benchmark_logger_type", default="BaseBenchmarkLogger",
enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger", enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger"],
"BenchmarkBigQueryLogger"],
help=help_wrap("The type of benchmark logger to use. Defaults to using " help=help_wrap("The type of benchmark logger to use. Defaults to using "
"BaseBenchmarkLogger which logs to STDOUT. Different " "BaseBenchmarkLogger which logs to STDOUT. Different "
"loggers will require other flags to be able to work.")) "loggers will require other flags to be able to work."))
......
...@@ -68,16 +68,6 @@ def config_benchmark_logger(flag_obj=None): ...@@ -68,16 +68,6 @@ def config_benchmark_logger(flag_obj=None):
_benchmark_logger = BaseBenchmarkLogger() _benchmark_logger = BaseBenchmarkLogger()
elif flag_obj.benchmark_logger_type == "BenchmarkFileLogger": elif flag_obj.benchmark_logger_type == "BenchmarkFileLogger":
_benchmark_logger = BenchmarkFileLogger(flag_obj.benchmark_log_dir) _benchmark_logger = BenchmarkFileLogger(flag_obj.benchmark_log_dir)
elif flag_obj.benchmark_logger_type == "BenchmarkBigQueryLogger":
from official.benchmark import benchmark_uploader as bu # pylint: disable=g-import-not-at-top
bq_uploader = bu.BigQueryUploader(gcp_project=flag_obj.gcp_project)
_benchmark_logger = BenchmarkBigQueryLogger(
bigquery_uploader=bq_uploader,
bigquery_data_set=flag_obj.bigquery_data_set,
bigquery_run_table=flag_obj.bigquery_run_table,
bigquery_run_status_table=flag_obj.bigquery_run_status_table,
bigquery_metric_table=flag_obj.bigquery_metric_table,
run_id=str(uuid.uuid4()))
else: else:
raise ValueError("Unrecognized benchmark_logger_type: %s" raise ValueError("Unrecognized benchmark_logger_type: %s"
% flag_obj.benchmark_logger_type) % flag_obj.benchmark_logger_type)
...@@ -219,86 +209,6 @@ class BenchmarkFileLogger(BaseBenchmarkLogger): ...@@ -219,86 +209,6 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
self._metric_file_handler.close() self._metric_file_handler.close()
class BenchmarkBigQueryLogger(BaseBenchmarkLogger):
"""Class to log the benchmark information to BigQuery data store."""
def __init__(self,
bigquery_uploader,
bigquery_data_set,
bigquery_run_table,
bigquery_run_status_table,
bigquery_metric_table,
run_id):
super(BenchmarkBigQueryLogger, self).__init__()
self._bigquery_uploader = bigquery_uploader
self._bigquery_data_set = bigquery_data_set
self._bigquery_run_table = bigquery_run_table
self._bigquery_run_status_table = bigquery_run_status_table
self._bigquery_metric_table = bigquery_metric_table
self._run_id = run_id
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to bigquery.
Args:
name: string, the name of the metric to log.
value: number, the value of the metric. The value will not be logged if it
is not a number type.
unit: string, the unit of the metric, E.g "image per second".
global_step: int, the global_step when the metric is logged.
extras: map of string:string, the extra information about the metric.
"""
metric = _process_metric_to_json(name, value, unit, global_step, extras)
if metric:
# Starting new thread for bigquery upload in case it might take long time
# and impact the benchmark and performance measurement. Starting a new
# thread might have potential performance impact for model that run on
# CPU.
thread.start_new_thread(
self._bigquery_uploader.upload_benchmark_metric_json,
(self._bigquery_data_set,
self._bigquery_metric_table,
self._run_id,
[metric]))
def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
"""Collect most of the TF runtime information for the local env.
The schema of the run info follows official/benchmark/datastore/schema.
Args:
model_name: string, the name of the model.
dataset_name: string, the name of dataset for training and evaluation.
run_params: dict, the dictionary of parameters for the run, it could
include hyperparameters or other params that are important for the run.
test_id: string, the unique name of the test run by the combination of key
parameters, eg batch size, num of GPU. It is hardware independent.
"""
run_info = _gather_run_info(model_name, dataset_name, run_params, test_id)
# Starting new thread for bigquery upload in case it might take long time
# and impact the benchmark and performance measurement. Starting a new
# thread might have potential performance impact for model that run on CPU.
thread.start_new_thread(
self._bigquery_uploader.upload_benchmark_run_json,
(self._bigquery_data_set,
self._bigquery_run_table,
self._run_id,
run_info))
thread.start_new_thread(
self._bigquery_uploader.insert_run_status,
(self._bigquery_data_set,
self._bigquery_run_status_table,
self._run_id,
RUN_STATUS_RUNNING))
def on_finish(self, status):
self._bigquery_uploader.update_run_status(
self._bigquery_data_set,
self._bigquery_run_status_table,
self._run_id,
status)
def _gather_run_info(model_name, dataset_name, run_params, test_id): def _gather_run_info(model_name, dataset_name, run_params, test_id):
"""Collect the benchmark run information for the local environment.""" """Collect the benchmark run information for the local environment."""
run_info = { run_info = {
......
...@@ -67,14 +67,6 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -67,14 +67,6 @@ class BenchmarkLoggerTest(tf.test.TestCase):
self.assertIsInstance(logger.get_benchmark_logger(), self.assertIsInstance(logger.get_benchmark_logger(),
logger.BenchmarkFileLogger) logger.BenchmarkFileLogger)
@unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
@mock.patch.object(bigquery, "Client")
def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
with flagsaver.flagsaver(benchmark_logger_type="BenchmarkBigQueryLogger"):
logger.config_benchmark_logger()
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BenchmarkBigQueryLogger)
@mock.patch("official.utils.logs.logger.config_benchmark_logger") @mock.patch("official.utils.logs.logger.config_benchmark_logger")
def test_benchmark_context(self, mock_config_benchmark_logger): def test_benchmark_context(self, mock_config_benchmark_logger):
mock_logger = mock.MagicMock() mock_logger = mock.MagicMock()
...@@ -299,68 +291,5 @@ class BenchmarkFileLoggerTest(tf.test.TestCase): ...@@ -299,68 +291,5 @@ class BenchmarkFileLoggerTest(tf.test.TestCase):
self.assertIsNotNone(run_info["machine_config"]["memory_available"]) self.assertIsNotNone(run_info["machine_config"]["memory_available"])
@unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
class BenchmarkBigQueryLoggerTest(tf.test.TestCase):
def setUp(self):
super(BenchmarkBigQueryLoggerTest, self).setUp()
# Avoid pulling extra env vars from test environment which affects the test
# result, eg. Kokoro test has a TF_PKG env which affect the test case
# test_collect_tensorflow_environment_variables()
self.original_environ = dict(os.environ)
os.environ.clear()
self.mock_bq_uploader = mock.MagicMock()
self.logger = logger.BenchmarkBigQueryLogger(
self.mock_bq_uploader, "dataset", "run_table", "run_status_table",
"metric_table", "run_id")
def tearDown(self):
super(BenchmarkBigQueryLoggerTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
os.environ.clear()
os.environ.update(self.original_environ)
def test_log_metric(self):
self.logger.log_metric(
"accuracy", 0.999, global_step=1e4, extras={"name": "value"})
expected_metric_json = [{
"name": "accuracy",
"value": 0.999,
"unit": None,
"global_step": 1e4,
"timestamp": mock.ANY,
"extras": [{"name": "name", "value": "value"}]
}]
# log_metric will call upload_benchmark_metric_json in a separate thread.
# Give it some grace period for the new thread before assert.
time.sleep(1)
self.mock_bq_uploader.upload_benchmark_metric_json.assert_called_once_with(
"dataset", "metric_table", "run_id", expected_metric_json)
@mock.patch("official.utils.logs.logger._gather_run_info")
def test_log_run_info(self, mock_gather_run_info):
run_info = {"model_name": "model_name",
"dataset": "dataset_name",
"run_info": "run_value"}
mock_gather_run_info.return_value = run_info
self.logger.log_run_info("model_name", "dataset_name", {})
# log_metric will call upload_benchmark_metric_json in a separate thread.
# Give it some grace period for the new thread before assert.
time.sleep(1)
self.mock_bq_uploader.upload_benchmark_run_json.assert_called_once_with(
"dataset", "run_table", "run_id", run_info)
self.mock_bq_uploader.insert_run_status.assert_called_once_with(
"dataset", "run_status_table", "run_id", "running")
def test_on_finish(self):
self.logger.on_finish(logger.RUN_STATUS_SUCCESS)
# log_metric will call upload_benchmark_metric_json in a separate thread.
# Give it some grace period for the new thread before assert.
time.sleep(1)
self.mock_bq_uploader.update_run_status.assert_called_once_with(
"dataset", "run_status_table", "run_id", logger.RUN_STATUS_SUCCESS)
if __name__ == "__main__": if __name__ == "__main__":
tf.test.main() tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment