"...git@developer.sourcefind.cn:chenpangpang/open-webui.git" did not exist on "bb98c10abbc1a5d25712937c00a743bc8a8cb016"
Commit ee3997b3 authored by qianyj's avatar qianyj
Browse files

new tf branch for dtk21.10.1

parent 2795dc1f
# 简介
- TensorFlow 框架 训练 图像分类相关网络的代码,tensorflow 官方基准测试程序,使用的数据集是 imagenet。
TenorFlow 框架 训练 图像分类相关网络的代码,tensorflow 官方基准测试程序,使用的数据集是 imagenet。
# 测试运行
......@@ -8,35 +8,27 @@
## 基础 benchmark
- 创建 TensorFlow 运行时环境后,以 resnet50 网络为例,计算其 batch_size=32 num_gpu=1 条件下不同精度的性能,分为训练和推理两部分
- 创建 TensorFlow 运行时环境后,以 resnet50 网络为例,计算其 batch_size=32 num_gpu=1 条件下不同精度的性能
### fp32 train
python3 benchmark_cnn_tf_v1.14_compatible/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet50 --batch_size=32 --num_gpus=1 --num_epochs=90
python3 ./benchmarks-master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50 --save_model_steps=10020 --optimizer=momentum --variable_update=parameter_server --print_training_accuracy=true --eval_during_training_every_n_epochs=1 --nodistortions --num_gpus=1 --num_epochs=90 --weight_decay=1e-4 --data_dir=$data_dir_path --use_fp16=False --data_name=imagenet --train_dir=$save_checkpoint_path
### fp16 train
python3 benchmark_cnn_tf_v1.14_compatible/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet50 --use_fp16=true --fp16_enable_auto_loss_scale=true --batch_size=32 --num_gpus=1 --num_epochs=90
### fp32 inference
python3 benchmark_cnn_tf_v1.14_compatible/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet50 --batch_size=1 --num_gpus=1 --forward_only --num_batches=500
### fp16 inference
python3 benchmark_cnn_tf_v1.14_compatible/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --use_fp16=true --fp16_enable_auto_loss_scale=true --model=resnet50 --batch_size=1 --num_gpus=1 --forward_only --num_batches=500
python3 ./benchmarks-master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50 --save_model_steps=10020 --optimizer=momentum --variable_update=parameter_server --print_training_accuracy=true --eval_during_training_every_n_epochs=1 --nodistortions --num_gpus=1 --num_epochs=90 --weight_decay=1e-4 --data_dir=$data_dir_path --use_fp16=True --data_name=imagenet --train_dir=$save_checkpoint_path
## 大规模测试
### 单卡
HIP_VISIBLE_DEVICES=0 python3 tensorflow_synthetic_benchmark.py --model=ResNet50 --batch-size=128 --num-iters=500
HIP_VISIBLE_DEVICES=0 python3 ./benchmarks-master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50 --save_model_steps=10020 --optimizer=momentum --variable_update=parameter_server --print_training_accuracy=true --eval_during_training_every_n_epochs=1 --nodistortions --num_gpus=1 --num_epochs=90 --weight_decay=1e-4 --data_dir=$data_dir_path --use_fp16=True --data_name=imagenet --train_dir=$save_checkpoint_path
### 多卡
mpirun -np ${num_gpu} --hostfile hostfile --bind-to none scripts-run/single_process.sh
mpirun -np ${num_gpu} --hostfile hostfile --bind-to none scripts-run/single_process.sh
# 参考资料
[https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks]
[https://github.com/horovod/horovod]
[https://github.com/tensorflow/benchmarks/tree/cnn_tf_v1.14_compatible/scripts/tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/cnn_tf_v1.14_compatible/scripts/tf_cnn_benchmarks)
[https://github.com/horovod/horovod/tree/master/examples/tensorflow](https://github.com/horovod/horovod/tree/master/examples/tensorflow)
# TensorFlow benchmarks
This repository contains various TensorFlow benchmarks. Currently, it consists of two projects:
1. [PerfZero](https://github.com/tensorflow/benchmarks/tree/master/perfzero): A benchmark framework for TensorFlow.
2. [scripts/tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks): The TensorFlow CNN benchmarks contain benchmarks for several convolutional neural networks.
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu-2.0-preview
# - Installs requirements.txt for tensorflow/models
# - Install bazel for building TF from source
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu-2.0-preview"
ARG extra_pip_specs=""
ARG local_tensorflow_pip_spec=""
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-dev-10-0 \
cuda-curand-dev-10-0 \
cuda-cusolver-dev-10-0 \
cuda-cusparse-dev-10-0 \
libcudnn7=7.6.2.24-1+cuda10.0 \
libcudnn7-dev=7.6.2.24-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl \
&& \
find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
build-essential \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python
# (bulding TF needs py2 even if building for Python3 as of 06-AUG-2019)
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv \
python
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip3 install -r /tmp/requirements.txt
RUN pip3 freeze
# Install bazel
ARG BAZEL_VERSION=0.24.1
RUN mkdir /bazel && \
wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
chmod +x /bazel/installer.sh && \
/bazel/installer.sh && \
rm -f /bazel/installer.sh
RUN git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu (this is TF 2.0)
# - Installs requirements.txt for tensorflow/models
# Additionally also installs:
# - Latest S4TF development snapshot for cuda 10.0
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
ARG swift_tf_url=https://storage.googleapis.com/swift-tensorflow-artifacts/nightlies/latest/swift-tensorflow-DEVELOPMENT-cuda10.0-cudnn7-ubuntu18.04.tar.gz
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.2.24-1+cuda10.0 \
libcudnn7-dev=7.6.2.24-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
### Install Swift deps.
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
git \
python \
python-dev \
python-pip \
python-setuptools \
python-tk \
python3 \
python3-pip \
python3-setuptools \
clang \
libcurl4-openssl-dev \
libicu-dev \
libpython-dev \
libpython3-dev \
libncurses5-dev \
libxml2 \
libblocksruntime-dev
# Download and extract S4TF
WORKDIR /swift-tensorflow-toolchain
RUN curl -fSsL $swift_tf_url -o swift.tar.gz \
&& mkdir usr \
&& tar -xzf swift.tar.gz --directory=usr --strip-components=1 \
&& rm swift.tar.gz
ENV PATH="/swift-tensorflow-toolchain/usr/bin:${PATH}"
ENV LD_LIBRARY_PATH="/swift-tensorflow-toolchain/usr/lib/swift/linux/:${LD_LIBRARY_PATH}"
# Ubuntu 18.04 Python3 with CUDA 10.1 and the following:
# - Installs tf-nightly-gpu (this is TF 2.1)
# - Installs requirements.txt for tensorflow/models
# - TF 2.0 tested with cuda 10.0, but we need to test tf 2.1 with cuda 10.1.
# Additionally also installs
# - Latest S4TF development snapshot for cuda 10.1
FROM nvidia/cuda:10.1-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
ARG swift_tf_url=https://storage.googleapis.com/swift-tensorflow-artifacts/nightlies/latest/swift-tensorflow-DEVELOPMENT-cuda10.1-cudnn7-ubuntu18.04.tar.gz
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-1 \
cuda-cufft-10-1 \
cuda-curand-10-1 \
cuda-cusolver-10-1 \
cuda-cusparse-10-1 \
libcudnn7=7.6.4.38-1+cuda10.1 \
libcudnn7-dev=7.6.4.38-1+cuda10.1 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.1 \
libnvinfer-dev=5.1.5-1+cuda10.1 \
libnvinfer6=6.0.1-1+cuda10.1 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
### Install Swift deps.
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
git \
python \
python-dev \
python-pip \
python-setuptools \
python-tk \
python3 \
python3-pip \
python3-setuptools \
clang \
libcurl4-openssl-dev \
libicu-dev \
libpython-dev \
libpython3-dev \
libncurses5-dev \
libxml2 \
libblocksruntime-dev
# Download and extract S4TF
WORKDIR /swift-tensorflow-toolchain
RUN curl -fSsL $swift_tf_url -o swift.tar.gz \
&& mkdir usr \
&& tar -xzf swift.tar.gz --directory=usr --strip-components=1 \
&& rm swift.tar.gz
ENV PATH="/swift-tensorflow-toolchain/usr/bin:${PATH}"
ENV LD_LIBRARY_PATH="/swift-tensorflow-toolchain/usr/lib/swift/linux/:${LD_LIBRARY_PATH}"
# Ubuntu 18.04 Python3 with CUDA 10.1 and the following:
# - Installs tf-nightly-gpu (this is TF 2.1)
# - Installs requirements.txt for tensorflow/models
# - TF 2.0 tested with cuda 10.0, but we need to test tf 2.1 with cuda 10.1.
FROM nvidia/cuda:10.1-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-1 \
cuda-cufft-10-1 \
cuda-curand-10-1 \
cuda-cusolver-10-1 \
cuda-cusparse-10-1 \
libcudnn7=7.6.4.38-1+cuda10.1 \
libcudnn7-dev=7.6.4.38-1+cuda10.1 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.1 \
libnvinfer-dev=5.1.5-1+cuda10.1 \
libnvinfer6=6.0.1-1+cuda10.1 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute benchmark."""
from __future__ import print_function
import argparse
import json
import logging
import multiprocessing
import os
import re
import sys
import time
import perfzero.benchmark_method_runner as benchmark_method_runner
import perfzero.perfzero_config as perfzero_config
import perfzero.utils as utils
class BenchmarkRunner(object):
"""Execute benchmark and report results."""
def __init__(self, config):
self.config = config
self.project_dir = os.path.abspath(
os.path.dirname(os.path.dirname(__file__)))
self.workspace_dir = os.path.join(self.project_dir, config.workspace)
self.site_packages_dir = os.path.join(self.workspace_dir, 'site-packages')
self.root_output_dir = os.path.join(self.workspace_dir, 'output')
self.benchmark_execution_time = {}
def _setup(self):
"""Download data and checkout git repository."""
# Acticate gcloud service
start_time = time.time()
utils.setup_python_path(self.site_packages_dir, self.config.python_path_str)
utils.active_gcloud_service(self.config.gcloud_key_file_url,
self.workspace_dir)
utils.make_dir_if_not_exist(self.root_output_dir)
self.benchmark_execution_time['activate_gcloud_service'] = (
time.time() - start_time)
# Download data
start_time = time.time()
utils.download_data(utils.parse_data_downloads_str(
self.config.root_data_dir, self.config.gcs_downloads_str))
utils.download_data(utils.parse_data_downloads_str(
self.config.root_data_dir, self.config.data_downloads_str))
self.benchmark_execution_time['download_data'] = time.time() - start_time
# Checkout git repositories
start_time = time.time()
site_package_info = utils.checkout_git_repos(
self.config.get_git_repos(self.site_packages_dir),
self.config.use_cached_site_packages)
self.benchmark_execution_time['checkout_repository'] = (
time.time() - start_time)
# Start cloud TPU.
if self.config.tpu_parameters is not None:
start_time = time.time()
utils.setup_tpu(self.config.tpu_parameters)
self.benchmark_execution_time['start_tpu'] = time.time() - start_time
self.stream_handler = logging.StreamHandler(sys.stdout)
self.stream_handler.setFormatter(
logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
logging.getLogger().addHandler(self.stream_handler)
return site_package_info
def _get_benchmark_methods(self):
"""Returns list of benchmark methods to execute."""
filter_prefix = 'filter:'
benchmark_methods = []
for benchmark_method_pattern in self.config.benchmark_method_patterns:
if filter_prefix not in benchmark_method_pattern:
benchmark_methods.append(benchmark_method_pattern)
else:
index = benchmark_method_pattern.find(filter_prefix)
benchmark_class = benchmark_method_pattern[:index - 1]
pattern = benchmark_method_pattern[index + len(filter_prefix):]
class_instance = utils.instantiate_benchmark_class(benchmark_class,
'/dev/null',
'',
None)
for benchmark_method_name in dir(class_instance):
if re.match(pattern, benchmark_method_name):
benchmark_methods.append(benchmark_class + '.' +
benchmark_method_name)
logging.info('The following benchmark methods will be executed: %s',
benchmark_methods)
return benchmark_methods
def run_benchmark(self):
"""Run benchmark."""
harness_info = utils.get_git_repo_info(self.project_dir)
site_package_info = self._setup()
has_exception = False
benchmark_success_results = {}
benchmark_output_dirs = {}
try:
for benchmark_method in self._get_benchmark_methods():
# Run the benchmark method in a separate process so that its memory usage
# will not affect the execution of other benchmark method
# This is a walkaround before we fix all memory leak issues in TensorFlow
queue = multiprocessing.Queue()
process = multiprocessing.Process(target=benchmark_method_runner.run,
args=(benchmark_method,
harness_info,
site_package_info,
self.root_output_dir,
self.config, queue))
process.start()
process.join()
method_has_exception, method_execution_time, succeeded, output_dir = queue.get() # pylint: disable=line-too-long
has_exception |= method_has_exception
self.benchmark_execution_time[benchmark_method] = method_execution_time
benchmark_success_results[benchmark_method] = succeeded
benchmark_output_dirs[benchmark_method] = output_dir
finally:
if self.config.tpu_parameters is not None:
has_exception |= utils.cleanup_tpu(self.config.tpu_parameters)
print('Benchmark execution time in seconds by operation:\n {}'.format(
json.dumps(self.benchmark_execution_time, indent=2)))
print('Benchmark success results:\n{}'.format(
json.dumps(benchmark_success_results, indent=2)))
print('Benchmark local output directories:\n{}'.format(
json.dumps(benchmark_output_dirs, indent=2)))
if has_exception:
sys.exit(1)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
perfzero_config.add_benchmark_parser_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()
level = logging.DEBUG if FLAGS.debug else logging.INFO
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
level=level)
if unparsed:
logging.error('Arguments %s are not recognized', unparsed)
sys.exit(1)
config_ = perfzero_config.PerfZeroConfig(mode='flags', flags=FLAGS)
benchmark_runner = BenchmarkRunner(config_)
benchmark_runner.run_benchmark()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute a single benchmark method."""
from __future__ import print_function
import datetime
import json
import logging
import os
import time
import traceback
from perfzero.process_info_tracker import ProcessInfoTracker
import perfzero.report_utils as report_utils
from perfzero.tensorflow_profiler import TensorFlowProfiler
import perfzero.utils as utils
def run(benchmark_method, harness_info, site_package_info,
root_output_dir, config, queue):
try:
_run_internal(benchmark_method, harness_info, site_package_info,
root_output_dir, config, queue)
except Exception: # pylint: disable=broad-except
logging.error('Benchmark execution for %s failed due to error:\n %s',
benchmark_method, traceback.format_exc())
queue.put((True, None, False, None))
def _run_internal(benchmark_method, harness_info, site_package_info,
root_output_dir, config, queue):
"""Run benchmark method and put result to the queue.
Args:
benchmark_method: Canonical path to the benchmark method
harness_info: Description of the benchmark harness used in the benchmark
site_package_info: Description of the site-package used in the benchmark
root_output_dir: Directory under which to put the benchmark output
config: An instance of perfzero_config
queue: An interprocess queue to transfer benchmark result to the caller
"""
start_timestamp = time.time()
execution_timestamp = start_timestamp
method_has_exception = False
execution_id = (config.execution_id if config.execution_id else
datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f'))
output_dir = os.path.join(root_output_dir, execution_id)
if config.scratch_gcs_url:
model_output_dir = os.path.join(config.scratch_gcs_url, execution_id)
else:
model_output_dir = output_dir
utils.make_dir_if_not_exist(output_dir)
benchmark_class, benchmark_method_name = benchmark_method.rsplit('.', 1)
benchmark_class_name = benchmark_class.rsplit('.', 1)[1]
tensorflow_profiler = TensorFlowProfiler(
config.profiler_enabled_time_str, output_dir)
process_info_tracker = ProcessInfoTracker(output_dir)
process_info = None
# Setup per-method file logger
filehandler = logging.FileHandler(
filename=os.path.join(output_dir, 'perfzero.log'), mode='w')
filehandler.setFormatter(
logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
logging.getLogger().addHandler(filehandler)
try:
if config.tpu_parameters:
tpu = config.tpu_parameters.get('name')
else:
tpu = None
class_instance = utils.instantiate_benchmark_class(
benchmark_class=benchmark_class,
output_dir=model_output_dir,
root_data_dir=config.root_data_dir,
tpu=tpu)
# tf.test.Benchmark.report_benchmark() writes results to a file with
# path benchmark_result_file_path_prefix + benchmark_method
benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_')
os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix
benchmark_result_file_path = '{}{}.{}'.format(
benchmark_result_file_path_prefix,
benchmark_class_name,
benchmark_method_name)
# Start background threads for profiler and system info tracker
tensorflow_profiler.start()
process_info_tracker.start()
# Run benchmark method
execution_timestamp = time.time()
logging.info('Starting benchmark execution: %s', benchmark_method)
getattr(class_instance, benchmark_method_name)()
logging.info('Stopped benchmark: %s', benchmark_method)
# Read and build benchmark results
raw_benchmark_result = utils.read_benchmark_result(
benchmark_result_file_path)
# Explicitly overwrite the name to be the full path to benchmark method
raw_benchmark_result['name'] = benchmark_method
except Exception: # pylint: disable=broad-except
logging.error('Benchmark execution for %s failed due to error:\n %s',
benchmark_method, traceback.format_exc())
method_has_exception = True
raw_benchmark_result = {}
raw_benchmark_result['name'] = benchmark_method
raw_benchmark_result['wall_time'] = -1
raw_benchmark_result['extras'] = {}
finally:
# Stop background threads for profiler and system info tracker
process_info = process_info_tracker.stop()
tensorflow_profiler.stop()
upload_timestamp = time.time()
benchmark_result = report_utils.build_benchmark_result(
raw_benchmark_result, method_has_exception)
execution_summary = report_utils.build_execution_summary(
execution_timestamp,
execution_id,
config.ml_framework_build_label,
config.execution_label,
config.platform_name,
config.system_name,
config.output_gcs_url,
benchmark_result,
config.get_env_vars(),
config.get_flags(),
harness_info,
site_package_info,
process_info,
method_has_exception)
report_utils.upload_execution_summary(
config.bigquery_project_name,
config.bigquery_dataset_table_name,
execution_summary)
report_utils.execute_methods(
config.result_upload_methods,
execution_summary)
logging.info('Benchmark execution for %s completed with summary:\n %s',
benchmark_method, json.dumps(execution_summary, indent=2))
utils.maybe_upload_to_gcs(output_dir, config.output_gcs_url)
logging.getLogger().removeHandler(filehandler)
method_execution_time = {
'class_initialization': execution_timestamp - start_timestamp,
'method_execution': upload_timestamp - execution_timestamp,
'log_upload': time.time() - upload_timestamp
}
if config.profiler_enabled_time_str:
relative_output_dir = output_dir[output_dir.find('benchmark'):]
print('\nExecute the command below to start tensorboard server using '
'the collected profiler data:\ntensorboard --logdir={}\n\n'
'Open localhost:6006 in your browser to access the Tensorbord '
'GUI. Use ssh with port forwarding if tensorboard is running on '
'a remote machine.\n'.format(relative_output_dir))
queue.put((method_has_exception, method_execution_time,
benchmark_result['succeeded'], output_dir))
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero configs provided by user."""
from __future__ import print_function
import json
import logging
import os
def add_setup_parser_arguments(parser):
"""Add arguments to the parser used by the setup.py."""
parser.add_argument(
'--dockerfile_path',
default='docker/Dockerfile_ubuntu_1804_tf_v1',
type=str,
help='''Build the docker image using docker file located at the ${pwd}/${dockerfile_path} if
it exists, where ${pwd} is user's current work directory. Otherwise, build
the docker image using the docker file located at path_to_perfzero/${dockerfile_path}.
''')
parser.add_argument(
'--workspace',
default='workspace',
type=str,
help='''The gcloud key file will be downloaded under directory path_to_perfzero/${workspace}
''')
parser.add_argument(
'--gcloud_key_file_url',
default='',
type=str,
help='''DEPRECATED: Use --gcloud_key_file_url of setup.py instead.
The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
''')
parser.add_argument(
'--root_data_dir',
default='/data',
type=str,
help='The directory which should contain the dataset required by the becnhmark method.'
)
parser.add_argument(
'--gce_nvme_raid',
default=None,
type=str,
help='If set to non-empty string, create raid 0 array with devices at the directory specified by the flag --root_data_dir'
)
parser.add_argument(
'--tensorflow_pip_spec',
default=None,
type=str,
help='''The tensorflow pip package specfication. The format can be either ${package_name}, or ${package_name}==${package_version}.
Example values include tf-nightly-gpu, and tensorflow==1.12.0. If it is specified, the corresponding tensorflow pip package/version
will be installed. Otherwise, the default tensorflow pip package specified in the docker file will be installed.
''')
parser.add_argument(
'--extra_pip_specs',
default='',
type=str,
help='''Additional specifications to pass to `pip install`. (e.g. pinning certain dependencies)
Specifications should be semicolon separated: e.g. `numpy==1.16.4;scipy==1.3.1`
''')
parser.add_argument(
'--docker_tag',
default='perfzero/tensorflow',
type=str,
help='The docker tag to use if building a docker image.'
)
parser.add_argument(
'--site_package_downloads',
default='',
type=str,
help='''Comma separated list of dirs in the external vm to copy to the docker\'s site package dir.
Format: <absolute-path>/src/dir:new_base_dir_name,<absolute-path>/src/dir2>:new_name,....
This will copy <absolute-path>/src/dir to <site-packages>/new_base_dir_name.
'''
)
def add_benchmark_parser_arguments(parser):
"""Add arguments to the parser used by the benchmark.py."""
parser.add_argument(
'--use_cached_site_packages',
action='store_true',
help='If set, skip git pull for dependent git repositories if it already exists in path_to_perfzero/${workspace}/site-packages'
)
parser.add_argument(
'--gcs_downloads',
default=None,
type=str,
help='This flag is deprecated. Use the flag --data_downloads instead')
parser.add_argument(
'--git_repos',
default=None,
type=str,
help='''A string representing git repositories to checkout. The format is url_1;branch_1;hash_1,url_2;branch_2;hash_2,...
Git repositories will be checked-out under directory path_to_perfzero/${workspace}/site-packages,
where ${workspace} either defaults to 'workspace', or takes the value of the flag --workspace.
branch and hash can be skipped if user wants to use the head of the master branch,
which shortens the format to url_1,url_2,...
''')
parser.add_argument(
'--benchmark_methods',
action='append',
default=[],
type=str,
help='''This string specifies the benchmark_method to be executed. The flag can be specified multiple times in which case
the union of methods matched by these flags will be executed. The format can be module_path.class_name.method_name in which
case the corresponding method is executed. The format can also be module_path.class_name.filter:regex_pattern, in which case all methods
of the given class whose method name matches the given regular expression are executed.
''')
parser.add_argument(
'--ml_framework_build_label',
default=None,
type=str,
help='A string that identified the machine learning framework build, e.g. nightly-gpu-build'
)
parser.add_argument(
'--execution_label',
default=None,
type=str,
help='A string that identified the benchmark execution type, e.g. test, prod'
)
parser.add_argument(
'--platform_name',
default=None,
type=str,
help='A string that identified the computing platform, e.g. gcp, aws'
)
parser.add_argument(
'--system_name',
default=None,
type=str,
help='A string that identified the hardware system, e.g. n1-standard-64-8xV100'
)
parser.add_argument(
'--output_gcs_url',
default=None,
type=str,
help='''If specified, log files generated by the benchmark execution will be uploaded to output_gcs_url/${execution_id},
where ${execution_id} is a string that generated by PerfZero which uniquely identifies the execution of one benchmark method
''')
parser.add_argument(
'--scratch_gcs_url',
default=None,
type=str,
help='''If specified, intermediate files like model outputs will be stored in scratch_gcs_url/${execution_id}, where
${execution_id} is a string that is generated by PerfZero which uniquely identifies the execution of one benchmark method.
If not specified, intermediate files will be stored in a local folder on the host.
''')
parser.add_argument(
'--bigquery_project_name',
default=None,
type=str,
help='''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
''')
parser.add_argument(
'--bigquery_dataset_table_name',
default=None,
type=str,
help='''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
''')
parser.add_argument(
'--python_path',
default=None,
type=str,
help='''A string of format path_1,path_2,... For each ${path} specified in the string,
path_to_perfzero/${workspace}/site-packages/${path} will be added to python path so that libraies downloaded by --git_repos can
be loaded and executed.
''')
parser.add_argument(
'--workspace',
default='workspace',
type=str,
help='''The log files, gcloud key file and git repositories will be generated and downloaded under the
directory path_to_perfzero/${workspace}
''')
parser.add_argument(
'--root_data_dir',
default='/data',
type=str,
help='The directory which should contain the dataset required by the becnhmark method.'
)
parser.add_argument(
'--data_downloads',
default=None,
type=str,
help='''A string of format url_1;relative_path_1,url_2;relative_path_2,...
Data will be copied from ${url} to ${root_data_dir}/${relative_path}. ${relative_path} can be skipped if it is the same as the
base name of the url, which shortens the format to url_1,url_2,... ${root_data_dir} is specified by the flag --root_data_dir.
File will be de-compressed in ${root_data_dir} if its name ends with 'gz'. Only url prefixed with gcs, http or https are supported.
Each url can start with file://, gcs://, http:// or https://.
''')
parser.add_argument(
'--gcloud_key_file_url',
default='gs://tf-performance/auth_tokens/benchmark_upload_gce.json',
type=str,
help='''The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
The key file will then be activated and used as gcloud authentication credential.
''')
parser.add_argument(
'--debug',
action='store_true',
help='If set, use debug level logging. Otherwise, use info level logging'
)
parser.add_argument(
'--profiler_enabled_time',
default=None,
type=str,
help='''A string of format begin_time_1:end_time_1,begin_time_2:end_time_2,.... PerfZero will start to collect profiler
data ${begin_time} sec after benchmark method execution starts. The data collection continues for ${end_time - begin_time}
sec or until the benchmark method execution finishes, whichever occurs first. If ${end_time} is not explicitly
specified, it is assumed to be MAX_LONG.
''')
parser.add_argument(
'--execution_id',
default=None,
type=str,
help='A string that uniquely identifies the benchmark execution.')
parser.add_argument(
'--result_upload_methods',
default=None,
type=str,
help='A comma separated list of class.method values to upload results.')
parser.add_argument(
'--tpu_parameters',
default=None,
type=str,
help='''A json dictionary of cloud tpu parameters. The format must look like the following:
{"name": "my-tpu-name", project": "my-gcp-project-id", "zone": "europe-west4-a", "size": "v3-8", "version": "nightly-2.x"}
''')
class PerfZeroConfig(object):
"""Creates and contains config for PerfZero."""
def __init__(self, mode, flags=None):
self.mode = mode
self.flags = flags
if mode == 'flags':
self.gcs_downloads_str = flags.gcs_downloads
self.data_downloads_str = flags.data_downloads
self.git_repos_str = flags.git_repos
self.benchmark_method_patterns = []
for value in flags.benchmark_methods:
self.benchmark_method_patterns.extend(value.split(','))
self.ml_framework_build_label = flags.ml_framework_build_label
self.execution_label = flags.execution_label
self.platform_name = flags.platform_name
self.system_name = flags.system_name
self.output_gcs_url = flags.output_gcs_url
self.scratch_gcs_url = flags.scratch_gcs_url
self.bigquery_project_name = flags.bigquery_project_name
self.bigquery_dataset_table_name = flags.bigquery_dataset_table_name
self.python_path_str = flags.python_path
self.workspace = flags.workspace
self.use_cached_site_packages = flags.use_cached_site_packages
self.root_data_dir = flags.root_data_dir
self.gcloud_key_file_url = flags.gcloud_key_file_url
self.profiler_enabled_time_str = flags.profiler_enabled_time
self.execution_id = flags.execution_id
self.result_upload_methods = flags.result_upload_methods
if flags.tpu_parameters:
self.tpu_parameters = json.loads(flags.tpu_parameters)
else:
self.tpu_parameters = None
if not flags.benchmark_methods:
logging.warning('No benchmark method is specified by '
'--benchmark_methods')
if flags.bigquery_project_name and not flags.bigquery_dataset_table_name:
raise ValueError('--bigquery_project_name is specified but '
'--bigquery_dataset_table_name is not')
if not flags.bigquery_project_name and flags.bigquery_dataset_table_name:
raise ValueError('--bigquery_dataset_table_name is specified but '
'--bigquery_project_name is not')
def get_env_vars(self):
env_vars = {}
for key in os.environ.keys():
if key.startswith('PERFZERO_'):
env_vars[key] = os.environ[key]
return env_vars
def get_flags(self):
not_none_flags = {}
for key in vars(self.flags):
value = getattr(self.flags, key)
if value is not None:
not_none_flags[key] = value
return not_none_flags
def get_git_repos(self, site_packages_dir):
"""Parse git repository string."""
git_repos = []
if not self.git_repos_str:
return git_repos
for repo_entry in self.git_repos_str.split(','):
parts = repo_entry.split(';')
git_repo = {}
git_repo['url'] = parts[0]
# Assume the git url has format */{dir_name}.git
git_repo['dir_name'] = parts[0].rsplit('/', 1)[-1].rsplit('.', 1)[0]
git_repo['local_path'] = os.path.join(site_packages_dir,
git_repo['dir_name'])
if len(parts) >= 2:
git_repo['branch'] = parts[1]
if len(parts) >= 3:
git_repo['git_hash'] = parts[2]
git_repos.append(git_repo)
return git_repos
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Upload test results."""
from __future__ import print_function
import importlib
import json
import logging
import perfzero.utils as utils
import psutil
import socket
from six import u as unicode # pylint: disable=W0622
def execute_methods(method_names_str, *args, **kwargs):
"""Calls a list of method names on given function params.
Args:
method_names_str: String - Comma-separated module.foo.bar.method strings.
This function imports module.foo.bar for each such method and calls it
with *args and **kwargs.
*args: Function params common to each method.
**kwargs: Function params common to each method.
Raises:
RuntimeError: If any of the invoked methods raised an exception.
"""
if not method_names_str:
return
errors = []
module_methods_list = method_names_str.split(',')
for module_method in module_methods_list:
try:
logging.info('Trying to call %s', module_method)
module_path, method_path = module_method.rsplit('.', 1)
this_module = importlib.import_module(module_path)
logging.info('Found module %s, looking for %s', module_path, method_path)
this_method = getattr(this_module, method_path)
logging.info('Found method %s', method_path)
this_method(*args, **kwargs)
except Exception as e: # pylint: disable=broad-except
errors.append(str(e))
if errors:
raise RuntimeError('\n' + '\n'.join(errors))
def upload_execution_summary(bigquery_project_name, bigquery_dataset_table_name,
execution_summary):
"""Upload benchmark summary.
Note: Using stream=False has a 1000 per day insert limit per table. Using
stream=True, the documented limit is 50K+. With streaming there can be
a small and possibly not noticeable delay to seeing the results the BigQuery
UI, but there can be a 90 minute more or less delay in the results being part
of exports.
Note: BigQuery maps unicode() to STRING for python2. If str is used that is
mapped to BYTE.
Args:
bigquery_project_name: Name of the gcp project.
bigquery_dataset_table_name: data_set and table name.
execution_summary: benchmark summary dictionary of results.
"""
# pylint: disable=C6204
import google.auth
from google.cloud import bigquery
if not bigquery_project_name:
logging.info(
'Skipped uploading benchmark result to bigquery because bigquery table name is not set.'
)
return
if not bigquery_dataset_table_name:
logging.info(
'Skipped uploading benchmark result to bigquery because bigquery project name is not set.'
)
return
credentials = google.auth.default()[0]
dataset_name = bigquery_dataset_table_name.split('.')[0]
table_name = bigquery_dataset_table_name.split('.')[1]
client = bigquery.Client(
project=bigquery_project_name, credentials=credentials)
benchmark_summary_input = {}
for key, value in execution_summary.items():
if isinstance(value, dict):
benchmark_summary_input[key] = unicode(json.dumps(value))
else:
benchmark_summary_input[key] = unicode(value)
logging.debug('Bigquery input for benchmark_summary table is %s',
json.dumps(benchmark_summary_input, indent=2))
errors = []
# TODO(tobyboyd): Shim to direct results to new table until all jobs
# are updated.
if 'benchmark_results' in dataset_name:
if dataset_name == 'benchmark_results_dev':
table_ref = client.dataset('perfzero_dev').table('benchmark_summary')
table_obj = client.get_table(table_ref)
elif dataset_name == 'benchmark_results':
table_ref = client.dataset('perfzero').table('benchmark_summary')
table_obj = client.get_table(table_ref)
else:
table_ref = client.dataset(dataset_name).table(table_name)
table_obj = client.get_table(table_ref)
errors.extend(client.insert_rows(table_obj, [benchmark_summary_input]))
if errors:
logging.error(
'Failed to upload benchmark result to bigquery due to errors %s',
errors)
else:
logging.info(
'Uploaded benchmark result to the table %s of the bigquery project %s.',
bigquery_dataset_table_name,
bigquery_project_name)
def build_benchmark_result(raw_benchmark_result, has_exception):
"""Converts test_log.proto format to PerfZero format."""
benchmark_result = {}
benchmark_result['name'] = raw_benchmark_result['name']
benchmark_result['wall_time'] = raw_benchmark_result['wall_time']
succeeded = not has_exception
extras = []
for name in raw_benchmark_result.get('extras', {}):
entry = {}
entry['name'] = name
if 'double_value' in raw_benchmark_result['extras'][name]:
entry['value'] = raw_benchmark_result['extras'][name]['double_value']
else:
entry['value'] = raw_benchmark_result['extras'][name]['string_value']
extras.append(entry)
metrics = []
for metric in raw_benchmark_result.get('metrics', []):
value = metric['value']
if 'min_value' in metric and metric['min_value'] > value:
succeeded = False
if 'max_value' in metric and metric['max_value'] < value:
succeeded = False
metrics.append(metric)
benchmark_result['succeeded'] = succeeded
benchmark_result['extras'] = extras
benchmark_result['metrics'] = metrics
return benchmark_result
def build_execution_summary(execution_timestamp, execution_id,
ml_framework_build_label, execution_label,
platform_name, system_name, output_gcs_url,
benchmark_result, env_vars, flags, harness_info,
site_package_info, process_info, has_exception):
"""Builds summary of the execution."""
# Avoids module not found during setup phase when tf is not installed yet.
# pylint: disable=C6204
import tensorflow as tf
benchmark_info = {}
benchmark_info['harness_name'] = 'perfzero'
benchmark_info['harness_info'] = harness_info
benchmark_info['has_exception'] = has_exception
if execution_label:
benchmark_info['execution_label'] = execution_label
if output_gcs_url:
benchmark_info['output_url'] = '{}/{}/'.format(output_gcs_url, execution_id)
if env_vars:
benchmark_info['env_vars'] = env_vars
if flags:
benchmark_info['flags'] = flags
benchmark_info['site_package_info'] = site_package_info
ml_framework_info = {}
ml_framework_info['name'] = 'tensorflow'
ml_framework_info['version'] = tf.__version__
# tf.__git_version__ in Python3 has format b'version_string'
if tf.__git_version__[0] == 'b':
ml_framework_info['build_version'] = tf.__git_version__[2:-1]
else:
ml_framework_info['build_version'] = tf.__git_version__
if ml_framework_build_label:
ml_framework_info['build_label'] = ml_framework_build_label
system_info = {}
if platform_name:
system_info['platform_name'] = platform_name
if system_name:
system_info['system_name'] = system_name
gpu_info = utils.get_gpu_info()
if gpu_info:
system_info['accelerator_driver_version'] = gpu_info['gpu_driver_version']
system_info['accelerator_model'] = gpu_info['gpu_model']
system_info['accelerator_count'] = gpu_info['gpu_count']
system_info['cpu_model'] = utils.get_cpu_name()
system_info['physical_cpu_count'] = psutil.cpu_count(logical=False)
system_info['logical_cpu_count'] = psutil.cpu_count(logical=True)
system_info['cpu_socket_count'] = utils.get_cpu_socket_count()
system_info['hostname'] = socket.gethostname()
execution_summary = {}
execution_summary['execution_id'] = execution_id
execution_summary['execution_timestamp'] = execution_timestamp
execution_summary['benchmark_result'] = benchmark_result
execution_summary['benchmark_info'] = benchmark_info
execution_summary['setup_info'] = {}
execution_summary['ml_framework_info'] = ml_framework_info
execution_summary['system_info'] = system_info
if process_info:
execution_summary['process_info'] = process_info
return execution_summary
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Collect profiler data for Tensorboard with a separate thread."""
from __future__ import print_function
import logging
import os
import sched
import threading
import time
import traceback
import perfzero.utils as utils
def _start_profiler():
from tensorflow.python.eager import profiler # pylint: disable=g-import-not-at-top
try:
profiler.start()
logging.info('Started TensorFlow profiler')
except Exception: # pylint: disable=broad-except
logging.error('TensorFlow profiler failed to start due to error:\n %s',
traceback.format_exc())
def _stop_and_save_profiler(output_dir):
"""Stop profiler and save profiler data.
Args:
output_dir: log directory to place the profiler data
"""
from tensorflow.python.eager import profiler # pylint: disable=g-import-not-at-top
try:
profiler_data_dir = os.path.join(output_dir, 'profiler_data')
logging.info('Stopping TensorFlow profiler and saving data to dir %s',
profiler_data_dir)
utils.make_dir_if_not_exist(profiler_data_dir)
result = profiler.stop()
with open(os.path.join(profiler_data_dir, 'local.trace'), 'wb') as f:
f.write(result)
logging.info('Stopped TensorFlow profiler.')
except Exception: # pylint: disable=broad-except
logging.error('TensorFlow profiler failed to stop due to error:\n %s',
traceback.format_exc())
class TensorFlowProfiler(object):
"""Collect profiler data for Tensorboard with a separate thread."""
def __init__(self, profiler_enabled_time_str, output_dir):
"""Constructor.
Args:
profiler_enabled_time_str: the value of the config --profiler_enabled_time
output_dir: log directory to place the profiler data
"""
self.profiler_enabled_time_str = profiler_enabled_time_str
self.output_dir = output_dir
self.exit_event = threading.Event()
self.scheduler = sched.scheduler(time.time, self._sleep_until_exit)
def _sleep_until_exit(self, timeout):
start_time = time.time()
cur_time = time.time()
while cur_time - start_time < timeout and not self.exit_event.is_set():
time.sleep(min(1, timeout + start_time - cur_time))
cur_time = time.time()
def start(self):
"""Schedule start/stop profiler event specified in profiler_enabled_time_str."""
if not self.profiler_enabled_time_str:
return
last_end_time = -1
for time_str in self.profiler_enabled_time_str.split(','):
begin_time = int(time_str.split(':')[0].strip())
end_time_str = time_str.split(':')[1].strip() if ':' in time_str else None
end_time = int(end_time_str) if end_time_str else 365 * 24 * 60 * 60
if begin_time <= last_end_time:
raise ValueError('begin_time {} is no larger than the last '
'end_time {}'.format(begin_time, last_end_time))
if end_time <= begin_time:
raise ValueError('end_time {} is no larger than begin_time {}'.format(
end_time, begin_time))
# 4th positional arg added to support Python2 for the short-term.
self.scheduler.enter(begin_time, 1, _start_profiler, ()) # pylint: disable=no-value-for-parameter
self.scheduler.enter(end_time, 1, _stop_and_save_profiler,
argument=(self.output_dir,))
last_end_time = end_time
threading.Thread(target=self.scheduler.run).start()
def stop(self):
"""Stop scheduler and save profiler data if any event is cancelled."""
event_canceled = False
for event in self.scheduler.queue:
try:
self.scheduler.cancel(event)
event_canceled = True
except ValueError:
# This is OK because the event may have been just canceled
pass
# Signal the scheduler thread to stop sleeping
self.exit_event.set()
# Save the profiler data if any event is canceled
if event_canceled:
_stop_and_save_profiler(self.output_dir)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero utility methods."""
from __future__ import print_function
import importlib
import logging
import os
import shutil
import subprocess
import sys
import threading
import traceback
import requests
def create_empty_file(parent_directory, file_basename):
"""Creates an empty file with a given basename in a parent directory.
Creates parent_directory and intermediate directories if it doesn't exist.
This is mostly used for creating no-op actions in the Dockerfile.
Args:
parent_directory: The path to the parent directory.
file_basename: The basename for the empty file.
"""
if not os.path.isdir(parent_directory):
os.makedirs(parent_directory)
full_file_name = os.path.join(parent_directory, file_basename)
with open(full_file_name, 'w'):
print('Creating empty file: {}'.format(full_file_name))
def checkout_git_repos(git_repos, use_cached_site_packages):
"""Clone, update, or sync a repo.
Args:
git_repos: array of dict containing attributes of the git repo to checkout.
use_cached_site_packages: If true, skip git pull if git_repo already exists.
Returns:
A dict containing attributes of the git repositories
"""
site_package_info = {}
for repo in git_repos:
logging.info('Checking out repository from %s to %s',
repo['url'], repo['local_path'])
if not os.path.isdir(repo['local_path']):
run_commands(['git clone {} {}'.format(repo['url'], repo['local_path'])])
if 'branch' in repo:
run_commands(['git -C {} checkout {}'.format(
repo['local_path'], repo['branch'])])
if not use_cached_site_packages or 'git_hash' in repo:
run_commands(['git -C {} pull --rebase'.format(repo['local_path'])])
if 'git_hash' in repo:
run_commands(['git -C {} reset --hard {}'.format(
repo['local_path'], repo['git_hash'])])
logging.info('Checked-out repository from %s to %s',
repo['url'], repo['local_path'])
site_package_info[repo['dir_name']] = get_git_repo_info(repo['local_path'])
return site_package_info
def get_git_repo_info(local_path):
"""Get information of the git repository specified by the local_path."""
git_repo_info = {}
# Get git url
cmd = 'git -C {} config --get remote.origin.url'.format(local_path)
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
git_repo_info['url'] = lines[0]
else:
logging.error('Error getting git url for repository %s due to %s',
local_path, result)
return {}
# Get git branch
cmd = 'git -C {} rev-parse --abbrev-ref HEAD'.format(local_path)
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
git_repo_info['branch'] = lines[0]
else:
logging.error('Error getting git branch for repository %s due to %s',
local_path, result)
return {}
# Get git hash
cmd = 'git -C {} rev-parse HEAD'.format(local_path)
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
git_repo_info['hash'] = lines[0]
else:
logging.error('Error getting git hash for repository %s due to %s',
local_path, result)
return {}
return git_repo_info
def setup_python_path(site_packages_dir, python_path_str):
if python_path_str:
python_paths = python_path_str.split(',')
for python_path in python_paths:
logging.info('Adding path %s to sys.path', python_path)
sys.path.append(os.path.join(site_packages_dir, python_path))
logging.debug('PYTHONPATH: %s', sys.path)
def active_gcloud_service(gcloud_key_file_url, workspace_dir,
download_only=False):
"""Download key file and setup gcloud service credential using the key file.
Args:
gcloud_key_file_url: gcloud key file url
workspace_dir: directory that the key file is downloaded to
download_only: skip setting up the gcloud service credential if this is true
"""
if not gcloud_key_file_url:
return
local_path = os.path.join(workspace_dir,
os.path.basename(gcloud_key_file_url))
if not os.path.exists(local_path):
download_data([{'url': gcloud_key_file_url, 'local_path': local_path}])
if not download_only:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = local_path
run_commands(['gcloud auth activate-service-account --key-file {}'.format(
local_path)])
logging.info('Activated gcloud service account credential')
def setup_gsutil_credential():
run_commands(['gcloud config set pass_credentials_to_gsutil true'])
def download_data(download_infos):
"""Download data from url to local_path for each (url, local_path) pair in the download_infos.
Each url should start with either gs://, http:// or https://
Downloaded file whose name ends with .gz will be decompressed in its
current directory
Args:
download_infos: array of dict which specifies the url and local_path for
data download
"""
for info in download_infos:
if os.path.exists(info['local_path']):
continue
original_base_name = os.path.basename(info['url'])
expected_base_name = os.path.basename(info['local_path'])
local_path_parent = os.path.dirname(info['local_path'])
logging.info('Downloading data from %s to %s',
info['url'], info['local_path'])
make_dir_if_not_exist(local_path_parent)
# Download data to the local path
if info['url'].startswith('http://') or info['url'].startswith('https://'):
request = requests.get(info['url'], allow_redirects=True)
f = open(info['local_path'], 'wb')
f.write(request.content)
f.close()
elif info['url'].startswith('gs://'):
cmd = ['gsutil', '-m', 'cp', '-r', '-n', info['url'], local_path_parent]
run_commands([cmd], shell=False)
elif info['url'].startswith('file://'):
cmd = ['cp', info['url'][7:], local_path_parent]
run_commands([cmd], shell=False)
else:
raise ValueError('Url {} with prefix {} is not supported.'.format(
info['url'], info['url'].split(':')[0]))
# Move data to the expected local path
if original_base_name != expected_base_name:
run_commands(['mv {} {}'.format(
os.path.join(local_path_parent, original_base_name),
os.path.join(local_path_parent, expected_base_name))])
logging.info('Downloaded data from %s to %s',
info['url'], info['local_path'])
# Decompress file if file name ends with .gz unless caller sets 'decompress'
# to False in info.
if info['url'].endswith('.gz') and info.get('decompress', True):
run_commands(['tar xvf {} -C {}'.format(
info['local_path'], local_path_parent)])
logging.info('Decompressed file %s', info['local_path'])
def parse_data_downloads_str(root_data_dir, data_downloads_str):
"""Parse a comma separated string into array of dicts.
Each dict specifies the url and local_path for a download.
Args:
root_data_dir: the directory which should contain all the dataset files
data_downloads_str: a comma separated string specified by the
flag --data_downloads
Returns:
An array of dict which specifies the url and local_path for data download
"""
download_infos = []
if not data_downloads_str:
return download_infos
for entry in data_downloads_str.split(','):
info = {}
if ';' in entry:
info['url'] = entry.split(';')[0]
info['local_path'] = os.path.join(root_data_dir, entry.split(';')[1])
else:
info['url'] = entry
info['local_path'] = os.path.join(root_data_dir, os.path.basename(entry))
# Canonicalize url to remove trailing '/' and '*'
if info['url'].endswith('*'):
info['url'] = info['url'][:-1]
if info['url'].endswith('/'):
info['url'] = info['url'][:-1]
download_infos.append(info)
return download_infos
def maybe_upload_to_gcs(local_dir, output_gcs_url):
if not output_gcs_url:
return
run_commands(['gsutil -m cp -r {} {}'.format(local_dir, output_gcs_url)])
logging.info('Uploaded data from local directory %s to gcs %s',
local_dir, output_gcs_url)
def make_dir_if_not_exist(local_path):
if not os.path.exists(local_path):
os.makedirs(local_path)
logging.info('Created directory %s', local_path)
def run_command(cmd, shell=True):
"""Structures for a variety of different test results.
Args:
cmd: Command to execute
shell: True to use shell, false otherwise.
Returns:
Tuple of the command return value and the standard out in as a string.
"""
logging.debug('Executing command: %s', cmd)
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, shell=shell)
exit_code = None
line = ''
stdout = ''
while exit_code is None or line:
exit_code = p.poll()
line = p.stdout.readline().decode('utf-8')
stdout += line
logging.debug(line)
return exit_code, stdout
def run_commands(cmds, shell=True):
"""Runs list of command and throw error if any fail."""
for cmd in cmds:
exit_code, stdout = run_command(cmd, shell=shell)
if exit_code:
raise Exception('"{}" failed with code:{} and stdout:\n{}'.format(
cmd, exit_code, stdout))
def get_cpu_name():
cmd = "cat /proc/cpuinfo | grep 'model name' | sort --unique"
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
model_name_parts = lines[0].split(':')
return model_name_parts[1].strip()
else:
logging.error('Error getting cpuinfo model name: %s', result)
return ''
def get_cpu_socket_count():
cmd = 'grep -i "physical id" /proc/cpuinfo | sort -u | wc -l'
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
return int(lines[0])
else:
logging.error('Error getting cpuinfo scocket count: %s', result)
return -1
def get_gpu_info():
"""Returns gpu information using nvidia-smi.
Note: Assumes if the system has multiple GPUs that they are all the same with
one exception. If the first result is a Quadro, the heuristic assumes
this may be a workstation and takes the second entry.
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`nvidia-smi` is not found or fails.
"""
cmd = 'nvidia-smi --query-gpu=driver_version,gpu_name --format=csv'
exit_code, result = run_command(cmd)
if exit_code != 0:
logging.error('nvidia-smi did not return as expected: %s', result)
return None
lines = result.splitlines()
gpu_info_line = lines[1]
if 'Quadro' in gpu_info_line and len(lines) >= 3:
gpu_info_line = lines[2]
gpu_info = {}
gpu_info['gpu_driver_version'] = gpu_info_line.split(',')[0].strip()
gpu_info['gpu_model'] = gpu_info_line.split(',')[1].strip()
gpu_info['gpu_count'] = len(lines) - 1
return gpu_info
def _install_tpu_tool():
"""Installs the ctpu tool to managing cloud TPUs.
Follows the instructions here:
https://github.com/tensorflow/tpu/tree/master/tools/ctpu
"""
if not os.path.exists('ctpu'):
logging.info('Installing TPU tool')
commands = [
'wget https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu',
'chmod a+x ctpu',
]
run_commands(commands)
def setup_tpu(parameters):
"""Sets up a TPU with a given set of parameters.
Args:
parameters: dictionary of TPU parameters.
Returns:
True if an error occurs during setup.
"""
try:
_install_tpu_tool()
args = [
'--name={}'.format(parameters.get('name')),
'--project={}'.format(parameters.get('project')),
'--zone={}'.format(parameters.get('zone')),
'--tpu-size={}'.format(parameters.get('size')),
'--tf-version={}'.format(parameters.get('version')),
'--tpu-only',
'-noconf',
]
command = './ctpu up {}'.format(' '.join(args))
logging.info('Setting up TPU: %s', command)
exit_code, output = run_command(command)
if exit_code != 0:
logging.error('Error in setup with output: %s', output)
return exit_code != 0
except Exception:
logging.error('Unable to setup TPU')
run_command('rm -f ctpu')
sys.exit(1)
def cleanup_tpu(parameters):
"""Cleans up an existing TPU.
Args:
parameters: dictionary of TPU parameters.
Returns:
True if an error occurs during cleanup.
"""
_install_tpu_tool()
args = [
'--name={}'.format(parameters.get('name')),
'--project={}'.format(parameters.get('project')),
'--zone={}'.format(parameters.get('zone')),
'--tpu-only',
'-noconf',
]
command = './ctpu delete {}'.format(' '.join(args))
logging.info('Cleaning up TPU: %s', command)
exit_code, output = run_command(command)
if exit_code != 0:
logging.error('Error in cleanup with output: %s', output)
return exit_code != 0
def read_benchmark_result(benchmark_result_file_path):
"""Read benchmark result from the protobuf file."""
from google.protobuf import json_format # pylint: disable=g-import-not-at-top
from tensorflow.core.util import test_log_pb2 # pylint: disable=g-import-not-at-top
if not os.path.isfile(benchmark_result_file_path):
logging.error('Failed to read benchmark result because '
'file %s does not exist', benchmark_result_file_path)
return {}
with open(benchmark_result_file_path, 'rb') as f:
benchmark_entries = test_log_pb2.BenchmarkEntries()
benchmark_entries.ParseFromString(f.read())
return json_format.MessageToDict(
benchmark_entries,
preserving_proto_field_name=True,
including_default_value_fields=True)['entry'][0]
def print_thread_stacktrace():
print('Here is the stacktrace for all threads:')
thread_names = {t.ident: t.name for t in threading.enumerate()}
for thread_id, frame in sys._current_frames().items(): # pylint: disable=protected-access
print('Thread {}'.format(thread_names.get(thread_id, thread_id)))
traceback.print_stack(frame)
def instantiate_benchmark_class(benchmark_class, output_dir, root_data_dir, tpu):
"""Return initialized benchmark class."""
module_import_path, class_name = benchmark_class.rsplit('.', 1)
module = importlib.import_module(module_import_path)
class_ = getattr(module, class_name)
instance = class_(output_dir=output_dir, root_data_dir=root_data_dir, tpu=tpu)
return instance
def copy_and_rename_dirs(dir_spec_string, dst_base_dir):
"""Copies list of <dir-path>:new_name specs into a new dest dir.
If a path /path1/path2/dir:new_dir is given, it copies /path1/path2/dir to
dst_base_dir/new_dir.
Args:
dir_spec_string: Comma separated list of /path1/path2:new_name specs.
dst_base_dir: The base dir to contain the copies.
"""
if not dir_spec_string:
return
dir_specs = dir_spec_string.split(',')
for src_dir_with_name in dir_specs:
src_dir, final_basename = src_dir_with_name.split(':')
dst_dir = os.path.join(dst_base_dir, final_basename)
if os.path.isdir(dst_dir):
logging.info('[DELETE] pre-existing %s', dst_dir)
shutil.rmtree(dst_dir)
logging.info('[COPY] %s -> %s', src_dir, dst_dir)
shutil.copytree(src_dir, dst_dir)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Checkout repository, download data and build docker image."""
from __future__ import print_function
import argparse
import json
import logging
import os
import shutil
import sys
import tempfile
import time
import perfzero.device_utils as device_utils
import perfzero.perfzero_config as perfzero_config
import perfzero.utils as utils
def _temporary_file_name(parent_dir, base_name):
"""Returns a temp name of the form <parent-dir>/<random>/<base-name>."""
if not os.path.isdir(parent_dir):
os.makedirs(parent_dir)
temp_dir = tempfile.mkdtemp(dir=parent_dir)
return os.path.join(temp_dir, base_name)
def _load_docker_image(FLAGS, workspace_dir, setup_execution_time):
"""Runs docker load --input_image <FLAGS.dockerfile_path>.
Fetches FLAGS.dockerfile_path to workspace_dir/<temp-dir>/local_docker first.
Runs docker load --input <path-to-local-docker>.
Deletes workspace_dir/<temp-dir> after the docker image is loaded.
Args:
FLAGS: parser.parse_known_args object.
workspace_dir: String - The path to use for intermediate artifacts.
setup_execution_time: Map from string->double containing wall times for
different operations. This will have insertions describing the docker
setup time.
"""
load_docker_start_time = time.time()
local_docker_image_path = _temporary_file_name(workspace_dir, 'local_docker')
utils.download_data([{'url': FLAGS.dockerfile_path,
'local_path': local_docker_image_path,
'decompress': False}])
setup_execution_time['fetch_docker'] = time.time() - load_docker_start_time
docker_load_cmd = 'docker load --input {}'.format(local_docker_image_path)
try:
utils.run_commands(
[docker_load_cmd,
'docker images' # Print loaded image list.
])
setup_execution_time['load_docker'] = time.time() - load_docker_start_time
finally:
logging.info('removing parent dir of local docker image copy %s',
local_docker_image_path)
shutil.rmtree(os.path.dirname(local_docker_image_path))
def _create_docker_image(FLAGS, project_dir, workspace_dir,
setup_execution_time):
"""Creates a docker image.
Args:
FLAGS: parser.parse_known_args object.
project_dir: String - The current project path.
workspace_dir: String - The path to use for intermediate artifacts.
setup_execution_time: Map from string->double containing wall times for
different operations. This will have insertions describing the docker
setup time.
"""
# Create docker image
docker_start_time = time.time()
docker_context = os.path.join(workspace_dir, 'resources')
# Necessary in case we don't have a local .whl file.
utils.create_empty_file(docker_context, 'EMPTY')
# Download TensorFlow pip package from Google Cloud Storage and modify package
# path accordingly, if applicable
local_tensorflow_pip_spec = None
if (FLAGS.tensorflow_pip_spec and
(FLAGS.tensorflow_pip_spec.startswith('gs://') or
FLAGS.tensorflow_pip_spec.startswith('file://'))):
local_pip_filename = os.path.basename(FLAGS.tensorflow_pip_spec)
local_pip_path = os.path.join(docker_context, local_pip_filename)
utils.download_data([{'url': FLAGS.tensorflow_pip_spec,
'local_path': local_pip_path}])
# Update path to pip wheel file for the Dockerfile. Note that this path has
# to be relative to the docker context (absolute path will not work).
FLAGS.tensorflow_pip_spec = local_pip_filename
local_tensorflow_pip_spec = local_pip_filename
else:
local_tensorflow_pip_spec = 'EMPTY'
dockerfile_path = FLAGS.dockerfile_path
if not os.path.exists(dockerfile_path):
# Fall back to the deprecated approach if the user-specified
# dockerfile_path does not exist
dockerfile_path = os.path.join(project_dir, FLAGS.dockerfile_path)
extra_pip_specs = (FLAGS.extra_pip_specs or '').replace(';', '')
docker_base_cmd = 'docker build --no-cache --pull'
cmd = '{docker_base_cmd} -t {docker_tag}{tf_pip}{local_tf_pip}{extra_pip} {suffix}'.format(
docker_base_cmd=docker_base_cmd,
docker_tag=FLAGS.docker_tag,
tf_pip=(
' --build-arg tensorflow_pip_spec={}'.format(
FLAGS.tensorflow_pip_spec) if FLAGS.tensorflow_pip_spec else ''),
# local_tensorflow_pip_spec is either string 'EMPTY' or basename of
# local .whl file.
local_tf_pip=' --build-arg local_tensorflow_pip_spec={}'.format(
local_tensorflow_pip_spec),
extra_pip=' --build-arg extra_pip_specs=\'{}\''.format(extra_pip_specs),
suffix=(
'-f {} {}'.format(dockerfile_path, docker_context)
if docker_context else '- < {}'.format(dockerfile_path))
)
utils.run_commands([cmd])
logging.info('Built docker image with tag %s', FLAGS.docker_tag)
setup_execution_time['build_docker'] = time.time() - docker_start_time
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
perfzero_config.add_setup_parser_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
level=logging.DEBUG)
if unparsed:
logging.error('Arguments %s are not recognized', unparsed)
sys.exit(1)
setup_execution_time = {}
project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
workspace_dir = os.path.join(project_dir, FLAGS.workspace)
site_package_dir = os.path.join(workspace_dir, 'site-packages')
utils.copy_and_rename_dirs(FLAGS.site_package_downloads,
site_package_dir)
activate_gcloud = False
if FLAGS.dockerfile_path and FLAGS.dockerfile_path.startswith('gs://'):
# We might end up doing gsutil fetch later, so need to call
# active_gcloud_service().
activate_gcloud = True
if FLAGS.tensorflow_pip_spec and FLAGS.tensorflow_pip_spec.startswith('gs://'):
activate_gcloud = True
# Download gcloud auth token. Remove this operation in the future when
# docker in Kokoro can accesss the GCP metadata server
start_time = time.time()
utils.active_gcloud_service(FLAGS.gcloud_key_file_url,
workspace_dir, download_only=not activate_gcloud)
setup_execution_time['download_token'] = time.time() - start_time
# Set up the raid array.
start_time = time.time()
device_utils.create_drive_from_devices(FLAGS.root_data_dir,
FLAGS.gce_nvme_raid)
setup_execution_time['create_drive'] = time.time() - start_time
if FLAGS.dockerfile_path:
if FLAGS.dockerfile_path.endswith('.tar.gz'):
logging.info('Assuming given file %s is a docker image to load',
FLAGS.dockerfile_path)
_load_docker_image(FLAGS, workspace_dir,
setup_execution_time)
else:
_create_docker_image(FLAGS, project_dir, workspace_dir,
setup_execution_time)
logging.info('Setup time in seconds by operation:\n %s',
json.dumps(setup_execution_time, indent=2))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment