Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
a32ffa95
Commit
a32ffa95
authored
Feb 03, 2023
by
qianyj
Browse files
update TensorFlow2x test method
parent
e286da17
Changes
268
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2845 deletions
+0
-2845
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v1
...marks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v1
+0
-88
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v2
...marks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v2
+0
-85
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v2_1
...rks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v2_1
+0
-96
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tfx
...chmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tfx
+0
-262
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_cuda11_8_0_0_180
...master/perfzero/docker/Dockerfile_ubuntu_cuda11_8_0_0_180
+0
-95
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_experimental_cuda11
...ter/perfzero/docker/Dockerfile_ubuntu_experimental_cuda11
+0
-109
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/cuda_diff.sh
...cation/benchmarks-master/perfzero/dockertest/cuda_diff.sh
+0
-105
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/diff_benchmarks.py
.../benchmarks-master/perfzero/dockertest/diff_benchmarks.py
+0
-117
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/requirements_temp.txt
...nchmarks-master/perfzero/dockertest/requirements_temp.txt
+0
-27
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/resnet50_synth.sh
...n/benchmarks-master/perfzero/dockertest/resnet50_synth.sh
+0
-82
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/run_single_benchmark.sh
...hmarks-master/perfzero/dockertest/run_single_benchmark.sh
+0
-74
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark.py
...lassification/benchmarks-master/perfzero/lib/benchmark.py
+0
-193
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark_test.py
...fication/benchmarks-master/perfzero/lib/benchmark_test.py
+0
-57
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/cloud_manager.py
...ification/benchmarks-master/perfzero/lib/cloud_manager.py
+0
-431
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/benchmark_method_runner.py
...s-master/perfzero/lib/perfzero/benchmark_method_runner.py
+0
-187
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/device_utils.py
...n/benchmarks-master/perfzero/lib/perfzero/device_utils.py
+0
-86
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config.py
...enchmarks-master/perfzero/lib/perfzero/perfzero_config.py
+0
-367
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config_test.py
...arks-master/perfzero/lib/perfzero/perfzero_config_test.py
+0
-54
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/process_info_tracker.py
...arks-master/perfzero/lib/perfzero/process_info_tracker.py
+0
-93
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/report_utils.py
...n/benchmarks-master/perfzero/lib/perfzero/report_utils.py
+0
-237
No files found.
Too many changes to show.
To preserve performance only
268 of 268+
files are displayed.
Plain diff
Email patch
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v1
deleted
100644 → 0
View file @
e286da17
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu
# - Installs requirements.txt for tensorflow/models
#
# This docker is not needed and is the same as the tf_v2 docker. The
# User can pass in the desired `ARG tensorflow_pip_spec` Remove
# one TF 1.0 testing is done or KOKORO jobs are updated to use the
# tensorfow_pip_spec rather than docker path to control TF version.
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG extra_pip_specs=""
ARG local_tensorflow_pip_spec=""
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.0.64-1+cuda10.0 \
libcudnn7-dev=7.6.0.64-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v2
deleted
100644 → 0
View file @
e286da17
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu (this is TF 2.0)
# - Installs requirements.txt for tensorflow/models
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.2.24-1+cuda10.0 \
libcudnn7-dev=7.6.2.24-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_v2_1
deleted
100644 → 0
View file @
e286da17
# Ubuntu 18.04 Python3 with CUDA 10.1 and the following:
# - Installs tf-nightly-gpu (this is TF 2.1)
# - Installs requirements.txt for tensorflow/models
# - TF 2.0 tested with cuda 10.0, but we need to test tf 2.1 with cuda 10.1.
FROM nvidia/cuda:10.1-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-1 \
cuda-cufft-10-1 \
cuda-curand-10-1 \
cuda-cusolver-10-1 \
cuda-cusparse-10-1 \
libcudnn7=7.6.4.38-1+cuda10.1 \
libcudnn7-dev=7.6.4.38-1+cuda10.1 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.1 \
libnvinfer-dev=5.1.5-1+cuda10.1 \
libnvinfer6=6.0.1-1+cuda10.1 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
# Install dependnecies needed for tf.distribute test utils
RUN pip install dill tblib portpicker
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tfx
deleted
100644 → 0
View file @
e286da17
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu (this is TF 2.0)
# - Installs requirements.txt for tensorflow/models
#
# NOTE: Branched from Dockerfile_ubuntu_1804_tf_v2 with changes for
# TFX benchmarks.
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# Specifies the default package version to use if no corresponding commit_id
# override is specified.
# If "head", uses the GitHub HEAD version.
# If "release", uses the latest released version from PyPI, REGARDLESS of
# package-compatibility requirements (e.g. even if tfx requires
# tensorflow-model-analysis<0.22, if tensorflow-model-analysis==0.22.0 is
# the latest released version on PyPI, we will install that).
# Packages include: tfx, tensorflow-transform, tensorflow-model-analysis,
# tensorflow-data-validation, tensorflow-metadata, tfx-bsl
ARG default_package_version="head"
# Specifies the package version to use for the corresponding packages.
# If empty, uses the default specified by default_package_version.
# If "head", uses the GitHub HEAD version.
# If "release", uses the latest released version from PyPI, REGARDLESS of
# package-compatibility requirements.
# If "github_commit:<commit id>", uses the given commit ID from GitHub.
# If "github_tag:<tag>" uses the given tag from GitHub.
# If "pypi:<version string>", uses the given package version from PyPI.
ARG tfx_package_version=""
ARG tensorflow_transform_package_version=""
ARG tensorflow_model_analysis_package_version=""
ARG tensorflow_data_validation_package_version=""
ARG tensorflow_metadata_package_version=""
ARG tfx_bsl_version=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.2.24-1+cuda10.0 \
libcudnn7-dev=7.6.2.24-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN pip install absl-py
RUN if [ ! -z "${extra_pip_specs}" ]; then pip install --upgrade --force-reinstall ${extra_pip_specs}; fi
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
# Install yolk3k, for getting package versions from PyPI (so we can pull
# TFX from GitHub even when we need to install from the released version)
RUN pip install yolk3k
# Install protoc
RUN PROTOC_ZIP=protoc-3.7.1-linux-x86_64.zip; \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.7.1/$PROTOC_ZIP; \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc; \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*'; \
rm -f $PROTOC_ZIP;
# Install Bazel
RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
RUN echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
RUN apt update
RUN apt install bazel
# Create symlink to "python3" binary under the name "python" so Bazel doesn't complain about "python" not being found
RUN ln -s $(which python3) /usr/bin/python
SHELL ["/bin/bash", "-c"]
RUN \
function install_package { \
# e.g. "head" or "release" \
default_version="$1"; \
# e.g "tensorflow-model-analysis" \
package_name="$2"; \
# e.g "model-analysis" \
package_repo_name="$3"; \
# How this package should be installed if pulled from GitHub. \
# "none" for no extra installation steps required \
# "bdist_wheel" for python setup.py bdist_wheel \
package_install_type=$4; \
# e.g. "head" or "release" or "pypi:0.21.4" or "github:[commit hash]" \
package_version="$5"; \
\
# e.g. "tensorflow_model_analysis" \
package_name_underscores=${package_name//-/_}; \
if [ "$package_version" == "" ]; then \
package_version="$default_version"; \
fi; \
\
commit_id=""; \
pypi_version=""; \
if [ "$package_version" == "head" ]; then \
commit_id=$(git ls-remote https://github.com/tensorflow/${package_repo_name} refs/heads/master | cut -f1); \
echo ${package_name}: latest commit from GitHub: ${commit_id}; \
elif [ "$package_version" == "release" ]; then \
pypi_version=$(yolk -V $package_name | head -n 1 | cut -d' ' -f 2-); \
echo ${package_name}: latest version from PyPI: ${pypi_version}; \
elif [ "${package_version:0:5}" == "pypi:" ]; then \
pypi_version="${package_version:5}"; \
echo ${package_name}: using specified PyPI version: ${pypi_version}; \
elif [ "${package_version:0:7}" == "github:" ]; then \
commit_id="${package_version:7}"; \
echo ${package_name}: using specified GitHub commit: ${commit_id}; \
else \
echo Unknown package version for ${package_name}: ${package_version}; \
exit 1; \
fi; \
\
if [ "$commit_id" != "" ]; then \
if [ "$package_install_type" == "none" ]; then \
# Package doesn't need extra installation steps - install directly from GitHub. \
pip install -e git+https://github.com/tensorflow/${package_repo_name}.git@${commit_id}#egg=${package_name_underscores}; \
install_commands+=("pip install --force --no-deps -e git+https://github.com/tensorflow/${package_repo_name}.git@${commit_id}#egg=${package_name_underscores}"); \
echo Installed ${package_name} from GitHub commit ${commit_id}; \
elif [ "$package_install_type" == "bdist_wheel" ]; then \
# Package needs extra installation steps. Clone from GitHub, then build and install. \
git clone https://github.com/tensorflow/${package_repo_name}.git; \
pushd ${package_repo_name}; \
git checkout ${commit_id}; \
if [ "$package_name" == "tfx" ]; then \
echo Building TFX pip package from source; \
sed -i 's@packages=packages,@packages=packages, package_data={package_name: ["benchmarks/datasets/chicago_taxi/data/taxi_1M.tfrecords.gz"]},@' setup.py; \
package_build/initialize.sh; \
python package_build/ml-pipelines-sdk/setup.py bdist_wheel; \
python package_build/tfx/setup.py bdist_wheel; \
else \
echo Using python setup.py bdist_wheel to build package; \
python setup.py bdist_wheel; \
fi; \
pip install dist/*.whl; \
install_commands+=("pip install --force --no-deps ${PWD}/dist/*.whl"); \
popd; \
echo Built and installed ${package_name} from GitHub commit ${commit_id}; \
fi; \
# Write GIT_COMMIT_ID attribute to the installed package. \
package_path=$(python3 -c "import ${package_name_underscores}; print(list(${package_name_underscores}.__path__)[0])"); \
echo "GIT_COMMIT_ID='${commit_id}'" >> ${package_path}/__init__.py; \
install_commands+=("echo \"GIT_COMMIT_ID='${commit_id}'\" >> ${package_path}/__init__.py;"); \
elif [ "$pypi_version" != "" ]; then \
if [ "$package_name" == "tfx" ]; then \
# Special handling for TFX - we want to install from GitHub, and get \
# the data files as well (they are not included in the pip package). \
# Install from the corresponding tag in GitHub. \
echo Special handling for tfx: will install tfx from GitHub tag for version ${pypi_version}; \
git clone --depth 1 --branch v${pypi_version} https://github.com/tensorflow/tfx.git; \
pushd tfx; \
echo Building TFX pip package from source; \
sed -i 's@packages=packages,@packages=packages, package_data={package_name: ["benchmarks/datasets/chicago_taxi/data/taxi_1M.tfrecords.gz"]},@' setup.py; \
package_build/initialize.sh; \
python package_build/ml-pipelines-sdk/setup.py bdist_wheel; \
python package_build/tfx/setup.py bdist_wheel; \
pip install dist/*.whl; \
install_commands+=("pip install --force --no-deps ${PWD}/dist/*.whl"); \
popd; \
echo Installed tfx from GitHub tag for version ${pypi_version}; \
else \
pip install ${package_name}==${pypi_version}; \
install_commands+=("pip install --force --no-deps ${package_name}==${pypi_version}"); \
echo Installed ${package_name} from PyPI version ${pypi_version}; \
fi; \
else \
echo Neither commit_id nor pypi_version was set for ${package_name}; \
exit 1; \
fi; \
}; \
\
# Array of commands to run post-installation. This is for forcing \
# installation of packages without regard to the requirements of other \
# packages. \
# The first round of installations installs the packages and their \
# requirements. This may result in some packages being re-installed at \
# versions other than the requested versions due to requirements from \
# other packages. \
# The second round of installations via install_commands \
# forces installations of the packages at the desired versions, ignoring \
# any dependencies of these packages or other packages. Note that if there \
# are incompatible package dependencies (e.g. tfx depends on \
# apache-beam==2.21 and tensorflow-transform depends on apache-beam==2.22 \
# then either could be installed depending on the installation order). \
install_commands=(); \
install_package "${default_package_version}" "tfx" "tfx" "bdist_wheel" "${tfx_package_version}"; \
install_package "${default_package_version}" "tensorflow-transform" "transform" "none" "${tensorflow_transform_package_version}"; \
install_package "${default_package_version}" "tensorflow-model-analysis" "model-analysis" "none" "${tensorflow_model_analysis_package_version}"; \
install_package "${default_package_version}" "tensorflow-data-validation" "data-validation" "bdist_wheel" "${tensorflow_data_validation_package_version}"; \
install_package "${default_package_version}" "tensorflow-metadata" "metadata" "bdist_wheel" "${tensorflow_metadata_package_version}"; \
install_package "${default_package_version}" "tfx-bsl" "tfx-bsl" "bdist_wheel" "${tfx_bsl_package_version}"; \
for cmd in "${install_commands[@]}"; do \
echo Running "${cmd}"; \
eval $cmd; \
done;
# Uninstall the TensorFlow version that TFX / the TFX components installed, and
# force install the version requested.
RUN pip uninstall -y tensorflow
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec}
RUN pip freeze
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_cuda11_8_0_0_180
deleted
100644 → 0
View file @
e286da17
# Ubuntu 18.04 Python3 with CUDA 11 and the following:
# - Installs tf-nightly-gpu (this is TF 2.3)
# - Installs requirements.txt for tensorflow/models
FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
RUN apt-get update && apt-get install -y --no-install-recommends \
--allow-downgrades --allow-change-held-packages \
build-essential \
cuda-tools-11-0 \
cuda-toolkit-11-0 \
libcudnn8=8.0.0.180-1+cuda11.0 \
libcudnn8-dev=8.0.0.180-1+cuda11.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
# Install dependnecies needed for tf.distribute test utils
RUN pip install dill tblib portpicker
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip install tf-estimator-nightly
RUN pip install tensorflow-text-nightly
# RUN nvidia-smi
RUN nvcc --version
RUN pip freeze
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_experimental_cuda11
deleted
100644 → 0
View file @
e286da17
# Ubuntu 18.04 Python3 with CUDA 11 and the following:
# - Installs tf-nightly-gpu (this is TF 2.3)
# - Installs requirements.txt for tensorflow/models
FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
ENV PIP_CMD="python3.9 -m pip"
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
# Needed to disable prompts during installation.
ENV DEBIAN_FRONTEND noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
# Python 3.9 related deps in this ppa.
RUN add-apt-repository ppa:deadsnakes/ppa
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3.9 \
python3-pip \
python3.9-dev \
python3-setuptools \
python3.9-venv \
python3.9-distutils \
python3.9-lib2to3
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN ${PIP_CMD} install --upgrade pip
RUN ${PIP_CMD} install --upgrade distlib
# setuptools upgraded to fix install requirements from model garden.
RUN ${PIP_CMD} install --upgrade setuptools
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.2/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
RUN ${PIP_CMD} install --upgrade pyyaml
RUN ${PIP_CMD} install --upgrade google-api-python-client==1.8.0
RUN ${PIP_CMD} install --upgrade google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN ${PIP_CMD} install wheel
RUN ${PIP_CMD} install absl-py
RUN ${PIP_CMD} install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN ${PIP_CMD} install tfds-nightly
RUN ${PIP_CMD} install -U scikit-learn
# Install dependnecies needed for tf.distribute test utils
RUN ${PIP_CMD} install dill tblib portpicker
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN ${PIP_CMD} install -r /tmp/requirements.txt
RUN ${PIP_CMD} install tf-estimator-nightly
RUN ${PIP_CMD} install tensorflow-text-nightly
# RUN nvidia-smi
RUN nvcc --version
RUN pip freeze
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/cuda_diff.sh
deleted
100644 → 0
View file @
e286da17
#!/bin/bash
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
set
-e
set
-x
git clone https://github.com/tensorflow/benchmarks.git
cd
benchmarks
tf_spec
=
"tf-nightly-gpu==2.5.0.dev20210212"
CIFAR10_DATA
=
"gs://tf-perf-imagenet-uswest1/tensorflow/cifar10_data/cifar-10-batches-bin"
CIFAR10_BENCHMARKS
=
"official.benchmark.keras_cifar_benchmark.Resnet56KerasBenchmarkReal.benchmark_1_gpu_no_dist_strat"
RESNET50_DATA
=
"gs://tf-perf-imagenet-uswest1/tensorflow/imagenet"
RESNET50_BENCHMARKS
=
"official.benchmark.resnet_ctl_imagenet_benchmark.Resnet50CtlBenchmarkReal.benchmark_1_gpu_fp16"
function
run_single_benchmark
()
{
docker_name
=
$1
label
=
$2
data_downloads
=
$3
benchmark_methods
=
$4
perfzero_pwd
=
`
pwd
`
nvidia-docker run
\
-v
${
perfzero_pwd
}
:/workspace
\
-v
/data:/data
\
-e
PERFZERO_RUN_TAGS
=
\
-e
PERFZERO_TRACKING_ID
=
\
-e
PERFZERO_COMMIT_LABEL
=
\
-e
PERFZERO_EXECUTION_BRANCH
=
master
\
-e
CUDNN_LOGINFO_DBG
=
1
\
-e
CUDNN_LOGDEST_DBG
=
stderr
\
${
docker_name
}
\
python3 /workspace/perfzero/lib/benchmark.py
\
--bigquery_dataset_table_name
=
""
\
--data_downloads
=
"
${
data_downloads
}
"
\
--ml_framework_build_label
=
v2-nightly-gpu
\
--execution_label
=
"
${
label
}
"
\
--platform_name
=
kokoro-gcp
\
--system_name
=
n1-standard-8-1xA100
\
--output_gcs_url
=
""
\
--benchmark_class_type
=
\
--scratch_gcs_url
=
\
--root_data_dir
=
/data
\
--benchmark_num_trials
=
2
\
--bigquery_project_name
=
""
\
--git_repos
=
"https://github.com/tensorflow/models.git;benchmark"
\
--python_path
=
models
\
--benchmark_methods
=
${
benchmark_methods
}
\
--result_upload_methods
=
""
\
--gcloud_key_file_url
=
"
${
PERFZERO_GCLOUD_KEY_FILE_URL
}
"
\
--tpu_parameters
=
}
function
run_benchmarks
()
{
docker_name
=
$1
label
=
$2
run_single_benchmark
${
docker_name
}
${
label
}
"
${
CIFAR10_DATA
}
"
"
${
CIFAR10_BENCHMARKS
}
"
run_single_benchmark
${
docker_name
}
${
label
}
"
${
RESNET50_DATA
}
"
"
${
RESNET50_BENCHMARKS
}
"
}
function
setup_docker
()
{
label
=
$1
dockerfile
=
$2
echo
"
`
date
`
Setting up
${
label
}
docker..."
sudo
python3 perfzero/lib/setup.py
\
--gce_nvme_raid
=
\
--docker_tag
=
"
${
label
}
"
\
--gcloud_key_file_url
=
\
--tensorflow_pip_spec
=
${
tf_spec
}
\
--dockerfile_path
=
${
dockerfile
}
echo
"
`
date
`
Finished setting up
${
label
}
docker."
}
function
diff_benchmarks
()
{
python3 perfzero/dockertest/diff_benchmarks.py
`
pwd
`
}
baseline_docker
=
"docker/Dockerfile_ubuntu_cuda11_8_0_0_180"
experiment_docker
=
"docker/Dockerfile_ubuntu_1804_tf_cuda_11"
setup_docker
"control/tensorflow"
${
baseline_docker
}
run_benchmarks
"control/tensorflow"
"control-8-0-0-180"
setup_docker
"experiment/tensorflow"
${
experiment_docker
}
run_benchmarks
"experiment/tensorflow"
"experiment-8-0-4-30"
diff_benchmarks
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/diff_benchmarks.py
deleted
100644 → 0
View file @
e286da17
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""Simple script to diff benchmark results.
This script will read all the summary files from a base output directory and
print a human readable diff report.
"""
import
json
import
os
import
sys
def
_find_perfzero_logs
(
docker_output_dir
):
"""Finds pairs of json_file, output_log file from all methods."""
summary_files
=
[]
for
root
,
_
,
files
in
os
.
walk
(
docker_output_dir
):
for
summary_file
in
files
:
if
summary_file
.
endswith
(
'perfzero_summary.json'
):
full_summary_file
=
os
.
path
.
join
(
root
,
summary_file
)
summary_files
.
append
(
full_summary_file
)
sys
.
stdout
.
write
(
'Found json {}
\n
'
.
format
(
full_summary_file
))
return
summary_files
def
_load_summaries
(
summary_files
):
"""Loads input json file paths and returns json objects."""
summary_jsons
=
[]
for
summary_file
in
summary_files
:
with
open
(
summary_file
,
'r'
)
as
f
:
summary_json
=
json
.
load
(
f
)
summary_jsons
.
append
(
summary_json
)
return
summary_jsons
def
_summarize_benchmarks
(
summary_files
):
"""Remaps list of json files -> summaries by benchmark method."""
summary_jsons
=
_load_summaries
(
summary_files
)
performance_by_method
=
{}
for
summary_json
in
summary_jsons
:
method
=
summary_json
[
'benchmark_result'
][
'name'
]
trial
=
summary_json
[
'benchmark_result'
][
'trial_id'
]
metrics_list
=
summary_json
[
'benchmark_result'
][
'metrics'
]
metrics
=
{}
for
metric_info
in
metrics_list
:
metrics
[
metric_info
[
'name'
]]
=
metric_info
[
'value'
]
metrics
[
'wall_time'
]
=
summary_json
[
'benchmark_result'
][
'wall_time'
]
label
=
summary_json
[
'benchmark_info'
][
'execution_label'
]
performance_by_method
.
setdefault
(
method
,
{}).
setdefault
(
label
,
[])
performance_by_method
[
method
][
label
].
append
((
trial
,
metrics
))
return
performance_by_method
def
_print_diff_report
(
performance_by_method
):
"""Prints a diff report of benchmark performance."""
print
(
'Performance report:'
)
print
(
json
.
dumps
(
performance_by_method
,
indent
=
2
))
method_to_metric_to_perf
=
{}
for
method
in
performance_by_method
:
for
label
,
label_data
in
performance_by_method
[
method
].
items
():
latest_trial_data
=
max
(
label_data
,
key
=
lambda
x
:
x
[
0
])
latest_metrics
=
latest_trial_data
[
1
]
for
metric
,
value
in
latest_metrics
.
items
():
method_to_metric_to_perf
.
setdefault
(
method
,
{}).
setdefault
(
metric
,
[])
method_to_metric_to_perf
[
method
][
metric
].
append
((
label
,
value
))
print
(
'Diff report:'
)
for
method
in
sorted
(
method_to_metric_to_perf
):
print
(
'-- benchmark: {}'
.
format
(
method
))
for
metric
in
sorted
(
method_to_metric_to_perf
[
method
].
keys
()):
value_list
=
[]
for
label
,
value
in
sorted
(
method_to_metric_to_perf
[
method
][
metric
],
key
=
lambda
x
:
x
[
0
]):
print
(
' {}: {}: {}'
.
format
(
metric
,
label
,
value
))
value_list
.
append
(
value
)
if
len
(
value_list
)
==
2
:
control_val
=
float
(
value_list
[
0
])
expt_val
=
float
(
value_list
[
1
])
if
abs
(
control_val
)
>
1e-5
:
diff_pct
=
(
expt_val
/
control_val
-
1.0
)
*
100.0
else
:
diff_pct
=
-
1.0
print
(
' diff: {:2.2f}%'
.
format
(
diff_pct
))
def
main
():
if
len
(
sys
.
argv
)
!=
2
:
raise
RuntimeError
(
'Usage: {} <base perfzero output dir>'
.
format
(
sys
.
argv
[
0
]))
perfzero_output_dir
=
sys
.
argv
[
1
]
summary_files
=
_find_perfzero_logs
(
perfzero_output_dir
)
performance_by_method
=
_summarize_benchmarks
(
summary_files
)
_print_diff_report
(
performance_by_method
)
if
__name__
==
'__main__'
:
main
()
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/requirements_temp.txt
deleted
100644 → 0
View file @
e286da17
six
google-api-python-client>=1.6.7
kaggle>=1.3.9
numpy>=1.15.4
oauth2client
pandas>=0.22.0
psutil>=5.4.3
py-cpuinfo>=3.3.0
scipy>=0.19.1
tensorflow-hub>=0.6.0
tensorflow-model-optimization>=0.4.1
tensorflow-datasets
tensorflow-addons
dataclasses;python_version<"3.7"
gin-config
tf_slim>=1.1.0
Cython
matplotlib
pyyaml>=5.1
# CV related dependencies
opencv-python-headless
Pillow
pycocotools
# NLP related dependencies
seqeval
sentencepiece
sacrebleu
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/resnet50_synth.sh
deleted
100644 → 0
View file @
e286da17
#!/bin/bash
set
-e
set
-x
# To run this script from a GCP VM / host connected to GPUs:
# git clone https://github.com/tensorflow/benchmarks.git
# cd benchmarks
# bash perfzero/dockertest/resnet50_synth.sh
# Output log files/results will be stored at perfzero/workspace/output/
# Modify INPUT_PARAMS variables below to tweak the tf whl under test / benchmark methods / dataset paths.
# You can comment out "build_docker" call at the end, if the docker's already built.
## INPUT PARAMS: start
# Acceptable formats for TF_PIP_SPEC
# pypi nightlies: tf-nightly-gpu==2.6.0.dev20210521
# gcs path to whls: gs://some-path-to-tf.whl
# Local path to whl: file://some-local-path-to-tf.whl
TF_PIP_SPEC
=
"tf-nightly-gpu==2.6.0.dev20210624"
# Path to GCS or local files containing the input datasets (if they need to be fetched into the docker).
DATA_DOWNLOADS
=
""
# Comma separated list of strings.
BENCHMARK_METHODS
=
"official.benchmark.keras_imagenet_benchmark.Resnet50KerasBenchmarkSynth.benchmark_1_gpu_fp16"
# If either the tf_pip_spec or data downloads reference private GCP, then we
# need to set GCLOUD_KEY_FILE_URL to a credentials file.
GCLOUD_KEY_FILE_URL
=
""
# Commit id under repository tensorflow/models, branch='benchmark' which has the benchmarks.
MODELS_GIT_HASH
=
"169e4051aef247c27a95748a8015b2f35f509e1a"
## INPUT PARAMS: end
build_docker
()
{
echo
"building docker"
sudo
python3 perfzero/lib/setup.py
\
--dockerfile_path
=
docker/Dockerfile_ubuntu_1804_tf_cuda_11
\
--tensorflow_pip_spec
=
"
${
TF_PIP_SPEC
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
\
--extra_docker_build_args
=
sudo
docker images
}
run_benchmark
()
{
echo
"running benchmark"
benchmark_tag
=
$1
env_var
=
$2
sudo
nvidia-docker run
\
-v
${
PWD
}
:/workspace
\
-v
/data:/data
\
-e
PERFZERO_EXECUTION_MODE
=
test
\
-e
TF_ENABLE_LEGACY_FILESYSTEM
=
1
\
-e
${
env_var
}
\
perfzero/tensorflow python3
\
/workspace/perfzero/lib/benchmark.py
\
--root_data_dir
=
/data
\
--bigquery_dataset_table_name
=
""
\
--benchmark_class_type
=
\
--ml_framework_build_label
=
v2-nightly-gpu-
${
benchmark_tag
}
\
--execution_label
=
test-benchmark
\
--platform_name
=
kokoro-gcp
\
--system_name
=
n1-standard-8-1xV100
\
--output_gcs_url
=
""
\
--benchmark_num_trials
=
1
\
--scratch_gcs_url
=
\
--bigquery_project_name
=
""
\
--git_repos
=
"https://github.com/tensorflow/models.git;benchmark;
${
MODELS_GIT_HASH
}
"
\
--data_downloads
=
"
${
DATA_DOWNLOADS
}
"
\
--python_path
=
models
\
--benchmark_methods
=
"
${
BENCHMARK_METHODS
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
}
build_docker
run_benchmark
"control"
"TF_CUDNN_USE_FRONTEND=false"
run_benchmark
"experiment"
"TF_CUDNN_USE_FRONTEND=true"
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/run_single_benchmark.sh
deleted
100644 → 0
View file @
e286da17
#!/bin/bash
set
-e
set
-x
# To run this script from a GCP VM / host connected to GPUs:
# git clone https://github.com/tensorflow/benchmarks.git
# cd benchmarks
# bash perfzero/dockertest/run_single_benchmark.sh
# Output log files/results will be stored at perfzero/workspace/output/
# Modify INPUT_PARAMS variables below to tweak the tf whl under test / benchmark methods / dataset paths.
# You can comment out "build_docker" call at the end, if the docker's already built.
## INPUT PARAMS: start
# Acceptable formats for TF_PIP_SPEC
# pypi nightlies: tf-nightly-gpu==2.6.0.dev20210521
# gcs path to whls: gs://some-path-to-tf.whl
# Local path to whl: file://some-local-path-to-tf.whl
TF_PIP_SPEC
=
"tf-nightly-gpu==2.6.0.dev20210521"
# Path to GCS or local files containing the input datasets (if they need to be fetched into the docker).
DATA_DOWNLOADS
=
""
# Comma separated list of strings.
BENCHMARK_METHODS
=
"official.benchmark.keras_imagenet_benchmark.Resnet50KerasBenchmarkSynth.benchmark_xla_1_gpu_fp16"
# If either the tf_pip_spec or data downloads reference private GCP, then we
# need to set GCLOUD_KEY_FILE_URL to a credentials file.
GCLOUD_KEY_FILE_URL
=
""
## INPUT PARAMS: end
build_docker
()
{
echo
"building docker"
sudo
python3 perfzero/lib/setup.py
\
--dockerfile_path
=
docker/Dockerfile_ubuntu_1804_tf_cuda_11
\
--tensorflow_pip_spec
=
"
${
TF_PIP_SPEC
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
\
--extra_docker_build_args
=
sudo
docker images
}
run_benchmark
()
{
echo
"running benchmark"
sudo
nvidia-docker run
\
-v
${
PWD
}
:/workspace
\
-v
/data:/data
\
-e
PERFZERO_EXECUTION_MODE
=
test
\
-e
TF_ENABLE_LEGACY_FILESYSTEM
=
1
\
perfzero/tensorflow python3
\
/workspace/perfzero/lib/benchmark.py
\
--root_data_dir
=
/data
\
--bigquery_dataset_table_name
=
""
\
--benchmark_class_type
=
\
--ml_framework_build_label
=
v2-nightly-gpu
\
--execution_label
=
test-benchmark
\
--platform_name
=
kokoro-gcp
\
--system_name
=
n1-standard-8-1xV100
\
--output_gcs_url
=
""
\
--benchmark_num_trials
=
1
\
--scratch_gcs_url
=
\
--bigquery_project_name
=
""
\
--git_repos
=
'https://github.com/tensorflow/models.git;benchmark;f7938e6ad46fecfa1112eda579eb046eb3f3bf96'
\
--data_downloads
=
"
${
DATA_DOWNLOADS
}
"
\
--python_path
=
models
\
--benchmark_methods
=
"
${
BENCHMARK_METHODS
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
}
build_docker
run_benchmark
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute benchmark."""
from
__future__
import
print_function
import
argparse
import
json
import
logging
import
multiprocessing
import
os
import
re
import
sys
import
time
import
perfzero.benchmark_method_runner
as
benchmark_method_runner
import
perfzero.perfzero_config
as
perfzero_config
import
perfzero.tpu_runtime_utils
as
tpu_runtime_utils
import
perfzero.utils
as
utils
class
BenchmarkRunner
(
object
):
"""Execute benchmark and report results."""
def
__init__
(
self
,
config
):
self
.
config
=
config
self
.
project_dir
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
)))
self
.
workspace_dir
=
os
.
path
.
join
(
self
.
project_dir
,
config
.
workspace
)
self
.
site_packages_dir
=
os
.
path
.
join
(
self
.
workspace_dir
,
'site-packages'
)
self
.
root_output_dir
=
os
.
path
.
join
(
self
.
workspace_dir
,
'output'
)
self
.
benchmark_execution_time
=
{}
def
_setup
(
self
):
"""Download data and checkout git repository."""
# Acticate gcloud service
start_time
=
time
.
time
()
utils
.
setup_python_path
(
self
.
site_packages_dir
,
self
.
config
.
python_path_str
)
utils
.
active_gcloud_service
(
self
.
config
.
gcloud_key_file_url
,
self
.
workspace_dir
)
utils
.
make_dir_if_not_exist
(
self
.
root_output_dir
)
self
.
benchmark_execution_time
[
'activate_gcloud_service'
]
=
(
time
.
time
()
-
start_time
)
# Download data
start_time
=
time
.
time
()
utils
.
download_data
(
utils
.
parse_data_downloads_str
(
self
.
config
.
root_data_dir
,
self
.
config
.
gcs_downloads_str
))
utils
.
download_data
(
utils
.
parse_data_downloads_str
(
self
.
config
.
root_data_dir
,
self
.
config
.
data_downloads_str
))
self
.
benchmark_execution_time
[
'download_data'
]
=
time
.
time
()
-
start_time
# Checkout git repositories
start_time
=
time
.
time
()
site_package_info
=
utils
.
checkout_git_repos
(
self
.
config
.
get_git_repos
(
self
.
site_packages_dir
),
self
.
config
.
use_cached_site_packages
)
self
.
benchmark_execution_time
[
'checkout_repository'
]
=
(
time
.
time
()
-
start_time
)
# Start cloud TPU.
if
self
.
config
.
tpu_parameters
is
not
None
:
start_time
=
time
.
time
()
utils
.
setup_tpu
(
self
.
config
.
tpu_parameters
)
tpu_info
=
tpu_runtime_utils
.
configure_tpu
(
self
.
config
.
tpu_parameters
)
site_package_info
[
'tpu_version'
]
=
tpu_info
self
.
benchmark_execution_time
[
'start_tpu'
]
=
time
.
time
()
-
start_time
self
.
stream_handler
=
logging
.
StreamHandler
(
sys
.
stdout
)
self
.
stream_handler
.
setFormatter
(
logging
.
Formatter
(
'%(asctime)s %(levelname)s: %(message)s'
))
logging
.
getLogger
().
addHandler
(
self
.
stream_handler
)
return
site_package_info
def
_get_benchmark_methods
(
self
):
"""Returns list of benchmark methods to execute."""
filter_prefix
=
'filter:'
benchmark_methods
=
[]
for
benchmark_method_pattern
in
self
.
config
.
benchmark_method_patterns
:
if
filter_prefix
not
in
benchmark_method_pattern
:
benchmark_methods
.
append
(
benchmark_method_pattern
)
else
:
index
=
benchmark_method_pattern
.
find
(
filter_prefix
)
benchmark_class
=
benchmark_method_pattern
[:
index
-
1
]
pattern
=
benchmark_method_pattern
[
index
+
len
(
filter_prefix
):]
class_instance
=
utils
.
instantiate_benchmark_class
(
benchmark_class
,
'/dev/null'
,
''
,
None
,
{},
benchmark_class_type
=
self
.
config
.
benchmark_class_type
)
for
benchmark_method_name
in
dir
(
class_instance
):
if
re
.
match
(
pattern
,
benchmark_method_name
):
benchmark_methods
.
append
(
benchmark_class
+
'.'
+
benchmark_method_name
)
logging
.
info
(
'The following benchmark methods will be executed: %s'
,
benchmark_methods
)
return
benchmark_methods
def
_run_benchmarks_trial
(
self
,
harness_info
,
site_package_info
,
benchmark_methods
,
trial_id
):
"""Runs a single trial of all benchmark methods."""
# Run the benchmark method in a separate process so that its memory usage
# will not affect the execution of other benchmark method
# This is a walkaround before we fix all memory leak issues in TensorFlow
has_exception
=
False
benchmark_success_results
=
{}
benchmark_output_dirs
=
{}
benchmark_execution_time
=
{}
for
benchmark_method
in
benchmark_methods
:
queue
=
multiprocessing
.
Queue
()
process
=
multiprocessing
.
Process
(
target
=
benchmark_method_runner
.
run
,
args
=
(
benchmark_method
,
harness_info
,
site_package_info
,
self
.
root_output_dir
,
self
.
config
,
queue
,
trial_id
))
process
.
start
()
process
.
join
()
method_has_exception
,
method_execution_time
,
succeeded
,
output_dir
=
queue
.
get
()
# pylint: disable=line-too-long
has_exception
|=
method_has_exception
benchmark_execution_time
[
benchmark_method
]
=
method_execution_time
benchmark_success_results
[
benchmark_method
]
=
succeeded
benchmark_output_dirs
[
benchmark_method
]
=
output_dir
return
(
has_exception
,
benchmark_success_results
,
benchmark_output_dirs
,
benchmark_execution_time
)
def
run_benchmark
(
self
):
"""Run benchmark."""
harness_info
=
utils
.
get_git_repo_info
(
self
.
project_dir
)
has_exception
=
False
benchmark_success_results
=
{}
benchmark_output_dirs
=
{}
num_trials
=
self
.
config
.
benchmark_num_trials
try
:
site_package_info
=
self
.
_setup
()
benchmark_methods
=
self
.
_get_benchmark_methods
()
print
(
'Setup complete. Running {} trials'
.
format
(
num_trials
))
for
trial_id
in
range
(
1
,
num_trials
+
1
):
print
(
'Running trial {} / {}'
.
format
(
trial_id
,
num_trials
))
(
trial_has_exception
,
trial_success_results
,
trial_output_dirs
,
trial_execution_time
)
=
self
.
_run_benchmarks_trial
(
harness_info
,
site_package_info
,
benchmark_methods
,
trial_id
)
trial_key
=
'trial_{}'
.
format
(
trial_id
)
has_exception
|=
trial_has_exception
self
.
benchmark_execution_time
[
trial_key
]
=
trial_execution_time
benchmark_success_results
[
trial_key
]
=
trial_success_results
benchmark_output_dirs
[
trial_key
]
=
trial_output_dirs
finally
:
if
self
.
config
.
tpu_parameters
is
not
None
:
has_exception
|=
utils
.
cleanup_tpu
(
self
.
config
.
tpu_parameters
)
print
(
'Benchmark execution time in seconds by operation:
\n
{}'
.
format
(
json
.
dumps
(
self
.
benchmark_execution_time
,
indent
=
2
)))
print
(
'Benchmark success results:
\n
{}'
.
format
(
json
.
dumps
(
benchmark_success_results
,
indent
=
2
)))
print
(
'Benchmark local output directories:
\n
{}'
.
format
(
json
.
dumps
(
benchmark_output_dirs
,
indent
=
2
)))
if
has_exception
:
sys
.
exit
(
1
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
perfzero_config
.
add_benchmark_parser_arguments
(
parser
)
FLAGS
,
unparsed
=
parser
.
parse_known_args
()
level
=
logging
.
DEBUG
if
FLAGS
.
debug
else
logging
.
INFO
logging
.
basicConfig
(
format
=
'%(asctime)s %(levelname)s: %(message)s'
,
level
=
level
)
if
unparsed
:
logging
.
error
(
'Arguments %s are not recognized'
,
unparsed
)
sys
.
exit
(
1
)
config_
=
perfzero_config
.
PerfZeroConfig
(
mode
=
'flags'
,
flags
=
FLAGS
)
benchmark_runner
=
BenchmarkRunner
(
config_
)
benchmark_runner
.
run_benchmark
()
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark_test.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark.py."""
from
__future__
import
print_function
import
sys
import
unittest
import
benchmark
import
mock
class
TestBenchmarkRunner
(
unittest
.
TestCase
):
def
test_get_benchmark_methods_filter
(
self
):
"""Tests returning methods on a class based on a filter."""
config
=
mock
.
Mock
()
config
.
workspace
=
'workspace'
config
.
benchmark_method_patterns
=
[
'new_foo.BenchmarkClass.filter:bench.*'
]
benchmark_runner
=
benchmark
.
BenchmarkRunner
(
config
)
mock_benchmark_class
=
mock
.
Mock
()
mock_benchmark_class
.
benchmark_method_1
=
'foo'
mock_module
=
mock
.
Mock
()
sys
.
modules
[
'new_foo'
]
=
mock_module
mock_module
.
BenchmarkClass
.
return_value
=
mock_benchmark_class
methods
=
benchmark_runner
.
_get_benchmark_methods
()
self
.
assertEqual
(
1
,
len
(
methods
))
self
.
assertEqual
(
'new_foo.BenchmarkClass.benchmark_method_1'
,
methods
[
0
])
def
test_get_benchmark_methods_exact_match
(
self
):
"""Tests returning methods on a class based on a filter."""
config
=
mock
.
Mock
()
config
.
workspace
=
'workspace'
config
.
benchmark_method_patterns
=
[
'new_foo.BenchmarkClass.benchmark_method_1'
,
'new_foo.BenchmarkClass.benchmark_method_2'
]
benchmark_runner
=
benchmark
.
BenchmarkRunner
(
config
)
methods
=
benchmark_runner
.
_get_benchmark_methods
()
self
.
assertEqual
([
'new_foo.BenchmarkClass.benchmark_method_1'
,
'new_foo.BenchmarkClass.benchmark_method_2'
],
methods
)
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/cloud_manager.py
deleted
100644 → 0
View file @
e286da17
#!/usr/bin/python
#
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper script to create, query and stop machine in GCP."""
from
__future__
import
print_function
import
argparse
import
getpass
import
logging
import
subprocess
import
sys
import
time
INSTANCE_NAME_PREFIX
=
'perfzero-dev-'
def
run_command
(
cmd
,
is_from_user
=
False
):
"""Runs list of command and throw error if return code is non-zero.
Args:
cmd: Command to execute
is_from_user: If true, log the command and the command output in INFO level.
Otherwise, log these in the DEBUG level.
Returns:
a string representing the command output
Raises:
Exception: raised when the command execution has non-zero exit code
"""
_log
=
logging
.
info
if
is_from_user
else
logging
.
debug
_log
(
'Executing command: {}'
.
format
(
cmd
))
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
,
shell
=
True
)
exit_code
=
None
line
=
''
stdout
=
''
while
exit_code
is
None
or
line
:
exit_code
=
p
.
poll
()
line
=
p
.
stdout
.
readline
().
decode
(
'utf-8'
)
stdout
+=
line
_log
(
line
)
if
exit_code
and
is_from_user
:
sys
.
exit
(
exit_code
)
elif
exit_code
:
raise
Exception
(
'Command:
\n
{}
\n
failed with output:
\n
{}'
.
format
(
cmd
,
stdout
))
return
stdout
def
get_instance_name
(
username
):
return
INSTANCE_NAME_PREFIX
+
username
def
get_machine_type
(
machine_type
,
accelerator_count
):
"""Get machine type for the instance.
- Use the user-specified machine_type if it is not None
- Otherwise, use the standard type with cpu_count = 8 x accelerator_count
if user-specified accelerator_count > 0
- Otherwise, use the standard type with 8 cpu
Args:
machine_type: machine_type specified by the user
accelerator_count: accelerator count
Returns:
the machine type used for the instance
"""
if
machine_type
:
return
machine_type
cpu_count
=
max
(
accelerator_count
,
1
)
*
8
return
'n1-standard-{}'
.
format
(
cpu_count
)
def
_ssh_prefix
(
project
,
zone
,
internal_ip
,
key_file
):
if
internal_ip
:
ssh_prefix
=
'gcloud beta compute ssh --internal-ip'
else
:
ssh_prefix
=
'gcloud compute ssh'
if
key_file
:
ssh_prefix
=
'{} --ssh-key-file={}'
.
format
(
ssh_prefix
,
key_file
)
return
'{} --project={} --zone={}'
.
format
(
ssh_prefix
,
project
,
zone
)
def
create
(
username
,
project
,
zone
,
machine_type
,
accelerator_count
,
accelerator_type
,
image
,
nvme_count
,
ssh_internal_ip
,
ssh_key_file
,
cpu_min_platform
=
None
,
boot_ssd_size
=
None
):
"""Create gcloud computing instance.
Args:
username: the username of the current user
project: project name
zone: zone of the GCP computing instance
machine_type: the machine type used for the instance
accelerator_count: the number of pieces of the accelerator to attach to
the instance
accelerator_type: the specific type of accelerator to attach to the instance
image: the name of the image that the disk will be initialized with
nvme_count: the number of NVME local SSD devices to attach to the instance
ssh_internal_ip: internal ip to use for ssh.
ssh_key_file: ssh key file to use to connect to instance.
cpu_min_platform: minimum CPU platform to use, if None use default.
boot_ssd_size: If set boot disk is changed to SSD and this size(GB) is used.
"""
instance_name
=
get_instance_name
(
username
)
machine_type
=
get_machine_type
(
machine_type
,
accelerator_count
)
logging
.
debug
(
'Creating gcloud computing instance %s'
,
instance_name
)
cmd
=
'''gcloud compute instances create {}
\
--image={}
\
--project={}
\
--zone={}
\
--machine-type={}
\
--maintenance-policy=TERMINATE
\
'''
.
format
(
instance_name
,
image
,
project
,
zone
,
machine_type
)
if
boot_ssd_size
:
cmd
+=
'--boot-disk-size={}GB --boot-disk-type=pd-ssd '
.
format
(
boot_ssd_size
)
if
accelerator_count
>
0
:
cmd
+=
'--accelerator=count={},type={} '
.
format
(
accelerator_count
,
accelerator_type
)
if
cpu_min_platform
:
cmd
+=
'--min-cpu-platform="{}" '
.
format
(
cpu_min_platform
)
for
_
in
range
(
nvme_count
):
cmd
+=
'--local-ssd=interface=NVME '
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
info
(
'Successfully created gcloud computing instance %s '
'with %s accelerator.
\n
'
,
instance_name
,
accelerator_count
)
ssh_prefix
=
_ssh_prefix
(
project
,
zone
,
ssh_internal_ip
,
ssh_key_file
)
# Wait until we can ssh to the newly created computing instance
cmd
=
'{} --strict-host-key-checking=no --command="exit" {}'
.
format
(
ssh_prefix
,
instance_name
)
ssh_remaining_retries
=
12
ssh_error
=
None
while
ssh_remaining_retries
>
0
:
ssh_remaining_retries
-=
1
try
:
run_command
(
cmd
,
is_from_user
=
False
)
ssh_error
=
None
except
Exception
as
error
:
# pylint: disable=broad-except
ssh_error
=
error
if
ssh_remaining_retries
:
logging
.
info
(
'Cannot ssh to the computing instance. '
'Try again after 5 seconds'
)
time
.
sleep
(
5
)
else
:
logging
.
error
(
'Cannot ssh to the computing instance after '
'60 seconds due to error:
\n
%s'
,
str
(
ssh_error
))
if
ssh_error
:
logging
.
info
(
'Run the commands below manually after ssh into the computing '
'instance:
\n
'
'git clone https://github.com/tensorflow/benchmarks.git
\n
'
'sudo usermod -a -G docker $USER
\n
'
)
else
:
cmd
=
'{} --command="git clone {}" {}'
.
format
(
ssh_prefix
,
'https://github.com/tensorflow/benchmarks.git'
,
instance_name
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
info
(
'Successfully checked-out PerfZero code on the '
'computing instance
\n
'
)
cmd
=
'{} --command="sudo usermod -a -G docker $USER" {}'
.
format
(
ssh_prefix
,
instance_name
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
info
(
'Successfully added user to the docker group
\n
'
)
cmd
=
'{} {} -- -L 6006:127.0.0.1:6006'
.
format
(
ssh_prefix
,
instance_name
)
logging
.
info
(
'Run the command below to ssh to the instance together with '
'port forwarding for tensorboard:
\n
%s
\n
'
,
cmd
)
def
status
(
username
,
project
,
zone
,
ssh_internal_ip
,
ssh_key_file
):
"""Query the status of the computing instance.
Args:
username: the username of the current user.
project: project name.
zone: zone of the GCP computing instance.
ssh_internal_ip: internal ip of the instance.
ssh_key_file: SSH key file to use to connect to the instance.
"""
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Querying status of gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'gcloud compute instances list --filter="name={} AND zone:{}" --project {}'
.
format
(
# pylint: disable=line-too-long
instance_name
,
zone
,
project
)
stdout
=
run_command
(
cmd
,
is_from_user
=
True
)
num_instances
=
len
(
stdout
.
splitlines
())
-
1
logging
.
info
(
'
\n
Found %s gcloud computing instance with name %s.
\n
'
,
num_instances
,
instance_name
)
if
num_instances
==
1
:
cmd
=
'{} {} -- -L 6006:127.0.0.1:6006'
.
format
(
_ssh_prefix
(
project
,
zone
,
ssh_internal_ip
,
ssh_key_file
),
instance_name
)
logging
.
info
(
'Run the command below to ssh to the instance together with '
'port forwarding for tensorboard:
\n
%s
\n
'
,
cmd
)
def
list_all
(
project
):
logging
.
debug
(
'Finding all gcloud computing instance of project %s created '
'for PerfZero test'
,
project
)
cmd
=
'gcloud compute instances list --filter="name ~ {}" --project={}'
.
format
(
# pylint: disable=line-too-long
INSTANCE_NAME_PREFIX
,
project
)
stdout
=
run_command
(
cmd
,
is_from_user
=
True
)
num_instances
=
len
(
stdout
.
splitlines
())
-
1
logging
.
info
(
'
\n
Found %s gcloud computing instance of project %s created '
'for PerfZero test'
,
num_instances
,
project
)
def
start
(
username
,
project
,
zone
):
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Starting gcloud computing instance %s of project %s '
'in zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'gcloud compute instances start {} --project={} --zone={}'
.
format
(
instance_name
,
project
,
zone
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
debug
(
'
\n
Successfully started gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
def
stop
(
username
,
project
,
zone
):
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Stopping gcloud computing instance %s of project %s in '
'zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'gcloud compute instances stop {} --project={} --zone={}'
.
format
(
instance_name
,
project
,
zone
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
debug
(
'
\n
Successfully stopped gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
def
delete
(
username
,
project
,
zone
):
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Deleting gcloud computing instance %s of project %s in '
'zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'echo Y | gcloud compute instances delete {} --project={} --zone={}'
.
format
(
# pylint: disable=line-too-long
instance_name
,
project
,
zone
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
debug
(
'
\n
Successfully deleted gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
def
parse_arguments
(
argv
,
command
):
# pylint: disable=redefined-outer-name
"""Parse command line arguments and return parsed flags.
Args:
argv: command line arguments
command: the subcommand requested by the user
Returns:
parsed flags
"""
# pylint: disable=redefined-outer-name
parser
=
argparse
.
ArgumentParser
(
usage
=
'cloud_manager.py {} [<args>]'
.
format
(
command
),
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
'--debug'
,
action
=
'store_true'
,
help
=
'If set, use debug level logging. Otherwise, use info level logging'
)
parser
.
add_argument
(
'--project'
,
default
=
'google.com:tensorflow-performance'
,
type
=
str
,
help
=
'Google Cloud Platform project name to use for this invocation'
)
if
command
in
[
'create'
,
'start'
,
'stop'
,
'delete'
,
'status'
]:
parser
.
add_argument
(
'--username'
,
default
=
getpass
.
getuser
(),
type
=
str
,
help
=
'''Username that uniquely identifies the name of computing instance created for PerfZero.
The default value is your ldap username.
'''
)
parser
.
add_argument
(
'--zone'
,
default
=
'us-west1-b'
,
type
=
str
,
help
=
'Zone of the instance to create.'
)
parser
.
add_argument
(
'--ssh-internal-ip'
,
action
=
'store_true'
,
help
=
'If set, use internal IP for ssh with `gcloud beta compute ssh`.'
)
parser
.
add_argument
(
'--ssh-key-file'
,
default
=
None
,
type
=
str
,
help
=
'The ssh key to use with with `gcloud (beta) compute ssh`.'
)
if
command
==
'create'
:
parser
.
add_argument
(
'--accelerator_count'
,
default
=
1
,
type
=
int
,
help
=
'The number of pieces of the accelerator to attach to the instance'
)
parser
.
add_argument
(
'--accelerator_type'
,
default
=
'nvidia-tesla-v100'
,
type
=
str
,
help
=
'''The specific type (e.g. nvidia-tesla-v100 for nVidia Tesla V100) of
accelerator to attach to the instance. Use 'gcloud compute accelerator-types list --project=${project_name}' to
learn about all available accelerator types.
'''
)
parser
.
add_argument
(
'--cpu_min_platform'
,
default
=
None
,
type
=
str
,
help
=
'''Minimum cpu platform, only needed for CPU only instances.'''
)
parser
.
add_argument
(
'--machine_type'
,
default
=
None
,
type
=
str
,
help
=
'''The machine type used for the instance. To get a list of available machine
types, run 'gcloud compute machine-types list --project=${project_name}'
'''
)
parser
.
add_argument
(
'--image'
,
default
=
'tf-ubuntu-1604-20180927-410'
,
type
=
str
,
help
=
'''Specifies the name of the image that the disk will be initialized with.
A new disk will be created based on the given image. To view a list of
public images and projects, run 'gcloud compute images list --project=${project_name}'. It is best
practice to use image when a specific version of an image is needed.
'''
)
parser
.
add_argument
(
'--nvme_count'
,
default
=
0
,
type
=
int
,
help
=
'''Specifies the number of NVME local SSD devices to attach to the instance.
'''
)
parser
.
add_argument
(
'--boot_ssd_size'
,
default
=
None
,
type
=
int
,
help
=
'''Specifies the size (GB) of the boot disk or size is the image
size. Setting this also changes boot disk to Persistent SSD.
'''
)
flags
,
unparsed
=
parser
.
parse_known_args
(
argv
)
# pylint: disable=redefined-outer-name
if
unparsed
:
logging
.
error
(
'Arguments %s are not recognized'
,
unparsed
)
sys
.
exit
(
1
)
level
=
logging
.
DEBUG
if
flags
.
debug
else
logging
.
INFO
logging
.
basicConfig
(
format
=
'%(message)s'
,
level
=
level
)
return
flags
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
usage
=
'''cloud_manager.py <command> [<args>]
The supported commands are:
create: Create a computing instance in gcloud that is unique to the specified username, which is your ldap by default
start: Start the computing instance in gcloud that is unique to the specified username, which is your ldap by default
stop: Stop the computing instance in gcloud that is unique to the specified username, which is your ldap by default
delete: Delete the computing instance in gcloud that is unique to the specified username, which is your ldap by default
status: Query the status and information of the computing instance in gcloud that is unique to the specified username, which is your ldap by default
list_all: Query the status of all computing instances that are created by this script.'''
)
parser
.
add_argument
(
'command'
,
type
=
str
)
flags
=
parser
.
parse_args
(
sys
.
argv
[
1
:
2
])
command
=
flags
.
command
if
not
hasattr
(
sys
.
modules
[
__name__
],
command
):
print
(
'Error: The command <{}> is not recognized
\n
'
.
format
(
command
))
parser
.
print_help
()
sys
.
exit
(
1
)
flags
=
parse_arguments
(
sys
.
argv
[
2
:],
command
)
if
command
==
'create'
:
create
(
flags
.
username
,
flags
.
project
,
flags
.
zone
,
flags
.
machine_type
,
flags
.
accelerator_count
,
flags
.
accelerator_type
,
flags
.
image
,
flags
.
nvme_count
,
flags
.
ssh_internal_ip
,
flags
.
ssh_key_file
,
cpu_min_platform
=
flags
.
cpu_min_platform
,
boot_ssd_size
=
flags
.
boot_ssd_size
)
elif
command
==
'start'
:
start
(
flags
.
username
,
flags
.
project
,
flags
.
zone
)
elif
command
==
'stop'
:
stop
(
flags
.
username
,
flags
.
project
,
flags
.
zone
)
elif
command
==
'delete'
:
delete
(
flags
.
username
,
flags
.
project
,
flags
.
zone
)
elif
command
==
'status'
:
status
(
flags
.
username
,
flags
.
project
,
flags
.
zone
,
flags
.
ssh_internal_ip
,
flags
.
ssh_key_file
)
elif
command
==
'list_all'
:
list_all
(
flags
.
project
)
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/benchmark_method_runner.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute a single benchmark method."""
from
__future__
import
print_function
import
datetime
import
json
import
logging
import
os
import
time
import
traceback
from
perfzero.process_info_tracker
import
ProcessInfoTracker
import
perfzero.report_utils
as
report_utils
from
perfzero.tensorflow_profiler
import
TensorFlowProfiler
import
perfzero.utils
as
utils
def
run
(
benchmark_method
,
harness_info
,
site_package_info
,
root_output_dir
,
config
,
queue
,
trial_id
):
try
:
_run_internal
(
benchmark_method
,
harness_info
,
site_package_info
,
root_output_dir
,
config
,
queue
,
trial_id
)
except
Exception
:
# pylint: disable=broad-except
logging
.
error
(
'Benchmark execution for %s failed due to error:
\n
%s'
,
benchmark_method
,
traceback
.
format_exc
())
queue
.
put
((
True
,
None
,
False
,
None
))
def
_set_file_contents
(
content_str
,
output_filename
):
with
open
(
output_filename
,
'w'
)
as
f
:
f
.
write
(
content_str
)
logging
.
info
(
'Wrote summary to file %s'
,
output_filename
)
def
_run_internal
(
benchmark_method
,
harness_info
,
site_package_info
,
root_output_dir
,
config
,
queue
,
trial_id
):
"""Run benchmark method and put result to the queue.
Args:
benchmark_method: Canonical path to the benchmark method
harness_info: Description of the benchmark harness used in the benchmark
site_package_info: Description of the site-package used in the benchmark
root_output_dir: Directory under which to put the benchmark output
config: An instance of perfzero_config
queue: An interprocess queue to transfer benchmark result to the caller.
trial_id: An integer trial id to annotate in the benchmark result.
"""
start_timestamp
=
time
.
time
()
execution_timestamp
=
start_timestamp
method_has_exception
=
False
execution_id
=
(
config
.
execution_id
if
config
.
execution_id
else
datetime
.
datetime
.
now
().
strftime
(
'%Y-%m-%d-%H-%M-%S-%f'
))
output_dir
=
os
.
path
.
join
(
root_output_dir
,
execution_id
)
if
config
.
scratch_gcs_url
:
model_output_dir
=
os
.
path
.
join
(
config
.
scratch_gcs_url
,
execution_id
)
else
:
model_output_dir
=
output_dir
utils
.
make_dir_if_not_exist
(
output_dir
)
benchmark_class
,
benchmark_method_name
=
benchmark_method
.
rsplit
(
'.'
,
1
)
benchmark_class_name
=
benchmark_class
.
rsplit
(
'.'
,
1
)[
1
]
tensorflow_profiler
=
TensorFlowProfiler
(
config
.
profiler_enabled_time_str
,
output_dir
)
process_info_tracker
=
ProcessInfoTracker
(
output_dir
)
process_info
=
None
# Setup per-method file logger
filehandler
=
logging
.
FileHandler
(
filename
=
os
.
path
.
join
(
output_dir
,
'perfzero.log'
),
mode
=
'w'
)
filehandler
.
setFormatter
(
logging
.
Formatter
(
'%(asctime)s %(levelname)s: %(message)s'
))
logging
.
getLogger
().
addHandler
(
filehandler
)
try
:
if
config
.
tpu_parameters
:
tpu
=
config
.
tpu_parameters
.
get
(
'name'
)
else
:
tpu
=
None
if
config
.
perfzero_constructor_args
:
constructor_args
=
json
.
loads
(
config
.
perfzero_constructor_args
)
else
:
constructor_args
=
{}
class_instance
=
utils
.
instantiate_benchmark_class
(
benchmark_class
=
benchmark_class
,
output_dir
=
model_output_dir
,
root_data_dir
=
config
.
root_data_dir
,
tpu
=
tpu
,
constructor_args
=
constructor_args
,
benchmark_class_type
=
config
.
benchmark_class_type
)
# tf.test.Benchmark.report_benchmark() writes results to a file with
# path benchmark_result_file_path_prefix + benchmark_method
benchmark_result_file_path_prefix
=
os
.
path
.
join
(
output_dir
,
'proto_'
)
os
.
environ
[
'TEST_REPORT_FILE_PREFIX'
]
=
benchmark_result_file_path_prefix
benchmark_result_file_path
=
'{}{}.{}'
.
format
(
benchmark_result_file_path_prefix
,
benchmark_class_name
,
benchmark_method_name
)
# Start background threads for profiler and system info tracker
tensorflow_profiler
.
start
()
process_info_tracker
.
start
()
# Run benchmark method
execution_timestamp
=
time
.
time
()
logging
.
info
(
'Starting benchmark execution: %s'
,
benchmark_method
)
getattr
(
class_instance
,
benchmark_method_name
)()
logging
.
info
(
'Stopped benchmark: %s'
,
benchmark_method
)
# Read and build benchmark results
raw_benchmark_result
=
utils
.
read_benchmark_result
(
benchmark_result_file_path
)
# Explicitly overwrite the name to be the full path to benchmark method
raw_benchmark_result
[
'name'
]
=
benchmark_method
except
Exception
:
# pylint: disable=broad-except
logging
.
error
(
'Benchmark execution for %s failed due to error:
\n
%s'
,
benchmark_method
,
traceback
.
format_exc
())
method_has_exception
=
True
raw_benchmark_result
=
{}
raw_benchmark_result
[
'name'
]
=
benchmark_method
raw_benchmark_result
[
'wall_time'
]
=
-
1
raw_benchmark_result
[
'extras'
]
=
{}
finally
:
# Stop background threads for profiler and system info tracker
process_info
=
process_info_tracker
.
stop
()
tensorflow_profiler
.
stop
()
upload_timestamp
=
time
.
time
()
benchmark_result
=
report_utils
.
build_benchmark_result
(
raw_benchmark_result
,
method_has_exception
,
trial_id
)
execution_summary
=
report_utils
.
build_execution_summary
(
execution_timestamp
,
execution_id
,
config
.
ml_framework_build_label
,
config
.
execution_label
,
config
.
platform_name
,
config
.
system_name
,
config
.
output_gcs_url
,
benchmark_result
,
config
.
get_env_vars
(),
config
.
get_flags
(),
harness_info
,
site_package_info
,
process_info
,
method_has_exception
,
is_tpu_benchmark
=
(
config
.
tpu_parameters
!=
None
))
report_utils
.
upload_execution_summary
(
config
.
bigquery_project_name
,
config
.
bigquery_dataset_table_name
,
execution_summary
)
report_utils
.
execute_methods
(
config
.
result_upload_methods
,
execution_summary
)
logging
.
info
(
'Benchmark execution for %s completed with summary:
\n
%s'
,
benchmark_method
,
json
.
dumps
(
execution_summary
,
indent
=
2
))
_set_file_contents
(
json
.
dumps
(
execution_summary
,
indent
=
2
),
os
.
path
.
join
(
output_dir
,
'perfzero_summary.json'
))
utils
.
maybe_upload_to_gcs
(
output_dir
,
config
.
output_gcs_url
)
logging
.
getLogger
().
removeHandler
(
filehandler
)
method_execution_time
=
{
'class_initialization'
:
execution_timestamp
-
start_timestamp
,
'method_execution'
:
upload_timestamp
-
execution_timestamp
,
'log_upload'
:
time
.
time
()
-
upload_timestamp
}
if
config
.
profiler_enabled_time_str
:
relative_output_dir
=
output_dir
[
output_dir
.
find
(
'benchmark'
):]
print
(
'
\n
Execute the command below to start tensorboard server using '
'the collected profiler data:
\n
tensorboard --logdir={}
\n\n
'
'Open localhost:6006 in your browser to access the Tensorbord '
'GUI. Use ssh with port forwarding if tensorboard is running on '
'a remote machine.
\n
'
.
format
(
relative_output_dir
))
queue
.
put
((
method_has_exception
,
method_execution_time
,
benchmark_result
[
'succeeded'
],
output_dir
))
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/device_utils.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Setup the data drive with raid, RAM, or mount network drives."""
from
__future__
import
print_function
import
logging
import
perfzero.utils
as
utils
def
create_drive_from_devices
(
data_dir
,
gce_nvme_raid
):
"""Creates a drive at data directory."""
if
not
gce_nvme_raid
:
return
devices
=
_get_nvme_devices
()
cmd
=
'mountpoint -q {}'
.
format
(
data_dir
)
retcode
,
_
=
utils
.
run_command
(
cmd
)
if
retcode
:
if
len
(
devices
)
>
1
:
_create_drive_raid
(
data_dir
,
devices
)
else
:
_create_single_drive
(
data_dir
,
devices
[
0
])
def
_get_nvme_devices
():
"""Returns list paths to nvme devices."""
devices
=
[]
cmd
=
'lsblk'
retcode
,
log
=
utils
.
run_command
(
cmd
)
if
retcode
:
raise
Exception
(
'"{}" failed with code:{} and log:
\n
{}'
.
format
(
cmd
,
retcode
,
log
))
lines
=
log
.
splitlines
()
if
lines
:
for
line
in
lines
:
if
line
.
startswith
(
'nvme'
):
parts
=
line
.
split
()
devices
.
append
(
'/dev/'
+
parts
[
0
].
strip
())
return
devices
def
_create_single_drive
(
data_dir
,
device
):
"""Creates a data drive out of a single device."""
cmds
=
[]
cmds
.
append
(
'mkfs.ext4 -F {}'
.
format
(
device
))
cmds
.
append
(
'mkdir -p {}'
.
format
(
data_dir
))
cmds
.
append
(
'mount {} {}'
.
format
(
device
,
data_dir
))
cmds
.
append
(
'chmod a+w {}'
.
format
(
data_dir
))
utils
.
run_commands
(
cmds
)
logging
.
info
(
'Created and mounted device %s at %s'
,
device
,
data_dir
)
def
_create_drive_raid
(
data_dir
,
devices
):
"""Creates a raid zero array of nvme drives."""
cmds
=
[]
# Passing 'yes' because GCE nvme drive are sometimes in an odd state and
# think they are in another raid. mdadm does not have -y option.
# Or the kokoro images were left dirty? and that is where the info
# comes from.
cmds
.
append
(
'yes | mdadm --create /dev/md0 --level=0 '
'--raid-devices={} {}'
.
format
(
len
(
devices
),
' '
.
join
(
devices
)))
cmds
.
append
(
'mkfs.ext4 -F /dev/md0'
)
cmds
.
append
(
'mkdir -p {}'
.
format
(
data_dir
))
cmds
.
append
(
'mount /dev/md0 {}'
.
format
(
data_dir
))
cmds
.
append
(
'chmod a+w {}'
.
format
(
data_dir
))
utils
.
run_commands
(
cmds
)
logging
.
info
(
'Created and mounted RAID array at %s'
,
data_dir
)
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero configs provided by user."""
from
__future__
import
print_function
import
json
import
logging
import
os
def
add_setup_parser_arguments
(
parser
):
"""Add arguments to the parser used by the setup.py."""
parser
.
add_argument
(
'--dockerfile_path'
,
default
=
'docker/Dockerfile_ubuntu_1804_tf_v1'
,
type
=
str
,
help
=
'''Build the docker image using docker file located at the ${pwd}/${dockerfile_path} if
it exists, where ${pwd} is user's current work directory. Otherwise, build
the docker image using the docker file located at path_to_perfzero/${dockerfile_path}.
'''
)
parser
.
add_argument
(
'--workspace'
,
default
=
'workspace'
,
type
=
str
,
help
=
'''The gcloud key file will be downloaded under directory path_to_perfzero/${workspace}
'''
)
parser
.
add_argument
(
'--gcloud_key_file_url'
,
default
=
''
,
type
=
str
,
help
=
'''DEPRECATED: Use --gcloud_key_file_url of setup.py instead.
The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
'''
)
parser
.
add_argument
(
'--root_data_dir'
,
default
=
'/data'
,
type
=
str
,
help
=
'The directory which should contain the dataset required by the becnhmark method.'
)
parser
.
add_argument
(
'--gce_nvme_raid'
,
default
=
None
,
type
=
str
,
help
=
'If set to non-empty string, create raid 0 array with devices at the directory specified by the flag --root_data_dir'
)
parser
.
add_argument
(
'--tensorflow_pip_spec'
,
default
=
None
,
type
=
str
,
help
=
'''The tensorflow pip package specfication. The format can be either ${package_name}, or ${package_name}==${package_version}.
Example values include tf-nightly-gpu, and tensorflow==1.12.0. If it is specified, the corresponding tensorflow pip package/version
will be installed. Otherwise, the default tensorflow pip package specified in the docker file will be installed.
'''
)
parser
.
add_argument
(
'--extra_pip_specs'
,
default
=
''
,
type
=
str
,
help
=
'''Additional specifications to pass to `pip install`. (e.g. pinning certain dependencies)
Specifications should be semicolon separated: e.g. `numpy==1.16.4;scipy==1.3.1`
'''
)
parser
.
add_argument
(
'--docker_tag'
,
default
=
'perfzero/tensorflow'
,
type
=
str
,
help
=
'The docker tag to use if building a docker image.'
)
parser
.
add_argument
(
'--site_package_downloads'
,
default
=
''
,
type
=
str
,
help
=
'''Comma separated list of dirs in the external vm to copy to the docker
\'
s site package dir.
Format: <absolute-path>/src/dir:new_base_dir_name,<absolute-path>/src/dir2>:new_name,....
This will copy <absolute-path>/src/dir to <site-packages>/new_base_dir_name.
'''
)
parser
.
add_argument
(
'--extra_docker_build_args'
,
nargs
=
'*'
,
default
=
''
,
type
=
str
,
help
=
'''Additional build-args to pass to `docker build`.
Example: --extra_docker_build_args arg0 arg1=value1 "arg2=value with space" arg3=300.
Each string will be passed directly as a build-arg to docker, so the above example will be passed as follows:
--build-arg arg0 --build-arg arg1=value1 --build-arg "arg2=value with space" --build-arg arg3=300
'''
)
def
add_benchmark_parser_arguments
(
parser
):
"""Add arguments to the parser used by the benchmark.py."""
parser
.
add_argument
(
'--use_cached_site_packages'
,
action
=
'store_true'
,
help
=
'If set, skip git pull for dependent git repositories if it already exists in path_to_perfzero/${workspace}/site-packages'
)
parser
.
add_argument
(
'--gcs_downloads'
,
default
=
None
,
type
=
str
,
help
=
'This flag is deprecated. Use the flag --data_downloads instead'
)
parser
.
add_argument
(
'--git_repos'
,
default
=
None
,
type
=
str
,
help
=
'''A string representing git repositories to checkout. The format is url_1;branch_1;hash_1,url_2;branch_2;hash_2,...
Git repositories will be checked-out under directory path_to_perfzero/${workspace}/site-packages,
where ${workspace} either defaults to 'workspace', or takes the value of the flag --workspace.
branch and hash can be skipped if user wants to use the head of the master branch,
which shortens the format to url_1,url_2,...
'''
)
parser
.
add_argument
(
'--benchmark_num_trials'
,
default
=
1
,
type
=
int
,
help
=
'''Configures number of times to run each benchmark method
after setup completion.'''
)
parser
.
add_argument
(
'--benchmark_methods'
,
action
=
'append'
,
default
=
[],
type
=
str
,
help
=
'''This string specifies the benchmark_method to be executed. The flag can be specified multiple times in which case
the union of methods matched by these flags will be executed. The format can be module_path.class_name.method_name in which
case the corresponding method is executed. The format can also be module_path.class_name.filter:regex_pattern, in which case all methods
of the given class whose method name matches the given regular expression are executed.
'''
)
parser
.
add_argument
(
'--ml_framework_build_label'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the machine learning framework build, e.g. nightly-gpu-build'
)
parser
.
add_argument
(
'--execution_label'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the benchmark execution type, e.g. test, prod'
)
parser
.
add_argument
(
'--platform_name'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the computing platform, e.g. gcp, aws'
)
parser
.
add_argument
(
'--system_name'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the hardware system, e.g. n1-standard-64-8xV100'
)
parser
.
add_argument
(
'--output_gcs_url'
,
default
=
None
,
type
=
str
,
help
=
'''If specified, log files generated by the benchmark execution will be uploaded to output_gcs_url/${execution_id},
where ${execution_id} is a string that generated by PerfZero which uniquely identifies the execution of one benchmark method
'''
)
parser
.
add_argument
(
'--scratch_gcs_url'
,
default
=
None
,
type
=
str
,
help
=
'''If specified, intermediate files like model outputs will be stored in scratch_gcs_url/${execution_id}, where
${execution_id} is a string that is generated by PerfZero which uniquely identifies the execution of one benchmark method.
If not specified, intermediate files will be stored in a local folder on the host.
'''
)
parser
.
add_argument
(
'--bigquery_project_name'
,
default
=
None
,
type
=
str
,
help
=
'''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
'''
)
parser
.
add_argument
(
'--bigquery_dataset_table_name'
,
default
=
None
,
type
=
str
,
help
=
'''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
'''
)
parser
.
add_argument
(
'--python_path'
,
default
=
None
,
type
=
str
,
help
=
'''A string of format path_1,path_2,... For each ${path} specified in the string,
path_to_perfzero/${workspace}/site-packages/${path} will be added to python path so that libraies downloaded by --git_repos can
be loaded and executed.
'''
)
parser
.
add_argument
(
'--workspace'
,
default
=
'workspace'
,
type
=
str
,
help
=
'''The log files, gcloud key file and git repositories will be generated and downloaded under the
directory path_to_perfzero/${workspace}
'''
)
parser
.
add_argument
(
'--root_data_dir'
,
default
=
'/data'
,
type
=
str
,
help
=
'The directory which should contain the dataset required by the becnhmark method.'
)
parser
.
add_argument
(
'--data_downloads'
,
default
=
None
,
type
=
str
,
help
=
'''A string of format url_1;relative_path_1,url_2;relative_path_2,...
Data will be copied from ${url} to ${root_data_dir}/${relative_path}. ${relative_path} can be skipped if it is the same as the
base name of the url, which shortens the format to url_1,url_2,... ${root_data_dir} is specified by the flag --root_data_dir.
File will be de-compressed in ${root_data_dir} if its name ends with 'gz'. Only url prefixed with gcs, http or https are supported.
Each url can start with file://, gcs://, http:// or https://.
'''
)
parser
.
add_argument
(
'--gcloud_key_file_url'
,
default
=
'gs://tf-performance/auth_tokens/benchmark_upload_gce.json'
,
type
=
str
,
help
=
'''The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
The key file will then be activated and used as gcloud authentication credential.
'''
)
parser
.
add_argument
(
'--debug'
,
action
=
'store_true'
,
help
=
'If set, use debug level logging. Otherwise, use info level logging'
)
parser
.
add_argument
(
'--profiler_enabled_time'
,
default
=
None
,
type
=
str
,
help
=
'''A string of format begin_time_1:end_time_1,begin_time_2:end_time_2,.... PerfZero will start to collect profiler
data ${begin_time} sec after benchmark method execution starts. The data collection continues for ${end_time - begin_time}
sec or until the benchmark method execution finishes, whichever occurs first. If ${end_time} is not explicitly
specified, it is assumed to be MAX_LONG.
'''
)
parser
.
add_argument
(
'--execution_id'
,
default
=
None
,
type
=
str
,
help
=
'A string that uniquely identifies the benchmark execution.'
)
parser
.
add_argument
(
'--result_upload_methods'
,
default
=
None
,
type
=
str
,
help
=
'A comma separated list of class.method values to upload results.'
)
parser
.
add_argument
(
'--tpu_parameters'
,
default
=
None
,
type
=
str
,
help
=
'''A json dictionary of cloud tpu parameters. The format must look like the following:
{"name": "my-tpu-name", project": "my-gcp-project-id", "zone": "europe-west4-a", "size": "v3-8", "version": "nightly-2.x"}
It can have an optional key value pair "version_id" -> "nightly version" to change the tpu version id.
Example "version_id": "2.4.0-dev20200728".
'''
)
parser
.
add_argument
(
'--perfzero_constructor_args'
,
nargs
=
'*'
,
default
=
''
,
type
=
str
,
help
=
'''A json dictionary of additional args to pass to the perfzero
constructor.'''
)
parser
.
add_argument
(
'--benchmark_class_type'
,
default
=
None
,
type
=
str
,
help
=
'''The benchmark class type. If none, assumed perfzero_benchmark. Set to "tf_benchmark"
for tf.test.Benchmark benchmarks.'''
)
class
PerfZeroConfig
(
object
):
"""Creates and contains config for PerfZero."""
def
__init__
(
self
,
mode
,
flags
=
None
):
self
.
mode
=
mode
self
.
flags
=
flags
if
mode
==
'flags'
:
self
.
gcs_downloads_str
=
flags
.
gcs_downloads
self
.
data_downloads_str
=
flags
.
data_downloads
self
.
git_repos_str
=
flags
.
git_repos
self
.
benchmark_method_patterns
=
[]
for
value
in
flags
.
benchmark_methods
:
self
.
benchmark_method_patterns
.
extend
(
value
.
split
(
','
))
self
.
ml_framework_build_label
=
flags
.
ml_framework_build_label
self
.
execution_label
=
flags
.
execution_label
self
.
platform_name
=
flags
.
platform_name
self
.
system_name
=
flags
.
system_name
self
.
output_gcs_url
=
flags
.
output_gcs_url
self
.
scratch_gcs_url
=
flags
.
scratch_gcs_url
self
.
bigquery_project_name
=
flags
.
bigquery_project_name
self
.
bigquery_dataset_table_name
=
flags
.
bigquery_dataset_table_name
self
.
python_path_str
=
flags
.
python_path
self
.
workspace
=
flags
.
workspace
self
.
benchmark_class_type
=
flags
.
benchmark_class_type
self
.
use_cached_site_packages
=
flags
.
use_cached_site_packages
self
.
root_data_dir
=
flags
.
root_data_dir
self
.
gcloud_key_file_url
=
flags
.
gcloud_key_file_url
self
.
profiler_enabled_time_str
=
flags
.
profiler_enabled_time
self
.
execution_id
=
flags
.
execution_id
self
.
result_upload_methods
=
flags
.
result_upload_methods
self
.
perfzero_constructor_args
=
flags
.
perfzero_constructor_args
self
.
benchmark_num_trials
=
flags
.
benchmark_num_trials
if
flags
.
tpu_parameters
:
self
.
tpu_parameters
=
json
.
loads
(
flags
.
tpu_parameters
)
else
:
self
.
tpu_parameters
=
None
if
not
flags
.
benchmark_methods
:
logging
.
warning
(
'No benchmark method is specified by '
'--benchmark_methods'
)
if
flags
.
bigquery_project_name
and
not
flags
.
bigquery_dataset_table_name
:
raise
ValueError
(
'--bigquery_project_name is specified but '
'--bigquery_dataset_table_name is not'
)
if
not
flags
.
bigquery_project_name
and
flags
.
bigquery_dataset_table_name
:
raise
ValueError
(
'--bigquery_dataset_table_name is specified but '
'--bigquery_project_name is not'
)
def
get_env_vars
(
self
):
env_vars
=
{}
for
key
in
os
.
environ
.
keys
():
if
key
.
startswith
(
'PERFZERO_'
):
env_vars
[
key
]
=
os
.
environ
[
key
]
return
env_vars
def
get_flags
(
self
):
not_none_flags
=
{}
for
key
in
vars
(
self
.
flags
):
value
=
getattr
(
self
.
flags
,
key
)
if
value
is
not
None
:
not_none_flags
[
key
]
=
value
return
not_none_flags
def
get_git_repos
(
self
,
site_packages_dir
):
"""Parse git repository string."""
git_repos
=
[]
if
not
self
.
git_repos_str
:
return
git_repos
for
repo_entry
in
self
.
git_repos_str
.
split
(
','
):
parts
=
repo_entry
.
split
(
';'
)
git_repo
=
{}
git_repo
[
'url'
]
=
parts
[
0
]
# Assume the git url has format */{dir_name}.git
git_repo
[
'dir_name'
]
=
parts
[
0
].
rsplit
(
'/'
,
1
)[
-
1
].
rsplit
(
'.'
,
1
)[
0
]
git_repo
[
'local_path'
]
=
os
.
path
.
join
(
site_packages_dir
,
git_repo
[
'dir_name'
])
if
len
(
parts
)
>=
2
:
git_repo
[
'branch'
]
=
parts
[
1
]
if
len
(
parts
)
>=
3
:
git_repo
[
'git_hash'
]
=
parts
[
2
]
git_repos
.
append
(
git_repo
)
return
git_repos
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config_test.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for perfzero_config.py."""
from
__future__
import
print_function
import
os
import
unittest
import
perfzero.perfzero_config
as
perfzero_config
class
TestPerfZeroConfig
(
unittest
.
TestCase
):
def
test_get_git_repos
(
self
):
config
=
perfzero_config
.
PerfZeroConfig
(
mode
=
'mock'
)
config
.
git_repos_str
=
'https://github.com/tensorflow/benchmarks.git;branch_1;hash_1,https://github.com/tensorflow/models.git;branch_2'
git_repos
=
config
.
get_git_repos
(
'/site_package_dir'
)
git_repo_1
=
{}
git_repo_1
[
'url'
]
=
'https://github.com/tensorflow/benchmarks.git'
git_repo_1
[
'dir_name'
]
=
'benchmarks'
git_repo_1
[
'local_path'
]
=
'/site_package_dir/benchmarks'
git_repo_1
[
'branch'
]
=
'branch_1'
git_repo_1
[
'git_hash'
]
=
'hash_1'
git_repo_2
=
{}
git_repo_2
[
'url'
]
=
'https://github.com/tensorflow/models.git'
git_repo_2
[
'dir_name'
]
=
'models'
git_repo_2
[
'local_path'
]
=
'/site_package_dir/models'
git_repo_2
[
'branch'
]
=
'branch_2'
self
.
assertEqual
(
2
,
len
(
git_repos
))
self
.
assertEqual
(
git_repo_1
,
git_repos
[
0
])
self
.
assertEqual
(
git_repo_2
,
git_repos
[
1
])
def
test_get_env_vars
(
self
):
config
=
perfzero_config
.
PerfZeroConfig
(
mode
=
'mock'
)
self
.
assertEqual
({},
config
.
get_env_vars
())
os
.
environ
[
'PERFZERO_VAR1'
]
=
'value1'
self
.
assertEqual
({
'PERFZERO_VAR1'
:
'value1'
},
config
.
get_env_vars
())
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/process_info_tracker.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keep track of process information such as maximum memory usage with a separate thread."""
from
__future__
import
absolute_import
import
json
import
logging
import
os
import
sched
import
threading
import
time
import
traceback
import
psutil
class
ProcessInfoTracker
(
object
):
"""Keep track of process information such as maximum memory usage with separate thread."""
def
__init__
(
self
,
output_dir
):
self
.
process_info_log
=
open
(
os
.
path
.
join
(
output_dir
,
'process_info.log'
),
'w'
)
self
.
scheduler
=
sched
.
scheduler
(
time
.
time
,
time
.
sleep
)
self
.
process_info
=
{}
self
.
process_info
[
'max_rss'
]
=
0
self
.
process_info
[
'max_vms'
]
=
0
self
.
process_info
[
'max_cpu_percent'
]
=
0
self
.
exit_event
=
threading
.
Event
()
self
.
last_exception
=
None
self
.
start_time
=
None
def
start
(
self
):
self
.
start_time
=
time
.
time
()
# 4th positional arg added to support Python2 for the short-term.
self
.
scheduler
.
enter
(
1
,
1
,
self
.
_update_process_info
,
())
# pylint: disable=no-value-for-parameter
threading
.
Thread
(
target
=
self
.
scheduler
.
run
).
start
()
logging
.
info
(
'Started process information tracker.'
)
def
stop
(
self
):
self
.
exit_event
.
set
()
self
.
process_info_log
.
flush
()
logging
.
info
(
'Stopped process information tracker.'
)
if
self
.
last_exception
is
not
None
:
raise
self
.
last_exception
# pylint: disable=raising-bad-type
return
dict
(
self
.
process_info
)
def
_update_process_info
(
self
):
"""Read and update process info using background thread every 1 second."""
try
:
p
=
psutil
.
Process
(
os
.
getpid
())
memory_info
=
p
.
memory_info
()
# This is a blocking call which takes 0.1 second.
# This affects the interval # at which the metrics are reported
cpu_percent
=
p
.
cpu_percent
(
interval
=
0.1
)
self
.
process_info
[
'max_rss'
]
=
max
(
self
.
process_info
[
'max_rss'
],
memory_info
.
rss
)
self
.
process_info
[
'max_vms'
]
=
max
(
self
.
process_info
[
'max_vms'
],
memory_info
.
vms
)
self
.
process_info
[
'max_cpu_percent'
]
=
max
(
self
.
process_info
[
'max_cpu_percent'
],
cpu_percent
)
entry
=
{}
entry
[
'time'
]
=
time
.
time
()
-
self
.
start_time
entry
[
'rss'
]
=
memory_info
.
rss
entry
[
'vms'
]
=
memory_info
.
vms
entry
[
'cpu_percent'
]
=
cpu_percent
self
.
process_info_log
.
write
(
json
.
dumps
(
entry
)
+
'
\n
'
)
if
not
self
.
exit_event
.
is_set
():
# Schedule the next event to be run after 1 second
# 4th positional arg added to support Python2 for the short-term.
self
.
scheduler
.
enter
(
1
,
1
,
self
.
_update_process_info
,
())
# pylint: disable=no-value-for-parameter
except
Exception
as
e
:
# pylint: disable=broad-except
logging
.
error
(
'Process info tracker failed due to error:
\n
%s'
,
traceback
.
format_exc
())
self
.
last_exception
=
e
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/report_utils.py
deleted
100644 → 0
View file @
e286da17
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Upload test results."""
from
__future__
import
print_function
import
importlib
import
json
import
logging
import
perfzero.utils
as
utils
import
psutil
import
socket
from
six
import
u
as
unicode
# pylint: disable=W0622
def
execute_methods
(
method_names_str
,
*
args
,
**
kwargs
):
"""Calls a list of method names on given function params.
Args:
method_names_str: String - Comma-separated module.foo.bar.method strings.
This function imports module.foo.bar for each such method and calls it
with *args and **kwargs.
*args: Function params common to each method.
**kwargs: Function params common to each method.
Raises:
RuntimeError: If any of the invoked methods raised an exception.
"""
if
not
method_names_str
:
return
errors
=
[]
module_methods_list
=
method_names_str
.
split
(
','
)
for
module_method
in
module_methods_list
:
try
:
logging
.
info
(
'Trying to call %s'
,
module_method
)
module_path
,
method_path
=
module_method
.
rsplit
(
'.'
,
1
)
this_module
=
importlib
.
import_module
(
module_path
)
logging
.
info
(
'Found module %s, looking for %s'
,
module_path
,
method_path
)
this_method
=
getattr
(
this_module
,
method_path
)
logging
.
info
(
'Found method %s'
,
method_path
)
this_method
(
*
args
,
**
kwargs
)
except
Exception
as
e
:
# pylint: disable=broad-except
errors
.
append
(
str
(
e
))
if
errors
:
raise
RuntimeError
(
'
\n
'
+
'
\n
'
.
join
(
errors
))
def
upload_execution_summary
(
bigquery_project_name
,
bigquery_dataset_table_name
,
execution_summary
):
"""Upload benchmark summary.
Note: Using stream=False has a 1000 per day insert limit per table. Using
stream=True, the documented limit is 50K+. With streaming there can be
a small and possibly not noticeable delay to seeing the results the BigQuery
UI, but there can be a 90 minute more or less delay in the results being part
of exports.
Note: BigQuery maps unicode() to STRING for python2. If str is used that is
mapped to BYTE.
Args:
bigquery_project_name: Name of the gcp project.
bigquery_dataset_table_name: data_set and table name.
execution_summary: benchmark summary dictionary of results.
"""
# pylint: disable=C6204
import
google.auth
from
google.cloud
import
bigquery
if
not
bigquery_project_name
:
logging
.
info
(
'Skipped uploading benchmark result to bigquery because bigquery table name is not set.'
)
return
if
not
bigquery_dataset_table_name
:
logging
.
info
(
'Skipped uploading benchmark result to bigquery because bigquery project name is not set.'
)
return
credentials
=
google
.
auth
.
default
()[
0
]
dataset_name
=
bigquery_dataset_table_name
.
split
(
'.'
)[
0
]
table_name
=
bigquery_dataset_table_name
.
split
(
'.'
)[
1
]
client
=
bigquery
.
Client
(
project
=
bigquery_project_name
,
credentials
=
credentials
)
benchmark_summary_input
=
{}
for
key
,
value
in
execution_summary
.
items
():
if
isinstance
(
value
,
dict
):
benchmark_summary_input
[
key
]
=
unicode
(
json
.
dumps
(
value
))
else
:
benchmark_summary_input
[
key
]
=
unicode
(
value
)
logging
.
debug
(
'Bigquery input for benchmark_summary table is %s'
,
json
.
dumps
(
benchmark_summary_input
,
indent
=
2
))
errors
=
[]
# TODO(tobyboyd): Shim to direct results to new table until all jobs
# are updated.
if
'benchmark_results'
in
dataset_name
:
if
dataset_name
==
'benchmark_results_dev'
:
table_ref
=
client
.
dataset
(
'perfzero_dev'
).
table
(
'benchmark_summary'
)
table_obj
=
client
.
get_table
(
table_ref
)
elif
dataset_name
==
'benchmark_results'
:
table_ref
=
client
.
dataset
(
'perfzero'
).
table
(
'benchmark_summary'
)
table_obj
=
client
.
get_table
(
table_ref
)
else
:
table_ref
=
client
.
dataset
(
dataset_name
).
table
(
table_name
)
table_obj
=
client
.
get_table
(
table_ref
)
errors
.
extend
(
client
.
insert_rows
(
table_obj
,
[
benchmark_summary_input
]))
if
errors
:
logging
.
error
(
'Failed to upload benchmark result to bigquery due to errors %s'
,
errors
)
else
:
logging
.
info
(
'Uploaded benchmark result to the table %s of the bigquery project %s.'
,
bigquery_dataset_table_name
,
bigquery_project_name
)
def
build_benchmark_result
(
raw_benchmark_result
,
has_exception
,
trial_id
):
"""Converts test_log.proto format to PerfZero format."""
benchmark_result
=
{}
benchmark_result
[
'name'
]
=
raw_benchmark_result
[
'name'
]
benchmark_result
[
'wall_time'
]
=
raw_benchmark_result
[
'wall_time'
]
succeeded
=
not
has_exception
extras
=
[]
for
name
in
raw_benchmark_result
.
get
(
'extras'
,
{}):
entry
=
{}
entry
[
'name'
]
=
name
if
'double_value'
in
raw_benchmark_result
[
'extras'
][
name
]:
entry
[
'value'
]
=
raw_benchmark_result
[
'extras'
][
name
][
'double_value'
]
else
:
entry
[
'value'
]
=
raw_benchmark_result
[
'extras'
][
name
][
'string_value'
]
extras
.
append
(
entry
)
metrics
=
[]
for
metric
in
raw_benchmark_result
.
get
(
'metrics'
,
[]):
value
=
metric
[
'value'
]
if
'min_value'
in
metric
and
metric
[
'min_value'
]
>
value
:
succeeded
=
False
if
'max_value'
in
metric
and
metric
[
'max_value'
]
<
value
:
succeeded
=
False
metrics
.
append
(
metric
)
benchmark_result
[
'succeeded'
]
=
succeeded
benchmark_result
[
'extras'
]
=
extras
benchmark_result
[
'metrics'
]
=
metrics
benchmark_result
[
'trial_id'
]
=
trial_id
return
benchmark_result
def
build_execution_summary
(
execution_timestamp
,
execution_id
,
ml_framework_build_label
,
execution_label
,
platform_name
,
system_name
,
output_gcs_url
,
benchmark_result
,
env_vars
,
flags
,
harness_info
,
site_package_info
,
process_info
,
has_exception
,
is_tpu_benchmark
):
"""Builds summary of the execution."""
# Avoids module not found during setup phase when tf is not installed yet.
# pylint: disable=C6204
import
tensorflow
as
tf
benchmark_info
=
{}
benchmark_info
[
'harness_name'
]
=
'perfzero'
benchmark_info
[
'harness_info'
]
=
harness_info
benchmark_info
[
'has_exception'
]
=
has_exception
if
execution_label
:
benchmark_info
[
'execution_label'
]
=
execution_label
if
output_gcs_url
:
benchmark_info
[
'output_url'
]
=
'{}/{}/'
.
format
(
output_gcs_url
,
execution_id
)
if
env_vars
:
benchmark_info
[
'env_vars'
]
=
env_vars
if
flags
:
benchmark_info
[
'flags'
]
=
flags
benchmark_info
[
'site_package_info'
]
=
site_package_info
ml_framework_info
=
{}
ml_framework_info
[
'name'
]
=
'tensorflow'
ml_framework_info
[
'version'
]
=
tf
.
__version__
# tf.__git_version__ in Python3 has format b'version_string'
if
tf
.
__git_version__
[
0
]
==
'b'
:
ml_framework_info
[
'build_version'
]
=
tf
.
__git_version__
[
2
:
-
1
]
else
:
ml_framework_info
[
'build_version'
]
=
tf
.
__git_version__
if
ml_framework_build_label
:
ml_framework_info
[
'build_label'
]
=
ml_framework_build_label
system_info
=
{}
if
platform_name
:
system_info
[
'platform_name'
]
=
platform_name
if
system_name
:
system_info
[
'system_name'
]
=
system_name
if
not
is_tpu_benchmark
:
gpu_info
=
utils
.
get_gpu_info
()
if
gpu_info
:
system_info
[
'accelerator_driver_version'
]
=
gpu_info
[
'gpu_driver_version'
]
system_info
[
'accelerator_model'
]
=
gpu_info
[
'gpu_model'
]
system_info
[
'accelerator_count'
]
=
gpu_info
[
'gpu_count'
]
system_info
[
'cpu_model'
]
=
utils
.
get_cpu_name
()
system_info
[
'physical_cpu_count'
]
=
psutil
.
cpu_count
(
logical
=
False
)
system_info
[
'logical_cpu_count'
]
=
psutil
.
cpu_count
(
logical
=
True
)
system_info
[
'cpu_socket_count'
]
=
utils
.
get_cpu_socket_count
()
system_info
[
'hostname'
]
=
socket
.
gethostname
()
execution_summary
=
{}
execution_summary
[
'execution_id'
]
=
execution_id
execution_summary
[
'execution_timestamp'
]
=
execution_timestamp
execution_summary
[
'benchmark_result'
]
=
benchmark_result
execution_summary
[
'benchmark_info'
]
=
benchmark_info
execution_summary
[
'setup_info'
]
=
{}
execution_summary
[
'ml_framework_info'
]
=
ml_framework_info
execution_summary
[
'system_info'
]
=
system_info
if
process_info
:
execution_summary
[
'process_info'
]
=
process_info
return
execution_summary
Prev
1
2
3
4
5
6
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment