Commit a32ffa95 authored by qianyj's avatar qianyj
Browse files

update TensorFlow2x test method

parent e286da17
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu
# - Installs requirements.txt for tensorflow/models
#
# This docker is not needed and is the same as the tf_v2 docker. The
# User can pass in the desired `ARG tensorflow_pip_spec` Remove
# one TF 1.0 testing is done or KOKORO jobs are updated to use the
# tensorfow_pip_spec rather than docker path to control TF version.
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG extra_pip_specs=""
ARG local_tensorflow_pip_spec=""
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.0.64-1+cuda10.0 \
libcudnn7-dev=7.6.0.64-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu (this is TF 2.0)
# - Installs requirements.txt for tensorflow/models
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.2.24-1+cuda10.0 \
libcudnn7-dev=7.6.2.24-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
# Ubuntu 18.04 Python3 with CUDA 10.1 and the following:
# - Installs tf-nightly-gpu (this is TF 2.1)
# - Installs requirements.txt for tensorflow/models
# - TF 2.0 tested with cuda 10.0, but we need to test tf 2.1 with cuda 10.1.
FROM nvidia/cuda:10.1-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-1 \
cuda-cufft-10-1 \
cuda-curand-10-1 \
cuda-cusolver-10-1 \
cuda-cusparse-10-1 \
libcudnn7=7.6.4.38-1+cuda10.1 \
libcudnn7-dev=7.6.4.38-1+cuda10.1 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.1 \
libnvinfer-dev=5.1.5-1+cuda10.1 \
libnvinfer6=6.0.1-1+cuda10.1 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
# Install dependnecies needed for tf.distribute test utils
RUN pip install dill tblib portpicker
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip freeze
# Ubuntu 18.04 Python3 with CUDA 10 and the following:
# - Installs tf-nightly-gpu (this is TF 2.0)
# - Installs requirements.txt for tensorflow/models
#
# NOTE: Branched from Dockerfile_ubuntu_1804_tf_v2 with changes for
# TFX benchmarks.
FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# Specifies the default package version to use if no corresponding commit_id
# override is specified.
# If "head", uses the GitHub HEAD version.
# If "release", uses the latest released version from PyPI, REGARDLESS of
# package-compatibility requirements (e.g. even if tfx requires
# tensorflow-model-analysis<0.22, if tensorflow-model-analysis==0.22.0 is
# the latest released version on PyPI, we will install that).
# Packages include: tfx, tensorflow-transform, tensorflow-model-analysis,
# tensorflow-data-validation, tensorflow-metadata, tfx-bsl
ARG default_package_version="head"
# Specifies the package version to use for the corresponding packages.
# If empty, uses the default specified by default_package_version.
# If "head", uses the GitHub HEAD version.
# If "release", uses the latest released version from PyPI, REGARDLESS of
# package-compatibility requirements.
# If "github_commit:<commit id>", uses the given commit ID from GitHub.
# If "github_tag:<tag>" uses the given tag from GitHub.
# If "pypi:<version string>", uses the given package version from PyPI.
ARG tfx_package_version=""
ARG tensorflow_transform_package_version=""
ARG tensorflow_model_analysis_package_version=""
ARG tensorflow_data_validation_package_version=""
ARG tensorflow_metadata_package_version=""
ARG tfx_bsl_version=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-10-0 \
cuda-cublas-10-0 \
cuda-cublas-dev-10-0 \
cuda-cufft-10-0 \
cuda-curand-10-0 \
cuda-cusolver-10-0 \
cuda-cusparse-10-0 \
libcudnn7=7.6.2.24-1+cuda10.0 \
libcudnn7-dev=7.6.2.24-1+cuda10.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
RUN apt-get update && \
apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
libnvinfer-dev=5.1.5-1+cuda10.0 \
&& apt-get clean
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN pip install absl-py
RUN if [ ! -z "${extra_pip_specs}" ]; then pip install --upgrade --force-reinstall ${extra_pip_specs}; fi
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
# Install yolk3k, for getting package versions from PyPI (so we can pull
# TFX from GitHub even when we need to install from the released version)
RUN pip install yolk3k
# Install protoc
RUN PROTOC_ZIP=protoc-3.7.1-linux-x86_64.zip; \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.7.1/$PROTOC_ZIP; \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc; \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*'; \
rm -f $PROTOC_ZIP;
# Install Bazel
RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
RUN echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
RUN apt update
RUN apt install bazel
# Create symlink to "python3" binary under the name "python" so Bazel doesn't complain about "python" not being found
RUN ln -s $(which python3) /usr/bin/python
SHELL ["/bin/bash", "-c"]
RUN \
function install_package { \
# e.g. "head" or "release" \
default_version="$1"; \
# e.g "tensorflow-model-analysis" \
package_name="$2"; \
# e.g "model-analysis" \
package_repo_name="$3"; \
# How this package should be installed if pulled from GitHub. \
# "none" for no extra installation steps required \
# "bdist_wheel" for python setup.py bdist_wheel \
package_install_type=$4; \
# e.g. "head" or "release" or "pypi:0.21.4" or "github:[commit hash]" \
package_version="$5"; \
\
# e.g. "tensorflow_model_analysis" \
package_name_underscores=${package_name//-/_}; \
if [ "$package_version" == "" ]; then \
package_version="$default_version"; \
fi; \
\
commit_id=""; \
pypi_version=""; \
if [ "$package_version" == "head" ]; then \
commit_id=$(git ls-remote https://github.com/tensorflow/${package_repo_name} refs/heads/master | cut -f1); \
echo ${package_name}: latest commit from GitHub: ${commit_id}; \
elif [ "$package_version" == "release" ]; then \
pypi_version=$(yolk -V $package_name | head -n 1 | cut -d' ' -f 2-); \
echo ${package_name}: latest version from PyPI: ${pypi_version}; \
elif [ "${package_version:0:5}" == "pypi:" ]; then \
pypi_version="${package_version:5}"; \
echo ${package_name}: using specified PyPI version: ${pypi_version}; \
elif [ "${package_version:0:7}" == "github:" ]; then \
commit_id="${package_version:7}"; \
echo ${package_name}: using specified GitHub commit: ${commit_id}; \
else \
echo Unknown package version for ${package_name}: ${package_version}; \
exit 1; \
fi; \
\
if [ "$commit_id" != "" ]; then \
if [ "$package_install_type" == "none" ]; then \
# Package doesn't need extra installation steps - install directly from GitHub. \
pip install -e git+https://github.com/tensorflow/${package_repo_name}.git@${commit_id}#egg=${package_name_underscores}; \
install_commands+=("pip install --force --no-deps -e git+https://github.com/tensorflow/${package_repo_name}.git@${commit_id}#egg=${package_name_underscores}"); \
echo Installed ${package_name} from GitHub commit ${commit_id}; \
elif [ "$package_install_type" == "bdist_wheel" ]; then \
# Package needs extra installation steps. Clone from GitHub, then build and install. \
git clone https://github.com/tensorflow/${package_repo_name}.git; \
pushd ${package_repo_name}; \
git checkout ${commit_id}; \
if [ "$package_name" == "tfx" ]; then \
echo Building TFX pip package from source; \
sed -i 's@packages=packages,@packages=packages, package_data={package_name: ["benchmarks/datasets/chicago_taxi/data/taxi_1M.tfrecords.gz"]},@' setup.py; \
package_build/initialize.sh; \
python package_build/ml-pipelines-sdk/setup.py bdist_wheel; \
python package_build/tfx/setup.py bdist_wheel; \
else \
echo Using python setup.py bdist_wheel to build package; \
python setup.py bdist_wheel; \
fi; \
pip install dist/*.whl; \
install_commands+=("pip install --force --no-deps ${PWD}/dist/*.whl"); \
popd; \
echo Built and installed ${package_name} from GitHub commit ${commit_id}; \
fi; \
# Write GIT_COMMIT_ID attribute to the installed package. \
package_path=$(python3 -c "import ${package_name_underscores}; print(list(${package_name_underscores}.__path__)[0])"); \
echo "GIT_COMMIT_ID='${commit_id}'" >> ${package_path}/__init__.py; \
install_commands+=("echo \"GIT_COMMIT_ID='${commit_id}'\" >> ${package_path}/__init__.py;"); \
elif [ "$pypi_version" != "" ]; then \
if [ "$package_name" == "tfx" ]; then \
# Special handling for TFX - we want to install from GitHub, and get \
# the data files as well (they are not included in the pip package). \
# Install from the corresponding tag in GitHub. \
echo Special handling for tfx: will install tfx from GitHub tag for version ${pypi_version}; \
git clone --depth 1 --branch v${pypi_version} https://github.com/tensorflow/tfx.git; \
pushd tfx; \
echo Building TFX pip package from source; \
sed -i 's@packages=packages,@packages=packages, package_data={package_name: ["benchmarks/datasets/chicago_taxi/data/taxi_1M.tfrecords.gz"]},@' setup.py; \
package_build/initialize.sh; \
python package_build/ml-pipelines-sdk/setup.py bdist_wheel; \
python package_build/tfx/setup.py bdist_wheel; \
pip install dist/*.whl; \
install_commands+=("pip install --force --no-deps ${PWD}/dist/*.whl"); \
popd; \
echo Installed tfx from GitHub tag for version ${pypi_version}; \
else \
pip install ${package_name}==${pypi_version}; \
install_commands+=("pip install --force --no-deps ${package_name}==${pypi_version}"); \
echo Installed ${package_name} from PyPI version ${pypi_version}; \
fi; \
else \
echo Neither commit_id nor pypi_version was set for ${package_name}; \
exit 1; \
fi; \
}; \
\
# Array of commands to run post-installation. This is for forcing \
# installation of packages without regard to the requirements of other \
# packages. \
# The first round of installations installs the packages and their \
# requirements. This may result in some packages being re-installed at \
# versions other than the requested versions due to requirements from \
# other packages. \
# The second round of installations via install_commands \
# forces installations of the packages at the desired versions, ignoring \
# any dependencies of these packages or other packages. Note that if there \
# are incompatible package dependencies (e.g. tfx depends on \
# apache-beam==2.21 and tensorflow-transform depends on apache-beam==2.22 \
# then either could be installed depending on the installation order). \
install_commands=(); \
install_package "${default_package_version}" "tfx" "tfx" "bdist_wheel" "${tfx_package_version}"; \
install_package "${default_package_version}" "tensorflow-transform" "transform" "none" "${tensorflow_transform_package_version}"; \
install_package "${default_package_version}" "tensorflow-model-analysis" "model-analysis" "none" "${tensorflow_model_analysis_package_version}"; \
install_package "${default_package_version}" "tensorflow-data-validation" "data-validation" "bdist_wheel" "${tensorflow_data_validation_package_version}"; \
install_package "${default_package_version}" "tensorflow-metadata" "metadata" "bdist_wheel" "${tensorflow_metadata_package_version}"; \
install_package "${default_package_version}" "tfx-bsl" "tfx-bsl" "bdist_wheel" "${tfx_bsl_package_version}"; \
for cmd in "${install_commands[@]}"; do \
echo Running "${cmd}"; \
eval $cmd; \
done;
# Uninstall the TensorFlow version that TFX / the TFX components installed, and
# force install the version requested.
RUN pip uninstall -y tensorflow
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec}
RUN pip freeze
# Ubuntu 18.04 Python3 with CUDA 11 and the following:
# - Installs tf-nightly-gpu (this is TF 2.3)
# - Installs requirements.txt for tensorflow/models
FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
RUN apt-get update && apt-get install -y --no-install-recommends \
--allow-downgrades --allow-change-held-packages \
build-essential \
cuda-tools-11-0 \
cuda-toolkit-11-0 \
libcudnn8=8.0.0.180-1+cuda11.0 \
libcudnn8-dev=8.0.0.180-1+cuda11.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
python3 \
python3-dev \
python3-pip \
python3-setuptools \
python3-venv
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN pip3 install --upgrade pip
# setuptools upgraded to fix install requirements from model garden.
RUN pip install wheel
RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN pip install absl-py
RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN pip install tfds-nightly
RUN pip install -U scikit-learn
# Install dependnecies needed for tf.distribute test utils
RUN pip install dill tblib portpicker
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN pip install tf-estimator-nightly
RUN pip install tensorflow-text-nightly
# RUN nvidia-smi
RUN nvcc --version
RUN pip freeze
# Ubuntu 18.04 Python3 with CUDA 11 and the following:
# - Installs tf-nightly-gpu (this is TF 2.3)
# - Installs requirements.txt for tensorflow/models
FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 as base
ARG tensorflow_pip_spec="tf-nightly-gpu"
ARG local_tensorflow_pip_spec=""
ARG extra_pip_specs=""
ENV PIP_CMD="python3.9 -m pip"
# setup.py passes the base path of local .whl file is chosen for the docker image.
# Otherwise passes an empty existing file from the context.
COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
# Pick up some TF dependencies
# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
# really be needed.
# In the future, add the following lines in a shell script running on the
# benchmark vm to get the available dependent versions when updating cuda
# version (e.g to 10.2 or something later):
# sudo apt-cache search cuda-command-line-tool
# sudo apt-cache search cuda-cublas
# sudo apt-cache search cuda-cufft
# sudo apt-cache search cuda-curand
# sudo apt-cache search cuda-cusolver
# sudo apt-cache search cuda-cusparse
# Needed to disable prompts during installation.
ENV DEBIAN_FRONTEND noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
libfreetype6-dev \
libhdf5-serial-dev \
libzmq3-dev \
libpng-dev \
pkg-config \
software-properties-common \
unzip \
lsb-core \
curl
# Python 3.9 related deps in this ppa.
RUN add-apt-repository ppa:deadsnakes/ppa
# Install / update Python and Python3
RUN apt-get install -y --no-install-recommends \
python3.9 \
python3-pip \
python3.9-dev \
python3-setuptools \
python3.9-venv \
python3.9-distutils \
python3.9-lib2to3
# Upgrade pip, need to use pip3 and then pip after this or an error
# is thrown for no main found.
RUN ${PIP_CMD} install --upgrade pip
RUN ${PIP_CMD} install --upgrade distlib
# setuptools upgraded to fix install requirements from model garden.
RUN ${PIP_CMD} install --upgrade setuptools
# For CUDA profiling, TensorFlow requires CUPTI.
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.2/lib64:$LD_LIBRARY_PATH
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
# Add google-cloud-sdk to the source list
RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# Install extras needed by most models
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
ca-certificates \
wget \
htop \
zip \
google-cloud-sdk
RUN ${PIP_CMD} install --upgrade pyyaml
RUN ${PIP_CMD} install --upgrade google-api-python-client==1.8.0
RUN ${PIP_CMD} install --upgrade google-cloud google-cloud-bigquery google-cloud-datastore mock
RUN ${PIP_CMD} install wheel
RUN ${PIP_CMD} install absl-py
RUN ${PIP_CMD} install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
RUN ${PIP_CMD} install tfds-nightly
RUN ${PIP_CMD} install -U scikit-learn
# Install dependnecies needed for tf.distribute test utils
RUN ${PIP_CMD} install dill tblib portpicker
RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
RUN ${PIP_CMD} install -r /tmp/requirements.txt
RUN ${PIP_CMD} install tf-estimator-nightly
RUN ${PIP_CMD} install tensorflow-text-nightly
# RUN nvidia-smi
RUN nvcc --version
RUN pip freeze
#!/bin/bash
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
set -e
set -x
git clone https://github.com/tensorflow/benchmarks.git
cd benchmarks
tf_spec="tf-nightly-gpu==2.5.0.dev20210212"
CIFAR10_DATA="gs://tf-perf-imagenet-uswest1/tensorflow/cifar10_data/cifar-10-batches-bin"
CIFAR10_BENCHMARKS="official.benchmark.keras_cifar_benchmark.Resnet56KerasBenchmarkReal.benchmark_1_gpu_no_dist_strat"
RESNET50_DATA="gs://tf-perf-imagenet-uswest1/tensorflow/imagenet"
RESNET50_BENCHMARKS="official.benchmark.resnet_ctl_imagenet_benchmark.Resnet50CtlBenchmarkReal.benchmark_1_gpu_fp16"
function run_single_benchmark() {
docker_name=$1
label=$2
data_downloads=$3
benchmark_methods=$4
perfzero_pwd=`pwd`
nvidia-docker run \
-v ${perfzero_pwd}:/workspace \
-v /data:/data \
-e PERFZERO_RUN_TAGS= \
-e PERFZERO_TRACKING_ID= \
-e PERFZERO_COMMIT_LABEL= \
-e PERFZERO_EXECUTION_BRANCH=master \
-e CUDNN_LOGINFO_DBG=1 \
-e CUDNN_LOGDEST_DBG=stderr \
${docker_name} \
python3 /workspace/perfzero/lib/benchmark.py \
--bigquery_dataset_table_name="" \
--data_downloads="${data_downloads}" \
--ml_framework_build_label=v2-nightly-gpu \
--execution_label="${label}" \
--platform_name=kokoro-gcp \
--system_name=n1-standard-8-1xA100 \
--output_gcs_url="" \
--benchmark_class_type= \
--scratch_gcs_url= \
--root_data_dir=/data \
--benchmark_num_trials=2 \
--bigquery_project_name="" \
--git_repos="https://github.com/tensorflow/models.git;benchmark" \
--python_path=models \
--benchmark_methods=${benchmark_methods} \
--result_upload_methods="" \
--gcloud_key_file_url="${PERFZERO_GCLOUD_KEY_FILE_URL}" \
--tpu_parameters=
}
function run_benchmarks() {
docker_name=$1
label=$2
run_single_benchmark ${docker_name} ${label} "${CIFAR10_DATA}" "${CIFAR10_BENCHMARKS}"
run_single_benchmark ${docker_name} ${label} "${RESNET50_DATA}" "${RESNET50_BENCHMARKS}"
}
function setup_docker() {
label=$1
dockerfile=$2
echo "`date` Setting up ${label} docker..."
sudo python3 perfzero/lib/setup.py \
--gce_nvme_raid= \
--docker_tag="${label}" \
--gcloud_key_file_url= \
--tensorflow_pip_spec=${tf_spec} \
--dockerfile_path=${dockerfile}
echo "`date` Finished setting up ${label} docker."
}
function diff_benchmarks() {
python3 perfzero/dockertest/diff_benchmarks.py `pwd`
}
baseline_docker="docker/Dockerfile_ubuntu_cuda11_8_0_0_180"
experiment_docker="docker/Dockerfile_ubuntu_1804_tf_cuda_11"
setup_docker "control/tensorflow" ${baseline_docker}
run_benchmarks "control/tensorflow" "control-8-0-0-180"
setup_docker "experiment/tensorflow" ${experiment_docker}
run_benchmarks "experiment/tensorflow" "experiment-8-0-4-30"
diff_benchmarks
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""Simple script to diff benchmark results.
This script will read all the summary files from a base output directory and
print a human readable diff report.
"""
import json
import os
import sys
def _find_perfzero_logs(docker_output_dir):
"""Finds pairs of json_file, output_log file from all methods."""
summary_files = []
for root, _, files in os.walk(docker_output_dir):
for summary_file in files:
if summary_file.endswith('perfzero_summary.json'):
full_summary_file = os.path.join(root, summary_file)
summary_files.append(full_summary_file)
sys.stdout.write('Found json {}\n'.format(full_summary_file))
return summary_files
def _load_summaries(summary_files):
"""Loads input json file paths and returns json objects."""
summary_jsons = []
for summary_file in summary_files:
with open(summary_file, 'r') as f:
summary_json = json.load(f)
summary_jsons.append(summary_json)
return summary_jsons
def _summarize_benchmarks(summary_files):
"""Remaps list of json files -> summaries by benchmark method."""
summary_jsons = _load_summaries(summary_files)
performance_by_method = {}
for summary_json in summary_jsons:
method = summary_json['benchmark_result']['name']
trial = summary_json['benchmark_result']['trial_id']
metrics_list = summary_json['benchmark_result']['metrics']
metrics = {}
for metric_info in metrics_list:
metrics[metric_info['name']] = metric_info['value']
metrics['wall_time'] = summary_json['benchmark_result']['wall_time']
label = summary_json['benchmark_info']['execution_label']
performance_by_method.setdefault(method, {}).setdefault(label, [])
performance_by_method[method][label].append((trial, metrics))
return performance_by_method
def _print_diff_report(performance_by_method):
"""Prints a diff report of benchmark performance."""
print('Performance report:')
print(json.dumps(performance_by_method, indent=2))
method_to_metric_to_perf = {}
for method in performance_by_method:
for label, label_data in performance_by_method[method].items():
latest_trial_data = max(label_data, key=lambda x: x[0])
latest_metrics = latest_trial_data[1]
for metric, value in latest_metrics.items():
method_to_metric_to_perf.setdefault(method, {}).setdefault(metric, [])
method_to_metric_to_perf[method][metric].append((label, value))
print('Diff report:')
for method in sorted(method_to_metric_to_perf):
print('-- benchmark: {}'.format(method))
for metric in sorted(method_to_metric_to_perf[method].keys()):
value_list = []
for label, value in sorted(
method_to_metric_to_perf[method][metric], key=lambda x: x[0]):
print(' {}: {}: {}'.format(metric, label, value))
value_list.append(value)
if len(value_list) == 2:
control_val = float(value_list[0])
expt_val = float(value_list[1])
if abs(control_val) > 1e-5:
diff_pct = (expt_val / control_val - 1.0) * 100.0
else:
diff_pct = -1.0
print(' diff: {:2.2f}%'.format(diff_pct))
def main():
if len(sys.argv) != 2:
raise RuntimeError('Usage: {} <base perfzero output dir>'.format(
sys.argv[0]))
perfzero_output_dir = sys.argv[1]
summary_files = _find_perfzero_logs(perfzero_output_dir)
performance_by_method = _summarize_benchmarks(summary_files)
_print_diff_report(performance_by_method)
if __name__ == '__main__':
main()
six
google-api-python-client>=1.6.7
kaggle>=1.3.9
numpy>=1.15.4
oauth2client
pandas>=0.22.0
psutil>=5.4.3
py-cpuinfo>=3.3.0
scipy>=0.19.1
tensorflow-hub>=0.6.0
tensorflow-model-optimization>=0.4.1
tensorflow-datasets
tensorflow-addons
dataclasses;python_version<"3.7"
gin-config
tf_slim>=1.1.0
Cython
matplotlib
pyyaml>=5.1
# CV related dependencies
opencv-python-headless
Pillow
pycocotools
# NLP related dependencies
seqeval
sentencepiece
sacrebleu
#!/bin/bash
set -e
set -x
# To run this script from a GCP VM / host connected to GPUs:
# git clone https://github.com/tensorflow/benchmarks.git
# cd benchmarks
# bash perfzero/dockertest/resnet50_synth.sh
# Output log files/results will be stored at perfzero/workspace/output/
# Modify INPUT_PARAMS variables below to tweak the tf whl under test / benchmark methods / dataset paths.
# You can comment out "build_docker" call at the end, if the docker's already built.
## INPUT PARAMS: start
# Acceptable formats for TF_PIP_SPEC
# pypi nightlies: tf-nightly-gpu==2.6.0.dev20210521
# gcs path to whls: gs://some-path-to-tf.whl
# Local path to whl: file://some-local-path-to-tf.whl
TF_PIP_SPEC="tf-nightly-gpu==2.6.0.dev20210624"
# Path to GCS or local files containing the input datasets (if they need to be fetched into the docker).
DATA_DOWNLOADS=""
# Comma separated list of strings.
BENCHMARK_METHODS="official.benchmark.keras_imagenet_benchmark.Resnet50KerasBenchmarkSynth.benchmark_1_gpu_fp16"
# If either the tf_pip_spec or data downloads reference private GCP, then we
# need to set GCLOUD_KEY_FILE_URL to a credentials file.
GCLOUD_KEY_FILE_URL=""
# Commit id under repository tensorflow/models, branch='benchmark' which has the benchmarks.
MODELS_GIT_HASH="169e4051aef247c27a95748a8015b2f35f509e1a"
## INPUT PARAMS: end
build_docker() {
echo "building docker"
sudo python3 perfzero/lib/setup.py \
--dockerfile_path=docker/Dockerfile_ubuntu_1804_tf_cuda_11 \
--tensorflow_pip_spec="${TF_PIP_SPEC}" \
--gcloud_key_file_url="${GCLOUD_KEY_FILE_URL}" \
--extra_docker_build_args=
sudo docker images
}
run_benchmark() {
echo "running benchmark"
benchmark_tag=$1
env_var=$2
sudo nvidia-docker run \
-v ${PWD}:/workspace \
-v /data:/data \
-e PERFZERO_EXECUTION_MODE=test \
-e TF_ENABLE_LEGACY_FILESYSTEM=1 \
-e ${env_var} \
perfzero/tensorflow python3 \
/workspace/perfzero/lib/benchmark.py \
--root_data_dir=/data \
--bigquery_dataset_table_name="" \
--benchmark_class_type= \
--ml_framework_build_label=v2-nightly-gpu-${benchmark_tag} \
--execution_label=test-benchmark \
--platform_name=kokoro-gcp \
--system_name=n1-standard-8-1xV100 \
--output_gcs_url="" \
--benchmark_num_trials=1 \
--scratch_gcs_url= \
--bigquery_project_name="" \
--git_repos="https://github.com/tensorflow/models.git;benchmark;${MODELS_GIT_HASH}" \
--data_downloads="${DATA_DOWNLOADS}"\
--python_path=models \
--benchmark_methods="${BENCHMARK_METHODS}" \
--gcloud_key_file_url="${GCLOUD_KEY_FILE_URL}"
}
build_docker
run_benchmark "control" "TF_CUDNN_USE_FRONTEND=false"
run_benchmark "experiment" "TF_CUDNN_USE_FRONTEND=true"
#!/bin/bash
set -e
set -x
# To run this script from a GCP VM / host connected to GPUs:
# git clone https://github.com/tensorflow/benchmarks.git
# cd benchmarks
# bash perfzero/dockertest/run_single_benchmark.sh
# Output log files/results will be stored at perfzero/workspace/output/
# Modify INPUT_PARAMS variables below to tweak the tf whl under test / benchmark methods / dataset paths.
# You can comment out "build_docker" call at the end, if the docker's already built.
## INPUT PARAMS: start
# Acceptable formats for TF_PIP_SPEC
# pypi nightlies: tf-nightly-gpu==2.6.0.dev20210521
# gcs path to whls: gs://some-path-to-tf.whl
# Local path to whl: file://some-local-path-to-tf.whl
TF_PIP_SPEC="tf-nightly-gpu==2.6.0.dev20210521"
# Path to GCS or local files containing the input datasets (if they need to be fetched into the docker).
DATA_DOWNLOADS=""
# Comma separated list of strings.
BENCHMARK_METHODS="official.benchmark.keras_imagenet_benchmark.Resnet50KerasBenchmarkSynth.benchmark_xla_1_gpu_fp16"
# If either the tf_pip_spec or data downloads reference private GCP, then we
# need to set GCLOUD_KEY_FILE_URL to a credentials file.
GCLOUD_KEY_FILE_URL=""
## INPUT PARAMS: end
build_docker() {
echo "building docker"
sudo python3 perfzero/lib/setup.py \
--dockerfile_path=docker/Dockerfile_ubuntu_1804_tf_cuda_11 \
--tensorflow_pip_spec="${TF_PIP_SPEC}" \
--gcloud_key_file_url="${GCLOUD_KEY_FILE_URL}" \
--extra_docker_build_args=
sudo docker images
}
run_benchmark() {
echo "running benchmark"
sudo nvidia-docker run \
-v ${PWD}:/workspace \
-v /data:/data \
-e PERFZERO_EXECUTION_MODE=test \
-e TF_ENABLE_LEGACY_FILESYSTEM=1 \
perfzero/tensorflow python3 \
/workspace/perfzero/lib/benchmark.py \
--root_data_dir=/data \
--bigquery_dataset_table_name="" \
--benchmark_class_type= \
--ml_framework_build_label=v2-nightly-gpu \
--execution_label=test-benchmark \
--platform_name=kokoro-gcp \
--system_name=n1-standard-8-1xV100 \
--output_gcs_url="" \
--benchmark_num_trials=1 \
--scratch_gcs_url= \
--bigquery_project_name="" \
--git_repos='https://github.com/tensorflow/models.git;benchmark;f7938e6ad46fecfa1112eda579eb046eb3f3bf96' \
--data_downloads="${DATA_DOWNLOADS}"\
--python_path=models \
--benchmark_methods="${BENCHMARK_METHODS}" \
--gcloud_key_file_url="${GCLOUD_KEY_FILE_URL}"
}
build_docker
run_benchmark
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute benchmark."""
from __future__ import print_function
import argparse
import json
import logging
import multiprocessing
import os
import re
import sys
import time
import perfzero.benchmark_method_runner as benchmark_method_runner
import perfzero.perfzero_config as perfzero_config
import perfzero.tpu_runtime_utils as tpu_runtime_utils
import perfzero.utils as utils
class BenchmarkRunner(object):
"""Execute benchmark and report results."""
def __init__(self, config):
self.config = config
self.project_dir = os.path.abspath(
os.path.dirname(os.path.dirname(__file__)))
self.workspace_dir = os.path.join(self.project_dir, config.workspace)
self.site_packages_dir = os.path.join(self.workspace_dir, 'site-packages')
self.root_output_dir = os.path.join(self.workspace_dir, 'output')
self.benchmark_execution_time = {}
def _setup(self):
"""Download data and checkout git repository."""
# Acticate gcloud service
start_time = time.time()
utils.setup_python_path(self.site_packages_dir, self.config.python_path_str)
utils.active_gcloud_service(self.config.gcloud_key_file_url,
self.workspace_dir)
utils.make_dir_if_not_exist(self.root_output_dir)
self.benchmark_execution_time['activate_gcloud_service'] = (
time.time() - start_time)
# Download data
start_time = time.time()
utils.download_data(utils.parse_data_downloads_str(
self.config.root_data_dir, self.config.gcs_downloads_str))
utils.download_data(utils.parse_data_downloads_str(
self.config.root_data_dir, self.config.data_downloads_str))
self.benchmark_execution_time['download_data'] = time.time() - start_time
# Checkout git repositories
start_time = time.time()
site_package_info = utils.checkout_git_repos(
self.config.get_git_repos(self.site_packages_dir),
self.config.use_cached_site_packages)
self.benchmark_execution_time['checkout_repository'] = (
time.time() - start_time)
# Start cloud TPU.
if self.config.tpu_parameters is not None:
start_time = time.time()
utils.setup_tpu(self.config.tpu_parameters)
tpu_info = tpu_runtime_utils.configure_tpu(self.config.tpu_parameters)
site_package_info['tpu_version'] = tpu_info
self.benchmark_execution_time['start_tpu'] = time.time() - start_time
self.stream_handler = logging.StreamHandler(sys.stdout)
self.stream_handler.setFormatter(
logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
logging.getLogger().addHandler(self.stream_handler)
return site_package_info
def _get_benchmark_methods(self):
"""Returns list of benchmark methods to execute."""
filter_prefix = 'filter:'
benchmark_methods = []
for benchmark_method_pattern in self.config.benchmark_method_patterns:
if filter_prefix not in benchmark_method_pattern:
benchmark_methods.append(benchmark_method_pattern)
else:
index = benchmark_method_pattern.find(filter_prefix)
benchmark_class = benchmark_method_pattern[:index - 1]
pattern = benchmark_method_pattern[index + len(filter_prefix):]
class_instance = utils.instantiate_benchmark_class(
benchmark_class, '/dev/null', '', None, {},
benchmark_class_type=self.config.benchmark_class_type)
for benchmark_method_name in dir(class_instance):
if re.match(pattern, benchmark_method_name):
benchmark_methods.append(benchmark_class + '.' +
benchmark_method_name)
logging.info('The following benchmark methods will be executed: %s',
benchmark_methods)
return benchmark_methods
def _run_benchmarks_trial(self, harness_info, site_package_info,
benchmark_methods, trial_id):
"""Runs a single trial of all benchmark methods."""
# Run the benchmark method in a separate process so that its memory usage
# will not affect the execution of other benchmark method
# This is a walkaround before we fix all memory leak issues in TensorFlow
has_exception = False
benchmark_success_results = {}
benchmark_output_dirs = {}
benchmark_execution_time = {}
for benchmark_method in benchmark_methods:
queue = multiprocessing.Queue()
process = multiprocessing.Process(target=benchmark_method_runner.run,
args=(benchmark_method,
harness_info,
site_package_info,
self.root_output_dir,
self.config, queue, trial_id))
process.start()
process.join()
method_has_exception, method_execution_time, succeeded, output_dir = queue.get() # pylint: disable=line-too-long
has_exception |= method_has_exception
benchmark_execution_time[benchmark_method] = method_execution_time
benchmark_success_results[benchmark_method] = succeeded
benchmark_output_dirs[benchmark_method] = output_dir
return (has_exception, benchmark_success_results,
benchmark_output_dirs, benchmark_execution_time)
def run_benchmark(self):
"""Run benchmark."""
harness_info = utils.get_git_repo_info(self.project_dir)
has_exception = False
benchmark_success_results = {}
benchmark_output_dirs = {}
num_trials = self.config.benchmark_num_trials
try:
site_package_info = self._setup()
benchmark_methods = self._get_benchmark_methods()
print('Setup complete. Running {} trials'.format(num_trials))
for trial_id in range(1, num_trials + 1):
print('Running trial {} / {}'.format(trial_id, num_trials))
(trial_has_exception, trial_success_results,
trial_output_dirs, trial_execution_time) = self._run_benchmarks_trial(
harness_info, site_package_info, benchmark_methods, trial_id)
trial_key = 'trial_{}'.format(trial_id)
has_exception |= trial_has_exception
self.benchmark_execution_time[trial_key] = trial_execution_time
benchmark_success_results[trial_key] = trial_success_results
benchmark_output_dirs[trial_key] = trial_output_dirs
finally:
if self.config.tpu_parameters is not None:
has_exception |= utils.cleanup_tpu(self.config.tpu_parameters)
print('Benchmark execution time in seconds by operation:\n {}'.format(
json.dumps(self.benchmark_execution_time, indent=2)))
print('Benchmark success results:\n{}'.format(
json.dumps(benchmark_success_results, indent=2)))
print('Benchmark local output directories:\n{}'.format(
json.dumps(benchmark_output_dirs, indent=2)))
if has_exception:
sys.exit(1)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
perfzero_config.add_benchmark_parser_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()
level = logging.DEBUG if FLAGS.debug else logging.INFO
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
level=level)
if unparsed:
logging.error('Arguments %s are not recognized', unparsed)
sys.exit(1)
config_ = perfzero_config.PerfZeroConfig(mode='flags', flags=FLAGS)
benchmark_runner = BenchmarkRunner(config_)
benchmark_runner.run_benchmark()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark.py."""
from __future__ import print_function
import sys
import unittest
import benchmark
import mock
class TestBenchmarkRunner(unittest.TestCase):
def test_get_benchmark_methods_filter(self):
"""Tests returning methods on a class based on a filter."""
config = mock.Mock()
config.workspace = 'workspace'
config.benchmark_method_patterns = ['new_foo.BenchmarkClass.filter:bench.*']
benchmark_runner = benchmark.BenchmarkRunner(config)
mock_benchmark_class = mock.Mock()
mock_benchmark_class.benchmark_method_1 = 'foo'
mock_module = mock.Mock()
sys.modules['new_foo'] = mock_module
mock_module.BenchmarkClass.return_value = mock_benchmark_class
methods = benchmark_runner._get_benchmark_methods()
self.assertEqual(1, len(methods))
self.assertEqual('new_foo.BenchmarkClass.benchmark_method_1', methods[0])
def test_get_benchmark_methods_exact_match(self):
"""Tests returning methods on a class based on a filter."""
config = mock.Mock()
config.workspace = 'workspace'
config.benchmark_method_patterns = [
'new_foo.BenchmarkClass.benchmark_method_1',
'new_foo.BenchmarkClass.benchmark_method_2']
benchmark_runner = benchmark.BenchmarkRunner(config)
methods = benchmark_runner._get_benchmark_methods()
self.assertEqual(['new_foo.BenchmarkClass.benchmark_method_1',
'new_foo.BenchmarkClass.benchmark_method_2'], methods)
#!/usr/bin/python
#
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper script to create, query and stop machine in GCP."""
from __future__ import print_function
import argparse
import getpass
import logging
import subprocess
import sys
import time
INSTANCE_NAME_PREFIX = 'perfzero-dev-'
def run_command(cmd, is_from_user=False):
"""Runs list of command and throw error if return code is non-zero.
Args:
cmd: Command to execute
is_from_user: If true, log the command and the command output in INFO level.
Otherwise, log these in the DEBUG level.
Returns:
a string representing the command output
Raises:
Exception: raised when the command execution has non-zero exit code
"""
_log = logging.info if is_from_user else logging.debug
_log('Executing command: {}'.format(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, shell=True)
exit_code = None
line = ''
stdout = ''
while exit_code is None or line:
exit_code = p.poll()
line = p.stdout.readline().decode('utf-8')
stdout += line
_log(line)
if exit_code and is_from_user:
sys.exit(exit_code)
elif exit_code:
raise Exception('Command:\n{}\nfailed with output:\n{}'.format(cmd, stdout))
return stdout
def get_instance_name(username):
return INSTANCE_NAME_PREFIX + username
def get_machine_type(machine_type, accelerator_count):
"""Get machine type for the instance.
- Use the user-specified machine_type if it is not None
- Otherwise, use the standard type with cpu_count = 8 x accelerator_count
if user-specified accelerator_count > 0
- Otherwise, use the standard type with 8 cpu
Args:
machine_type: machine_type specified by the user
accelerator_count: accelerator count
Returns:
the machine type used for the instance
"""
if machine_type:
return machine_type
cpu_count = max(accelerator_count, 1) * 8
return 'n1-standard-{}'.format(cpu_count)
def _ssh_prefix(project, zone, internal_ip, key_file):
if internal_ip:
ssh_prefix = 'gcloud beta compute ssh --internal-ip'
else:
ssh_prefix = 'gcloud compute ssh'
if key_file:
ssh_prefix = '{} --ssh-key-file={}'.format(ssh_prefix, key_file)
return '{} --project={} --zone={}'.format(ssh_prefix, project, zone)
def create(username, project, zone, machine_type, accelerator_count,
accelerator_type, image, nvme_count, ssh_internal_ip, ssh_key_file,
cpu_min_platform=None, boot_ssd_size=None):
"""Create gcloud computing instance.
Args:
username: the username of the current user
project: project name
zone: zone of the GCP computing instance
machine_type: the machine type used for the instance
accelerator_count: the number of pieces of the accelerator to attach to
the instance
accelerator_type: the specific type of accelerator to attach to the instance
image: the name of the image that the disk will be initialized with
nvme_count: the number of NVME local SSD devices to attach to the instance
ssh_internal_ip: internal ip to use for ssh.
ssh_key_file: ssh key file to use to connect to instance.
cpu_min_platform: minimum CPU platform to use, if None use default.
boot_ssd_size: If set boot disk is changed to SSD and this size(GB) is used.
"""
instance_name = get_instance_name(username)
machine_type = get_machine_type(machine_type, accelerator_count)
logging.debug('Creating gcloud computing instance %s', instance_name)
cmd = '''gcloud compute instances create {} \
--image={} \
--project={} \
--zone={} \
--machine-type={} \
--maintenance-policy=TERMINATE \
'''.format(instance_name, image, project, zone, machine_type)
if boot_ssd_size:
cmd += '--boot-disk-size={}GB --boot-disk-type=pd-ssd '.format(
boot_ssd_size)
if accelerator_count > 0:
cmd += '--accelerator=count={},type={} '.format(
accelerator_count, accelerator_type)
if cpu_min_platform:
cmd += '--min-cpu-platform="{}" '.format(cpu_min_platform)
for _ in range(nvme_count):
cmd += '--local-ssd=interface=NVME '
run_command(cmd, is_from_user=True)
logging.info('Successfully created gcloud computing instance %s '
'with %s accelerator.\n', instance_name, accelerator_count)
ssh_prefix = _ssh_prefix(project, zone, ssh_internal_ip, ssh_key_file)
# Wait until we can ssh to the newly created computing instance
cmd = '{} --strict-host-key-checking=no --command="exit" {}'.format(
ssh_prefix, instance_name)
ssh_remaining_retries = 12
ssh_error = None
while ssh_remaining_retries > 0:
ssh_remaining_retries -= 1
try:
run_command(cmd, is_from_user=False)
ssh_error = None
except Exception as error: # pylint: disable=broad-except
ssh_error = error
if ssh_remaining_retries:
logging.info('Cannot ssh to the computing instance. '
'Try again after 5 seconds')
time.sleep(5)
else:
logging.error('Cannot ssh to the computing instance after '
'60 seconds due to error:\n%s', str(ssh_error))
if ssh_error:
logging.info('Run the commands below manually after ssh into the computing '
'instance:\n'
'git clone https://github.com/tensorflow/benchmarks.git\n'
'sudo usermod -a -G docker $USER\n')
else:
cmd = '{} --command="git clone {}" {}'.format(
ssh_prefix, 'https://github.com/tensorflow/benchmarks.git',
instance_name)
run_command(cmd, is_from_user=True)
logging.info('Successfully checked-out PerfZero code on the '
'computing instance\n')
cmd = '{} --command="sudo usermod -a -G docker $USER" {}'.format(
ssh_prefix, instance_name)
run_command(cmd, is_from_user=True)
logging.info('Successfully added user to the docker group\n')
cmd = '{} {} -- -L 6006:127.0.0.1:6006'.format(ssh_prefix, instance_name)
logging.info('Run the command below to ssh to the instance together with '
'port forwarding for tensorboard:\n%s\n', cmd)
def status(username, project, zone, ssh_internal_ip, ssh_key_file):
"""Query the status of the computing instance.
Args:
username: the username of the current user.
project: project name.
zone: zone of the GCP computing instance.
ssh_internal_ip: internal ip of the instance.
ssh_key_file: SSH key file to use to connect to the instance.
"""
instance_name = get_instance_name(username)
logging.debug('Querying status of gcloud computing instance %s of '
'project %s in zone %s', instance_name, project, zone)
cmd = 'gcloud compute instances list --filter="name={} AND zone:{}" --project {}'.format( # pylint: disable=line-too-long
instance_name, zone, project)
stdout = run_command(cmd, is_from_user=True)
num_instances = len(stdout.splitlines()) - 1
logging.info('\nFound %s gcloud computing instance with name %s.\n',
num_instances, instance_name)
if num_instances == 1:
cmd = '{} {} -- -L 6006:127.0.0.1:6006'.format(
_ssh_prefix(project, zone, ssh_internal_ip, ssh_key_file),
instance_name)
logging.info('Run the command below to ssh to the instance together with '
'port forwarding for tensorboard:\n%s\n', cmd)
def list_all(project):
logging.debug('Finding all gcloud computing instance of project %s created '
'for PerfZero test', project)
cmd = 'gcloud compute instances list --filter="name ~ {}" --project={}'.format( # pylint: disable=line-too-long
INSTANCE_NAME_PREFIX, project)
stdout = run_command(cmd, is_from_user=True)
num_instances = len(stdout.splitlines()) - 1
logging.info('\nFound %s gcloud computing instance of project %s created '
'for PerfZero test', num_instances, project)
def start(username, project, zone):
instance_name = get_instance_name(username)
logging.debug('Starting gcloud computing instance %s of project %s '
'in zone %s', instance_name, project, zone)
cmd = 'gcloud compute instances start {} --project={} --zone={}'.format(
instance_name, project, zone)
run_command(cmd, is_from_user=True)
logging.debug('\nSuccessfully started gcloud computing instance %s of '
'project %s in zone %s', instance_name, project, zone)
def stop(username, project, zone):
instance_name = get_instance_name(username)
logging.debug('Stopping gcloud computing instance %s of project %s in '
'zone %s', instance_name, project, zone)
cmd = 'gcloud compute instances stop {} --project={} --zone={}'.format(
instance_name, project, zone)
run_command(cmd, is_from_user=True)
logging.debug('\nSuccessfully stopped gcloud computing instance %s of '
'project %s in zone %s', instance_name, project, zone)
def delete(username, project, zone):
instance_name = get_instance_name(username)
logging.debug('Deleting gcloud computing instance %s of project %s in '
'zone %s', instance_name, project, zone)
cmd = 'echo Y | gcloud compute instances delete {} --project={} --zone={}'.format( # pylint: disable=line-too-long
instance_name, project, zone)
run_command(cmd, is_from_user=True)
logging.debug('\nSuccessfully deleted gcloud computing instance %s of '
'project %s in zone %s', instance_name, project, zone)
def parse_arguments(argv, command): # pylint: disable=redefined-outer-name
"""Parse command line arguments and return parsed flags.
Args:
argv: command line arguments
command: the subcommand requested by the user
Returns:
parsed flags
"""
# pylint: disable=redefined-outer-name
parser = argparse.ArgumentParser(
usage='cloud_manager.py {} [<args>]'.format(command),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
'--debug',
action='store_true',
help='If set, use debug level logging. Otherwise, use info level logging'
)
parser.add_argument(
'--project',
default='google.com:tensorflow-performance',
type=str,
help='Google Cloud Platform project name to use for this invocation'
)
if command in ['create', 'start', 'stop', 'delete', 'status']:
parser.add_argument(
'--username',
default=getpass.getuser(),
type=str,
help='''Username that uniquely identifies the name of computing instance created for PerfZero.
The default value is your ldap username.
''')
parser.add_argument(
'--zone',
default='us-west1-b',
type=str,
help='Zone of the instance to create.'
)
parser.add_argument(
'--ssh-internal-ip',
action='store_true',
help='If set, use internal IP for ssh with `gcloud beta compute ssh`.'
)
parser.add_argument(
'--ssh-key-file',
default=None,
type=str,
help='The ssh key to use with with `gcloud (beta) compute ssh`.'
)
if command == 'create':
parser.add_argument(
'--accelerator_count',
default=1,
type=int,
help='The number of pieces of the accelerator to attach to the instance'
)
parser.add_argument(
'--accelerator_type',
default='nvidia-tesla-v100',
type=str,
help='''The specific type (e.g. nvidia-tesla-v100 for nVidia Tesla V100) of
accelerator to attach to the instance. Use 'gcloud compute accelerator-types list --project=${project_name}' to
learn about all available accelerator types.
''')
parser.add_argument(
'--cpu_min_platform',
default=None,
type=str,
help='''Minimum cpu platform, only needed for CPU only instances.''')
parser.add_argument(
'--machine_type',
default=None,
type=str,
help='''The machine type used for the instance. To get a list of available machine
types, run 'gcloud compute machine-types list --project=${project_name}'
''')
parser.add_argument(
'--image',
default='tf-ubuntu-1604-20180927-410',
type=str,
help='''Specifies the name of the image that the disk will be initialized with.
A new disk will be created based on the given image. To view a list of
public images and projects, run 'gcloud compute images list --project=${project_name}'. It is best
practice to use image when a specific version of an image is needed.
''')
parser.add_argument(
'--nvme_count',
default=0,
type=int,
help='''Specifies the number of NVME local SSD devices to attach to the instance.
'''
)
parser.add_argument(
'--boot_ssd_size',
default=None,
type=int,
help='''Specifies the size (GB) of the boot disk or size is the image
size. Setting this also changes boot disk to Persistent SSD.
'''
)
flags, unparsed = parser.parse_known_args(argv) # pylint: disable=redefined-outer-name
if unparsed:
logging.error('Arguments %s are not recognized', unparsed)
sys.exit(1)
level = logging.DEBUG if flags.debug else logging.INFO
logging.basicConfig(format='%(message)s', level=level)
return flags
if __name__ == '__main__':
parser = argparse.ArgumentParser(
usage='''cloud_manager.py <command> [<args>]
The supported commands are:
create: Create a computing instance in gcloud that is unique to the specified username, which is your ldap by default
start: Start the computing instance in gcloud that is unique to the specified username, which is your ldap by default
stop: Stop the computing instance in gcloud that is unique to the specified username, which is your ldap by default
delete: Delete the computing instance in gcloud that is unique to the specified username, which is your ldap by default
status: Query the status and information of the computing instance in gcloud that is unique to the specified username, which is your ldap by default
list_all: Query the status of all computing instances that are created by this script.'''
)
parser.add_argument(
'command',
type=str
)
flags = parser.parse_args(sys.argv[1:2])
command = flags.command
if not hasattr(sys.modules[__name__], command):
print('Error: The command <{}> is not recognized\n'.format(command))
parser.print_help()
sys.exit(1)
flags = parse_arguments(sys.argv[2:], command)
if command == 'create':
create(flags.username, flags.project, flags.zone, flags.machine_type,
flags.accelerator_count, flags.accelerator_type, flags.image,
flags.nvme_count, flags.ssh_internal_ip, flags.ssh_key_file,
cpu_min_platform=flags.cpu_min_platform,
boot_ssd_size=flags.boot_ssd_size)
elif command == 'start':
start(flags.username, flags.project, flags.zone)
elif command == 'stop':
stop(flags.username, flags.project, flags.zone)
elif command == 'delete':
delete(flags.username, flags.project, flags.zone)
elif command == 'status':
status(flags.username, flags.project, flags.zone, flags.ssh_internal_ip,
flags.ssh_key_file)
elif command == 'list_all':
list_all(flags.project)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute a single benchmark method."""
from __future__ import print_function
import datetime
import json
import logging
import os
import time
import traceback
from perfzero.process_info_tracker import ProcessInfoTracker
import perfzero.report_utils as report_utils
from perfzero.tensorflow_profiler import TensorFlowProfiler
import perfzero.utils as utils
def run(benchmark_method, harness_info, site_package_info,
root_output_dir, config, queue, trial_id):
try:
_run_internal(benchmark_method, harness_info, site_package_info,
root_output_dir, config, queue, trial_id)
except Exception: # pylint: disable=broad-except
logging.error('Benchmark execution for %s failed due to error:\n %s',
benchmark_method, traceback.format_exc())
queue.put((True, None, False, None))
def _set_file_contents(content_str, output_filename):
with open(output_filename, 'w') as f:
f.write(content_str)
logging.info('Wrote summary to file %s', output_filename)
def _run_internal(benchmark_method, harness_info, site_package_info,
root_output_dir, config, queue, trial_id):
"""Run benchmark method and put result to the queue.
Args:
benchmark_method: Canonical path to the benchmark method
harness_info: Description of the benchmark harness used in the benchmark
site_package_info: Description of the site-package used in the benchmark
root_output_dir: Directory under which to put the benchmark output
config: An instance of perfzero_config
queue: An interprocess queue to transfer benchmark result to the caller.
trial_id: An integer trial id to annotate in the benchmark result.
"""
start_timestamp = time.time()
execution_timestamp = start_timestamp
method_has_exception = False
execution_id = (config.execution_id if config.execution_id else
datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f'))
output_dir = os.path.join(root_output_dir, execution_id)
if config.scratch_gcs_url:
model_output_dir = os.path.join(config.scratch_gcs_url, execution_id)
else:
model_output_dir = output_dir
utils.make_dir_if_not_exist(output_dir)
benchmark_class, benchmark_method_name = benchmark_method.rsplit('.', 1)
benchmark_class_name = benchmark_class.rsplit('.', 1)[1]
tensorflow_profiler = TensorFlowProfiler(
config.profiler_enabled_time_str, output_dir)
process_info_tracker = ProcessInfoTracker(output_dir)
process_info = None
# Setup per-method file logger
filehandler = logging.FileHandler(
filename=os.path.join(output_dir, 'perfzero.log'), mode='w')
filehandler.setFormatter(
logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
logging.getLogger().addHandler(filehandler)
try:
if config.tpu_parameters:
tpu = config.tpu_parameters.get('name')
else:
tpu = None
if config.perfzero_constructor_args:
constructor_args = json.loads(config.perfzero_constructor_args)
else:
constructor_args = {}
class_instance = utils.instantiate_benchmark_class(
benchmark_class=benchmark_class,
output_dir=model_output_dir,
root_data_dir=config.root_data_dir,
tpu=tpu,
constructor_args=constructor_args,
benchmark_class_type=config.benchmark_class_type)
# tf.test.Benchmark.report_benchmark() writes results to a file with
# path benchmark_result_file_path_prefix + benchmark_method
benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_')
os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix
benchmark_result_file_path = '{}{}.{}'.format(
benchmark_result_file_path_prefix,
benchmark_class_name,
benchmark_method_name)
# Start background threads for profiler and system info tracker
tensorflow_profiler.start()
process_info_tracker.start()
# Run benchmark method
execution_timestamp = time.time()
logging.info('Starting benchmark execution: %s', benchmark_method)
getattr(class_instance, benchmark_method_name)()
logging.info('Stopped benchmark: %s', benchmark_method)
# Read and build benchmark results
raw_benchmark_result = utils.read_benchmark_result(
benchmark_result_file_path)
# Explicitly overwrite the name to be the full path to benchmark method
raw_benchmark_result['name'] = benchmark_method
except Exception: # pylint: disable=broad-except
logging.error('Benchmark execution for %s failed due to error:\n %s',
benchmark_method, traceback.format_exc())
method_has_exception = True
raw_benchmark_result = {}
raw_benchmark_result['name'] = benchmark_method
raw_benchmark_result['wall_time'] = -1
raw_benchmark_result['extras'] = {}
finally:
# Stop background threads for profiler and system info tracker
process_info = process_info_tracker.stop()
tensorflow_profiler.stop()
upload_timestamp = time.time()
benchmark_result = report_utils.build_benchmark_result(
raw_benchmark_result, method_has_exception, trial_id)
execution_summary = report_utils.build_execution_summary(
execution_timestamp,
execution_id,
config.ml_framework_build_label,
config.execution_label,
config.platform_name,
config.system_name,
config.output_gcs_url,
benchmark_result,
config.get_env_vars(),
config.get_flags(),
harness_info,
site_package_info,
process_info,
method_has_exception,
is_tpu_benchmark = (config.tpu_parameters != None))
report_utils.upload_execution_summary(
config.bigquery_project_name,
config.bigquery_dataset_table_name,
execution_summary)
report_utils.execute_methods(
config.result_upload_methods,
execution_summary)
logging.info('Benchmark execution for %s completed with summary:\n %s',
benchmark_method, json.dumps(execution_summary, indent=2))
_set_file_contents(json.dumps(execution_summary, indent=2),
os.path.join(output_dir, 'perfzero_summary.json'))
utils.maybe_upload_to_gcs(output_dir, config.output_gcs_url)
logging.getLogger().removeHandler(filehandler)
method_execution_time = {
'class_initialization': execution_timestamp - start_timestamp,
'method_execution': upload_timestamp - execution_timestamp,
'log_upload': time.time() - upload_timestamp
}
if config.profiler_enabled_time_str:
relative_output_dir = output_dir[output_dir.find('benchmark'):]
print('\nExecute the command below to start tensorboard server using '
'the collected profiler data:\ntensorboard --logdir={}\n\n'
'Open localhost:6006 in your browser to access the Tensorbord '
'GUI. Use ssh with port forwarding if tensorboard is running on '
'a remote machine.\n'.format(relative_output_dir))
queue.put((method_has_exception, method_execution_time,
benchmark_result['succeeded'], output_dir))
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Setup the data drive with raid, RAM, or mount network drives."""
from __future__ import print_function
import logging
import perfzero.utils as utils
def create_drive_from_devices(data_dir, gce_nvme_raid):
"""Creates a drive at data directory."""
if not gce_nvme_raid:
return
devices = _get_nvme_devices()
cmd = 'mountpoint -q {}'.format(data_dir)
retcode, _ = utils.run_command(cmd)
if retcode:
if len(devices) > 1:
_create_drive_raid(data_dir, devices)
else:
_create_single_drive(data_dir, devices[0])
def _get_nvme_devices():
"""Returns list paths to nvme devices."""
devices = []
cmd = 'lsblk'
retcode, log = utils.run_command(cmd)
if retcode:
raise Exception('"{}" failed with code:{} and log:\n{}'.format(
cmd, retcode, log))
lines = log.splitlines()
if lines:
for line in lines:
if line.startswith('nvme'):
parts = line.split()
devices.append('/dev/' + parts[0].strip())
return devices
def _create_single_drive(data_dir, device):
"""Creates a data drive out of a single device."""
cmds = []
cmds.append('mkfs.ext4 -F {}'.format(device))
cmds.append('mkdir -p {}'.format(data_dir))
cmds.append('mount {} {}'.format(device, data_dir))
cmds.append('chmod a+w {}'.format(data_dir))
utils.run_commands(cmds)
logging.info('Created and mounted device %s at %s', device, data_dir)
def _create_drive_raid(data_dir, devices):
"""Creates a raid zero array of nvme drives."""
cmds = []
# Passing 'yes' because GCE nvme drive are sometimes in an odd state and
# think they are in another raid. mdadm does not have -y option.
# Or the kokoro images were left dirty? and that is where the info
# comes from.
cmds.append('yes | mdadm --create /dev/md0 --level=0 '
'--raid-devices={} {}'.format(
len(devices), ' '.join(devices)))
cmds.append('mkfs.ext4 -F /dev/md0')
cmds.append('mkdir -p {}'.format(data_dir))
cmds.append('mount /dev/md0 {}'.format(data_dir))
cmds.append('chmod a+w {}'.format(data_dir))
utils.run_commands(cmds)
logging.info('Created and mounted RAID array at %s', data_dir)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero configs provided by user."""
from __future__ import print_function
import json
import logging
import os
def add_setup_parser_arguments(parser):
"""Add arguments to the parser used by the setup.py."""
parser.add_argument(
'--dockerfile_path',
default='docker/Dockerfile_ubuntu_1804_tf_v1',
type=str,
help='''Build the docker image using docker file located at the ${pwd}/${dockerfile_path} if
it exists, where ${pwd} is user's current work directory. Otherwise, build
the docker image using the docker file located at path_to_perfzero/${dockerfile_path}.
''')
parser.add_argument(
'--workspace',
default='workspace',
type=str,
help='''The gcloud key file will be downloaded under directory path_to_perfzero/${workspace}
''')
parser.add_argument(
'--gcloud_key_file_url',
default='',
type=str,
help='''DEPRECATED: Use --gcloud_key_file_url of setup.py instead.
The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
''')
parser.add_argument(
'--root_data_dir',
default='/data',
type=str,
help='The directory which should contain the dataset required by the becnhmark method.'
)
parser.add_argument(
'--gce_nvme_raid',
default=None,
type=str,
help='If set to non-empty string, create raid 0 array with devices at the directory specified by the flag --root_data_dir'
)
parser.add_argument(
'--tensorflow_pip_spec',
default=None,
type=str,
help='''The tensorflow pip package specfication. The format can be either ${package_name}, or ${package_name}==${package_version}.
Example values include tf-nightly-gpu, and tensorflow==1.12.0. If it is specified, the corresponding tensorflow pip package/version
will be installed. Otherwise, the default tensorflow pip package specified in the docker file will be installed.
''')
parser.add_argument(
'--extra_pip_specs',
default='',
type=str,
help='''Additional specifications to pass to `pip install`. (e.g. pinning certain dependencies)
Specifications should be semicolon separated: e.g. `numpy==1.16.4;scipy==1.3.1`
''')
parser.add_argument(
'--docker_tag',
default='perfzero/tensorflow',
type=str,
help='The docker tag to use if building a docker image.'
)
parser.add_argument(
'--site_package_downloads',
default='',
type=str,
help='''Comma separated list of dirs in the external vm to copy to the docker\'s site package dir.
Format: <absolute-path>/src/dir:new_base_dir_name,<absolute-path>/src/dir2>:new_name,....
This will copy <absolute-path>/src/dir to <site-packages>/new_base_dir_name.
'''
)
parser.add_argument(
'--extra_docker_build_args',
nargs='*',
default='',
type=str,
help='''Additional build-args to pass to `docker build`.
Example: --extra_docker_build_args arg0 arg1=value1 "arg2=value with space" arg3=300.
Each string will be passed directly as a build-arg to docker, so the above example will be passed as follows:
--build-arg arg0 --build-arg arg1=value1 --build-arg "arg2=value with space" --build-arg arg3=300
'''
)
def add_benchmark_parser_arguments(parser):
"""Add arguments to the parser used by the benchmark.py."""
parser.add_argument(
'--use_cached_site_packages',
action='store_true',
help='If set, skip git pull for dependent git repositories if it already exists in path_to_perfzero/${workspace}/site-packages'
)
parser.add_argument(
'--gcs_downloads',
default=None,
type=str,
help='This flag is deprecated. Use the flag --data_downloads instead')
parser.add_argument(
'--git_repos',
default=None,
type=str,
help='''A string representing git repositories to checkout. The format is url_1;branch_1;hash_1,url_2;branch_2;hash_2,...
Git repositories will be checked-out under directory path_to_perfzero/${workspace}/site-packages,
where ${workspace} either defaults to 'workspace', or takes the value of the flag --workspace.
branch and hash can be skipped if user wants to use the head of the master branch,
which shortens the format to url_1,url_2,...
''')
parser.add_argument(
'--benchmark_num_trials',
default=1,
type=int,
help='''Configures number of times to run each benchmark method
after setup completion.''')
parser.add_argument(
'--benchmark_methods',
action='append',
default=[],
type=str,
help='''This string specifies the benchmark_method to be executed. The flag can be specified multiple times in which case
the union of methods matched by these flags will be executed. The format can be module_path.class_name.method_name in which
case the corresponding method is executed. The format can also be module_path.class_name.filter:regex_pattern, in which case all methods
of the given class whose method name matches the given regular expression are executed.
''')
parser.add_argument(
'--ml_framework_build_label',
default=None,
type=str,
help='A string that identified the machine learning framework build, e.g. nightly-gpu-build'
)
parser.add_argument(
'--execution_label',
default=None,
type=str,
help='A string that identified the benchmark execution type, e.g. test, prod'
)
parser.add_argument(
'--platform_name',
default=None,
type=str,
help='A string that identified the computing platform, e.g. gcp, aws'
)
parser.add_argument(
'--system_name',
default=None,
type=str,
help='A string that identified the hardware system, e.g. n1-standard-64-8xV100'
)
parser.add_argument(
'--output_gcs_url',
default=None,
type=str,
help='''If specified, log files generated by the benchmark execution will be uploaded to output_gcs_url/${execution_id},
where ${execution_id} is a string that generated by PerfZero which uniquely identifies the execution of one benchmark method
''')
parser.add_argument(
'--scratch_gcs_url',
default=None,
type=str,
help='''If specified, intermediate files like model outputs will be stored in scratch_gcs_url/${execution_id}, where
${execution_id} is a string that is generated by PerfZero which uniquely identifies the execution of one benchmark method.
If not specified, intermediate files will be stored in a local folder on the host.
''')
parser.add_argument(
'--bigquery_project_name',
default=None,
type=str,
help='''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
''')
parser.add_argument(
'--bigquery_dataset_table_name',
default=None,
type=str,
help='''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
''')
parser.add_argument(
'--python_path',
default=None,
type=str,
help='''A string of format path_1,path_2,... For each ${path} specified in the string,
path_to_perfzero/${workspace}/site-packages/${path} will be added to python path so that libraies downloaded by --git_repos can
be loaded and executed.
''')
parser.add_argument(
'--workspace',
default='workspace',
type=str,
help='''The log files, gcloud key file and git repositories will be generated and downloaded under the
directory path_to_perfzero/${workspace}
''')
parser.add_argument(
'--root_data_dir',
default='/data',
type=str,
help='The directory which should contain the dataset required by the becnhmark method.'
)
parser.add_argument(
'--data_downloads',
default=None,
type=str,
help='''A string of format url_1;relative_path_1,url_2;relative_path_2,...
Data will be copied from ${url} to ${root_data_dir}/${relative_path}. ${relative_path} can be skipped if it is the same as the
base name of the url, which shortens the format to url_1,url_2,... ${root_data_dir} is specified by the flag --root_data_dir.
File will be de-compressed in ${root_data_dir} if its name ends with 'gz'. Only url prefixed with gcs, http or https are supported.
Each url can start with file://, gcs://, http:// or https://.
''')
parser.add_argument(
'--gcloud_key_file_url',
default='gs://tf-performance/auth_tokens/benchmark_upload_gce.json',
type=str,
help='''The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
The key file will then be activated and used as gcloud authentication credential.
''')
parser.add_argument(
'--debug',
action='store_true',
help='If set, use debug level logging. Otherwise, use info level logging'
)
parser.add_argument(
'--profiler_enabled_time',
default=None,
type=str,
help='''A string of format begin_time_1:end_time_1,begin_time_2:end_time_2,.... PerfZero will start to collect profiler
data ${begin_time} sec after benchmark method execution starts. The data collection continues for ${end_time - begin_time}
sec or until the benchmark method execution finishes, whichever occurs first. If ${end_time} is not explicitly
specified, it is assumed to be MAX_LONG.
''')
parser.add_argument(
'--execution_id',
default=None,
type=str,
help='A string that uniquely identifies the benchmark execution.')
parser.add_argument(
'--result_upload_methods',
default=None,
type=str,
help='A comma separated list of class.method values to upload results.')
parser.add_argument(
'--tpu_parameters',
default=None,
type=str,
help='''A json dictionary of cloud tpu parameters. The format must look like the following:
{"name": "my-tpu-name", project": "my-gcp-project-id", "zone": "europe-west4-a", "size": "v3-8", "version": "nightly-2.x"}
It can have an optional key value pair "version_id" -> "nightly version" to change the tpu version id.
Example "version_id": "2.4.0-dev20200728".
''')
parser.add_argument(
'--perfzero_constructor_args',
nargs='*',
default='',
type=str,
help='''A json dictionary of additional args to pass to the perfzero
constructor.'''
)
parser.add_argument(
'--benchmark_class_type',
default=None,
type=str,
help='''The benchmark class type. If none, assumed perfzero_benchmark. Set to "tf_benchmark"
for tf.test.Benchmark benchmarks.''')
class PerfZeroConfig(object):
"""Creates and contains config for PerfZero."""
def __init__(self, mode, flags=None):
self.mode = mode
self.flags = flags
if mode == 'flags':
self.gcs_downloads_str = flags.gcs_downloads
self.data_downloads_str = flags.data_downloads
self.git_repos_str = flags.git_repos
self.benchmark_method_patterns = []
for value in flags.benchmark_methods:
self.benchmark_method_patterns.extend(value.split(','))
self.ml_framework_build_label = flags.ml_framework_build_label
self.execution_label = flags.execution_label
self.platform_name = flags.platform_name
self.system_name = flags.system_name
self.output_gcs_url = flags.output_gcs_url
self.scratch_gcs_url = flags.scratch_gcs_url
self.bigquery_project_name = flags.bigquery_project_name
self.bigquery_dataset_table_name = flags.bigquery_dataset_table_name
self.python_path_str = flags.python_path
self.workspace = flags.workspace
self.benchmark_class_type = flags.benchmark_class_type
self.use_cached_site_packages = flags.use_cached_site_packages
self.root_data_dir = flags.root_data_dir
self.gcloud_key_file_url = flags.gcloud_key_file_url
self.profiler_enabled_time_str = flags.profiler_enabled_time
self.execution_id = flags.execution_id
self.result_upload_methods = flags.result_upload_methods
self.perfzero_constructor_args = flags.perfzero_constructor_args
self.benchmark_num_trials = flags.benchmark_num_trials
if flags.tpu_parameters:
self.tpu_parameters = json.loads(flags.tpu_parameters)
else:
self.tpu_parameters = None
if not flags.benchmark_methods:
logging.warning('No benchmark method is specified by '
'--benchmark_methods')
if flags.bigquery_project_name and not flags.bigquery_dataset_table_name:
raise ValueError('--bigquery_project_name is specified but '
'--bigquery_dataset_table_name is not')
if not flags.bigquery_project_name and flags.bigquery_dataset_table_name:
raise ValueError('--bigquery_dataset_table_name is specified but '
'--bigquery_project_name is not')
def get_env_vars(self):
env_vars = {}
for key in os.environ.keys():
if key.startswith('PERFZERO_'):
env_vars[key] = os.environ[key]
return env_vars
def get_flags(self):
not_none_flags = {}
for key in vars(self.flags):
value = getattr(self.flags, key)
if value is not None:
not_none_flags[key] = value
return not_none_flags
def get_git_repos(self, site_packages_dir):
"""Parse git repository string."""
git_repos = []
if not self.git_repos_str:
return git_repos
for repo_entry in self.git_repos_str.split(','):
parts = repo_entry.split(';')
git_repo = {}
git_repo['url'] = parts[0]
# Assume the git url has format */{dir_name}.git
git_repo['dir_name'] = parts[0].rsplit('/', 1)[-1].rsplit('.', 1)[0]
git_repo['local_path'] = os.path.join(site_packages_dir,
git_repo['dir_name'])
if len(parts) >= 2:
git_repo['branch'] = parts[1]
if len(parts) >= 3:
git_repo['git_hash'] = parts[2]
git_repos.append(git_repo)
return git_repos
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for perfzero_config.py."""
from __future__ import print_function
import os
import unittest
import perfzero.perfzero_config as perfzero_config
class TestPerfZeroConfig(unittest.TestCase):
def test_get_git_repos(self):
config = perfzero_config.PerfZeroConfig(mode='mock')
config.git_repos_str = 'https://github.com/tensorflow/benchmarks.git;branch_1;hash_1,https://github.com/tensorflow/models.git;branch_2'
git_repos = config.get_git_repos('/site_package_dir')
git_repo_1 = {}
git_repo_1['url'] = 'https://github.com/tensorflow/benchmarks.git'
git_repo_1['dir_name'] = 'benchmarks'
git_repo_1['local_path'] = '/site_package_dir/benchmarks'
git_repo_1['branch'] = 'branch_1'
git_repo_1['git_hash'] = 'hash_1'
git_repo_2 = {}
git_repo_2['url'] = 'https://github.com/tensorflow/models.git'
git_repo_2['dir_name'] = 'models'
git_repo_2['local_path'] = '/site_package_dir/models'
git_repo_2['branch'] = 'branch_2'
self.assertEqual(2, len(git_repos))
self.assertEqual(git_repo_1, git_repos[0])
self.assertEqual(git_repo_2, git_repos[1])
def test_get_env_vars(self):
config = perfzero_config.PerfZeroConfig(mode='mock')
self.assertEqual({}, config.get_env_vars())
os.environ['PERFZERO_VAR1'] = 'value1'
self.assertEqual({'PERFZERO_VAR1': 'value1'}, config.get_env_vars())
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keep track of process information such as maximum memory usage with a separate thread."""
from __future__ import absolute_import
import json
import logging
import os
import sched
import threading
import time
import traceback
import psutil
class ProcessInfoTracker(object):
"""Keep track of process information such as maximum memory usage with separate thread."""
def __init__(self, output_dir):
self.process_info_log = open(os.path.join(output_dir, 'process_info.log'),
'w')
self.scheduler = sched.scheduler(time.time, time.sleep)
self.process_info = {}
self.process_info['max_rss'] = 0
self.process_info['max_vms'] = 0
self.process_info['max_cpu_percent'] = 0
self.exit_event = threading.Event()
self.last_exception = None
self.start_time = None
def start(self):
self.start_time = time.time()
# 4th positional arg added to support Python2 for the short-term.
self.scheduler.enter(1, 1, self._update_process_info, ()) # pylint: disable=no-value-for-parameter
threading.Thread(target=self.scheduler.run).start()
logging.info('Started process information tracker.')
def stop(self):
self.exit_event.set()
self.process_info_log.flush()
logging.info('Stopped process information tracker.')
if self.last_exception is not None:
raise self.last_exception # pylint: disable=raising-bad-type
return dict(self.process_info)
def _update_process_info(self):
"""Read and update process info using background thread every 1 second."""
try:
p = psutil.Process(os.getpid())
memory_info = p.memory_info()
# This is a blocking call which takes 0.1 second.
# This affects the interval # at which the metrics are reported
cpu_percent = p.cpu_percent(interval=0.1)
self.process_info['max_rss'] = max(self.process_info['max_rss'],
memory_info.rss)
self.process_info['max_vms'] = max(self.process_info['max_vms'],
memory_info.vms)
self.process_info['max_cpu_percent'] = max(
self.process_info['max_cpu_percent'], cpu_percent)
entry = {}
entry['time'] = time.time() - self.start_time
entry['rss'] = memory_info.rss
entry['vms'] = memory_info.vms
entry['cpu_percent'] = cpu_percent
self.process_info_log.write(json.dumps(entry) + '\n')
if not self.exit_event.is_set():
# Schedule the next event to be run after 1 second
# 4th positional arg added to support Python2 for the short-term.
self.scheduler.enter(1, 1, self._update_process_info, ()) # pylint: disable=no-value-for-parameter
except Exception as e: # pylint: disable=broad-except
logging.error('Process info tracker failed due to error:\n %s',
traceback.format_exc())
self.last_exception = e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment