Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
ee3997b3
Commit
ee3997b3
authored
Apr 15, 2022
by
qianyj
Browse files
new tf branch for dtk21.10.1
parent
2795dc1f
Changes
383
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2941 additions
and
0 deletions
+2941
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/resnet50_synth.sh
...n/benchmarks-master/perfzero/dockertest/resnet50_synth.sh
+82
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/run_single_benchmark.sh
...hmarks-master/perfzero/dockertest/run_single_benchmark.sh
+74
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/__init__.py
...Classification/benchmarks-master/perfzero/lib/__init__.py
+0
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark.py
...lassification/benchmarks-master/perfzero/lib/benchmark.py
+193
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark_test.py
...fication/benchmarks-master/perfzero/lib/benchmark_test.py
+57
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/cloud_manager.py
...ification/benchmarks-master/perfzero/lib/cloud_manager.py
+431
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/__init__.py
...ation/benchmarks-master/perfzero/lib/perfzero/__init__.py
+0
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/benchmark_method_runner.py
...s-master/perfzero/lib/perfzero/benchmark_method_runner.py
+187
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/device_utils.py
...n/benchmarks-master/perfzero/lib/perfzero/device_utils.py
+86
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config.py
...enchmarks-master/perfzero/lib/perfzero/perfzero_config.py
+367
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config_test.py
...arks-master/perfzero/lib/perfzero/perfzero_config_test.py
+54
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/process_info_tracker.py
...arks-master/perfzero/lib/perfzero/process_info_tracker.py
+93
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/report_utils.py
...n/benchmarks-master/perfzero/lib/perfzero/report_utils.py
+237
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tensorflow_profiler.py
...marks-master/perfzero/lib/perfzero/tensorflow_profiler.py
+128
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_no_processes.txt
...b/perfzero/test_files/example_nvidia-smi_no_processes.txt
+40
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
.../lib/perfzero/test_files/example_nvidia-smi_processes.txt
+43
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/nvme_device_log.txt
...ster/perfzero/lib/perfzero/test_files/nvme_device_log.txt
+15
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
...chmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
+89
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils.py
...fication/benchmarks-master/perfzero/lib/perfzero/utils.py
+546
-0
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils_test.py
...ion/benchmarks-master/perfzero/lib/perfzero/utils_test.py
+219
-0
No files found.
Too many changes to show.
To preserve performance only
383 of 383+
files are displayed.
Plain diff
Email patch
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/resnet50_synth.sh
0 → 100644
View file @
ee3997b3
#!/bin/bash
set
-e
set
-x
# To run this script from a GCP VM / host connected to GPUs:
# git clone https://github.com/tensorflow/benchmarks.git
# cd benchmarks
# bash perfzero/dockertest/resnet50_synth.sh
# Output log files/results will be stored at perfzero/workspace/output/
# Modify INPUT_PARAMS variables below to tweak the tf whl under test / benchmark methods / dataset paths.
# You can comment out "build_docker" call at the end, if the docker's already built.
## INPUT PARAMS: start
# Acceptable formats for TF_PIP_SPEC
# pypi nightlies: tf-nightly-gpu==2.6.0.dev20210521
# gcs path to whls: gs://some-path-to-tf.whl
# Local path to whl: file://some-local-path-to-tf.whl
TF_PIP_SPEC
=
"tf-nightly-gpu==2.6.0.dev20210624"
# Path to GCS or local files containing the input datasets (if they need to be fetched into the docker).
DATA_DOWNLOADS
=
""
# Comma separated list of strings.
BENCHMARK_METHODS
=
"official.benchmark.keras_imagenet_benchmark.Resnet50KerasBenchmarkSynth.benchmark_1_gpu_fp16"
# If either the tf_pip_spec or data downloads reference private GCP, then we
# need to set GCLOUD_KEY_FILE_URL to a credentials file.
GCLOUD_KEY_FILE_URL
=
""
# Commit id under repository tensorflow/models, branch='benchmark' which has the benchmarks.
MODELS_GIT_HASH
=
"169e4051aef247c27a95748a8015b2f35f509e1a"
## INPUT PARAMS: end
build_docker
()
{
echo
"building docker"
sudo
python3 perfzero/lib/setup.py
\
--dockerfile_path
=
docker/Dockerfile_ubuntu_1804_tf_cuda_11
\
--tensorflow_pip_spec
=
"
${
TF_PIP_SPEC
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
\
--extra_docker_build_args
=
sudo
docker images
}
run_benchmark
()
{
echo
"running benchmark"
benchmark_tag
=
$1
env_var
=
$2
sudo
nvidia-docker run
\
-v
${
PWD
}
:/workspace
\
-v
/data:/data
\
-e
PERFZERO_EXECUTION_MODE
=
test
\
-e
TF_ENABLE_LEGACY_FILESYSTEM
=
1
\
-e
${
env_var
}
\
perfzero/tensorflow python3
\
/workspace/perfzero/lib/benchmark.py
\
--root_data_dir
=
/data
\
--bigquery_dataset_table_name
=
""
\
--benchmark_class_type
=
\
--ml_framework_build_label
=
v2-nightly-gpu-
${
benchmark_tag
}
\
--execution_label
=
test-benchmark
\
--platform_name
=
kokoro-gcp
\
--system_name
=
n1-standard-8-1xV100
\
--output_gcs_url
=
""
\
--benchmark_num_trials
=
1
\
--scratch_gcs_url
=
\
--bigquery_project_name
=
""
\
--git_repos
=
"https://github.com/tensorflow/models.git;benchmark;
${
MODELS_GIT_HASH
}
"
\
--data_downloads
=
"
${
DATA_DOWNLOADS
}
"
\
--python_path
=
models
\
--benchmark_methods
=
"
${
BENCHMARK_METHODS
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
}
build_docker
run_benchmark
"control"
"TF_CUDNN_USE_FRONTEND=false"
run_benchmark
"experiment"
"TF_CUDNN_USE_FRONTEND=true"
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/dockertest/run_single_benchmark.sh
0 → 100644
View file @
ee3997b3
#!/bin/bash
set
-e
set
-x
# To run this script from a GCP VM / host connected to GPUs:
# git clone https://github.com/tensorflow/benchmarks.git
# cd benchmarks
# bash perfzero/dockertest/run_single_benchmark.sh
# Output log files/results will be stored at perfzero/workspace/output/
# Modify INPUT_PARAMS variables below to tweak the tf whl under test / benchmark methods / dataset paths.
# You can comment out "build_docker" call at the end, if the docker's already built.
## INPUT PARAMS: start
# Acceptable formats for TF_PIP_SPEC
# pypi nightlies: tf-nightly-gpu==2.6.0.dev20210521
# gcs path to whls: gs://some-path-to-tf.whl
# Local path to whl: file://some-local-path-to-tf.whl
TF_PIP_SPEC
=
"tf-nightly-gpu==2.6.0.dev20210521"
# Path to GCS or local files containing the input datasets (if they need to be fetched into the docker).
DATA_DOWNLOADS
=
""
# Comma separated list of strings.
BENCHMARK_METHODS
=
"official.benchmark.keras_imagenet_benchmark.Resnet50KerasBenchmarkSynth.benchmark_xla_1_gpu_fp16"
# If either the tf_pip_spec or data downloads reference private GCP, then we
# need to set GCLOUD_KEY_FILE_URL to a credentials file.
GCLOUD_KEY_FILE_URL
=
""
## INPUT PARAMS: end
build_docker
()
{
echo
"building docker"
sudo
python3 perfzero/lib/setup.py
\
--dockerfile_path
=
docker/Dockerfile_ubuntu_1804_tf_cuda_11
\
--tensorflow_pip_spec
=
"
${
TF_PIP_SPEC
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
\
--extra_docker_build_args
=
sudo
docker images
}
run_benchmark
()
{
echo
"running benchmark"
sudo
nvidia-docker run
\
-v
${
PWD
}
:/workspace
\
-v
/data:/data
\
-e
PERFZERO_EXECUTION_MODE
=
test
\
-e
TF_ENABLE_LEGACY_FILESYSTEM
=
1
\
perfzero/tensorflow python3
\
/workspace/perfzero/lib/benchmark.py
\
--root_data_dir
=
/data
\
--bigquery_dataset_table_name
=
""
\
--benchmark_class_type
=
\
--ml_framework_build_label
=
v2-nightly-gpu
\
--execution_label
=
test-benchmark
\
--platform_name
=
kokoro-gcp
\
--system_name
=
n1-standard-8-1xV100
\
--output_gcs_url
=
""
\
--benchmark_num_trials
=
1
\
--scratch_gcs_url
=
\
--bigquery_project_name
=
""
\
--git_repos
=
'https://github.com/tensorflow/models.git;benchmark;f7938e6ad46fecfa1112eda579eb046eb3f3bf96'
\
--data_downloads
=
"
${
DATA_DOWNLOADS
}
"
\
--python_path
=
models
\
--benchmark_methods
=
"
${
BENCHMARK_METHODS
}
"
\
--gcloud_key_file_url
=
"
${
GCLOUD_KEY_FILE_URL
}
"
}
build_docker
run_benchmark
TensorFlow2x/ComputeVision/Classification/
tf_cnn_
benchmarks
/models
/__init__.py
→
TensorFlow2x/ComputeVision/Classification/benchmarks
-master/perfzero/lib
/__init__.py
View file @
ee3997b3
File moved
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute benchmark."""
from
__future__
import
print_function
import
argparse
import
json
import
logging
import
multiprocessing
import
os
import
re
import
sys
import
time
import
perfzero.benchmark_method_runner
as
benchmark_method_runner
import
perfzero.perfzero_config
as
perfzero_config
import
perfzero.tpu_runtime_utils
as
tpu_runtime_utils
import
perfzero.utils
as
utils
class
BenchmarkRunner
(
object
):
"""Execute benchmark and report results."""
def
__init__
(
self
,
config
):
self
.
config
=
config
self
.
project_dir
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
)))
self
.
workspace_dir
=
os
.
path
.
join
(
self
.
project_dir
,
config
.
workspace
)
self
.
site_packages_dir
=
os
.
path
.
join
(
self
.
workspace_dir
,
'site-packages'
)
self
.
root_output_dir
=
os
.
path
.
join
(
self
.
workspace_dir
,
'output'
)
self
.
benchmark_execution_time
=
{}
def
_setup
(
self
):
"""Download data and checkout git repository."""
# Acticate gcloud service
start_time
=
time
.
time
()
utils
.
setup_python_path
(
self
.
site_packages_dir
,
self
.
config
.
python_path_str
)
utils
.
active_gcloud_service
(
self
.
config
.
gcloud_key_file_url
,
self
.
workspace_dir
)
utils
.
make_dir_if_not_exist
(
self
.
root_output_dir
)
self
.
benchmark_execution_time
[
'activate_gcloud_service'
]
=
(
time
.
time
()
-
start_time
)
# Download data
start_time
=
time
.
time
()
utils
.
download_data
(
utils
.
parse_data_downloads_str
(
self
.
config
.
root_data_dir
,
self
.
config
.
gcs_downloads_str
))
utils
.
download_data
(
utils
.
parse_data_downloads_str
(
self
.
config
.
root_data_dir
,
self
.
config
.
data_downloads_str
))
self
.
benchmark_execution_time
[
'download_data'
]
=
time
.
time
()
-
start_time
# Checkout git repositories
start_time
=
time
.
time
()
site_package_info
=
utils
.
checkout_git_repos
(
self
.
config
.
get_git_repos
(
self
.
site_packages_dir
),
self
.
config
.
use_cached_site_packages
)
self
.
benchmark_execution_time
[
'checkout_repository'
]
=
(
time
.
time
()
-
start_time
)
# Start cloud TPU.
if
self
.
config
.
tpu_parameters
is
not
None
:
start_time
=
time
.
time
()
utils
.
setup_tpu
(
self
.
config
.
tpu_parameters
)
tpu_info
=
tpu_runtime_utils
.
configure_tpu
(
self
.
config
.
tpu_parameters
)
site_package_info
[
'tpu_version'
]
=
tpu_info
self
.
benchmark_execution_time
[
'start_tpu'
]
=
time
.
time
()
-
start_time
self
.
stream_handler
=
logging
.
StreamHandler
(
sys
.
stdout
)
self
.
stream_handler
.
setFormatter
(
logging
.
Formatter
(
'%(asctime)s %(levelname)s: %(message)s'
))
logging
.
getLogger
().
addHandler
(
self
.
stream_handler
)
return
site_package_info
def
_get_benchmark_methods
(
self
):
"""Returns list of benchmark methods to execute."""
filter_prefix
=
'filter:'
benchmark_methods
=
[]
for
benchmark_method_pattern
in
self
.
config
.
benchmark_method_patterns
:
if
filter_prefix
not
in
benchmark_method_pattern
:
benchmark_methods
.
append
(
benchmark_method_pattern
)
else
:
index
=
benchmark_method_pattern
.
find
(
filter_prefix
)
benchmark_class
=
benchmark_method_pattern
[:
index
-
1
]
pattern
=
benchmark_method_pattern
[
index
+
len
(
filter_prefix
):]
class_instance
=
utils
.
instantiate_benchmark_class
(
benchmark_class
,
'/dev/null'
,
''
,
None
,
{},
benchmark_class_type
=
self
.
config
.
benchmark_class_type
)
for
benchmark_method_name
in
dir
(
class_instance
):
if
re
.
match
(
pattern
,
benchmark_method_name
):
benchmark_methods
.
append
(
benchmark_class
+
'.'
+
benchmark_method_name
)
logging
.
info
(
'The following benchmark methods will be executed: %s'
,
benchmark_methods
)
return
benchmark_methods
def
_run_benchmarks_trial
(
self
,
harness_info
,
site_package_info
,
benchmark_methods
,
trial_id
):
"""Runs a single trial of all benchmark methods."""
# Run the benchmark method in a separate process so that its memory usage
# will not affect the execution of other benchmark method
# This is a walkaround before we fix all memory leak issues in TensorFlow
has_exception
=
False
benchmark_success_results
=
{}
benchmark_output_dirs
=
{}
benchmark_execution_time
=
{}
for
benchmark_method
in
benchmark_methods
:
queue
=
multiprocessing
.
Queue
()
process
=
multiprocessing
.
Process
(
target
=
benchmark_method_runner
.
run
,
args
=
(
benchmark_method
,
harness_info
,
site_package_info
,
self
.
root_output_dir
,
self
.
config
,
queue
,
trial_id
))
process
.
start
()
process
.
join
()
method_has_exception
,
method_execution_time
,
succeeded
,
output_dir
=
queue
.
get
()
# pylint: disable=line-too-long
has_exception
|=
method_has_exception
benchmark_execution_time
[
benchmark_method
]
=
method_execution_time
benchmark_success_results
[
benchmark_method
]
=
succeeded
benchmark_output_dirs
[
benchmark_method
]
=
output_dir
return
(
has_exception
,
benchmark_success_results
,
benchmark_output_dirs
,
benchmark_execution_time
)
def
run_benchmark
(
self
):
"""Run benchmark."""
harness_info
=
utils
.
get_git_repo_info
(
self
.
project_dir
)
has_exception
=
False
benchmark_success_results
=
{}
benchmark_output_dirs
=
{}
num_trials
=
self
.
config
.
benchmark_num_trials
try
:
site_package_info
=
self
.
_setup
()
benchmark_methods
=
self
.
_get_benchmark_methods
()
print
(
'Setup complete. Running {} trials'
.
format
(
num_trials
))
for
trial_id
in
range
(
1
,
num_trials
+
1
):
print
(
'Running trial {} / {}'
.
format
(
trial_id
,
num_trials
))
(
trial_has_exception
,
trial_success_results
,
trial_output_dirs
,
trial_execution_time
)
=
self
.
_run_benchmarks_trial
(
harness_info
,
site_package_info
,
benchmark_methods
,
trial_id
)
trial_key
=
'trial_{}'
.
format
(
trial_id
)
has_exception
|=
trial_has_exception
self
.
benchmark_execution_time
[
trial_key
]
=
trial_execution_time
benchmark_success_results
[
trial_key
]
=
trial_success_results
benchmark_output_dirs
[
trial_key
]
=
trial_output_dirs
finally
:
if
self
.
config
.
tpu_parameters
is
not
None
:
has_exception
|=
utils
.
cleanup_tpu
(
self
.
config
.
tpu_parameters
)
print
(
'Benchmark execution time in seconds by operation:
\n
{}'
.
format
(
json
.
dumps
(
self
.
benchmark_execution_time
,
indent
=
2
)))
print
(
'Benchmark success results:
\n
{}'
.
format
(
json
.
dumps
(
benchmark_success_results
,
indent
=
2
)))
print
(
'Benchmark local output directories:
\n
{}'
.
format
(
json
.
dumps
(
benchmark_output_dirs
,
indent
=
2
)))
if
has_exception
:
sys
.
exit
(
1
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
perfzero_config
.
add_benchmark_parser_arguments
(
parser
)
FLAGS
,
unparsed
=
parser
.
parse_known_args
()
level
=
logging
.
DEBUG
if
FLAGS
.
debug
else
logging
.
INFO
logging
.
basicConfig
(
format
=
'%(asctime)s %(levelname)s: %(message)s'
,
level
=
level
)
if
unparsed
:
logging
.
error
(
'Arguments %s are not recognized'
,
unparsed
)
sys
.
exit
(
1
)
config_
=
perfzero_config
.
PerfZeroConfig
(
mode
=
'flags'
,
flags
=
FLAGS
)
benchmark_runner
=
BenchmarkRunner
(
config_
)
benchmark_runner
.
run_benchmark
()
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/benchmark_test.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark.py."""
from
__future__
import
print_function
import
sys
import
unittest
import
benchmark
import
mock
class
TestBenchmarkRunner
(
unittest
.
TestCase
):
def
test_get_benchmark_methods_filter
(
self
):
"""Tests returning methods on a class based on a filter."""
config
=
mock
.
Mock
()
config
.
workspace
=
'workspace'
config
.
benchmark_method_patterns
=
[
'new_foo.BenchmarkClass.filter:bench.*'
]
benchmark_runner
=
benchmark
.
BenchmarkRunner
(
config
)
mock_benchmark_class
=
mock
.
Mock
()
mock_benchmark_class
.
benchmark_method_1
=
'foo'
mock_module
=
mock
.
Mock
()
sys
.
modules
[
'new_foo'
]
=
mock_module
mock_module
.
BenchmarkClass
.
return_value
=
mock_benchmark_class
methods
=
benchmark_runner
.
_get_benchmark_methods
()
self
.
assertEqual
(
1
,
len
(
methods
))
self
.
assertEqual
(
'new_foo.BenchmarkClass.benchmark_method_1'
,
methods
[
0
])
def
test_get_benchmark_methods_exact_match
(
self
):
"""Tests returning methods on a class based on a filter."""
config
=
mock
.
Mock
()
config
.
workspace
=
'workspace'
config
.
benchmark_method_patterns
=
[
'new_foo.BenchmarkClass.benchmark_method_1'
,
'new_foo.BenchmarkClass.benchmark_method_2'
]
benchmark_runner
=
benchmark
.
BenchmarkRunner
(
config
)
methods
=
benchmark_runner
.
_get_benchmark_methods
()
self
.
assertEqual
([
'new_foo.BenchmarkClass.benchmark_method_1'
,
'new_foo.BenchmarkClass.benchmark_method_2'
],
methods
)
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/cloud_manager.py
0 → 100644
View file @
ee3997b3
#!/usr/bin/python
#
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper script to create, query and stop machine in GCP."""
from
__future__
import
print_function
import
argparse
import
getpass
import
logging
import
subprocess
import
sys
import
time
INSTANCE_NAME_PREFIX
=
'perfzero-dev-'
def
run_command
(
cmd
,
is_from_user
=
False
):
"""Runs list of command and throw error if return code is non-zero.
Args:
cmd: Command to execute
is_from_user: If true, log the command and the command output in INFO level.
Otherwise, log these in the DEBUG level.
Returns:
a string representing the command output
Raises:
Exception: raised when the command execution has non-zero exit code
"""
_log
=
logging
.
info
if
is_from_user
else
logging
.
debug
_log
(
'Executing command: {}'
.
format
(
cmd
))
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
,
shell
=
True
)
exit_code
=
None
line
=
''
stdout
=
''
while
exit_code
is
None
or
line
:
exit_code
=
p
.
poll
()
line
=
p
.
stdout
.
readline
().
decode
(
'utf-8'
)
stdout
+=
line
_log
(
line
)
if
exit_code
and
is_from_user
:
sys
.
exit
(
exit_code
)
elif
exit_code
:
raise
Exception
(
'Command:
\n
{}
\n
failed with output:
\n
{}'
.
format
(
cmd
,
stdout
))
return
stdout
def
get_instance_name
(
username
):
return
INSTANCE_NAME_PREFIX
+
username
def
get_machine_type
(
machine_type
,
accelerator_count
):
"""Get machine type for the instance.
- Use the user-specified machine_type if it is not None
- Otherwise, use the standard type with cpu_count = 8 x accelerator_count
if user-specified accelerator_count > 0
- Otherwise, use the standard type with 8 cpu
Args:
machine_type: machine_type specified by the user
accelerator_count: accelerator count
Returns:
the machine type used for the instance
"""
if
machine_type
:
return
machine_type
cpu_count
=
max
(
accelerator_count
,
1
)
*
8
return
'n1-standard-{}'
.
format
(
cpu_count
)
def
_ssh_prefix
(
project
,
zone
,
internal_ip
,
key_file
):
if
internal_ip
:
ssh_prefix
=
'gcloud beta compute ssh --internal-ip'
else
:
ssh_prefix
=
'gcloud compute ssh'
if
key_file
:
ssh_prefix
=
'{} --ssh-key-file={}'
.
format
(
ssh_prefix
,
key_file
)
return
'{} --project={} --zone={}'
.
format
(
ssh_prefix
,
project
,
zone
)
def
create
(
username
,
project
,
zone
,
machine_type
,
accelerator_count
,
accelerator_type
,
image
,
nvme_count
,
ssh_internal_ip
,
ssh_key_file
,
cpu_min_platform
=
None
,
boot_ssd_size
=
None
):
"""Create gcloud computing instance.
Args:
username: the username of the current user
project: project name
zone: zone of the GCP computing instance
machine_type: the machine type used for the instance
accelerator_count: the number of pieces of the accelerator to attach to
the instance
accelerator_type: the specific type of accelerator to attach to the instance
image: the name of the image that the disk will be initialized with
nvme_count: the number of NVME local SSD devices to attach to the instance
ssh_internal_ip: internal ip to use for ssh.
ssh_key_file: ssh key file to use to connect to instance.
cpu_min_platform: minimum CPU platform to use, if None use default.
boot_ssd_size: If set boot disk is changed to SSD and this size(GB) is used.
"""
instance_name
=
get_instance_name
(
username
)
machine_type
=
get_machine_type
(
machine_type
,
accelerator_count
)
logging
.
debug
(
'Creating gcloud computing instance %s'
,
instance_name
)
cmd
=
'''gcloud compute instances create {}
\
--image={}
\
--project={}
\
--zone={}
\
--machine-type={}
\
--maintenance-policy=TERMINATE
\
'''
.
format
(
instance_name
,
image
,
project
,
zone
,
machine_type
)
if
boot_ssd_size
:
cmd
+=
'--boot-disk-size={}GB --boot-disk-type=pd-ssd '
.
format
(
boot_ssd_size
)
if
accelerator_count
>
0
:
cmd
+=
'--accelerator=count={},type={} '
.
format
(
accelerator_count
,
accelerator_type
)
if
cpu_min_platform
:
cmd
+=
'--min-cpu-platform="{}" '
.
format
(
cpu_min_platform
)
for
_
in
range
(
nvme_count
):
cmd
+=
'--local-ssd=interface=NVME '
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
info
(
'Successfully created gcloud computing instance %s '
'with %s accelerator.
\n
'
,
instance_name
,
accelerator_count
)
ssh_prefix
=
_ssh_prefix
(
project
,
zone
,
ssh_internal_ip
,
ssh_key_file
)
# Wait until we can ssh to the newly created computing instance
cmd
=
'{} --strict-host-key-checking=no --command="exit" {}'
.
format
(
ssh_prefix
,
instance_name
)
ssh_remaining_retries
=
12
ssh_error
=
None
while
ssh_remaining_retries
>
0
:
ssh_remaining_retries
-=
1
try
:
run_command
(
cmd
,
is_from_user
=
False
)
ssh_error
=
None
except
Exception
as
error
:
# pylint: disable=broad-except
ssh_error
=
error
if
ssh_remaining_retries
:
logging
.
info
(
'Cannot ssh to the computing instance. '
'Try again after 5 seconds'
)
time
.
sleep
(
5
)
else
:
logging
.
error
(
'Cannot ssh to the computing instance after '
'60 seconds due to error:
\n
%s'
,
str
(
ssh_error
))
if
ssh_error
:
logging
.
info
(
'Run the commands below manually after ssh into the computing '
'instance:
\n
'
'git clone https://github.com/tensorflow/benchmarks.git
\n
'
'sudo usermod -a -G docker $USER
\n
'
)
else
:
cmd
=
'{} --command="git clone {}" {}'
.
format
(
ssh_prefix
,
'https://github.com/tensorflow/benchmarks.git'
,
instance_name
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
info
(
'Successfully checked-out PerfZero code on the '
'computing instance
\n
'
)
cmd
=
'{} --command="sudo usermod -a -G docker $USER" {}'
.
format
(
ssh_prefix
,
instance_name
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
info
(
'Successfully added user to the docker group
\n
'
)
cmd
=
'{} {} -- -L 6006:127.0.0.1:6006'
.
format
(
ssh_prefix
,
instance_name
)
logging
.
info
(
'Run the command below to ssh to the instance together with '
'port forwarding for tensorboard:
\n
%s
\n
'
,
cmd
)
def
status
(
username
,
project
,
zone
,
ssh_internal_ip
,
ssh_key_file
):
"""Query the status of the computing instance.
Args:
username: the username of the current user.
project: project name.
zone: zone of the GCP computing instance.
ssh_internal_ip: internal ip of the instance.
ssh_key_file: SSH key file to use to connect to the instance.
"""
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Querying status of gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'gcloud compute instances list --filter="name={} AND zone:{}" --project {}'
.
format
(
# pylint: disable=line-too-long
instance_name
,
zone
,
project
)
stdout
=
run_command
(
cmd
,
is_from_user
=
True
)
num_instances
=
len
(
stdout
.
splitlines
())
-
1
logging
.
info
(
'
\n
Found %s gcloud computing instance with name %s.
\n
'
,
num_instances
,
instance_name
)
if
num_instances
==
1
:
cmd
=
'{} {} -- -L 6006:127.0.0.1:6006'
.
format
(
_ssh_prefix
(
project
,
zone
,
ssh_internal_ip
,
ssh_key_file
),
instance_name
)
logging
.
info
(
'Run the command below to ssh to the instance together with '
'port forwarding for tensorboard:
\n
%s
\n
'
,
cmd
)
def
list_all
(
project
):
logging
.
debug
(
'Finding all gcloud computing instance of project %s created '
'for PerfZero test'
,
project
)
cmd
=
'gcloud compute instances list --filter="name ~ {}" --project={}'
.
format
(
# pylint: disable=line-too-long
INSTANCE_NAME_PREFIX
,
project
)
stdout
=
run_command
(
cmd
,
is_from_user
=
True
)
num_instances
=
len
(
stdout
.
splitlines
())
-
1
logging
.
info
(
'
\n
Found %s gcloud computing instance of project %s created '
'for PerfZero test'
,
num_instances
,
project
)
def
start
(
username
,
project
,
zone
):
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Starting gcloud computing instance %s of project %s '
'in zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'gcloud compute instances start {} --project={} --zone={}'
.
format
(
instance_name
,
project
,
zone
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
debug
(
'
\n
Successfully started gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
def
stop
(
username
,
project
,
zone
):
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Stopping gcloud computing instance %s of project %s in '
'zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'gcloud compute instances stop {} --project={} --zone={}'
.
format
(
instance_name
,
project
,
zone
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
debug
(
'
\n
Successfully stopped gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
def
delete
(
username
,
project
,
zone
):
instance_name
=
get_instance_name
(
username
)
logging
.
debug
(
'Deleting gcloud computing instance %s of project %s in '
'zone %s'
,
instance_name
,
project
,
zone
)
cmd
=
'echo Y | gcloud compute instances delete {} --project={} --zone={}'
.
format
(
# pylint: disable=line-too-long
instance_name
,
project
,
zone
)
run_command
(
cmd
,
is_from_user
=
True
)
logging
.
debug
(
'
\n
Successfully deleted gcloud computing instance %s of '
'project %s in zone %s'
,
instance_name
,
project
,
zone
)
def
parse_arguments
(
argv
,
command
):
# pylint: disable=redefined-outer-name
"""Parse command line arguments and return parsed flags.
Args:
argv: command line arguments
command: the subcommand requested by the user
Returns:
parsed flags
"""
# pylint: disable=redefined-outer-name
parser
=
argparse
.
ArgumentParser
(
usage
=
'cloud_manager.py {} [<args>]'
.
format
(
command
),
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
'--debug'
,
action
=
'store_true'
,
help
=
'If set, use debug level logging. Otherwise, use info level logging'
)
parser
.
add_argument
(
'--project'
,
default
=
'google.com:tensorflow-performance'
,
type
=
str
,
help
=
'Google Cloud Platform project name to use for this invocation'
)
if
command
in
[
'create'
,
'start'
,
'stop'
,
'delete'
,
'status'
]:
parser
.
add_argument
(
'--username'
,
default
=
getpass
.
getuser
(),
type
=
str
,
help
=
'''Username that uniquely identifies the name of computing instance created for PerfZero.
The default value is your ldap username.
'''
)
parser
.
add_argument
(
'--zone'
,
default
=
'us-west1-b'
,
type
=
str
,
help
=
'Zone of the instance to create.'
)
parser
.
add_argument
(
'--ssh-internal-ip'
,
action
=
'store_true'
,
help
=
'If set, use internal IP for ssh with `gcloud beta compute ssh`.'
)
parser
.
add_argument
(
'--ssh-key-file'
,
default
=
None
,
type
=
str
,
help
=
'The ssh key to use with with `gcloud (beta) compute ssh`.'
)
if
command
==
'create'
:
parser
.
add_argument
(
'--accelerator_count'
,
default
=
1
,
type
=
int
,
help
=
'The number of pieces of the accelerator to attach to the instance'
)
parser
.
add_argument
(
'--accelerator_type'
,
default
=
'nvidia-tesla-v100'
,
type
=
str
,
help
=
'''The specific type (e.g. nvidia-tesla-v100 for nVidia Tesla V100) of
accelerator to attach to the instance. Use 'gcloud compute accelerator-types list --project=${project_name}' to
learn about all available accelerator types.
'''
)
parser
.
add_argument
(
'--cpu_min_platform'
,
default
=
None
,
type
=
str
,
help
=
'''Minimum cpu platform, only needed for CPU only instances.'''
)
parser
.
add_argument
(
'--machine_type'
,
default
=
None
,
type
=
str
,
help
=
'''The machine type used for the instance. To get a list of available machine
types, run 'gcloud compute machine-types list --project=${project_name}'
'''
)
parser
.
add_argument
(
'--image'
,
default
=
'tf-ubuntu-1604-20180927-410'
,
type
=
str
,
help
=
'''Specifies the name of the image that the disk will be initialized with.
A new disk will be created based on the given image. To view a list of
public images and projects, run 'gcloud compute images list --project=${project_name}'. It is best
practice to use image when a specific version of an image is needed.
'''
)
parser
.
add_argument
(
'--nvme_count'
,
default
=
0
,
type
=
int
,
help
=
'''Specifies the number of NVME local SSD devices to attach to the instance.
'''
)
parser
.
add_argument
(
'--boot_ssd_size'
,
default
=
None
,
type
=
int
,
help
=
'''Specifies the size (GB) of the boot disk or size is the image
size. Setting this also changes boot disk to Persistent SSD.
'''
)
flags
,
unparsed
=
parser
.
parse_known_args
(
argv
)
# pylint: disable=redefined-outer-name
if
unparsed
:
logging
.
error
(
'Arguments %s are not recognized'
,
unparsed
)
sys
.
exit
(
1
)
level
=
logging
.
DEBUG
if
flags
.
debug
else
logging
.
INFO
logging
.
basicConfig
(
format
=
'%(message)s'
,
level
=
level
)
return
flags
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
usage
=
'''cloud_manager.py <command> [<args>]
The supported commands are:
create: Create a computing instance in gcloud that is unique to the specified username, which is your ldap by default
start: Start the computing instance in gcloud that is unique to the specified username, which is your ldap by default
stop: Stop the computing instance in gcloud that is unique to the specified username, which is your ldap by default
delete: Delete the computing instance in gcloud that is unique to the specified username, which is your ldap by default
status: Query the status and information of the computing instance in gcloud that is unique to the specified username, which is your ldap by default
list_all: Query the status of all computing instances that are created by this script.'''
)
parser
.
add_argument
(
'command'
,
type
=
str
)
flags
=
parser
.
parse_args
(
sys
.
argv
[
1
:
2
])
command
=
flags
.
command
if
not
hasattr
(
sys
.
modules
[
__name__
],
command
):
print
(
'Error: The command <{}> is not recognized
\n
'
.
format
(
command
))
parser
.
print_help
()
sys
.
exit
(
1
)
flags
=
parse_arguments
(
sys
.
argv
[
2
:],
command
)
if
command
==
'create'
:
create
(
flags
.
username
,
flags
.
project
,
flags
.
zone
,
flags
.
machine_type
,
flags
.
accelerator_count
,
flags
.
accelerator_type
,
flags
.
image
,
flags
.
nvme_count
,
flags
.
ssh_internal_ip
,
flags
.
ssh_key_file
,
cpu_min_platform
=
flags
.
cpu_min_platform
,
boot_ssd_size
=
flags
.
boot_ssd_size
)
elif
command
==
'start'
:
start
(
flags
.
username
,
flags
.
project
,
flags
.
zone
)
elif
command
==
'stop'
:
stop
(
flags
.
username
,
flags
.
project
,
flags
.
zone
)
elif
command
==
'delete'
:
delete
(
flags
.
username
,
flags
.
project
,
flags
.
zone
)
elif
command
==
'status'
:
status
(
flags
.
username
,
flags
.
project
,
flags
.
zone
,
flags
.
ssh_internal_ip
,
flags
.
ssh_key_file
)
elif
command
==
'list_all'
:
list_all
(
flags
.
project
)
TensorFlow2x/ComputeVision/Classification/
tf_cnn_
benchmarks
/models/experimental
/__init__.py
→
TensorFlow2x/ComputeVision/Classification/benchmarks
-master/perfzero/lib/perfzero
/__init__.py
View file @
ee3997b3
File moved
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/benchmark_method_runner.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Execute a single benchmark method."""
from
__future__
import
print_function
import
datetime
import
json
import
logging
import
os
import
time
import
traceback
from
perfzero.process_info_tracker
import
ProcessInfoTracker
import
perfzero.report_utils
as
report_utils
from
perfzero.tensorflow_profiler
import
TensorFlowProfiler
import
perfzero.utils
as
utils
def
run
(
benchmark_method
,
harness_info
,
site_package_info
,
root_output_dir
,
config
,
queue
,
trial_id
):
try
:
_run_internal
(
benchmark_method
,
harness_info
,
site_package_info
,
root_output_dir
,
config
,
queue
,
trial_id
)
except
Exception
:
# pylint: disable=broad-except
logging
.
error
(
'Benchmark execution for %s failed due to error:
\n
%s'
,
benchmark_method
,
traceback
.
format_exc
())
queue
.
put
((
True
,
None
,
False
,
None
))
def
_set_file_contents
(
content_str
,
output_filename
):
with
open
(
output_filename
,
'w'
)
as
f
:
f
.
write
(
content_str
)
logging
.
info
(
'Wrote summary to file %s'
,
output_filename
)
def
_run_internal
(
benchmark_method
,
harness_info
,
site_package_info
,
root_output_dir
,
config
,
queue
,
trial_id
):
"""Run benchmark method and put result to the queue.
Args:
benchmark_method: Canonical path to the benchmark method
harness_info: Description of the benchmark harness used in the benchmark
site_package_info: Description of the site-package used in the benchmark
root_output_dir: Directory under which to put the benchmark output
config: An instance of perfzero_config
queue: An interprocess queue to transfer benchmark result to the caller.
trial_id: An integer trial id to annotate in the benchmark result.
"""
start_timestamp
=
time
.
time
()
execution_timestamp
=
start_timestamp
method_has_exception
=
False
execution_id
=
(
config
.
execution_id
if
config
.
execution_id
else
datetime
.
datetime
.
now
().
strftime
(
'%Y-%m-%d-%H-%M-%S-%f'
))
output_dir
=
os
.
path
.
join
(
root_output_dir
,
execution_id
)
if
config
.
scratch_gcs_url
:
model_output_dir
=
os
.
path
.
join
(
config
.
scratch_gcs_url
,
execution_id
)
else
:
model_output_dir
=
output_dir
utils
.
make_dir_if_not_exist
(
output_dir
)
benchmark_class
,
benchmark_method_name
=
benchmark_method
.
rsplit
(
'.'
,
1
)
benchmark_class_name
=
benchmark_class
.
rsplit
(
'.'
,
1
)[
1
]
tensorflow_profiler
=
TensorFlowProfiler
(
config
.
profiler_enabled_time_str
,
output_dir
)
process_info_tracker
=
ProcessInfoTracker
(
output_dir
)
process_info
=
None
# Setup per-method file logger
filehandler
=
logging
.
FileHandler
(
filename
=
os
.
path
.
join
(
output_dir
,
'perfzero.log'
),
mode
=
'w'
)
filehandler
.
setFormatter
(
logging
.
Formatter
(
'%(asctime)s %(levelname)s: %(message)s'
))
logging
.
getLogger
().
addHandler
(
filehandler
)
try
:
if
config
.
tpu_parameters
:
tpu
=
config
.
tpu_parameters
.
get
(
'name'
)
else
:
tpu
=
None
if
config
.
perfzero_constructor_args
:
constructor_args
=
json
.
loads
(
config
.
perfzero_constructor_args
)
else
:
constructor_args
=
{}
class_instance
=
utils
.
instantiate_benchmark_class
(
benchmark_class
=
benchmark_class
,
output_dir
=
model_output_dir
,
root_data_dir
=
config
.
root_data_dir
,
tpu
=
tpu
,
constructor_args
=
constructor_args
,
benchmark_class_type
=
config
.
benchmark_class_type
)
# tf.test.Benchmark.report_benchmark() writes results to a file with
# path benchmark_result_file_path_prefix + benchmark_method
benchmark_result_file_path_prefix
=
os
.
path
.
join
(
output_dir
,
'proto_'
)
os
.
environ
[
'TEST_REPORT_FILE_PREFIX'
]
=
benchmark_result_file_path_prefix
benchmark_result_file_path
=
'{}{}.{}'
.
format
(
benchmark_result_file_path_prefix
,
benchmark_class_name
,
benchmark_method_name
)
# Start background threads for profiler and system info tracker
tensorflow_profiler
.
start
()
process_info_tracker
.
start
()
# Run benchmark method
execution_timestamp
=
time
.
time
()
logging
.
info
(
'Starting benchmark execution: %s'
,
benchmark_method
)
getattr
(
class_instance
,
benchmark_method_name
)()
logging
.
info
(
'Stopped benchmark: %s'
,
benchmark_method
)
# Read and build benchmark results
raw_benchmark_result
=
utils
.
read_benchmark_result
(
benchmark_result_file_path
)
# Explicitly overwrite the name to be the full path to benchmark method
raw_benchmark_result
[
'name'
]
=
benchmark_method
except
Exception
:
# pylint: disable=broad-except
logging
.
error
(
'Benchmark execution for %s failed due to error:
\n
%s'
,
benchmark_method
,
traceback
.
format_exc
())
method_has_exception
=
True
raw_benchmark_result
=
{}
raw_benchmark_result
[
'name'
]
=
benchmark_method
raw_benchmark_result
[
'wall_time'
]
=
-
1
raw_benchmark_result
[
'extras'
]
=
{}
finally
:
# Stop background threads for profiler and system info tracker
process_info
=
process_info_tracker
.
stop
()
tensorflow_profiler
.
stop
()
upload_timestamp
=
time
.
time
()
benchmark_result
=
report_utils
.
build_benchmark_result
(
raw_benchmark_result
,
method_has_exception
,
trial_id
)
execution_summary
=
report_utils
.
build_execution_summary
(
execution_timestamp
,
execution_id
,
config
.
ml_framework_build_label
,
config
.
execution_label
,
config
.
platform_name
,
config
.
system_name
,
config
.
output_gcs_url
,
benchmark_result
,
config
.
get_env_vars
(),
config
.
get_flags
(),
harness_info
,
site_package_info
,
process_info
,
method_has_exception
,
is_tpu_benchmark
=
(
config
.
tpu_parameters
!=
None
))
report_utils
.
upload_execution_summary
(
config
.
bigquery_project_name
,
config
.
bigquery_dataset_table_name
,
execution_summary
)
report_utils
.
execute_methods
(
config
.
result_upload_methods
,
execution_summary
)
logging
.
info
(
'Benchmark execution for %s completed with summary:
\n
%s'
,
benchmark_method
,
json
.
dumps
(
execution_summary
,
indent
=
2
))
_set_file_contents
(
json
.
dumps
(
execution_summary
,
indent
=
2
),
os
.
path
.
join
(
output_dir
,
'perfzero_summary.json'
))
utils
.
maybe_upload_to_gcs
(
output_dir
,
config
.
output_gcs_url
)
logging
.
getLogger
().
removeHandler
(
filehandler
)
method_execution_time
=
{
'class_initialization'
:
execution_timestamp
-
start_timestamp
,
'method_execution'
:
upload_timestamp
-
execution_timestamp
,
'log_upload'
:
time
.
time
()
-
upload_timestamp
}
if
config
.
profiler_enabled_time_str
:
relative_output_dir
=
output_dir
[
output_dir
.
find
(
'benchmark'
):]
print
(
'
\n
Execute the command below to start tensorboard server using '
'the collected profiler data:
\n
tensorboard --logdir={}
\n\n
'
'Open localhost:6006 in your browser to access the Tensorbord '
'GUI. Use ssh with port forwarding if tensorboard is running on '
'a remote machine.
\n
'
.
format
(
relative_output_dir
))
queue
.
put
((
method_has_exception
,
method_execution_time
,
benchmark_result
[
'succeeded'
],
output_dir
))
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/device_utils.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Setup the data drive with raid, RAM, or mount network drives."""
from
__future__
import
print_function
import
logging
import
perfzero.utils
as
utils
def
create_drive_from_devices
(
data_dir
,
gce_nvme_raid
):
"""Creates a drive at data directory."""
if
not
gce_nvme_raid
:
return
devices
=
_get_nvme_devices
()
cmd
=
'mountpoint -q {}'
.
format
(
data_dir
)
retcode
,
_
=
utils
.
run_command
(
cmd
)
if
retcode
:
if
len
(
devices
)
>
1
:
_create_drive_raid
(
data_dir
,
devices
)
else
:
_create_single_drive
(
data_dir
,
devices
[
0
])
def
_get_nvme_devices
():
"""Returns list paths to nvme devices."""
devices
=
[]
cmd
=
'lsblk'
retcode
,
log
=
utils
.
run_command
(
cmd
)
if
retcode
:
raise
Exception
(
'"{}" failed with code:{} and log:
\n
{}'
.
format
(
cmd
,
retcode
,
log
))
lines
=
log
.
splitlines
()
if
lines
:
for
line
in
lines
:
if
line
.
startswith
(
'nvme'
):
parts
=
line
.
split
()
devices
.
append
(
'/dev/'
+
parts
[
0
].
strip
())
return
devices
def
_create_single_drive
(
data_dir
,
device
):
"""Creates a data drive out of a single device."""
cmds
=
[]
cmds
.
append
(
'mkfs.ext4 -F {}'
.
format
(
device
))
cmds
.
append
(
'mkdir -p {}'
.
format
(
data_dir
))
cmds
.
append
(
'mount {} {}'
.
format
(
device
,
data_dir
))
cmds
.
append
(
'chmod a+w {}'
.
format
(
data_dir
))
utils
.
run_commands
(
cmds
)
logging
.
info
(
'Created and mounted device %s at %s'
,
device
,
data_dir
)
def
_create_drive_raid
(
data_dir
,
devices
):
"""Creates a raid zero array of nvme drives."""
cmds
=
[]
# Passing 'yes' because GCE nvme drive are sometimes in an odd state and
# think they are in another raid. mdadm does not have -y option.
# Or the kokoro images were left dirty? and that is where the info
# comes from.
cmds
.
append
(
'yes | mdadm --create /dev/md0 --level=0 '
'--raid-devices={} {}'
.
format
(
len
(
devices
),
' '
.
join
(
devices
)))
cmds
.
append
(
'mkfs.ext4 -F /dev/md0'
)
cmds
.
append
(
'mkdir -p {}'
.
format
(
data_dir
))
cmds
.
append
(
'mount /dev/md0 {}'
.
format
(
data_dir
))
cmds
.
append
(
'chmod a+w {}'
.
format
(
data_dir
))
utils
.
run_commands
(
cmds
)
logging
.
info
(
'Created and mounted RAID array at %s'
,
data_dir
)
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero configs provided by user."""
from
__future__
import
print_function
import
json
import
logging
import
os
def
add_setup_parser_arguments
(
parser
):
"""Add arguments to the parser used by the setup.py."""
parser
.
add_argument
(
'--dockerfile_path'
,
default
=
'docker/Dockerfile_ubuntu_1804_tf_v1'
,
type
=
str
,
help
=
'''Build the docker image using docker file located at the ${pwd}/${dockerfile_path} if
it exists, where ${pwd} is user's current work directory. Otherwise, build
the docker image using the docker file located at path_to_perfzero/${dockerfile_path}.
'''
)
parser
.
add_argument
(
'--workspace'
,
default
=
'workspace'
,
type
=
str
,
help
=
'''The gcloud key file will be downloaded under directory path_to_perfzero/${workspace}
'''
)
parser
.
add_argument
(
'--gcloud_key_file_url'
,
default
=
''
,
type
=
str
,
help
=
'''DEPRECATED: Use --gcloud_key_file_url of setup.py instead.
The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
'''
)
parser
.
add_argument
(
'--root_data_dir'
,
default
=
'/data'
,
type
=
str
,
help
=
'The directory which should contain the dataset required by the becnhmark method.'
)
parser
.
add_argument
(
'--gce_nvme_raid'
,
default
=
None
,
type
=
str
,
help
=
'If set to non-empty string, create raid 0 array with devices at the directory specified by the flag --root_data_dir'
)
parser
.
add_argument
(
'--tensorflow_pip_spec'
,
default
=
None
,
type
=
str
,
help
=
'''The tensorflow pip package specfication. The format can be either ${package_name}, or ${package_name}==${package_version}.
Example values include tf-nightly-gpu, and tensorflow==1.12.0. If it is specified, the corresponding tensorflow pip package/version
will be installed. Otherwise, the default tensorflow pip package specified in the docker file will be installed.
'''
)
parser
.
add_argument
(
'--extra_pip_specs'
,
default
=
''
,
type
=
str
,
help
=
'''Additional specifications to pass to `pip install`. (e.g. pinning certain dependencies)
Specifications should be semicolon separated: e.g. `numpy==1.16.4;scipy==1.3.1`
'''
)
parser
.
add_argument
(
'--docker_tag'
,
default
=
'perfzero/tensorflow'
,
type
=
str
,
help
=
'The docker tag to use if building a docker image.'
)
parser
.
add_argument
(
'--site_package_downloads'
,
default
=
''
,
type
=
str
,
help
=
'''Comma separated list of dirs in the external vm to copy to the docker
\'
s site package dir.
Format: <absolute-path>/src/dir:new_base_dir_name,<absolute-path>/src/dir2>:new_name,....
This will copy <absolute-path>/src/dir to <site-packages>/new_base_dir_name.
'''
)
parser
.
add_argument
(
'--extra_docker_build_args'
,
nargs
=
'*'
,
default
=
''
,
type
=
str
,
help
=
'''Additional build-args to pass to `docker build`.
Example: --extra_docker_build_args arg0 arg1=value1 "arg2=value with space" arg3=300.
Each string will be passed directly as a build-arg to docker, so the above example will be passed as follows:
--build-arg arg0 --build-arg arg1=value1 --build-arg "arg2=value with space" --build-arg arg3=300
'''
)
def
add_benchmark_parser_arguments
(
parser
):
"""Add arguments to the parser used by the benchmark.py."""
parser
.
add_argument
(
'--use_cached_site_packages'
,
action
=
'store_true'
,
help
=
'If set, skip git pull for dependent git repositories if it already exists in path_to_perfzero/${workspace}/site-packages'
)
parser
.
add_argument
(
'--gcs_downloads'
,
default
=
None
,
type
=
str
,
help
=
'This flag is deprecated. Use the flag --data_downloads instead'
)
parser
.
add_argument
(
'--git_repos'
,
default
=
None
,
type
=
str
,
help
=
'''A string representing git repositories to checkout. The format is url_1;branch_1;hash_1,url_2;branch_2;hash_2,...
Git repositories will be checked-out under directory path_to_perfzero/${workspace}/site-packages,
where ${workspace} either defaults to 'workspace', or takes the value of the flag --workspace.
branch and hash can be skipped if user wants to use the head of the master branch,
which shortens the format to url_1,url_2,...
'''
)
parser
.
add_argument
(
'--benchmark_num_trials'
,
default
=
1
,
type
=
int
,
help
=
'''Configures number of times to run each benchmark method
after setup completion.'''
)
parser
.
add_argument
(
'--benchmark_methods'
,
action
=
'append'
,
default
=
[],
type
=
str
,
help
=
'''This string specifies the benchmark_method to be executed. The flag can be specified multiple times in which case
the union of methods matched by these flags will be executed. The format can be module_path.class_name.method_name in which
case the corresponding method is executed. The format can also be module_path.class_name.filter:regex_pattern, in which case all methods
of the given class whose method name matches the given regular expression are executed.
'''
)
parser
.
add_argument
(
'--ml_framework_build_label'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the machine learning framework build, e.g. nightly-gpu-build'
)
parser
.
add_argument
(
'--execution_label'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the benchmark execution type, e.g. test, prod'
)
parser
.
add_argument
(
'--platform_name'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the computing platform, e.g. gcp, aws'
)
parser
.
add_argument
(
'--system_name'
,
default
=
None
,
type
=
str
,
help
=
'A string that identified the hardware system, e.g. n1-standard-64-8xV100'
)
parser
.
add_argument
(
'--output_gcs_url'
,
default
=
None
,
type
=
str
,
help
=
'''If specified, log files generated by the benchmark execution will be uploaded to output_gcs_url/${execution_id},
where ${execution_id} is a string that generated by PerfZero which uniquely identifies the execution of one benchmark method
'''
)
parser
.
add_argument
(
'--scratch_gcs_url'
,
default
=
None
,
type
=
str
,
help
=
'''If specified, intermediate files like model outputs will be stored in scratch_gcs_url/${execution_id}, where
${execution_id} is a string that is generated by PerfZero which uniquely identifies the execution of one benchmark method.
If not specified, intermediate files will be stored in a local folder on the host.
'''
)
parser
.
add_argument
(
'--bigquery_project_name'
,
default
=
None
,
type
=
str
,
help
=
'''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
'''
)
parser
.
add_argument
(
'--bigquery_dataset_table_name'
,
default
=
None
,
type
=
str
,
help
=
'''If both --bigquery_project_name and --bigquery_dataset_table_name are specified, for each benchmark method, the benchmark
summary will be uploaded to the specified bigquery table whose schema is defined in perfzero/scripts/create_big_table.txt.
The value of each field can in turn be a json-formatted string. See README.md for example output.
'''
)
parser
.
add_argument
(
'--python_path'
,
default
=
None
,
type
=
str
,
help
=
'''A string of format path_1,path_2,... For each ${path} specified in the string,
path_to_perfzero/${workspace}/site-packages/${path} will be added to python path so that libraies downloaded by --git_repos can
be loaded and executed.
'''
)
parser
.
add_argument
(
'--workspace'
,
default
=
'workspace'
,
type
=
str
,
help
=
'''The log files, gcloud key file and git repositories will be generated and downloaded under the
directory path_to_perfzero/${workspace}
'''
)
parser
.
add_argument
(
'--root_data_dir'
,
default
=
'/data'
,
type
=
str
,
help
=
'The directory which should contain the dataset required by the becnhmark method.'
)
parser
.
add_argument
(
'--data_downloads'
,
default
=
None
,
type
=
str
,
help
=
'''A string of format url_1;relative_path_1,url_2;relative_path_2,...
Data will be copied from ${url} to ${root_data_dir}/${relative_path}. ${relative_path} can be skipped if it is the same as the
base name of the url, which shortens the format to url_1,url_2,... ${root_data_dir} is specified by the flag --root_data_dir.
File will be de-compressed in ${root_data_dir} if its name ends with 'gz'. Only url prefixed with gcs, http or https are supported.
Each url can start with file://, gcs://, http:// or https://.
'''
)
parser
.
add_argument
(
'--gcloud_key_file_url'
,
default
=
'gs://tf-performance/auth_tokens/benchmark_upload_gce.json'
,
type
=
str
,
help
=
'''The gcloud key file url. When specified, it will be downloaded to the
directory specified by the flag --workspace. Each url can start with file://, gcs://, http:// or https://.
The key file will then be activated and used as gcloud authentication credential.
'''
)
parser
.
add_argument
(
'--debug'
,
action
=
'store_true'
,
help
=
'If set, use debug level logging. Otherwise, use info level logging'
)
parser
.
add_argument
(
'--profiler_enabled_time'
,
default
=
None
,
type
=
str
,
help
=
'''A string of format begin_time_1:end_time_1,begin_time_2:end_time_2,.... PerfZero will start to collect profiler
data ${begin_time} sec after benchmark method execution starts. The data collection continues for ${end_time - begin_time}
sec or until the benchmark method execution finishes, whichever occurs first. If ${end_time} is not explicitly
specified, it is assumed to be MAX_LONG.
'''
)
parser
.
add_argument
(
'--execution_id'
,
default
=
None
,
type
=
str
,
help
=
'A string that uniquely identifies the benchmark execution.'
)
parser
.
add_argument
(
'--result_upload_methods'
,
default
=
None
,
type
=
str
,
help
=
'A comma separated list of class.method values to upload results.'
)
parser
.
add_argument
(
'--tpu_parameters'
,
default
=
None
,
type
=
str
,
help
=
'''A json dictionary of cloud tpu parameters. The format must look like the following:
{"name": "my-tpu-name", project": "my-gcp-project-id", "zone": "europe-west4-a", "size": "v3-8", "version": "nightly-2.x"}
It can have an optional key value pair "version_id" -> "nightly version" to change the tpu version id.
Example "version_id": "2.4.0-dev20200728".
'''
)
parser
.
add_argument
(
'--perfzero_constructor_args'
,
nargs
=
'*'
,
default
=
''
,
type
=
str
,
help
=
'''A json dictionary of additional args to pass to the perfzero
constructor.'''
)
parser
.
add_argument
(
'--benchmark_class_type'
,
default
=
None
,
type
=
str
,
help
=
'''The benchmark class type. If none, assumed perfzero_benchmark. Set to "tf_benchmark"
for tf.test.Benchmark benchmarks.'''
)
class
PerfZeroConfig
(
object
):
"""Creates and contains config for PerfZero."""
def
__init__
(
self
,
mode
,
flags
=
None
):
self
.
mode
=
mode
self
.
flags
=
flags
if
mode
==
'flags'
:
self
.
gcs_downloads_str
=
flags
.
gcs_downloads
self
.
data_downloads_str
=
flags
.
data_downloads
self
.
git_repos_str
=
flags
.
git_repos
self
.
benchmark_method_patterns
=
[]
for
value
in
flags
.
benchmark_methods
:
self
.
benchmark_method_patterns
.
extend
(
value
.
split
(
','
))
self
.
ml_framework_build_label
=
flags
.
ml_framework_build_label
self
.
execution_label
=
flags
.
execution_label
self
.
platform_name
=
flags
.
platform_name
self
.
system_name
=
flags
.
system_name
self
.
output_gcs_url
=
flags
.
output_gcs_url
self
.
scratch_gcs_url
=
flags
.
scratch_gcs_url
self
.
bigquery_project_name
=
flags
.
bigquery_project_name
self
.
bigquery_dataset_table_name
=
flags
.
bigquery_dataset_table_name
self
.
python_path_str
=
flags
.
python_path
self
.
workspace
=
flags
.
workspace
self
.
benchmark_class_type
=
flags
.
benchmark_class_type
self
.
use_cached_site_packages
=
flags
.
use_cached_site_packages
self
.
root_data_dir
=
flags
.
root_data_dir
self
.
gcloud_key_file_url
=
flags
.
gcloud_key_file_url
self
.
profiler_enabled_time_str
=
flags
.
profiler_enabled_time
self
.
execution_id
=
flags
.
execution_id
self
.
result_upload_methods
=
flags
.
result_upload_methods
self
.
perfzero_constructor_args
=
flags
.
perfzero_constructor_args
self
.
benchmark_num_trials
=
flags
.
benchmark_num_trials
if
flags
.
tpu_parameters
:
self
.
tpu_parameters
=
json
.
loads
(
flags
.
tpu_parameters
)
else
:
self
.
tpu_parameters
=
None
if
not
flags
.
benchmark_methods
:
logging
.
warning
(
'No benchmark method is specified by '
'--benchmark_methods'
)
if
flags
.
bigquery_project_name
and
not
flags
.
bigquery_dataset_table_name
:
raise
ValueError
(
'--bigquery_project_name is specified but '
'--bigquery_dataset_table_name is not'
)
if
not
flags
.
bigquery_project_name
and
flags
.
bigquery_dataset_table_name
:
raise
ValueError
(
'--bigquery_dataset_table_name is specified but '
'--bigquery_project_name is not'
)
def
get_env_vars
(
self
):
env_vars
=
{}
for
key
in
os
.
environ
.
keys
():
if
key
.
startswith
(
'PERFZERO_'
):
env_vars
[
key
]
=
os
.
environ
[
key
]
return
env_vars
def
get_flags
(
self
):
not_none_flags
=
{}
for
key
in
vars
(
self
.
flags
):
value
=
getattr
(
self
.
flags
,
key
)
if
value
is
not
None
:
not_none_flags
[
key
]
=
value
return
not_none_flags
def
get_git_repos
(
self
,
site_packages_dir
):
"""Parse git repository string."""
git_repos
=
[]
if
not
self
.
git_repos_str
:
return
git_repos
for
repo_entry
in
self
.
git_repos_str
.
split
(
','
):
parts
=
repo_entry
.
split
(
';'
)
git_repo
=
{}
git_repo
[
'url'
]
=
parts
[
0
]
# Assume the git url has format */{dir_name}.git
git_repo
[
'dir_name'
]
=
parts
[
0
].
rsplit
(
'/'
,
1
)[
-
1
].
rsplit
(
'.'
,
1
)[
0
]
git_repo
[
'local_path'
]
=
os
.
path
.
join
(
site_packages_dir
,
git_repo
[
'dir_name'
])
if
len
(
parts
)
>=
2
:
git_repo
[
'branch'
]
=
parts
[
1
]
if
len
(
parts
)
>=
3
:
git_repo
[
'git_hash'
]
=
parts
[
2
]
git_repos
.
append
(
git_repo
)
return
git_repos
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/perfzero_config_test.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for perfzero_config.py."""
from
__future__
import
print_function
import
os
import
unittest
import
perfzero.perfzero_config
as
perfzero_config
class
TestPerfZeroConfig
(
unittest
.
TestCase
):
def
test_get_git_repos
(
self
):
config
=
perfzero_config
.
PerfZeroConfig
(
mode
=
'mock'
)
config
.
git_repos_str
=
'https://github.com/tensorflow/benchmarks.git;branch_1;hash_1,https://github.com/tensorflow/models.git;branch_2'
git_repos
=
config
.
get_git_repos
(
'/site_package_dir'
)
git_repo_1
=
{}
git_repo_1
[
'url'
]
=
'https://github.com/tensorflow/benchmarks.git'
git_repo_1
[
'dir_name'
]
=
'benchmarks'
git_repo_1
[
'local_path'
]
=
'/site_package_dir/benchmarks'
git_repo_1
[
'branch'
]
=
'branch_1'
git_repo_1
[
'git_hash'
]
=
'hash_1'
git_repo_2
=
{}
git_repo_2
[
'url'
]
=
'https://github.com/tensorflow/models.git'
git_repo_2
[
'dir_name'
]
=
'models'
git_repo_2
[
'local_path'
]
=
'/site_package_dir/models'
git_repo_2
[
'branch'
]
=
'branch_2'
self
.
assertEqual
(
2
,
len
(
git_repos
))
self
.
assertEqual
(
git_repo_1
,
git_repos
[
0
])
self
.
assertEqual
(
git_repo_2
,
git_repos
[
1
])
def
test_get_env_vars
(
self
):
config
=
perfzero_config
.
PerfZeroConfig
(
mode
=
'mock'
)
self
.
assertEqual
({},
config
.
get_env_vars
())
os
.
environ
[
'PERFZERO_VAR1'
]
=
'value1'
self
.
assertEqual
({
'PERFZERO_VAR1'
:
'value1'
},
config
.
get_env_vars
())
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/process_info_tracker.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keep track of process information such as maximum memory usage with a separate thread."""
from
__future__
import
absolute_import
import
json
import
logging
import
os
import
sched
import
threading
import
time
import
traceback
import
psutil
class
ProcessInfoTracker
(
object
):
"""Keep track of process information such as maximum memory usage with separate thread."""
def
__init__
(
self
,
output_dir
):
self
.
process_info_log
=
open
(
os
.
path
.
join
(
output_dir
,
'process_info.log'
),
'w'
)
self
.
scheduler
=
sched
.
scheduler
(
time
.
time
,
time
.
sleep
)
self
.
process_info
=
{}
self
.
process_info
[
'max_rss'
]
=
0
self
.
process_info
[
'max_vms'
]
=
0
self
.
process_info
[
'max_cpu_percent'
]
=
0
self
.
exit_event
=
threading
.
Event
()
self
.
last_exception
=
None
self
.
start_time
=
None
def
start
(
self
):
self
.
start_time
=
time
.
time
()
# 4th positional arg added to support Python2 for the short-term.
self
.
scheduler
.
enter
(
1
,
1
,
self
.
_update_process_info
,
())
# pylint: disable=no-value-for-parameter
threading
.
Thread
(
target
=
self
.
scheduler
.
run
).
start
()
logging
.
info
(
'Started process information tracker.'
)
def
stop
(
self
):
self
.
exit_event
.
set
()
self
.
process_info_log
.
flush
()
logging
.
info
(
'Stopped process information tracker.'
)
if
self
.
last_exception
is
not
None
:
raise
self
.
last_exception
# pylint: disable=raising-bad-type
return
dict
(
self
.
process_info
)
def
_update_process_info
(
self
):
"""Read and update process info using background thread every 1 second."""
try
:
p
=
psutil
.
Process
(
os
.
getpid
())
memory_info
=
p
.
memory_info
()
# This is a blocking call which takes 0.1 second.
# This affects the interval # at which the metrics are reported
cpu_percent
=
p
.
cpu_percent
(
interval
=
0.1
)
self
.
process_info
[
'max_rss'
]
=
max
(
self
.
process_info
[
'max_rss'
],
memory_info
.
rss
)
self
.
process_info
[
'max_vms'
]
=
max
(
self
.
process_info
[
'max_vms'
],
memory_info
.
vms
)
self
.
process_info
[
'max_cpu_percent'
]
=
max
(
self
.
process_info
[
'max_cpu_percent'
],
cpu_percent
)
entry
=
{}
entry
[
'time'
]
=
time
.
time
()
-
self
.
start_time
entry
[
'rss'
]
=
memory_info
.
rss
entry
[
'vms'
]
=
memory_info
.
vms
entry
[
'cpu_percent'
]
=
cpu_percent
self
.
process_info_log
.
write
(
json
.
dumps
(
entry
)
+
'
\n
'
)
if
not
self
.
exit_event
.
is_set
():
# Schedule the next event to be run after 1 second
# 4th positional arg added to support Python2 for the short-term.
self
.
scheduler
.
enter
(
1
,
1
,
self
.
_update_process_info
,
())
# pylint: disable=no-value-for-parameter
except
Exception
as
e
:
# pylint: disable=broad-except
logging
.
error
(
'Process info tracker failed due to error:
\n
%s'
,
traceback
.
format_exc
())
self
.
last_exception
=
e
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/report_utils.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Upload test results."""
from
__future__
import
print_function
import
importlib
import
json
import
logging
import
perfzero.utils
as
utils
import
psutil
import
socket
from
six
import
u
as
unicode
# pylint: disable=W0622
def
execute_methods
(
method_names_str
,
*
args
,
**
kwargs
):
"""Calls a list of method names on given function params.
Args:
method_names_str: String - Comma-separated module.foo.bar.method strings.
This function imports module.foo.bar for each such method and calls it
with *args and **kwargs.
*args: Function params common to each method.
**kwargs: Function params common to each method.
Raises:
RuntimeError: If any of the invoked methods raised an exception.
"""
if
not
method_names_str
:
return
errors
=
[]
module_methods_list
=
method_names_str
.
split
(
','
)
for
module_method
in
module_methods_list
:
try
:
logging
.
info
(
'Trying to call %s'
,
module_method
)
module_path
,
method_path
=
module_method
.
rsplit
(
'.'
,
1
)
this_module
=
importlib
.
import_module
(
module_path
)
logging
.
info
(
'Found module %s, looking for %s'
,
module_path
,
method_path
)
this_method
=
getattr
(
this_module
,
method_path
)
logging
.
info
(
'Found method %s'
,
method_path
)
this_method
(
*
args
,
**
kwargs
)
except
Exception
as
e
:
# pylint: disable=broad-except
errors
.
append
(
str
(
e
))
if
errors
:
raise
RuntimeError
(
'
\n
'
+
'
\n
'
.
join
(
errors
))
def
upload_execution_summary
(
bigquery_project_name
,
bigquery_dataset_table_name
,
execution_summary
):
"""Upload benchmark summary.
Note: Using stream=False has a 1000 per day insert limit per table. Using
stream=True, the documented limit is 50K+. With streaming there can be
a small and possibly not noticeable delay to seeing the results the BigQuery
UI, but there can be a 90 minute more or less delay in the results being part
of exports.
Note: BigQuery maps unicode() to STRING for python2. If str is used that is
mapped to BYTE.
Args:
bigquery_project_name: Name of the gcp project.
bigquery_dataset_table_name: data_set and table name.
execution_summary: benchmark summary dictionary of results.
"""
# pylint: disable=C6204
import
google.auth
from
google.cloud
import
bigquery
if
not
bigquery_project_name
:
logging
.
info
(
'Skipped uploading benchmark result to bigquery because bigquery table name is not set.'
)
return
if
not
bigquery_dataset_table_name
:
logging
.
info
(
'Skipped uploading benchmark result to bigquery because bigquery project name is not set.'
)
return
credentials
=
google
.
auth
.
default
()[
0
]
dataset_name
=
bigquery_dataset_table_name
.
split
(
'.'
)[
0
]
table_name
=
bigquery_dataset_table_name
.
split
(
'.'
)[
1
]
client
=
bigquery
.
Client
(
project
=
bigquery_project_name
,
credentials
=
credentials
)
benchmark_summary_input
=
{}
for
key
,
value
in
execution_summary
.
items
():
if
isinstance
(
value
,
dict
):
benchmark_summary_input
[
key
]
=
unicode
(
json
.
dumps
(
value
))
else
:
benchmark_summary_input
[
key
]
=
unicode
(
value
)
logging
.
debug
(
'Bigquery input for benchmark_summary table is %s'
,
json
.
dumps
(
benchmark_summary_input
,
indent
=
2
))
errors
=
[]
# TODO(tobyboyd): Shim to direct results to new table until all jobs
# are updated.
if
'benchmark_results'
in
dataset_name
:
if
dataset_name
==
'benchmark_results_dev'
:
table_ref
=
client
.
dataset
(
'perfzero_dev'
).
table
(
'benchmark_summary'
)
table_obj
=
client
.
get_table
(
table_ref
)
elif
dataset_name
==
'benchmark_results'
:
table_ref
=
client
.
dataset
(
'perfzero'
).
table
(
'benchmark_summary'
)
table_obj
=
client
.
get_table
(
table_ref
)
else
:
table_ref
=
client
.
dataset
(
dataset_name
).
table
(
table_name
)
table_obj
=
client
.
get_table
(
table_ref
)
errors
.
extend
(
client
.
insert_rows
(
table_obj
,
[
benchmark_summary_input
]))
if
errors
:
logging
.
error
(
'Failed to upload benchmark result to bigquery due to errors %s'
,
errors
)
else
:
logging
.
info
(
'Uploaded benchmark result to the table %s of the bigquery project %s.'
,
bigquery_dataset_table_name
,
bigquery_project_name
)
def
build_benchmark_result
(
raw_benchmark_result
,
has_exception
,
trial_id
):
"""Converts test_log.proto format to PerfZero format."""
benchmark_result
=
{}
benchmark_result
[
'name'
]
=
raw_benchmark_result
[
'name'
]
benchmark_result
[
'wall_time'
]
=
raw_benchmark_result
[
'wall_time'
]
succeeded
=
not
has_exception
extras
=
[]
for
name
in
raw_benchmark_result
.
get
(
'extras'
,
{}):
entry
=
{}
entry
[
'name'
]
=
name
if
'double_value'
in
raw_benchmark_result
[
'extras'
][
name
]:
entry
[
'value'
]
=
raw_benchmark_result
[
'extras'
][
name
][
'double_value'
]
else
:
entry
[
'value'
]
=
raw_benchmark_result
[
'extras'
][
name
][
'string_value'
]
extras
.
append
(
entry
)
metrics
=
[]
for
metric
in
raw_benchmark_result
.
get
(
'metrics'
,
[]):
value
=
metric
[
'value'
]
if
'min_value'
in
metric
and
metric
[
'min_value'
]
>
value
:
succeeded
=
False
if
'max_value'
in
metric
and
metric
[
'max_value'
]
<
value
:
succeeded
=
False
metrics
.
append
(
metric
)
benchmark_result
[
'succeeded'
]
=
succeeded
benchmark_result
[
'extras'
]
=
extras
benchmark_result
[
'metrics'
]
=
metrics
benchmark_result
[
'trial_id'
]
=
trial_id
return
benchmark_result
def
build_execution_summary
(
execution_timestamp
,
execution_id
,
ml_framework_build_label
,
execution_label
,
platform_name
,
system_name
,
output_gcs_url
,
benchmark_result
,
env_vars
,
flags
,
harness_info
,
site_package_info
,
process_info
,
has_exception
,
is_tpu_benchmark
):
"""Builds summary of the execution."""
# Avoids module not found during setup phase when tf is not installed yet.
# pylint: disable=C6204
import
tensorflow
as
tf
benchmark_info
=
{}
benchmark_info
[
'harness_name'
]
=
'perfzero'
benchmark_info
[
'harness_info'
]
=
harness_info
benchmark_info
[
'has_exception'
]
=
has_exception
if
execution_label
:
benchmark_info
[
'execution_label'
]
=
execution_label
if
output_gcs_url
:
benchmark_info
[
'output_url'
]
=
'{}/{}/'
.
format
(
output_gcs_url
,
execution_id
)
if
env_vars
:
benchmark_info
[
'env_vars'
]
=
env_vars
if
flags
:
benchmark_info
[
'flags'
]
=
flags
benchmark_info
[
'site_package_info'
]
=
site_package_info
ml_framework_info
=
{}
ml_framework_info
[
'name'
]
=
'tensorflow'
ml_framework_info
[
'version'
]
=
tf
.
__version__
# tf.__git_version__ in Python3 has format b'version_string'
if
tf
.
__git_version__
[
0
]
==
'b'
:
ml_framework_info
[
'build_version'
]
=
tf
.
__git_version__
[
2
:
-
1
]
else
:
ml_framework_info
[
'build_version'
]
=
tf
.
__git_version__
if
ml_framework_build_label
:
ml_framework_info
[
'build_label'
]
=
ml_framework_build_label
system_info
=
{}
if
platform_name
:
system_info
[
'platform_name'
]
=
platform_name
if
system_name
:
system_info
[
'system_name'
]
=
system_name
if
not
is_tpu_benchmark
:
gpu_info
=
utils
.
get_gpu_info
()
if
gpu_info
:
system_info
[
'accelerator_driver_version'
]
=
gpu_info
[
'gpu_driver_version'
]
system_info
[
'accelerator_model'
]
=
gpu_info
[
'gpu_model'
]
system_info
[
'accelerator_count'
]
=
gpu_info
[
'gpu_count'
]
system_info
[
'cpu_model'
]
=
utils
.
get_cpu_name
()
system_info
[
'physical_cpu_count'
]
=
psutil
.
cpu_count
(
logical
=
False
)
system_info
[
'logical_cpu_count'
]
=
psutil
.
cpu_count
(
logical
=
True
)
system_info
[
'cpu_socket_count'
]
=
utils
.
get_cpu_socket_count
()
system_info
[
'hostname'
]
=
socket
.
gethostname
()
execution_summary
=
{}
execution_summary
[
'execution_id'
]
=
execution_id
execution_summary
[
'execution_timestamp'
]
=
execution_timestamp
execution_summary
[
'benchmark_result'
]
=
benchmark_result
execution_summary
[
'benchmark_info'
]
=
benchmark_info
execution_summary
[
'setup_info'
]
=
{}
execution_summary
[
'ml_framework_info'
]
=
ml_framework_info
execution_summary
[
'system_info'
]
=
system_info
if
process_info
:
execution_summary
[
'process_info'
]
=
process_info
return
execution_summary
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tensorflow_profiler.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Collect profiler data for Tensorboard with a separate thread."""
from
__future__
import
print_function
import
logging
import
os
import
sched
import
threading
import
time
import
traceback
import
perfzero.utils
as
utils
def
_start_profiler
(
output_dir
):
"""Start profiler.
Args:
output_dir: log directory to place the profiler data
"""
import
tensorflow
as
tf
# pylint: disable=g-import-not-at-top
profiler_data_dir
=
os
.
path
.
join
(
output_dir
,
'profiler_data'
)
utils
.
make_dir_if_not_exist
(
profiler_data_dir
)
logging
.
info
(
'Starting TensorFlow profiler and saving data to dir %s'
,
profiler_data_dir
)
try
:
tf
.
profiler
.
experimental
.
start
(
profiler_data_dir
)
logging
.
info
(
'Started TensorFlow profiler'
)
except
Exception
:
# pylint: disable=broad-except
logging
.
error
(
'TensorFlow profiler failed to start due to error:
\n
%s'
,
traceback
.
format_exc
())
def
_stop_profiler
():
"""Stop profiler."""
import
tensorflow
as
tf
# pylint: disable=g-import-not-at-top
try
:
tf
.
profiler
.
experimental
.
stop
()
logging
.
info
(
'Stopped TensorFlow profiler.'
)
except
Exception
:
# pylint: disable=broad-except
logging
.
error
(
'TensorFlow profiler failed to stop due to error:
\n
%s'
,
traceback
.
format_exc
())
class
TensorFlowProfiler
(
object
):
"""Collect profiler data for Tensorboard with a separate thread."""
def
__init__
(
self
,
profiler_enabled_time_str
,
output_dir
):
"""Constructor.
Args:
profiler_enabled_time_str: the value of the config --profiler_enabled_time
output_dir: log directory to place the profiler data
"""
self
.
profiler_enabled_time_str
=
profiler_enabled_time_str
self
.
output_dir
=
output_dir
self
.
exit_event
=
threading
.
Event
()
self
.
scheduler
=
sched
.
scheduler
(
time
.
time
,
self
.
_sleep_until_exit
)
def
_sleep_until_exit
(
self
,
timeout
):
start_time
=
time
.
time
()
cur_time
=
time
.
time
()
while
cur_time
-
start_time
<
timeout
and
not
self
.
exit_event
.
is_set
():
time
.
sleep
(
min
(
1
,
timeout
+
start_time
-
cur_time
))
cur_time
=
time
.
time
()
def
start
(
self
):
"""Schedule start/stop profiler event specified in profiler_enabled_time_str."""
if
not
self
.
profiler_enabled_time_str
:
return
last_end_time
=
-
1
for
time_str
in
self
.
profiler_enabled_time_str
.
split
(
','
):
begin_time
=
int
(
time_str
.
split
(
':'
)[
0
].
strip
())
end_time_str
=
time_str
.
split
(
':'
)[
1
].
strip
()
if
':'
in
time_str
else
None
end_time
=
int
(
end_time_str
)
if
end_time_str
else
365
*
24
*
60
*
60
if
begin_time
<=
last_end_time
:
raise
ValueError
(
'begin_time {} is no larger than the last '
'end_time {}'
.
format
(
begin_time
,
last_end_time
))
if
end_time
<=
begin_time
:
raise
ValueError
(
'end_time {} is no larger than begin_time {}'
.
format
(
end_time
,
begin_time
))
# 4th positional arg added to support Python2 for the short-term.
self
.
scheduler
.
enter
(
begin_time
,
1
,
_start_profiler
,
argument
=
(
self
.
output_dir
,))
self
.
scheduler
.
enter
(
end_time
,
1
,
_stop_profiler
,
())
# pylint: disable=no-value-for-parameter
last_end_time
=
end_time
threading
.
Thread
(
target
=
self
.
scheduler
.
run
).
start
()
def
stop
(
self
):
"""Stop scheduler and save profiler data if any event is cancelled."""
event_canceled
=
False
for
event
in
self
.
scheduler
.
queue
:
try
:
self
.
scheduler
.
cancel
(
event
)
event_canceled
=
True
except
ValueError
:
# This is OK because the event may have been just canceled
pass
# Signal the scheduler thread to stop sleeping
self
.
exit_event
.
set
()
# Save the profiler data if any event is canceled
if
event_canceled
:
_stop_profiler
()
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_no_processes.txt
0 → 100644
View file @
ee3997b3
Tue Jan 9 09:34:25 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P100-SXM2... On | 00000000:06:00.0 Off | 0 |
| N/A 50C P0 196W / 300W | 15643MiB / 16276MiB | 97% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla P100-SXM2... On | 00000000:07:00.0 Off | 0 |
| N/A 41C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla P100-SXM2... On | 00000000:0A:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla P100-SXM2... On | 00000000:0B:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 4 Tesla P100-SXM2... On | 00000000:85:00.0 Off | 0 |
| N/A 36C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 5 Tesla P100-SXM2... On | 00000000:86:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 6 Tesla P100-SXM2... On | 00000000:89:00.0 Off | 0 |
| N/A 38C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 7 Tesla P100-SXM2... On | 00000000:8A:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
0 → 100644
View file @
ee3997b3
Tue Jan 9 09:34:25 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P100-SXM2... On | 00000000:06:00.0 Off | 0 |
| N/A 50C P0 196W / 300W | 15643MiB / 16276MiB | 97% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla P100-SXM2... On | 00000000:07:00.0 Off | 0 |
| N/A 41C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla P100-SXM2... On | 00000000:0A:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla P100-SXM2... On | 00000000:0B:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 4 Tesla P100-SXM2... On | 00000000:85:00.0 Off | 0 |
| N/A 36C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 5 Tesla P100-SXM2... On | 00000000:86:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 6 Tesla P100-SXM2... On | 00000000:89:00.0 Off | 0 |
| N/A 38C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 7 Tesla P100-SXM2... On | 00000000:8A:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 44454 C /usr/bin/python 15631MiB |
| 1 44454 C /usr/bin/python 15471MiB |
| 2 44454 C /usr/bin/python 15471MiB |
| 3 44454 C /usr/bin/python 15471MiB |
+-----------------------------------------------------------------------------+
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/nvme_device_log.txt
0 → 100644
View file @
ee3997b3
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
nvme0n8 259:7 0 375G 0 disk
nvme0n6 259:5 0 375G 0 disk
sdb 8:16 0 50G 0 disk
└─sdb1 8:17 0 50G 0 part /tmpfs
nvme0n4 259:3 0 375G 0 disk
nvme0n2 259:1 0 375G 0 disk
nvme0n7 259:6 0 375G 0 disk
nvme0n5 259:4 0 375G 0 disk
sda 8:0 0 100G 0 disk
└─sda1 8:1 0 100G 0 part /
nvme0n3 259:2 0 375G 0 disk
nvme0n1 259:0 0 375G 0 disk
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
0 → 100644
View file @
ee3997b3
"""Utility to manage the tpu version before starting the benchmark."""
import
json
from
absl
import
logging
from
six.moves.urllib
import
request
try
:
from
cloud_tpu_client
import
client
# pylint: disable=g-import-not-at-top
except
ImportError
:
print
(
'Falling back to TensorFlow client; we recommended you install the Cloud '
'TPU client directly with pip install cloud-tpu-client.'
)
from
tensorflow.python.tpu.client
import
client
# pylint: disable=g-import-not-at-top
def
_as_text
(
s
):
"""Converts a byte/string into string."""
if
isinstance
(
s
,
bytes
):
return
s
.
decode
(
'utf-8'
)
return
s
def
_get_content
(
url
):
"""Opens the url and loads the response into json."""
logging
.
info
(
'opening url %s'
,
url
)
req
=
request
.
Request
(
url
)
resp
=
request
.
urlopen
(
req
)
resp_text
=
_as_text
(
resp
.
read
())
logging
.
info
(
'response text = %s'
,
resp_text
)
return
json
.
loads
(
resp_text
)
def
_get_version_info
(
url
,
version_label
):
"""Constructs a version info from the response."""
json_data
=
_get_content
(
url
)
logging
.
info
(
'json_data = %s'
,
json_data
)
if
'currentVersion'
in
json_data
:
commit_id
=
json_data
[
'currentVersion'
]
elif
'buildLabel'
in
json_data
:
commit_id
=
json_data
[
'buildLabel'
]
else
:
commit_id
=
''
info
=
{
'url'
:
''
,
'hash'
:
commit_id
,
'branch'
:
version_label
,
'piper_id'
:
json_data
.
get
(
'piperOriginRevId'
,
''
)
}
return
info
def
_configure_tpu_version
(
tpu_name
,
version_label
,
new_version_id
):
"""Returns the current tpu version after resetting to an optional version."""
# The tpu_name is arbitrary / user chosen unique string for this tpu.
logging
.
info
(
'Trying to connect to tpu %s'
,
tpu_name
)
tpu_client
=
client
.
Client
(
tpu
=
tpu_name
)
tpu_client
.
wait_for_healthy
()
if
new_version_id
:
logging
.
info
(
'Trying to reset tpu version to %s'
,
new_version_id
)
tpu_client
.
configure_tpu_version
(
version
=
new_version_id
)
tpu_client
.
wait_for_healthy
()
logging
.
info
(
'TPU healthy after version reset.'
)
else
:
logging
.
info
(
'Using the default tpu version id.'
)
workers
=
tpu_client
.
network_endpoints
()
if
workers
:
ip_addr
=
workers
[
0
][
'ipAddress'
]
url
=
'http://{}:8475/requestversion'
.
format
(
ip_addr
)
return
_get_version_info
(
url
,
version_label
)
else
:
logging
.
error
(
'No tpu endpoint info'
)
return
{
'url'
:
''
,
'hash'
:
''
,
'branch'
:
version_label
,
'piper_id'
:
''
,
}
def
configure_tpu
(
tpu_params
):
return
_configure_tpu_version
(
tpu_params
.
get
(
'name'
),
version_label
=
tpu_params
.
get
(
'version'
),
new_version_id
=
tpu_params
.
get
(
'version_id'
))
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero utility methods."""
from
__future__
import
print_function
import
importlib
import
logging
import
os
import
shutil
import
subprocess
import
sys
import
threading
import
traceback
import
requests
import
json
import
re
def
create_empty_file
(
parent_directory
,
file_basename
):
"""Creates an empty file with a given basename in a parent directory.
Creates parent_directory and intermediate directories if it doesn't exist.
This is mostly used for creating no-op actions in the Dockerfile.
Args:
parent_directory: The path to the parent directory.
file_basename: The basename for the empty file.
"""
if
not
os
.
path
.
isdir
(
parent_directory
):
os
.
makedirs
(
parent_directory
)
full_file_name
=
os
.
path
.
join
(
parent_directory
,
file_basename
)
with
open
(
full_file_name
,
'w'
):
print
(
'Creating empty file: {}'
.
format
(
full_file_name
))
def
checkout_git_repos
(
git_repos
,
use_cached_site_packages
):
"""Clone, update, or sync a repo.
Args:
git_repos: array of dict containing attributes of the git repo to checkout.
use_cached_site_packages: If true, skip git pull if git_repo already exists.
Returns:
A dict containing attributes of the git repositories
"""
site_package_info
=
{}
for
repo
in
git_repos
:
logging
.
info
(
'Checking out repository from %s to %s'
,
repo
[
'url'
],
repo
[
'local_path'
])
if
not
os
.
path
.
isdir
(
repo
[
'local_path'
]):
run_commands
([
'git clone {} {}'
.
format
(
repo
[
'url'
],
repo
[
'local_path'
])])
if
'branch'
in
repo
:
run_commands
([
'git -C {} checkout {}'
.
format
(
repo
[
'local_path'
],
repo
[
'branch'
])])
if
not
use_cached_site_packages
or
'git_hash'
in
repo
:
run_commands
([
'git -C {} pull --rebase'
.
format
(
repo
[
'local_path'
])])
if
'git_hash'
in
repo
:
run_commands
([
'git -C {} reset --hard {}'
.
format
(
repo
[
'local_path'
],
repo
[
'git_hash'
])])
logging
.
info
(
'Checked-out repository from %s to %s'
,
repo
[
'url'
],
repo
[
'local_path'
])
site_package_info
[
repo
[
'dir_name'
]]
=
get_git_repo_info
(
repo
[
'local_path'
])
return
site_package_info
def
get_git_repo_info
(
local_path
):
"""Get information of the git repository specified by the local_path."""
git_repo_info
=
{}
# Get git url
cmd
=
'git -C {} config --get remote.origin.url'
.
format
(
local_path
)
exit_code
,
result
=
run_command
(
cmd
)
lines
=
result
.
splitlines
()
if
exit_code
==
0
and
lines
:
git_repo_info
[
'url'
]
=
lines
[
0
]
else
:
logging
.
error
(
'Error getting git url for repository %s due to %s'
,
local_path
,
result
)
return
{}
# Get git branch
cmd
=
'git -C {} rev-parse --abbrev-ref HEAD'
.
format
(
local_path
)
exit_code
,
result
=
run_command
(
cmd
)
lines
=
result
.
splitlines
()
if
exit_code
==
0
and
lines
:
git_repo_info
[
'branch'
]
=
lines
[
0
]
else
:
logging
.
error
(
'Error getting git branch for repository %s due to %s'
,
local_path
,
result
)
return
{}
# Get git hash
cmd
=
'git -C {} rev-parse HEAD'
.
format
(
local_path
)
exit_code
,
result
=
run_command
(
cmd
)
lines
=
result
.
splitlines
()
if
exit_code
==
0
and
lines
:
git_repo_info
[
'hash'
]
=
lines
[
0
]
else
:
logging
.
error
(
'Error getting git hash for repository %s due to %s'
,
local_path
,
result
)
return
{}
return
git_repo_info
def
setup_python_path
(
site_packages_dir
,
python_path_str
):
if
python_path_str
:
python_paths
=
python_path_str
.
split
(
','
)
for
python_path
in
python_paths
:
logging
.
info
(
'Adding path %s to sys.path'
,
python_path
)
sys
.
path
.
append
(
os
.
path
.
join
(
site_packages_dir
,
python_path
))
logging
.
debug
(
'PYTHONPATH: %s'
,
sys
.
path
)
def
active_gcloud_service
(
gcloud_key_file_url
,
workspace_dir
,
download_only
=
False
):
"""Download key file and setup gcloud service credential using the key file.
Args:
gcloud_key_file_url: gcloud key file url
workspace_dir: directory that the key file is downloaded to
download_only: skip setting up the gcloud service credential if this is true
"""
if
not
gcloud_key_file_url
:
return
local_path
=
os
.
path
.
join
(
workspace_dir
,
os
.
path
.
basename
(
gcloud_key_file_url
))
if
not
os
.
path
.
exists
(
local_path
):
download_data
([{
'url'
:
gcloud_key_file_url
,
'local_path'
:
local_path
}])
if
not
download_only
:
os
.
environ
[
'GOOGLE_APPLICATION_CREDENTIALS'
]
=
local_path
run_commands
([
'gcloud auth activate-service-account --key-file {}'
.
format
(
local_path
)])
logging
.
info
(
'Activated gcloud service account credential'
)
def
setup_gsutil_credential
():
run_commands
([
'gcloud config set pass_credentials_to_gsutil true'
])
def
download_data
(
download_infos
):
"""Download data from url to local_path for each (url, local_path) pair in the download_infos.
Each url should start with either gs://, http:// or https://
Downloaded file whose name ends with .gz will be decompressed in its
current directory
Args:
download_infos: array of dict which specifies the url and local_path for
data download
"""
for
info
in
download_infos
:
if
os
.
path
.
exists
(
info
[
'local_path'
]):
continue
original_base_name
=
os
.
path
.
basename
(
info
[
'url'
])
expected_base_name
=
os
.
path
.
basename
(
info
[
'local_path'
])
local_path_parent
=
os
.
path
.
dirname
(
info
[
'local_path'
])
logging
.
info
(
'Downloading data from %s to %s'
,
info
[
'url'
],
info
[
'local_path'
])
make_dir_if_not_exist
(
local_path_parent
)
# Download data to the local path
if
info
[
'url'
].
startswith
(
'http://'
)
or
info
[
'url'
].
startswith
(
'https://'
):
request
=
requests
.
get
(
info
[
'url'
],
allow_redirects
=
True
)
f
=
open
(
info
[
'local_path'
],
'wb'
)
f
.
write
(
request
.
content
)
f
.
close
()
elif
info
[
'url'
].
startswith
(
'gs://'
):
cmd
=
[
'gsutil'
,
'-m'
,
'cp'
,
'-r'
,
'-n'
,
info
[
'url'
],
local_path_parent
]
run_commands
([
cmd
],
shell
=
False
)
elif
info
[
'url'
].
startswith
(
'file://'
):
cmd
=
[
'cp'
,
info
[
'url'
][
7
:],
local_path_parent
]
run_commands
([
cmd
],
shell
=
False
)
else
:
raise
ValueError
(
'Url {} with prefix {} is not supported.'
.
format
(
info
[
'url'
],
info
[
'url'
].
split
(
':'
)[
0
]))
# Move data to the expected local path
if
original_base_name
!=
expected_base_name
:
run_commands
([
'mv {} {}'
.
format
(
os
.
path
.
join
(
local_path_parent
,
original_base_name
),
os
.
path
.
join
(
local_path_parent
,
expected_base_name
))])
logging
.
info
(
'Downloaded data from %s to %s'
,
info
[
'url'
],
info
[
'local_path'
])
# Decompress file if file name ends with .gz unless caller sets 'decompress'
# to False in info.
if
info
[
'url'
].
endswith
(
'.gz'
)
and
info
.
get
(
'decompress'
,
True
):
run_commands
([
'tar xvf {} -C {}'
.
format
(
info
[
'local_path'
],
local_path_parent
)])
logging
.
info
(
'Decompressed file %s'
,
info
[
'local_path'
])
def
parse_data_downloads_str
(
root_data_dir
,
data_downloads_str
):
"""Parse a comma separated string into array of dicts.
Each dict specifies the url and local_path for a download.
Args:
root_data_dir: the directory which should contain all the dataset files
data_downloads_str: a comma separated string specified by the
flag --data_downloads
Returns:
An array of dict which specifies the url and local_path for data download
"""
download_infos
=
[]
if
not
data_downloads_str
:
return
download_infos
for
entry
in
data_downloads_str
.
split
(
','
):
info
=
{}
if
';'
in
entry
:
info
[
'url'
]
=
entry
.
split
(
';'
)[
0
]
info
[
'local_path'
]
=
os
.
path
.
join
(
root_data_dir
,
entry
.
split
(
';'
)[
1
])
else
:
info
[
'url'
]
=
entry
info
[
'local_path'
]
=
os
.
path
.
join
(
root_data_dir
,
os
.
path
.
basename
(
entry
))
# Canonicalize url to remove trailing '/' and '*'
if
info
[
'url'
].
endswith
(
'*'
):
info
[
'url'
]
=
info
[
'url'
][:
-
1
]
if
info
[
'url'
].
endswith
(
'/'
):
info
[
'url'
]
=
info
[
'url'
][:
-
1
]
download_infos
.
append
(
info
)
return
download_infos
def
maybe_upload_to_gcs
(
local_dir
,
output_gcs_url
):
if
not
output_gcs_url
:
return
run_commands
([
'gsutil -m cp -r {} {}'
.
format
(
local_dir
,
output_gcs_url
)])
logging
.
info
(
'Uploaded data from local directory %s to gcs %s'
,
local_dir
,
output_gcs_url
)
def
make_dir_if_not_exist
(
local_path
):
if
not
os
.
path
.
exists
(
local_path
):
os
.
makedirs
(
local_path
)
logging
.
info
(
'Created directory %s'
,
local_path
)
def
run_command
(
cmd
,
shell
=
True
):
"""Structures for a variety of different test results.
Args:
cmd: Command to execute
shell: True to use shell, false otherwise.
Returns:
Tuple of the command return value and the standard out in as a string.
"""
logging
.
debug
(
'Executing command: %s'
,
cmd
)
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
,
shell
=
shell
)
exit_code
=
None
line
=
''
stdout
=
''
while
exit_code
is
None
or
line
:
exit_code
=
p
.
poll
()
line
=
p
.
stdout
.
readline
().
decode
(
'utf-8'
)
stdout
+=
line
logging
.
debug
(
line
)
return
exit_code
,
stdout
def
run_commands
(
cmds
,
shell
=
True
):
"""Runs list of command and throw error if any fail."""
for
cmd
in
cmds
:
exit_code
,
stdout
=
run_command
(
cmd
,
shell
=
shell
)
if
exit_code
:
raise
Exception
(
'"{}" failed with code:{} and stdout:
\n
{}'
.
format
(
cmd
,
exit_code
,
stdout
))
def
get_cpu_name
():
cmd
=
"cat /proc/cpuinfo | grep 'model name' | sort --unique"
exit_code
,
result
=
run_command
(
cmd
)
lines
=
result
.
splitlines
()
if
exit_code
==
0
and
lines
:
model_name_parts
=
lines
[
0
].
split
(
':'
)
return
model_name_parts
[
1
].
strip
()
else
:
logging
.
error
(
'Error getting cpuinfo model name: %s'
,
result
)
return
''
def
get_cpu_socket_count
():
cmd
=
'grep -i "physical id" /proc/cpuinfo | sort -u | wc -l'
exit_code
,
result
=
run_command
(
cmd
)
lines
=
result
.
splitlines
()
if
exit_code
==
0
and
lines
:
return
int
(
lines
[
0
])
else
:
logging
.
error
(
'Error getting cpuinfo scocket count: %s'
,
result
)
return
-
1
def
_get_amd_gpu_info
():
"""Returns gpu information using rocm-smi.
Note: Assumes if the system has multiple GPUs, that they are all the same
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`rocm-smi` is not found or fails.
"""
cmd
=
'rocm-smi --json --showproductname --showdriverversion'
exit_code
,
result
=
run_command
(
cmd
)
if
exit_code
!=
0
:
logging
.
error
(
'rocm-smi did not return as expected: %s'
,
result
)
return
None
def
get_gpu_driver_version
(
rocm_smi_output
):
return
rocm_smi_output
[
'system'
][
'Driver version'
]
def
get_gpu_model
(
rocm_smi_output
):
gpu_model
=
""
for
key
,
value
in
rocm_smi_output
.
items
():
if
re
.
match
(
"card[0-9]+"
,
key
):
gpu_model
=
value
[
'Card SKU'
]
break
return
gpu_model
def
get_gpu_count
(
rocm_smi_output
):
gpu_count
=
0
for
key
,
value
in
rocm_smi_output
.
items
():
if
re
.
match
(
"card[0-9]+"
,
key
):
gpu_count
+=
1
return
gpu_count
rocm_smi_output
=
json
.
loads
(
result
)
gpu_info
=
{}
gpu_info
[
'gpu_driver_version'
]
=
get_gpu_driver_version
(
rocm_smi_output
)
gpu_info
[
'gpu_model'
]
=
get_gpu_model
(
rocm_smi_output
)
gpu_info
[
'gpu_count'
]
=
get_gpu_count
(
rocm_smi_output
)
return
gpu_info
def
_get_nvidia_gpu_info
():
"""Returns gpu information using nvidia-smi.
Note: Assumes if the system has multiple GPUs that they are all the same with
one exception. If the first result is a Quadro, the heuristic assumes
this may be a workstation and takes the second entry.
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`nvidia-smi` is not found or fails.
"""
cmd
=
'nvidia-smi --query-gpu=driver_version,gpu_name --format=csv'
exit_code
,
result
=
run_command
(
cmd
)
if
exit_code
!=
0
:
logging
.
error
(
'nvidia-smi did not return as expected: %s'
,
result
)
return
None
lines
=
result
.
splitlines
()
gpu_info_line
=
lines
[
1
]
if
'Quadro'
in
gpu_info_line
and
len
(
lines
)
>=
3
:
gpu_info_line
=
lines
[
2
]
gpu_info
=
{}
gpu_info
[
'gpu_driver_version'
]
=
gpu_info_line
.
split
(
','
)[
0
].
strip
()
gpu_info
[
'gpu_model'
]
=
gpu_info_line
.
split
(
','
)[
1
].
strip
()
gpu_info
[
'gpu_count'
]
=
len
(
lines
)
-
1
return
gpu_info
def
get_gpu_info
():
"""Returns gpu information using either nvidia-smi or rocm-smi.
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`nvidia-smi` is not found or fails.
"""
return
_get_amd_gpu_info
()
if
shutil
.
which
(
"rocm-smi"
)
\
else
_get_nvidia_gpu_info
()
def
_install_tpu_tool
():
"""Installs the ctpu tool to managing cloud TPUs.
Follows the instructions here:
https://github.com/tensorflow/tpu/tree/master/tools/ctpu
"""
if
not
os
.
path
.
exists
(
'ctpu'
):
logging
.
info
(
'Installing TPU tool'
)
commands
=
[
'wget https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu'
,
'chmod a+x ctpu'
,
]
run_commands
(
commands
)
def
setup_tpu
(
parameters
):
"""Sets up a TPU with a given set of parameters.
Args:
parameters: dictionary of TPU parameters.
Returns:
True if an error occurs during setup.
"""
try
:
_install_tpu_tool
()
args
=
[
'--name={}'
.
format
(
parameters
.
get
(
'name'
)),
'--project={}'
.
format
(
parameters
.
get
(
'project'
)),
'--zone={}'
.
format
(
parameters
.
get
(
'zone'
)),
'--tpu-size={}'
.
format
(
parameters
.
get
(
'size'
)),
'--tf-version={}'
.
format
(
parameters
.
get
(
'version'
)),
'--tpu-only'
,
'-noconf'
,
]
command
=
'./ctpu up {}'
.
format
(
' '
.
join
(
args
))
logging
.
info
(
'Setting up TPU: %s'
,
command
)
exit_code
,
output
=
run_command
(
command
)
if
exit_code
!=
0
:
logging
.
error
(
'Error in setup with output: %s'
,
output
)
return
exit_code
!=
0
except
Exception
:
logging
.
error
(
'Unable to setup TPU'
)
run_command
(
'rm -f ctpu'
)
sys
.
exit
(
1
)
def
cleanup_tpu
(
parameters
):
"""Cleans up an existing TPU.
Args:
parameters: dictionary of TPU parameters.
Returns:
True if an error occurs during cleanup.
"""
_install_tpu_tool
()
args
=
[
'--name={}'
.
format
(
parameters
.
get
(
'name'
)),
'--project={}'
.
format
(
parameters
.
get
(
'project'
)),
'--zone={}'
.
format
(
parameters
.
get
(
'zone'
)),
'--tpu-only'
,
'-noconf'
,
]
command
=
'./ctpu delete {}'
.
format
(
' '
.
join
(
args
))
logging
.
info
(
'Cleaning up TPU: %s'
,
command
)
exit_code
,
output
=
run_command
(
command
)
if
exit_code
!=
0
:
logging
.
error
(
'Error in cleanup with output: %s'
,
output
)
return
exit_code
!=
0
def
read_benchmark_result
(
benchmark_result_file_path
):
"""Read benchmark result from the protobuf file."""
from
google.protobuf
import
json_format
# pylint: disable=g-import-not-at-top
from
tensorflow.core.util
import
test_log_pb2
# pylint: disable=g-import-not-at-top
if
not
os
.
path
.
isfile
(
benchmark_result_file_path
):
logging
.
error
(
'Failed to read benchmark result because '
'file %s does not exist'
,
benchmark_result_file_path
)
return
{}
with
open
(
benchmark_result_file_path
,
'rb'
)
as
f
:
benchmark_entries
=
test_log_pb2
.
BenchmarkEntries
()
benchmark_entries
.
ParseFromString
(
f
.
read
())
return
json_format
.
MessageToDict
(
benchmark_entries
,
preserving_proto_field_name
=
True
,
including_default_value_fields
=
True
)[
'entry'
][
0
]
def
print_thread_stacktrace
():
print
(
'Here is the stacktrace for all threads:'
)
thread_names
=
{
t
.
ident
:
t
.
name
for
t
in
threading
.
enumerate
()}
for
thread_id
,
frame
in
sys
.
_current_frames
().
items
():
# pylint: disable=protected-access
print
(
'Thread {}'
.
format
(
thread_names
.
get
(
thread_id
,
thread_id
)))
traceback
.
print_stack
(
frame
)
def
instantiate_benchmark_class
(
benchmark_class
,
output_dir
,
root_data_dir
,
tpu
,
constructor_args
,
benchmark_class_type
=
None
):
"""Return initialized benchmark class."""
module_import_path
,
class_name
=
benchmark_class
.
rsplit
(
'.'
,
1
)
module
=
importlib
.
import_module
(
module_import_path
)
class_
=
getattr
(
module
,
class_name
)
if
benchmark_class_type
==
'tf_benchmark'
:
# for benchmarks inheriting from tf.test.Benchmark, instantiate them directly.
instance
=
class_
(
**
constructor_args
)
else
:
# Default instantiation for perfzero_benchmark classes.
instance
=
class_
(
output_dir
=
output_dir
,
root_data_dir
=
root_data_dir
,
tpu
=
tpu
,
**
constructor_args
)
return
instance
def
copy_and_rename_dirs
(
dir_spec_string
,
dst_base_dir
):
"""Copies list of <dir-path>:new_name specs into a new dest dir.
If a path /path1/path2/dir:new_dir is given, it copies /path1/path2/dir to
dst_base_dir/new_dir.
Args:
dir_spec_string: Comma separated list of /path1/path2:new_name specs.
dst_base_dir: The base dir to contain the copies.
"""
if
not
dir_spec_string
:
return
dir_specs
=
dir_spec_string
.
split
(
','
)
for
src_dir_with_name
in
dir_specs
:
src_dir
,
final_basename
=
src_dir_with_name
.
split
(
':'
)
dst_dir
=
os
.
path
.
join
(
dst_base_dir
,
final_basename
)
if
os
.
path
.
isdir
(
dst_dir
):
logging
.
info
(
'[DELETE] pre-existing %s'
,
dst_dir
)
shutil
.
rmtree
(
dst_dir
)
logging
.
info
(
'[COPY] %s -> %s'
,
src_dir
,
dst_dir
)
shutil
.
copytree
(
src_dir
,
dst_dir
)
TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils_test.py
0 → 100644
View file @
ee3997b3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests utils.py."""
import
os
import
unittest
from
mock
import
call
from
mock
import
MagicMock
from
mock
import
patch
import
perfzero.utils
as
utils
import
tensorflow
as
tf
# pylint: disable=g-bad-import-order
class
TestUtils
(
unittest
.
TestCase
,
tf
.
test
.
Benchmark
):
def
test_protobuf_read
(
self
):
output_dir
=
'/tmp/'
os
.
environ
[
'TEST_REPORT_FILE_PREFIX'
]
=
output_dir
benchmark_result_file_path
=
os
.
path
.
join
(
output_dir
,
'TestUtils.testReportBenchmark'
)
if
os
.
path
.
exists
(
benchmark_result_file_path
):
os
.
remove
(
benchmark_result_file_path
)
self
.
report_benchmark
(
iters
=
2000
,
wall_time
=
1000
,
name
=
'testReportBenchmark'
,
metrics
=
[{
'name'
:
'metric_name_1'
,
'value'
:
0
,
'min_value'
:
1
},
{
'name'
:
'metric_name_2'
,
'value'
:
90
,
'min_value'
:
0
,
'max_value'
:
95
}])
actual_result
=
utils
.
read_benchmark_result
(
benchmark_result_file_path
)
os
.
remove
(
benchmark_result_file_path
)
expected_result
=
{
'name'
:
'TestUtils.testReportBenchmark'
,
# google.protobuf.json_format.MessageToDict() will convert
# int64 field to string.
'iters'
:
'2000'
,
'wall_time'
:
1000
,
'cpu_time'
:
0
,
'throughput'
:
0
,
'extras'
:
{},
'metrics'
:
[
{
'name'
:
'metric_name_1'
,
'value'
:
0
,
'min_value'
:
1
},
{
'name'
:
'metric_name_2'
,
'value'
:
90
,
'min_value'
:
0
,
'max_value'
:
95
}
]
}
self
.
assertDictEqual
(
expected_result
,
actual_result
)
@
patch
(
'perfzero.utils.get_git_repo_info'
)
@
patch
(
'perfzero.utils.run_commands'
)
def
test_checkout_git_repos
(
self
,
run_commands_mock
,
get_git_repo_info_mock
):
git_repo_1
=
{}
git_repo_1
[
'url'
]
=
'url_1'
git_repo_1
[
'local_path'
]
=
'local_path_1'
git_repo_1
[
'dir_name'
]
=
'dir_name_1'
git_repo_1
[
'branch'
]
=
'branch_1'
git_repo_1
[
'git_hash'
]
=
'git_hash_1'
git_repo_2
=
{}
git_repo_2
[
'url'
]
=
'url_2'
git_repo_2
[
'local_path'
]
=
'local_path_2'
git_repo_2
[
'dir_name'
]
=
'dir_name_2'
git_repo_2
[
'branch'
]
=
'branch_2'
git_repo_info_1
=
{
'url'
:
'url_1'
}
git_repo_info_2
=
{
'url'
:
'url_2'
}
get_git_repo_info_mock
.
side_effect
=
\
lambda
local_path
:
git_repo_info_1
if
local_path
==
'local_path_1'
else
git_repo_info_2
# pylint: disable=line-too-long
site_package_info
=
utils
.
checkout_git_repos
([
git_repo_1
,
git_repo_2
],
False
)
self
.
assertEqual
(
2
,
len
(
site_package_info
))
self
.
assertEqual
(
git_repo_info_1
,
site_package_info
[
'dir_name_1'
])
self
.
assertEqual
(
git_repo_info_2
,
site_package_info
[
'dir_name_2'
])
run_commands_mock
.
assert_has_calls
(
any_order
=
False
,
calls
=
[
call
([
'git clone url_1 local_path_1'
]),
call
([
'git -C local_path_1 checkout branch_1'
]),
call
([
'git -C local_path_1 pull --rebase'
]),
call
([
'git -C local_path_1 reset --hard git_hash_1'
]),
call
([
'git clone url_2 local_path_2'
]),
call
([
'git -C local_path_2 checkout branch_2'
])
])
@
patch
(
'perfzero.utils.run_command'
)
def
test_get_git_repo_info
(
self
,
run_command_mock
):
run_command_mock
.
side_effect
=
[
[
0
,
'git_url'
],
[
0
,
'branch_name'
],
[
0
,
'git_hash'
]
]
git_repo_info
=
utils
.
get_git_repo_info
(
'local_path_1'
)
self
.
assertEqual
(
{
'url'
:
'git_url'
,
'branch'
:
'branch_name'
,
'hash'
:
'git_hash'
},
git_repo_info
)
run_command_mock
.
assert_has_calls
(
any_order
=
False
,
calls
=
[
call
(
'git -C local_path_1 config --get remote.origin.url'
),
call
(
'git -C local_path_1 rev-parse --abbrev-ref HEAD'
),
call
(
'git -C local_path_1 rev-parse HEAD'
)
])
@
patch
(
'builtins.open'
)
@
patch
(
'perfzero.utils.make_dir_if_not_exist'
)
@
patch
(
'requests.get'
)
@
patch
(
'perfzero.utils.run_commands'
)
def
test_download_data
(
self
,
run_commands_mock
,
requests_get_mock
,
make_dir_mock
,
open_mock
):
# pylint: disable=unused-argument
get_mock
=
MagicMock
()
get_mock
.
content
=
'content'
requests_get_mock
.
return_value
=
get_mock
download_info_1
=
{
'url'
:
'gs://remote_path_1/name_1'
,
'local_path'
:
'local_path_1/modified_name_1'
}
download_info_2
=
{
'url'
:
'http://remote_path_2/name_2'
,
'local_path'
:
'local_path_2/modified_name_2'
}
utils
.
download_data
([
download_info_1
,
download_info_2
])
make_dir_mock
.
assert_has_calls
(
any_order
=
False
,
calls
=
[
call
(
'local_path_1'
),
call
(
'local_path_2'
)
])
requests_get_mock
.
assert_called_once_with
(
'http://remote_path_2/name_2'
,
allow_redirects
=
True
)
run_commands_mock
.
assert_has_calls
(
any_order
=
False
,
calls
=
[
call
([[
'gsutil'
,
'-m'
,
'cp'
,
'-r'
,
'-n'
,
'gs://remote_path_1/name_1'
,
'local_path_1'
]],
shell
=
False
),
call
([
'mv local_path_1/name_1 local_path_1/modified_name_1'
]),
call
([
'mv local_path_2/name_2 local_path_2/modified_name_2'
])
])
def
test_parse_data_downloads_str
(
self
):
data_downloads_str
=
'url_1;relative_path_1,url_2;relative_path_2'
download_infos
=
utils
.
parse_data_downloads_str
(
'/root_data_dir'
,
data_downloads_str
)
self
.
assertEqual
(
2
,
len
(
download_infos
))
self
.
assertEqual
(
download_infos
[
0
],
{
'url'
:
'url_1'
,
'local_path'
:
'/root_data_dir/relative_path_1'
})
self
.
assertEqual
(
download_infos
[
1
],
{
'url'
:
'url_2'
,
'local_path'
:
'/root_data_dir/relative_path_2'
})
@
patch
(
'perfzero.utils.run_command'
)
def
test_get_cpu_name
(
self
,
run_command_mock
):
"""Tests extract the cpu model name."""
run_command_mock
.
return_value
=
[
0
,
'model name : Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz
\n
'
]
cpu_name
=
utils
.
get_cpu_name
()
self
.
assertEqual
(
'Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz'
,
cpu_name
)
@
patch
(
'perfzero.utils.run_command'
)
def
test_get_cpu_socket_count
(
self
,
run_command_mock
):
"""Tests get socket count."""
run_command_mock
.
return_value
=
[
0
,
'2
\n
'
]
cpu_socket_count
=
utils
.
get_cpu_socket_count
()
self
.
assertEqual
(
2
,
cpu_socket_count
)
@
patch
(
'perfzero.utils.run_command'
)
def
test_get_gpu_model
(
self
,
run_command_mock
):
# Tests get gpu info parses expected value into expected components.
run_command_mock
.
return_value
=
[
0
,
'driver_version, name
\n
381.99, GTX 1080
\n
'
]
gpu_model
=
utils
.
get_gpu_info
()[
'gpu_model'
]
self
.
assertEqual
(
'GTX 1080'
,
gpu_model
)
# Tests gpu info returns second entry if first entry is a Quadro.
run_command_mock
.
return_value
=
[
0
,
'blah
\n
200.99, Quadro K900
\n
381.99, GTX 1080
\n
'
]
gpu_model
=
utils
.
get_gpu_info
()[
'gpu_model'
]
self
.
assertEqual
(
'GTX 1080'
,
gpu_model
)
@
patch
(
'perfzero.utils.run_command'
)
def
test_get_gpu_count
(
self
,
run_command_mock
):
"""Tests gpu info returns second entry if first entry is a Quadro."""
run_command_mock
.
return_value
=
[
0
,
'blah
\n
200.99, Quadro K900
\n
381.99, GTX 1080
\n
'
]
gpu_count
=
utils
.
get_gpu_info
()[
'gpu_count'
]
self
.
assertEqual
(
2
,
gpu_count
)
Prev
1
…
8
9
10
11
12
13
14
15
16
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment