Merge branch 'qianyj_tf' into 'main'

update tf code See merge request dcutoolkit/deeplearing/dlexamples_new!35

Merge branch 'qianyj_tf' into 'main'
update tf code See merge request dcutoolkit/deeplearing/dlexamples_new!35
9dafea91 · sunxx1 · 92a2ca36 · a4146470 · 9dafea91 · 9dafea91
Commit 9dafea91 authored Aug 02, 2022 by sunxx1
20 changed files
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
+Tue Jan  9 09:34:25 2018
+-----------------------------------------------------------------------------+
+| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|===============================+======================+======================|
+|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
+| N/A   50C    P0   196W / 300W |  15643MiB / 16276MiB |     97%      Default |
+-------------------------------+----------------------+----------------------+
+|   1  Tesla P100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
+| N/A   41C    P0    50W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+|   2  Tesla P100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
+| N/A   33C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+|   3  Tesla P100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
+| N/A   34C    P0    49W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+|   4  Tesla P100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
+| N/A   36C    P0    50W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+|   5  Tesla P100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |
+| N/A   33C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+|   6  Tesla P100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
+| N/A   38C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+|   7  Tesla P100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
+| N/A   34C    P0    49W / 300W |  15483MiB / 16276MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
+| Processes:                                                       GPU Memory |
+|  GPU       PID   Type   Process name                             Usage      |
+|=============================================================================|
+|    0     44454      C   /usr/bin/python                            15631MiB |
+|    1     44454      C   /usr/bin/python                            15471MiB |
+|    2     44454      C   /usr/bin/python                            15471MiB |
+|    3     44454      C   /usr/bin/python                            15471MiB |
+-----------------------------------------------------------------------------+
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/test_files/nvme_device_log.txt
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/test_files/nvme_device_log.txt
+NAME    MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
+nvme0n8 259:7    0   375G  0 disk
+nvme0n6 259:5    0   375G  0 disk
+sdb       8:16   0    50G  0 disk
+└─sdb1    8:17   0    50G  0 part /tmpfs
+nvme0n4 259:3    0   375G  0 disk
+nvme0n2 259:1    0   375G  0 disk
+nvme0n7 259:6    0   375G  0 disk
+nvme0n5 259:4    0   375G  0 disk
+sda       8:0    0   100G  0 disk
+└─sda1    8:1    0   100G  0 part /
+nvme0n3 259:2    0   375G  0 disk
+nvme0n1 259:0    0   375G  0 disk
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
+"""Utility to manage the tpu version before starting the benchmark."""
+import json
+from absl import logging
+from six.moves.urllib import request
+try:
+  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
+except ImportError:
+  print(
+      'Falling back to TensorFlow client; we recommended you install the Cloud '
+      'TPU client directly with pip install cloud-tpu-client.')
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
+def _as_text(s):
+  """Converts a byte/string into string."""
+  if isinstance(s, bytes):
+    return s.decode('utf-8')
+  return s
+def _get_content(url):
+  """Opens the url and loads the response into json."""
+  logging.info('opening url %s', url)
+  req = request.Request(url)
+  resp = request.urlopen(req)
+  resp_text = _as_text(resp.read())
+  logging.info('response text = %s', resp_text)
+  return json.loads(resp_text)
+def _get_version_info(url, version_label):
+  """Constructs a version info from the response."""
+  json_data = _get_content(url)
+  logging.info('json_data = %s', json_data)
+  if 'currentVersion' in json_data:
+    commit_id = json_data['currentVersion']
+  elif 'buildLabel' in json_data:
+    commit_id = json_data['buildLabel']
+  else:
+    commit_id = ''
+  info = {
+      'url': '',
+      'hash': commit_id,
+      'branch': version_label,
+      'piper_id': json_data.get('piperOriginRevId', '')
+  }
+  return info
+def _configure_tpu_version(tpu_name, version_label, new_version_id):
+  """Returns the current tpu version after resetting to an optional version."""
+  # The tpu_name is arbitrary / user chosen unique string for this tpu.
+  logging.info('Trying to connect to tpu %s', tpu_name)
+  tpu_client = client.Client(tpu=tpu_name)
+  tpu_client.wait_for_healthy()
+  if new_version_id:
+    logging.info('Trying to reset tpu version to %s', new_version_id)
+    tpu_client.configure_tpu_version(version=new_version_id)
+    tpu_client.wait_for_healthy()
+    logging.info('TPU healthy after version reset.')
+  else:
+    logging.info('Using the default tpu version id.')
+  workers = tpu_client.network_endpoints()
+  if workers:
+    ip_addr = workers[0]['ipAddress']
+    url = 'http://{}:8475/requestversion'.format(ip_addr)
+    return _get_version_info(url, version_label)
+  else:
+    logging.error('No tpu endpoint info')
+    return {
+        'url': '',
+        'hash': '',
+        'branch': version_label,
+        'piper_id': '',
+    }
+def configure_tpu(tpu_params):
+  return _configure_tpu_version(
+      tpu_params.get('name'),
+      version_label=tpu_params.get('version'),
+      new_version_id=tpu_params.get('version_id'))
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/utils.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/utils.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/utils_test.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/perfzero/utils_test.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/setup.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/lib/setup.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/screenshots/profiling_overview.png
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/screenshots/profiling_overview.png
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/screenshots/profiling_trace_view.png
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/screenshots/profiling_trace_view.png
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/scripts/create_big_table.txt
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/scripts/create_big_table.txt
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/scripts/generate-readme-header.sh
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/perfzero/scripts/generate-readme-header.sh
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/resnet/cifar10_download_and_extract.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/resnet/cifar10_download_and_extract.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/README.md
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/README.md
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/utils/export/export.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/utils/export/export.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce_test.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce_test.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/batch_allreduce.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/batch_allreduce.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
--- a/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py
+++ b/TensorFlow/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py