_performance.py 8.94 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Register flags for optimizing performance."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import multiprocessing

from absl import flags    # pylint: disable=g-bad-import-order
import tensorflow as tf   # pylint: disable=g-bad-import-order

from official.utils.flags._conventions import help_wrap


# Map string to (TensorFlow dtype, default loss scale)
DTYPE_MAP = {
    "fp16": (tf.float16, 128),
    "fp32": (tf.float32, 1),
}


def get_tf_dtype(flags_obj):
  return DTYPE_MAP[flags_obj.dtype][0]


def get_loss_scale(flags_obj):
41
  if flags_obj.loss_scale == "dynamic":
42
    return flags_obj.loss_scale
43
44
  elif flags_obj.loss_scale is not None:
    return float(flags_obj.loss_scale)
45
46
47
48
  return DTYPE_MAP[flags_obj.dtype][1]


def define_performance(num_parallel_calls=True, inter_op=True, intra_op=True,
49
                       synthetic_data=True, max_train_steps=True, dtype=True,
50
51
                       all_reduce_alg=True, num_packs=True,
                       tf_gpu_thread_mode=False,
Toby Boyd's avatar
Toby Boyd committed
52
                       datasets_num_private_threads=False,
53
54
                       datasets_num_parallel_batches=False,
                       dynamic_loss_scale=False):
55
56
57
58
59
60
61
62
63
64
  """Register flags for specifying performance tuning arguments.

  Args:
    num_parallel_calls: Create a flag to specify parallelism of data loading.
    inter_op: Create a flag to allow specification of inter op threads.
    intra_op: Create a flag to allow specification of intra op threads.
    synthetic_data: Create a flag to allow the use of synthetic data.
    max_train_steps: Create a flags to allow specification of maximum number
      of training steps
    dtype: Create flags for specifying dtype.
Toby Boyd's avatar
Toby Boyd committed
65
    all_reduce_alg: If set forces a specific algorithm for multi-gpu.
66
67
    num_packs: If set provides number of packs for MirroredStrategy's cross
      device ops.
Toby Boyd's avatar
Toby Boyd committed
68
69
    tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
    datasets_num_private_threads: Number of private threads for datasets.
Toby Boyd's avatar
Toby Boyd committed
70
71
    datasets_num_parallel_batches: Determines how many batches to process in
    parallel when using map and batch from tf.data.
72
73
    dynamic_loss_scale: Allow the "loss_scale" flag to take on the value
      "dynamic". Only valid if `dtype` is True.
Toby Boyd's avatar
Toby Boyd committed
74

75
76
77
78
79
80
81
82
  Returns:
    A list of flags for core.py to marks as key flags.
  """

  key_flags = []
  if num_parallel_calls:
    flags.DEFINE_integer(
        name="num_parallel_calls", short_name="npc",
Toby Boyd's avatar
Toby Boyd committed
83
        default=multiprocessing.cpu_count(),
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
        help=help_wrap("The number of records that are  processed in parallel "
                       "during input processing. This can be optimized per "
                       "data set but for generally homogeneous data sets, "
                       "should be approximately the number of available CPU "
                       "cores. (default behavior)"))

  if inter_op:
    flags.DEFINE_integer(
        name="inter_op_parallelism_threads", short_name="inter", default=0,
        help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
                       "See TensorFlow config.proto for details.")
    )

  if intra_op:
    flags.DEFINE_integer(
        name="intra_op_parallelism_threads", short_name="intra", default=0,
        help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
                       "See TensorFlow config.proto for details."))

  if synthetic_data:
    flags.DEFINE_bool(
        name="use_synthetic_data", short_name="synth", default=False,
        help=help_wrap(
            "If set, use fake data (zeroes) instead of a real dataset. "
            "This mode is useful for performance debugging, as it removes "
            "input processing steps, but will not learn anything."))

  if max_train_steps:
    flags.DEFINE_integer(
        name="max_train_steps", short_name="mts", default=None, help=help_wrap(
            "The model will stop training if the global_step reaches this "
            "value. If not set, training will run until the specified number "
            "of epochs have run as usual. It is generally recommended to set "
            "--train_epochs=1 when using this flag."
        ))

  if dtype:
    flags.DEFINE_enum(
        name="dtype", short_name="dt", default="fp32",
        enum_values=DTYPE_MAP.keys(),
        help=help_wrap("The TensorFlow datatype used for calculations. "
                       "Variables may be cast to a higher precision on a "
                       "case-by-case basis for numerical stability."))

128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
    loss_scale_help_text = (
        "The amount to scale the loss by when the model is run. {}. Before "
        "gradients are computed, the loss is multiplied by the loss scale, "
        "making all gradients loss_scale times larger. To adjust for this, "
        "gradients are divided by the loss scale before being applied to "
        "variables. This is mathematically equivalent to training without "
        "a loss scale, but the loss scale helps avoid some intermediate "
        "gradients from underflowing to zero. If not provided the default "
        "for fp16 is 128 and 1 for all other dtypes.{}"
    )
    if dynamic_loss_scale:
      loss_scale_help_text = loss_scale_help_text.format(
          "This can be an int/float or the string 'dynamic'",
          " The string 'dynamic' can be used to dynamically determine the "
          "optimal loss scale during training, but currently this "
          "significantly slows down performance")
      loss_scale_validation_msg = ("loss_scale should be a positive int/float "
                                   "or the string 'dynamic'.")
    else:
      loss_scale_help_text = loss_scale_help_text.format(
          "This must be an int/float", "")
      loss_scale_validation_msg = "loss_scale should be a positive int/float."
    flags.DEFINE_string(
151
        name="loss_scale", short_name="ls", default=None,
152
153
154
        help=help_wrap(loss_scale_help_text))

    @flags.validator(flag_name="loss_scale", message=loss_scale_validation_msg)
155
    def _check_loss_scale(loss_scale):  # pylint: disable=unused-variable
156
      """Validator to check the loss scale flag is valid"""
157
158
159
      if loss_scale is None:
        return True  # null case is handled in get_loss_scale()

160
161
162
163
164
165
166
167
      if loss_scale == "dynamic" and dynamic_loss_scale:
        return True

      try:
        loss_scale = float(loss_scale)
      except ValueError:
        return False

168
169
      return loss_scale > 0

170
171
172
173
  if all_reduce_alg:
    flags.DEFINE_string(
        name="all_reduce_alg", short_name="ara", default=None,
        help=help_wrap("Defines the algorithm to use for performing all-reduce."
174
175
176
177
178
179
180
                       "When specified with MirroredStrategy for single "
                       "worker, this controls "
                       "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
                       "specified with MultiWorkerMirroredStrategy, this "
                       "controls "
                       "tf.distribute.experimental.CollectiveCommunication; "
                       "valid options are `ring` and `nccl`."))
181

182
183
184
185
186
187
188
  if num_packs:
    flags.DEFINE_integer(
        name="num_packs", default=1,
        help=help_wrap("Sets `num_packs` in the cross device ops used in "
                       "MirroredStrategy.  For details, see "
                       "tf.distribute.NcclAllReduce."))

Toby Boyd's avatar
Toby Boyd committed
189
190
  if tf_gpu_thread_mode:
    flags.DEFINE_string(
Toby Boyd's avatar
Toby Boyd committed
191
        name="tf_gpu_thread_mode", short_name="gt_mode", default=None,
Toby Boyd's avatar
Toby Boyd committed
192
193
194
195
        help=help_wrap(
            "Whether and how the GPU device uses its own threadpool.")
    )

196
197
198
199
200
201
202
    flags.DEFINE_integer(
        name="per_gpu_thread_count", short_name="pgtc", default=0,
        help=help_wrap(
            "The number of threads to use for GPU. Only valid when "
            "tf_gpu_thread_mode is not global.")
    )

Toby Boyd's avatar
Toby Boyd committed
203
204
  if datasets_num_private_threads:
    flags.DEFINE_integer(
Toby Boyd's avatar
Toby Boyd committed
205
        name="datasets_num_private_threads",
Toby Boyd's avatar
Toby Boyd committed
206
207
208
209
210
        default=None,
        help=help_wrap(
            "Number of threads for a private threadpool created for all"
            "datasets computation..")
    )
211

Toby Boyd's avatar
Toby Boyd committed
212
213
214
215
216
217
218
219
220
  if datasets_num_parallel_batches:
    flags.DEFINE_integer(
        name="datasets_num_parallel_batches",
        default=None,
        help=help_wrap(
            "Determines how many batches to process in parallel when using "
            "map and batch from tf.data.")
    )

221
  return key_flags