_performance.py 12.7 KB
Newer Older
Hongkun Yu's avatar
Hongkun Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Register flags for optimizing performance."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import multiprocessing

Hongkun Yu's avatar
Hongkun Yu committed
37
38
from absl import flags  # pylint: disable=g-bad-import-order
import tensorflow as tf  # pylint: disable=g-bad-import-order
39
40
41

from official.utils.flags._conventions import help_wrap

42
# Map string to TensorFlow dtype
43
DTYPE_MAP = {
44
    "fp16": tf.float16,
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
45
    "bf16": tf.bfloat16,
46
    "fp32": tf.float32,
47
48
49
50
}


def get_tf_dtype(flags_obj):
51
  if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
52
53
54
    # If the graph_rewrite is used, we build the graph with fp32, and let the
    # graph rewrite change ops to fp16.
    return tf.float32
55
  return DTYPE_MAP[flags_obj.dtype]
56
57


58
def get_loss_scale(flags_obj, default_for_fp16):
59
  dtype = get_tf_dtype(flags_obj)
60
  if flags_obj.loss_scale == "dynamic":
61
    return flags_obj.loss_scale
62
63
  elif flags_obj.loss_scale is not None:
    return float(flags_obj.loss_scale)
64
  elif dtype == tf.float32 or dtype == tf.bfloat16:
65
66
    return 1  # No loss scaling is needed for fp32
  else:
67
    assert dtype == tf.float16
68
    return default_for_fp16
69
70


Hongkun Yu's avatar
Hongkun Yu committed
71
72
73
74
75
76
77
78
def define_performance(num_parallel_calls=False,
                       inter_op=False,
                       intra_op=False,
                       synthetic_data=False,
                       max_train_steps=False,
                       dtype=False,
                       all_reduce_alg=False,
                       num_packs=False,
79
                       tf_gpu_thread_mode=False,
Toby Boyd's avatar
Toby Boyd committed
80
                       datasets_num_private_threads=False,
81
                       datasets_num_parallel_batches=False,
Hongkun Yu's avatar
Hongkun Yu committed
82
83
                       dynamic_loss_scale=False,
                       fp16_implementation=False,
84
                       loss_scale=False,
Hongkun Yu's avatar
Hongkun Yu committed
85
86
                       tf_data_experimental_slack=False,
                       enable_xla=False,
87
                       training_dataset_cache=False):
88
89
90
91
92
93
94
  """Register flags for specifying performance tuning arguments.

  Args:
    num_parallel_calls: Create a flag to specify parallelism of data loading.
    inter_op: Create a flag to allow specification of inter op threads.
    intra_op: Create a flag to allow specification of intra op threads.
    synthetic_data: Create a flag to allow the use of synthetic data.
Hongkun Yu's avatar
Hongkun Yu committed
95
96
    max_train_steps: Create a flags to allow specification of maximum number of
      training steps
97
    dtype: Create flags for specifying dtype.
Toby Boyd's avatar
Toby Boyd committed
98
    all_reduce_alg: If set forces a specific algorithm for multi-gpu.
99
100
    num_packs: If set provides number of packs for MirroredStrategy's cross
      device ops.
Toby Boyd's avatar
Toby Boyd committed
101
102
    tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
    datasets_num_private_threads: Number of private threads for datasets.
Toby Boyd's avatar
Toby Boyd committed
103
    datasets_num_parallel_batches: Determines how many batches to process in
Hongkun Yu's avatar
Hongkun Yu committed
104
      parallel when using map and batch from tf.data.
105
106
    dynamic_loss_scale: Allow the "loss_scale" flag to take on the value
      "dynamic". Only valid if `dtype` is True.
107
    fp16_implementation: Create fp16_implementation flag.
108
109
    loss_scale: Controls the loss scaling, normally for mixed-precision
      training. Can only be turned on if dtype is also True.
110
111
    tf_data_experimental_slack: Determines whether to enable tf.data's
      `experimental_slack` option.
Toby Boyd's avatar
Toby Boyd committed
112
    enable_xla: Determines if XLA (auto clustering) is turned on.
113
    training_dataset_cache: Whether to cache the training dataset on workers.
Hongkun Yu's avatar
Hongkun Yu committed
114
115
      Typically used to improve training performance when training data is in
      remote storage and can fit into worker memory.
Toby Boyd's avatar
Toby Boyd committed
116

117
118
119
120
121
122
123
  Returns:
    A list of flags for core.py to marks as key flags.
  """

  key_flags = []
  if num_parallel_calls:
    flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
124
125
        name="num_parallel_calls",
        short_name="npc",
Toby Boyd's avatar
Toby Boyd committed
126
        default=multiprocessing.cpu_count(),
127
128
129
130
131
132
133
134
        help=help_wrap("The number of records that are  processed in parallel "
                       "during input processing. This can be optimized per "
                       "data set but for generally homogeneous data sets, "
                       "should be approximately the number of available CPU "
                       "cores. (default behavior)"))

  if inter_op:
    flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
135
136
137
        name="inter_op_parallelism_threads",
        short_name="inter",
        default=0,
138
        help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
Hongkun Yu's avatar
Hongkun Yu committed
139
                       "See TensorFlow config.proto for details."))
140
141
142

  if intra_op:
    flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
143
144
145
        name="intra_op_parallelism_threads",
        short_name="intra",
        default=0,
146
147
148
149
150
        help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
                       "See TensorFlow config.proto for details."))

  if synthetic_data:
    flags.DEFINE_bool(
Hongkun Yu's avatar
Hongkun Yu committed
151
152
153
        name="use_synthetic_data",
        short_name="synth",
        default=False,
154
155
156
157
158
159
160
        help=help_wrap(
            "If set, use fake data (zeroes) instead of a real dataset. "
            "This mode is useful for performance debugging, as it removes "
            "input processing steps, but will not learn anything."))

  if max_train_steps:
    flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
161
162
163
164
        name="max_train_steps",
        short_name="mts",
        default=None,
        help=help_wrap(
165
166
167
            "The model will stop training if the global_step reaches this "
            "value. If not set, training will run until the specified number "
            "of epochs have run as usual. It is generally recommended to set "
Hongkun Yu's avatar
Hongkun Yu committed
168
            "--train_epochs=1 when using this flag."))
169
170
171

  if dtype:
    flags.DEFINE_enum(
Hongkun Yu's avatar
Hongkun Yu committed
172
173
174
        name="dtype",
        short_name="dt",
        default="fp32",
175
176
177
178
179
        enum_values=DTYPE_MAP.keys(),
        help=help_wrap("The TensorFlow datatype used for calculations. "
                       "Variables may be cast to a higher precision on a "
                       "case-by-case basis for numerical stability."))

180
181
182
183
184
185
186
187
    loss_scale_help_text = (
        "The amount to scale the loss by when the model is run. {}. Before "
        "gradients are computed, the loss is multiplied by the loss scale, "
        "making all gradients loss_scale times larger. To adjust for this, "
        "gradients are divided by the loss scale before being applied to "
        "variables. This is mathematically equivalent to training without "
        "a loss scale, but the loss scale helps avoid some intermediate "
        "gradients from underflowing to zero. If not provided the default "
Hongkun Yu's avatar
Hongkun Yu committed
188
        "for fp16 is 128 and 1 for all other dtypes.{}")
189
190
191
192
193
194
195
196
197
198
199
200
    if dynamic_loss_scale:
      loss_scale_help_text = loss_scale_help_text.format(
          "This can be an int/float or the string 'dynamic'",
          " The string 'dynamic' can be used to dynamically determine the "
          "optimal loss scale during training, but currently this "
          "significantly slows down performance")
      loss_scale_validation_msg = ("loss_scale should be a positive int/float "
                                   "or the string 'dynamic'.")
    else:
      loss_scale_help_text = loss_scale_help_text.format(
          "This must be an int/float", "")
      loss_scale_validation_msg = "loss_scale should be a positive int/float."
201
202
    if loss_scale:
      flags.DEFINE_string(
Hongkun Yu's avatar
Hongkun Yu committed
203
204
205
          name="loss_scale",
          short_name="ls",
          default=None,
206
          help=help_wrap(loss_scale_help_text))
207

Hongkun Yu's avatar
Hongkun Yu committed
208
209
      @flags.validator(
          flag_name="loss_scale", message=loss_scale_validation_msg)
210
211
212
213
      def _check_loss_scale(loss_scale):  # pylint: disable=unused-variable
        """Validator to check the loss scale flag is valid."""
        if loss_scale is None:
          return True  # null case is handled in get_loss_scale()
214

215
216
        if loss_scale == "dynamic" and dynamic_loss_scale:
          return True
217

218
219
220
221
        try:
          loss_scale = float(loss_scale)
        except ValueError:
          return False
222

223
        return loss_scale > 0
224

225
226
    if fp16_implementation:
      flags.DEFINE_enum(
Hongkun Yu's avatar
Hongkun Yu committed
227
228
          name="fp16_implementation",
          default="keras",
229
          enum_values=("keras', 'graph_rewrite"),
230
231
          help=help_wrap(
              "When --dtype=fp16, how fp16 should be implemented. This has no "
232
233
234
235
              "impact on correctness. 'keras' uses the "
              "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
              "tf.train.experimental.enable_mixed_precision_graph_rewrite "
              "API."))
236

Hongkun Yu's avatar
Hongkun Yu committed
237
238
      @flags.multi_flags_validator(
          ["fp16_implementation", "dtype", "loss_scale"])
239
240
      def _check_fp16_implementation(flags_dict):
        """Validator to check fp16_implementation flag is valid."""
241
242
243
244
        if (flags_dict["fp16_implementation"] == "graph_rewrite" and
            flags_dict["dtype"] != "fp16"):
          raise flags.ValidationError("--fp16_implementation should not be "
                                      "specified unless --dtype=fp16")
245
246
        return True

247
248
  if all_reduce_alg:
    flags.DEFINE_string(
Hongkun Yu's avatar
Hongkun Yu committed
249
250
251
        name="all_reduce_alg",
        short_name="ara",
        default=None,
252
        help=help_wrap("Defines the algorithm to use for performing all-reduce."
253
254
255
256
257
258
259
                       "When specified with MirroredStrategy for single "
                       "worker, this controls "
                       "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
                       "specified with MultiWorkerMirroredStrategy, this "
                       "controls "
                       "tf.distribute.experimental.CollectiveCommunication; "
                       "valid options are `ring` and `nccl`."))
260

261
262
  if num_packs:
    flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
263
264
        name="num_packs",
        default=1,
265
266
267
268
        help=help_wrap("Sets `num_packs` in the cross device ops used in "
                       "MirroredStrategy.  For details, see "
                       "tf.distribute.NcclAllReduce."))

Toby Boyd's avatar
Toby Boyd committed
269
270
  if tf_gpu_thread_mode:
    flags.DEFINE_string(
Hongkun Yu's avatar
Hongkun Yu committed
271
272
273
        name="tf_gpu_thread_mode",
        short_name="gt_mode",
        default=None,
Toby Boyd's avatar
Toby Boyd committed
274
        help=help_wrap(
Hongkun Yu's avatar
Hongkun Yu committed
275
            "Whether and how the GPU device uses its own threadpool."))
Toby Boyd's avatar
Toby Boyd committed
276

277
    flags.DEFINE_integer(
Hongkun Yu's avatar
Hongkun Yu committed
278
279
280
281
282
        name="per_gpu_thread_count",
        short_name="pgtc",
        default=0,
        help=help_wrap("The number of threads to use for GPU. Only valid when "
                       "tf_gpu_thread_mode is not global."))
283

Toby Boyd's avatar
Toby Boyd committed
284
285
  if datasets_num_private_threads:
    flags.DEFINE_integer(
Toby Boyd's avatar
Toby Boyd committed
286
        name="datasets_num_private_threads",
Toby Boyd's avatar
Toby Boyd committed
287
288
289
        default=None,
        help=help_wrap(
            "Number of threads for a private threadpool created for all"
Hongkun Yu's avatar
Hongkun Yu committed
290
            "datasets computation.."))
291

Toby Boyd's avatar
Toby Boyd committed
292
293
294
295
296
297
  if datasets_num_parallel_batches:
    flags.DEFINE_integer(
        name="datasets_num_parallel_batches",
        default=None,
        help=help_wrap(
            "Determines how many batches to process in parallel when using "
Hongkun Yu's avatar
Hongkun Yu committed
298
            "map and batch from tf.data."))
Toby Boyd's avatar
Toby Boyd committed
299

300
301
302
303
304
305
306
  if training_dataset_cache:
    flags.DEFINE_boolean(
        name="training_dataset_cache",
        default=False,
        help=help_wrap(
            "Determines whether to cache the training dataset on workers. "
            "Typically used to improve training performance when training "
Hongkun Yu's avatar
Hongkun Yu committed
307
            "data is in remote storage and can fit into worker memory."))
308

309
310
311
312
313
  if tf_data_experimental_slack:
    flags.DEFINE_boolean(
        name="tf_data_experimental_slack",
        default=False,
        help=help_wrap(
Hongkun Yu's avatar
Hongkun Yu committed
314
            "Whether to enable tf.data's `experimental_slack` option."))
315

Toby Boyd's avatar
Toby Boyd committed
316
317
  if enable_xla:
    flags.DEFINE_boolean(
Hongkun Yu's avatar
Hongkun Yu committed
318
319
        name="enable_xla",
        default=False,
Toby Boyd's avatar
Toby Boyd committed
320
321
        help="Whether to enable XLA auto jit compilation")

322
  return key_flags