distribution_utils.py 11.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper functions for running models in a distributed setting."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

21
22
import json
import os
23
24
import random
import string
25
26
import tensorflow as tf

27

28
29
def _collective_communication(all_reduce_alg):
  """Return a CollectiveCommunication based on all_reduce_alg.
30

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  Args:
    all_reduce_alg: a string specifying which collective communication to pick,
      or None.

  Returns:
    tf.distribute.experimental.CollectiveCommunication object

  Raises:
    ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl']
  """
  collective_communication_options = {
      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
  }
  if all_reduce_alg not in collective_communication_options:
    raise ValueError(
        "When used with `multi_worker_mirrored`, valid values for "
        "all_reduce_alg are ['ring', 'nccl'].  Supplied value: {}".format(
            all_reduce_alg))
  return collective_communication_options[all_reduce_alg]


def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.

  Args:
    all_reduce_alg: a string specifying which cross device op to pick, or None.
    num_packs: an integer specifying number of packs for the cross device op.

  Returns:
    tf.distribute.CrossDeviceOps object or None.

  Raises:
    ValueError: if `all_reduce_alg` not in [None, 'nccl', 'hierarchical_copy'].
  """
  if all_reduce_alg is None:
    return None
  mirrored_all_reduce_options = {
      "nccl": tf.distribute.NcclAllReduce,
      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
  }
  if all_reduce_alg not in mirrored_all_reduce_options:
    raise ValueError(
        "When used with `mirrored`, valid values for all_reduce_alg are "
        "['nccl', 'hierarchical_copy'].  Supplied value: {}".format(
            all_reduce_alg))
  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
  return cross_device_ops_class(num_packs=num_packs)
80

81

82
83
def get_distribution_strategy(distribution_strategy="default",
                              num_gpus=0,
84
                              num_workers=1,
85
86
                              all_reduce_alg=None,
                              num_packs=1):
87
88
89
  """Return a DistributionStrategy for running the model.

  Args:
90
91
    distribution_strategy: a string specifying which distribution strategy to
      use. Accepted values are 'off', 'default', 'one_device', 'mirrored',
92
93
94
95
      'parameter_server', 'multi_worker_mirrored', case insensitive. 'off' means
      not to use Distribution Strategy; 'default' means to choose from
      `MirroredStrategy`, `MultiWorkerMirroredStrategy`, or `OneDeviceStrategy`
      according to the number of GPUs and number of workers.
96
    num_gpus: Number of GPUs to run this model.
97
    num_workers: Number of workers to run this model.
98
99
100
101
102
    all_reduce_alg: Optional. Specifies which algorithm to use when performing
      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
      "ring" and "nccl".  If None, DistributionStrategy will choose based on
      device topology.
103
104
    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
105
106

  Returns:
107
    tf.distribute.DistibutionStrategy object.
Shining Sun's avatar
Shining Sun committed
108
  Raises:
109
110
    ValueError: if `distribution_strategy` is 'off' or 'one_device' and
      `num_gpus` is larger than 1; or `num_gpus` is negative.
111
  """
112
113
114
115
116
  if num_gpus < 0:
    raise ValueError("`num_gpus` can not be negative.")

  distribution_strategy = distribution_strategy.lower()
  if distribution_strategy == "off":
117
    if num_gpus > 1:
118
119
120
      raise ValueError(
          "When {} GPUs and  {} workers are specified, distribution_strategy "
          "flag cannot be set to 'off'.".format(num_gpus, num_workers))
121
122
    return None

123
  if distribution_strategy == "multi_worker_mirrored":
124
    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
125
        communication=_collective_communication(all_reduce_alg))
126

127
128
129
  if (distribution_strategy == "one_device" or
      (distribution_strategy == "default" and num_gpus <= 1)):
    if num_gpus == 0:
Toby Boyd's avatar
Toby Boyd committed
130
      return tf.distribute.OneDeviceStrategy("device:CPU:0")
Toby Boyd's avatar
Toby Boyd committed
131
    else:
132
133
134
      if num_gpus > 1:
        raise ValueError("`OneDeviceStrategy` can not be used for more than "
                         "one device.")
Toby Boyd's avatar
Toby Boyd committed
135
      return tf.distribute.OneDeviceStrategy("device:GPU:0")
136
137
138
139
140

  if distribution_strategy in ("mirrored", "default"):
    if num_gpus == 0:
      assert distribution_strategy == "mirrored"
      devices = ["device:CPU:0"]
Shining Sun's avatar
Shining Sun committed
141
    else:
142
      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
143
144
    return tf.distribute.MirroredStrategy(
        devices=devices,
145
        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
146

147
  if distribution_strategy == "parameter_server":
148
    return tf.distribute.experimental.ParameterServerStrategy()
149
150
151
152

  raise ValueError(
      "Unrecognized Distribution Strategy: %r" % distribution_strategy)

153

154
def per_replica_batch_size(batch_size, num_gpus):
155
156
  """For multi-gpu, batch-size must be a multiple of the number of GPUs.

157
158
159

  Note that distribution strategy handles this automatically when used with
  Keras. For using with Estimator, we need to get per GPU batch.
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

  Args:
    batch_size: Global batch size to be divided among devices. This should be
      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
    num_gpus: How many GPUs are used with DistributionStrategies.

  Returns:
    Batch size per device.

  Raises:
    ValueError: if batch_size is not divisible by number of devices
  """
  if num_gpus <= 1:
    return batch_size

  remainder = batch_size % num_gpus
  if remainder:
Toby Boyd's avatar
Toby Boyd committed
177
178
179
    err = ('When running with multiple GPUs, batch size '
           'must be a multiple of the number of available GPUs. Found {} '
           'GPUs with a batch size of {}; try --batch_size={} instead.'
180
181
182
          ).format(num_gpus, batch_size, batch_size - remainder)
    raise ValueError(err)
  return int(batch_size / num_gpus)
183

Toby Boyd's avatar
Toby Boyd committed
184

185
186
187
188
189
190
191
192
193
194
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# directly on devices. It is only useful for Keras with Distribution
# Strategies. We will have better support in `tf.data` or Distribution Strategy
# later.
class SyntheticDataset(object):
  """A dataset that generates synthetic data on each device."""

  def __init__(self, dataset, split_by=1):
    self._input_data = {}
    # dataset.take(1) doesn't have GPU kernel.
Toby Boyd's avatar
Toby Boyd committed
195
    with tf.device('device:CPU:0'):
196
197
198
199
200
201
202
      tensor = tf.data.experimental.get_single_element(dataset.take(1))
    flat_tensor = tf.nest.flatten(tensor)
    variable_data = []
    self._initializers = []
    for t in flat_tensor:
      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
Toby Boyd's avatar
Toby Boyd committed
203
204
      v = tf.compat.v1.get_local_variable(self.random_name(),
                                          initializer=rebatched_t)
205
206
207
208
209
210
211
212
213
214
215
216
217
218
      variable_data.append(v)
      self._initializers.append(v.initializer)
    self._input_data = tf.nest.pack_sequence_as(tensor, variable_data)

  def get_next(self):
    return self._input_data

  def initialize(self):
    if tf.executing_eagerly():
      return tf.no_op()
    else:
      return self._initializers

  def random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
Toby Boyd's avatar
Toby Boyd committed
219
    return ''.join(random.choice(chars) for _ in range(size))
220
221
222
223
224


def _monkey_patch_dataset_method(strategy):
  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
  def make_dataset_iterator(self, dataset):
Toby Boyd's avatar
Toby Boyd committed
225
    tf.compat.v1.logging.info('Using pure synthetic data.')
226
227
228
229
230
231
232
233
234
235
236
    with self.scope():
      if self.extended._global_batch_size:  # pylint: disable=protected-access
        return SyntheticDataset(dataset, self.num_replicas_in_sync)
      else:
        return SyntheticDataset(dataset)

  strategy.org_make_dataset_iterator = strategy.make_dataset_iterator
  strategy.make_dataset_iterator = make_dataset_iterator


def _undo_monkey_patch_dataset_method(strategy):
Toby Boyd's avatar
Toby Boyd committed
237
  if hasattr(strategy, 'org_make_dataset_iterator'):
238
239
240
241
    strategy.make_dataset_iterator = strategy.org_make_dataset_iterator


def set_up_synthetic_data():
242
  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
243
  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
Toby Boyd's avatar
Toby Boyd committed
244
245
246
247
248
249
  # TODO(tobyboyd): Remove when contrib.distribute is all in core.
  if hasattr(tf, 'contrib'):
    _monkey_patch_dataset_method(tf.contrib.distribute.MirroredStrategy)
    _monkey_patch_dataset_method(tf.contrib.distribute.OneDeviceStrategy)
  else:
    print('Contrib missing: Skip monkey patch tf.contrib.distribute.*')
250
251
252


def undo_set_up_synthetic_data():
253
  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
254
  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
Toby Boyd's avatar
Toby Boyd committed
255
256
257
258
259
260
  # TODO(tobyboyd): Remove when contrib.distribute is all in core.
  if hasattr(tf, 'contrib'):
    _undo_monkey_patch_dataset_method(tf.contrib.distribute.MirroredStrategy)
    _undo_monkey_patch_dataset_method(tf.contrib.distribute.OneDeviceStrategy)
  else:
    print('Contrib missing: Skip remove monkey patch tf.contrib.distribute.*')
261
262
263
264
265
266
267
268
269
270
271
272
273


def configure_cluster(worker_hosts=None, task_index=-1):
  """Set multi-worker cluster spec in TF_CONFIG environment variable.

  Args:
    worker_hosts: comma-separated list of worker ip:port pairs.

  Returns:
    Number of workers in the cluster.
  """
  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
  if tf_config:
274
275
    num_workers = (len(tf_config['cluster'].get('chief', [])) +
                   len(tf_config['cluster'].get('worker', [])))
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
  elif worker_hosts:
    workers = worker_hosts.split(',')
    num_workers = len(workers)
    if num_workers > 1 and task_index < 0:
      raise ValueError('Must specify task_index when number of workers > 1')
    task_index = 0 if num_workers == 1 else task_index
    os.environ['TF_CONFIG'] = json.dumps({
        'cluster': {
            'worker': workers
        },
        'task': {'type': 'worker', 'index': task_index}
    })
  else:
    num_workers = 1
  return num_workers
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308


def get_strategy_scope(strategy):
  if strategy:
    strategy_scope = strategy.scope()
  else:
    strategy_scope = DummyContextManager()

  return strategy_scope


class DummyContextManager(object):

  def __enter__(self):
    pass

  def __exit__(self, *args):
    pass