distribution_utils.py 7.57 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper functions for running models in a distributed setting."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

21
22
import random
import string
23
24
25
import tensorflow as tf


26
27
28
def get_distribution_strategy(distribution_strategy="default",
                              num_gpus=0,
                              all_reduce_alg=None):
29
30
31
  """Return a DistributionStrategy for running the model.

  Args:
32
33
34
35
36
    distribution_strategy: a string specify which distribution strategy to use.
      Accepted values are 'off', 'default', 'one_device', 'mirrored',
      'parameter_server', 'collective', case insensitive. 'off' means not to use
      Distribution Strategy; 'default' means to choose from `MirroredStrategy`
      or `OneDeviceStrategy` according to the number of GPUs."
37
    num_gpus: Number of GPUs to run this model.
38
39
40
41
    all_reduce_alg: Optional. Specify which algorithm to use when performing
      all-reduce. See tf.contrib.distribute.AllReduceCrossDeviceOps for
      available algorithms. If None, DistributionStrategy will choose based on
      device topology.
42
43

  Returns:
44
    tf.distribute.DistibutionStrategy object.
Shining Sun's avatar
Shining Sun committed
45
  Raises:
46
47
    ValueError: if `distribution_strategy` is 'off' or 'one_device' and
      `num_gpus` is larger than 1; or `num_gpus` is negative.
48
  """
49
50
51
52
53
54
55
56
57
58
59
60
61
  if num_gpus < 0:
    raise ValueError("`num_gpus` can not be negative.")

  distribution_strategy = distribution_strategy.lower()
  if distribution_strategy == "off":
    if num_gpus > 1:
      raise ValueError("When {} GPUs are specified, distribution_strategy flag "
                       "cannot be set to 'off'.".format(num_gpus))
    return None

  if (distribution_strategy == "one_device" or
      (distribution_strategy == "default" and num_gpus <= 1)):
    if num_gpus == 0:
Toby Boyd's avatar
Toby Boyd committed
62
      return tf.distribute.OneDeviceStrategy("device:CPU:0")
Toby Boyd's avatar
Toby Boyd committed
63
    else:
64
65
66
      if num_gpus > 1:
        raise ValueError("`OneDeviceStrategy` can not be used for more than "
                         "one device.")
Toby Boyd's avatar
Toby Boyd committed
67
      return tf.distribute.OneDeviceStrategy("device:GPU:0")
68
69
70
71
72

  if distribution_strategy in ("mirrored", "default"):
    if num_gpus == 0:
      assert distribution_strategy == "mirrored"
      devices = ["device:CPU:0"]
Shining Sun's avatar
Shining Sun committed
73
    else:
74
      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
75
    if all_reduce_alg:
76
77
      return tf.distribute.MirroredStrategy(
          devices=devices,
78
          cross_device_ops=tf.contrib.distribute.AllReduceCrossDeviceOps(
79
              all_reduce_alg, num_packs=2))
80
    else:
81
      return tf.distribute.MirroredStrategy(devices=devices)
82

83
84
85
86
87
88
89
90
91
92
93
  if distribution_strategy == "collective":
    return tf.contrib.distribute.CollectiveAllReduceStrategy(
        num_gpus_per_worker=num_gpus)

  if distribution_strategy == "parameter_server":
    return tf.contrib.distribute.ParameterServerStrategy(
        num_gpus_per_worker=num_gpus)

  raise ValueError(
      "Unrecognized Distribution Strategy: %r" % distribution_strategy)

94
95
96
97

def per_device_batch_size(batch_size, num_gpus):
  """For multi-gpu, batch-size must be a multiple of the number of GPUs.

98
99
100

  Note that distribution strategy handles this automatically when used with
  Keras. For using with Estimator, we need to get per GPU batch.
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

  Args:
    batch_size: Global batch size to be divided among devices. This should be
      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
    num_gpus: How many GPUs are used with DistributionStrategies.

  Returns:
    Batch size per device.

  Raises:
    ValueError: if batch_size is not divisible by number of devices
  """
  if num_gpus <= 1:
    return batch_size

  remainder = batch_size % num_gpus
  if remainder:
Toby Boyd's avatar
Toby Boyd committed
118
119
120
    err = ('When running with multiple GPUs, batch size '
           'must be a multiple of the number of available GPUs. Found {} '
           'GPUs with a batch size of {}; try --batch_size={} instead.'
121
122
123
          ).format(num_gpus, batch_size, batch_size - remainder)
    raise ValueError(err)
  return int(batch_size / num_gpus)
124

Toby Boyd's avatar
Toby Boyd committed
125

126
127
128
129
130
131
132
133
134
135
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# directly on devices. It is only useful for Keras with Distribution
# Strategies. We will have better support in `tf.data` or Distribution Strategy
# later.
class SyntheticDataset(object):
  """A dataset that generates synthetic data on each device."""

  def __init__(self, dataset, split_by=1):
    self._input_data = {}
    # dataset.take(1) doesn't have GPU kernel.
Toby Boyd's avatar
Toby Boyd committed
136
    with tf.device('device:CPU:0'):
137
138
139
140
141
142
143
      tensor = tf.data.experimental.get_single_element(dataset.take(1))
    flat_tensor = tf.nest.flatten(tensor)
    variable_data = []
    self._initializers = []
    for t in flat_tensor:
      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
Toby Boyd's avatar
Toby Boyd committed
144
145
      v = tf.compat.v1.get_local_variable(self.random_name(),
                                          initializer=rebatched_t)
146
147
148
149
150
151
152
153
154
155
156
157
158
159
      variable_data.append(v)
      self._initializers.append(v.initializer)
    self._input_data = tf.nest.pack_sequence_as(tensor, variable_data)

  def get_next(self):
    return self._input_data

  def initialize(self):
    if tf.executing_eagerly():
      return tf.no_op()
    else:
      return self._initializers

  def random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
Toby Boyd's avatar
Toby Boyd committed
160
    return ''.join(random.choice(chars) for _ in range(size))
161
162
163
164
165


def _monkey_patch_dataset_method(strategy):
  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
  def make_dataset_iterator(self, dataset):
Toby Boyd's avatar
Toby Boyd committed
166
    tf.compat.v1.logging.info('Using pure synthetic data.')
167
168
169
170
171
172
173
174
175
176
177
    with self.scope():
      if self.extended._global_batch_size:  # pylint: disable=protected-access
        return SyntheticDataset(dataset, self.num_replicas_in_sync)
      else:
        return SyntheticDataset(dataset)

  strategy.org_make_dataset_iterator = strategy.make_dataset_iterator
  strategy.make_dataset_iterator = make_dataset_iterator


def _undo_monkey_patch_dataset_method(strategy):
Toby Boyd's avatar
Toby Boyd committed
178
  if hasattr(strategy, 'org_make_dataset_iterator'):
179
180
181
182
183
    strategy.make_dataset_iterator = strategy.org_make_dataset_iterator


def set_up_synthetic_data():
  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
Toby Boyd's avatar
Toby Boyd committed
184
185
186
187
188
189
  # TODO(tobyboyd): Remove when contrib.distribute is all in core.
  if hasattr(tf, 'contrib'):
    _monkey_patch_dataset_method(tf.contrib.distribute.MirroredStrategy)
    _monkey_patch_dataset_method(tf.contrib.distribute.OneDeviceStrategy)
  else:
    print('Contrib missing: Skip monkey patch tf.contrib.distribute.*')
190
191
192
193


def undo_set_up_synthetic_data():
  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
Toby Boyd's avatar
Toby Boyd committed
194
195
196
197
198
199
  # TODO(tobyboyd): Remove when contrib.distribute is all in core.
  if hasattr(tf, 'contrib'):
    _undo_monkey_patch_dataset_method(tf.contrib.distribute.MirroredStrategy)
    _undo_monkey_patch_dataset_method(tf.contrib.distribute.OneDeviceStrategy)
  else:
    print('Contrib missing: Skip remove monkey patch tf.contrib.distribute.*')