Unverified Commit 94b57bf7 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

[TF 2.2 compat] use tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283)

* Fix the issue to properly run the accumulator with TF 2.2

* Apply style

* Fix training_args_tf for TF 2.2

* Fix the TF training args when only one GPU is available

* Remove the fixed version of TF in setup.py
parent cffbb3d8
...@@ -67,8 +67,8 @@ extras = {} ...@@ -67,8 +67,8 @@ extras = {}
extras["mecab"] = ["mecab-python3"] extras["mecab"] = ["mecab-python3"]
extras["sklearn"] = ["scikit-learn"] extras["sklearn"] = ["scikit-learn"]
extras["tf"] = ["tensorflow<=2.1.0"] extras["tf"] = ["tensorflow"]
extras["tf-cpu"] = ["tensorflow-cpu<=2.1.0"] extras["tf-cpu"] = ["tensorflow-cpu"]
extras["torch"] = ["torch"] extras["torch"] = ["torch"]
extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
...@@ -81,7 +81,7 @@ extras["quality"] = [ ...@@ -81,7 +81,7 @@ extras["quality"] = [
"isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
"flake8", "flake8",
] ]
extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow<=2.1.0", "torch"] extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
setup( setup(
name="transformers", name="transformers",
......
...@@ -204,7 +204,10 @@ class GradientAccumulator(object): ...@@ -204,7 +204,10 @@ class GradientAccumulator(object):
"""Number of accumulated steps.""" """Number of accumulated steps."""
if self._accum_steps is None: if self._accum_steps is None:
self._accum_steps = tf.Variable( self._accum_steps = tf.Variable(
tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, tf.constant(0, dtype=tf.int64),
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
) )
return self._accum_steps.value() return self._accum_steps.value()
...@@ -223,7 +226,10 @@ class GradientAccumulator(object): ...@@ -223,7 +226,10 @@ class GradientAccumulator(object):
self._gradients.extend( self._gradients.extend(
[ [
tf.Variable( tf.Variable(
tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, tf.zeros_like(gradient),
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
) )
for gradient in gradients for gradient in gradients
] ]
......
...@@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments): ...@@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments):
strategy = tf.distribute.experimental.TPUStrategy(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu)
elif len(gpus) == 0: elif len(gpus) == 0:
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
elif len(gpus) == 1:
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
elif len(gpus) > 1: elif len(gpus) > 1:
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
strategy = tf.distribute.MirroredStrategy(gpus) strategy = tf.distribute.MirroredStrategy()
else: else:
raise ValueError("Cannot find the proper strategy please check your environment properties.") raise ValueError("Cannot find the proper strategy please check your environment properties.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment