[TF 2.2 compat] use tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283)

* Fix the issue to properly run the accumulator with TF 2.2 * Apply style * Fix training_args_tf for TF 2.2 * Fix the TF training args when only one GPU is available * Remove the fixed version of TF in setup.py

[TF 2.2 compat] use tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283)
* Fix the issue to properly run the accumulator with TF 2.2 * Apply style * Fix training_args_tf for TF 2.2 * Fix the TF training args when only one GPU is available * Remove the fixed version of TF in setup.py
94b57bf7 · Julien Plu · GitHub · cffbb3d8 · 94b57bf7 · 94b57bf7
Unverified Commit 94b57bf7 authored May 11, 2020 by Julien Plu Committed by GitHub May 11, 2020
Showing with 14 additions and 6 deletions

setup.py setup.py +3 -3

src/transformers/optimization_tf.py src/transformers/optimization_tf.py +8 -2

src/transformers/training_args_tf.py src/transformers/training_args_tf.py +3 -1

No files found.
--- a/setup.py
+++ b/setup.py
@@ -67,8 +67,8 @@ extras = {}
 extras["mecab"] = ["mecab-python3"]
 extras["sklearn"] = ["scikit-learn"]
-extras["tf"] = ["tensorflow<=2.1.0"]
+extras["tf"] = ["tensorflow"]
-extras["tf-cpu"] = ["tensorflow-cpu<=2.1.0"]
+extras["tf-cpu"] = ["tensorflow-cpu"]
 extras["torch"] = ["torch"]
 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
@@ -81,7 +81,7 @@ extras["quality"] = [
    "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
    "flake8",
 ]
-extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow<=2.1.0", "torch"]
+extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
 setup(
    name="transformers",

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -204,7 +204,10 @@ class GradientAccumulator(object):
        """Number of accumulated steps."""
        if self._accum_steps is None:
            self._accum_steps = tf.Variable(
-                tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                tf.constant(0, dtype=tf.int64),
+                trainable=False,
+                synchronization=tf.VariableSynchronization.ON_READ,
+                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
            )
        return self._accum_steps.value()
@@ -223,7 +226,10 @@ class GradientAccumulator(object):
            self._gradients.extend(
                [
                    tf.Variable(
-                        tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                        tf.zeros_like(gradient),
+                        trainable=False,
+                        synchronization=tf.VariableSynchronization.ON_READ,
+                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
                    )
                    for gradient in gradients
                ]

--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments):
                strategy = tf.distribute.experimental.TPUStrategy(tpu)
            elif len(gpus) == 0:
                strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+            elif len(gpus) == 1:
+                strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
            elif len(gpus) > 1:
                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-                strategy = tf.distribute.MirroredStrategy(gpus)
+                strategy = tf.distribute.MirroredStrategy()
            else:
                raise ValueError("Cannot find the proper strategy please check your environment properties.")