Unverified Commit f9414f75 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

Tensorflow improvements (#4530)



* Better None gradients handling

* Apply Style

* Apply Style

* Create a loss class per task to compute its respective loss

* Add loss classes to the ALBERT TF models

* Add loss classes to the BERT TF models

* Add question answering and multiple choice to TF Camembert

* Remove prints

* Add multiple choice model to TF DistilBERT + loss computation

* Add question answering model to TF Electra + loss computation

* Add token classification, question answering and multiple choice models to TF Flaubert

* Add multiple choice model to TF Roberta + loss computation

* Add multiple choice model to TF XLM + loss computation

* Add multiple choice and question answering models to TF XLM-Roberta

* Add multiple choice model to TF XLNet + loss computation

* Remove unused parameters

* Add task loss classes

* Reorder TF imports + add new model classes

* Add new model classes

* Bugfix in TF T5 model

* Bugfix for TF T5 tests

* Bugfix in TF T5 model

* Fix TF T5 model tests

* Fix T5 tests + some renaming

* Fix inheritance issue in the AutoX tests

* Add tests for TF Flaubert and TF XLM Roberta

* Add tests for TF Flaubert and TF XLM Roberta

* Remove unused piece of code in the TF trainer

* bugfix and remove unused code

* Bugfix for TF 2.2

* Apply Style

* Divide TFSequenceClassificationAndMultipleChoiceLoss into their two respective name

* Apply style

* Mirror the PT Trainer in the TF one: fp16, optimizers and tb_writer as class parameter and better dataset handling

* Fix TF optimizations tests and apply style

* Remove useless parameter

* Bugfix and apply style

* Fix TF Trainer prediction

* Now the TF models return the loss such as their PyTorch couterparts

* Apply Style

* Ignore some tests output

* Take into account the SQuAD cls_index, p_mask and is_impossible parameters for the QuestionAnswering task models.

* Fix names for SQuAD data

* Apply Style

* Fix conflicts with 2.11 release

* Fix conflicts with 2.11

* Fix wrongname

* Add better documentation on the new create_optimizer function

* Fix isort

* logging_dir: use same default as PyTorch
Co-authored-by: default avatarJulien Chaumond <chaumond@gmail.com>
parent ccd26c28
......@@ -3,12 +3,12 @@
import logging
import math
import os
from typing import Callable, Dict, Optional
from typing import Callable, Dict, Optional, Tuple
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import TFPreTrainedModel, shape_list
from .modeling_tf_utils import TFPreTrainedModel
from .optimization_tf import GradientAccumulator, create_optimizer
from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput
from .training_args_tf import TFTrainingArguments
......@@ -20,13 +20,14 @@ logger = logging.getLogger(__name__)
class TFTrainer:
model: TFPreTrainedModel
args: TFTrainingArguments
# something similar to a PT Dataset.
# This is just temporary before to have
# a framework-agnostic approach for datasets.
train_dataset: Optional[tf.data.Dataset]
eval_dataset: Optional[tf.data.Dataset]
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
prediction_loss_only: bool
tb_writer: Optional[tf.summary.SummaryWriter] = None
optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None
global_step: Optional[int] = None
epoch: Optional[float] = None
def __init__(
self,
......@@ -36,6 +37,8 @@ class TFTrainer:
eval_dataset: Optional[tf.data.Dataset] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
prediction_loss_only=False,
tb_writer: Optional[tf.summary.SummaryWriter] = None,
optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None,
):
self.model = model
self.args = args
......@@ -43,55 +46,18 @@ class TFTrainer:
self.eval_dataset = eval_dataset
self.compute_metrics = compute_metrics
self.prediction_loss_only = prediction_loss_only
self.optimizers = optimizers
self.gradient_accumulator = GradientAccumulator()
self._setup_training()
def _setup_training(self) -> None:
"""
Setup the different steps to train a model:
- check if all the data are given
- create the proper strategy
- create the features
- prepare the model settings
"""
self._prepare_dataset()
with self.args.strategy.scope():
self._create_optimizer()
_ = self.optimizer.iterations
self._set_loss_and_metric()
self._create_checkpoint_manager()
self._create_summary_writer()
def _set_loss_and_metric(self) -> None:
"""
Create the training loss and metric with their name. Allowed names are those listed
in the Tensorflow documentation and those contained in the transformers library.
"""
try:
self.loss = tf.keras.losses.get(
{
"class_name": self.args.loss_name,
"config": {"from_logits": True, "reduction": tf.keras.losses.Reduction.NONE},
}
)
except TypeError:
self.loss = tf.keras.losses.get(
{"class_name": self.args.loss_name, "config": {"reduction": tf.keras.losses.Reduction.NONE}}
)
if tb_writer is not None:
self.tb_writer = tb_writer
else:
self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir)
def _create_summary_writer(self) -> None:
"""
Create a summary writer to be able to read the logs in Tensorboard.
"""
self.writer = tf.summary.create_file_writer(self.args.logging_dir)
def get_train_tfdataset(self) -> tf.data.Dataset:
if self.train_dataset is None:
raise ValueError("Trainer: training requires a train_dataset.")
def _prepare_dataset(self) -> None:
"""
Prepare the training, validation and test data.
"""
if self.train_dataset is not None:
self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
if self.args.max_steps > 0:
......@@ -99,7 +65,7 @@ class TFTrainer:
else:
self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
self.train_dataset = (
ds = (
self.train_dataset.cache()
.shuffle(self.num_train_examples)
.batch(self.args.train_batch_size)
......@@ -109,54 +75,44 @@ class TFTrainer:
if self.args.max_steps > 0:
self.train_dataset = self.train_dataset.repeat(-1)
self.train_dataset = self.args.strategy.experimental_distribute_dataset(self.train_dataset)
else:
self.train_steps = 0
return self.args.strategy.experimental_distribute_dataset(ds)
if self.eval_dataset is not None:
self.eval_dataset = (
self.eval_dataset.batch(self.args.eval_batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
)
self.eval_dataset = self.args.strategy.experimental_distribute_dataset(self.eval_dataset)
def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.")
def _create_optimizer(self) -> None:
"""
Create the training optimizer with its name. Allowed names are those listed
in the Tensorflow documentation and those contained in the transformers library.
"""
if self.args.optimizer_name == "adamw":
self.optimizer = create_optimizer(
self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr
)
else:
try:
self.optimizer = tf.keras.optimizers.get(
{
"class_name": self.args.optimizer_name,
"config": {"learning_rate": self.args.learning_rate, "epsilon": self.args.adam_epsilon},
}
)
except TypeError:
# This is for the case where the optimizer is not Adam-like such as SGD
self.optimizer = tf.keras.optimizers.get(
{"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}}
)
logger.info("Created an/a {} optimizer".format(self.args.optimizer_name))
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
ds = eval_dataset.cache().batch(self.args.eval_batch_size).prefetch(tf.data.experimental.AUTOTUNE)
def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None:
"""
Create a checkpoint manager in order to be able to make the training
fault-tolerant.
Args:
max_to_keep: the maximum number of checkpoints to keep in the checkpoint path.
load_model: if we want to start the training from the latest checkpoint.
return self.args.strategy.experimental_distribute_dataset(ds)
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
ds = test_dataset.batch(self.args.eval_batch_size)
return self.args.strategy.experimental_distribute_dataset(ds)
def get_optimizers(
self,
) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]:
"""
ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model)
Setup the optimizer and the learning rate scheduler.
self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep)
We provide a reasonable default that works well.
If you want to use something else, you can pass a tuple in the Trainer's init,
or override this method in a subclass.
"""
if self.optimizers is not None:
return self.optimizers
optimizer, scheduler = create_optimizer(
self.args.learning_rate,
self.train_steps,
self.args.warmup_steps,
adam_epsilon=self.args.adam_epsilon,
weight_decay_rate=self.args.weight_decay,
)
if load_model:
ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
return optimizer, scheduler
@tf.function
def _evaluate_steps(self, per_replica_features, per_replica_labels):
......@@ -182,6 +138,14 @@ class TFTrainer:
def _prediction_loop(
self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None
) -> PredictionOutput:
"""
Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
Works both with or without labels.
"""
prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
logger.info("***** Running %s *****", description)
logger.info(" Batch size = %d", self.args.eval_batch_size)
......@@ -196,6 +160,12 @@ class TFTrainer:
loss = tf.reduce_mean(loss)
if not prediction_loss_only:
if isinstance(logits, tuple):
logits = logits[0]
if isinstance(labels, tuple):
labels = labels[0]
if self.args.n_gpu > 1:
for val in logits.values:
if preds is None:
......@@ -240,10 +210,9 @@ class TFTrainer:
"""
Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
"""
if eval_dataset is None:
eval_dataset = self.eval_dataset
eval_ds = self.get_eval_tfdataset(eval_dataset)
output = self._prediction_loop(eval_dataset, description="Evaluation")
output = self._prediction_loop(eval_ds, description="Evaluation")
return output.metrics
......@@ -251,12 +220,25 @@ class TFTrainer:
"""
Train method to train the model.
"""
train_ds = self.get_train_tfdataset()
if self.args.debug:
tf.summary.trace_on(graph=True, profiler=True)
self.gradient_accumulator.reset()
iterations = self.optimizer.iterations
with self.args.strategy.scope():
optimizer, lr_scheduler = self.get_optimizers()
iterations = optimizer.iterations
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model)
self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=5)
if self.model.ckpt_manager.latest_checkpoint:
logger.info(
"Checkpoint file %s found and restoring from checkpoint", self.model.ckpt_manager.latest_checkpoint
)
ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
if iterations.numpy() > 0:
logger.info("Start the training from the last checkpoint")
......@@ -268,21 +250,30 @@ class TFTrainer:
epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs
if self.args.fp16:
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
tf.keras.mixed_precision.experimental.set_policy(policy)
with self.tb_writer.as_default():
tf.summary.text("args", self.args.to_json_string())
self.tb_writer.flush()
logger.info("***** Running training *****")
logger.info(" Num examples = %d", self.num_train_examples)
logger.info(" Num Epochs = %d", epochs)
logger.info(" Total optimization steps = %d", self.train_steps)
for epoch in range(start_epoch, int(epochs + 1)):
for training_loss in self._training_steps():
for training_loss in self._training_steps(train_ds, optimizer):
step = iterations.numpy()
if self.args.debug:
with self.writer.as_default():
with self.tb_writer.as_default():
tf.summary.scalar("loss", training_loss, step=step)
if step == 1 and self.args.debug:
with self.writer.as_default():
with self.tb_writer.as_default():
tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir)
if self.args.evaluate_during_training and step % self.args.eval_steps == 0:
......@@ -293,17 +284,16 @@ class TFTrainer:
eval_key = "eval_{}".format(key)
logs[eval_key] = value
if callable(self.optimizer.learning_rate):
logs["learning_rate"] = self.optimizer.learning_rate(step).numpy()
else:
logs["learning_rate"] = self.optimizer.learning_rate.numpy()
logs["learning_rate"] = lr_scheduler(step).numpy()
logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs))
with self.writer.as_default():
with self.tb_writer.as_default():
for k, v in logs.items():
tf.summary.scalar(k, v, step=step)
self.tb_writer.flush()
if step % self.args.logging_steps == 0:
logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy()))
......@@ -314,21 +304,21 @@ class TFTrainer:
if step % self.train_steps == 0:
break
def _training_steps(self):
def _training_steps(self, ds, optimizer):
"""
Returns a generator over training steps (i.e. parameters update).
"""
for i, loss in enumerate(self._accumulate_next_gradients()):
for i, loss in enumerate(self._accumulate_next_gradients(ds)):
if i % self.args.gradient_accumulation_steps == 0:
self._apply_gradients()
self._apply_gradients(optimizer)
yield loss
@tf.function
def _apply_gradients(self):
def _apply_gradients(self, optimizer):
"""Applies the gradients (cross-replica)."""
self.args.strategy.experimental_run_v2(self._step)
self.args.strategy.experimental_run_v2(self._step, args=(optimizer,))
def _step(self):
def _step(self, optimizer):
"""Applies gradients and resets accumulation."""
gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync
gradients = [
......@@ -336,12 +326,12 @@ class TFTrainer:
]
gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients]
self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
self.gradient_accumulator.reset()
def _accumulate_next_gradients(self):
def _accumulate_next_gradients(self, ds):
"""Accumulates the gradients from the next element in dataset."""
iterator = iter(self.train_dataset)
iterator = iter(ds)
@tf.function
def _accumulate_next():
......@@ -388,23 +378,10 @@ class TFTrainer:
labels: the batched labels.
training: run the model in training mode or not
"""
if self.args.mode == "text-classification" or self.args.mode == "token-classification":
logits = self.model(features, training=training)[0]
if isinstance(labels, (dict)):
loss, logits = self.model(features, training=training, **labels)[:2]
else:
logits = self.model(features, training=training)
if self.args.mode == "token-classification":
active_loss = tf.reshape(labels, (-1,)) != -1
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
loss = self.loss(labels, reduced_logits)
elif self.args.mode == "question-answering":
start_loss = self.loss(labels["start_position"], logits[0])
end_loss = self.loss(labels["end_position"], logits[1])
loss = (start_loss + end_loss) / 2.0
else:
loss = self.loss(labels, logits)
loss, logits = self.model(features, labels=labels, training=training)[:2]
loss += sum(self.model.losses) * (1.0 / self.args.n_gpu)
return loss, logits
......@@ -418,19 +395,24 @@ class TFTrainer:
test_dataset: something similar to a PT Dataset. This is just
temporary before to have a framework-agnostic approach for datasets.
"""
test_dataset = test_dataset.batch(self.args.eval_batch_size)
test_dataset = self.args.strategy.experimental_distribute_dataset(test_dataset)
test_ds = self.get_test_tfdataset(test_dataset)
return self._prediction_loop(test_dataset, description="Prediction")
return self._prediction_loop(test_ds, description="Prediction")
def save_model(self) -> None:
def save_model(self, output_dir: Optional[str] = None):
"""
Save the pretrained model and create a Tensorflow saved model.
"""
logger.info("Saving model in {}".format(self.args.output_dir))
output_dir = output_dir if output_dir is not None else self.args.output_dir
logger.info("Saving model in {}".format(output_dir))
path = os.path.join(self.args.output_dir, "saved_model")
logger.info("Saving model in {}".format(path))
os.makedirs(path, exist_ok=True)
if not isinstance(self.model, TFPreTrainedModel):
raise ValueError("Trainer.model appears to not be a PreTrainedModel")
self.model.save_pretrained(self.args.output_dir)
import dataclasses
import json
import logging
import os
from dataclasses import dataclass, field
from typing import Any, Dict, Optional, Tuple
......@@ -27,6 +28,17 @@ def is_tpu_available():
logger = logging.getLogger(__name__)
def default_logdir() -> str:
"""
Same default as PyTorch
"""
import socket
from datetime import datetime
current_time = datetime.now().strftime("%b%d_%H-%M-%S")
return os.path.join("runs", current_time + "_" + socket.gethostname())
@dataclass
class TrainingArguments:
"""
......@@ -97,7 +109,7 @@ class TrainingArguments:
)
warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
......
......@@ -14,28 +14,9 @@ if is_tf_available():
@dataclass
class TFTrainingArguments(TrainingArguments):
optimizer_name: str = field(
default="adam",
metadata={
"help": 'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"'
},
)
mode: str = field(
default="text-classification",
metadata={"help": 'Type of task, one of "text-classification", "token-classification", "question-answering"'},
)
loss_name: str = field(
default="SparseCategoricalCrossentropy",
metadata={
"help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses"
},
)
tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"},
)
end_lr: float = field(
default=0, metadata={"help": "End learning rate for optimizer"},
)
eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."})
debug: bool = field(
default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"}
......
......@@ -30,7 +30,7 @@ if is_tf_available():
import tensorflow as tf
import numpy as np
from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding
from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding, TFSharedEmbeddings
if _tf_gpu_memory_limit is not None:
gpus = tf.config.list_physical_devices("GPU")
......@@ -107,16 +107,32 @@ class TFModelTesterMixin:
and getattr(module_member, "_keras_serializable", False)
)
for main_layer_class in tf_main_layer_classes:
# T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
if "T5" in main_layer_class.__name__:
# Take the same values than in TFT5ModelTester for this shared layer
shared = TFSharedEmbeddings(99, 32, name="shared")
main_layer = main_layer_class(config, embed_tokens=shared)
else:
main_layer = main_layer_class(config)
symbolic_inputs = {
name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
}
model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
outputs = model(inputs_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "keras_model.h5")
model.save(filepath)
if "T5" in main_layer_class.__name__:
model = tf.keras.models.load_model(
filepath,
custom_objects={
main_layer_class.__name__: main_layer_class,
"TFSharedEmbeddings": TFSharedEmbeddings,
},
)
else:
model = tf.keras.models.load_model(
filepath, custom_objects={main_layer_class.__name__: main_layer_class}
)
......@@ -126,6 +142,9 @@ class TFModelTesterMixin:
def assert_outputs_same(self, after_outputs, outputs):
# Make sure we don't have nans
if isinstance(after_outputs, tf.Tensor):
out_1 = after_outputs.numpy()
else:
out_1 = after_outputs[0].numpy()
out_2 = outputs[0].numpy()
self.assertEqual(out_1.shape, out_2.shape)
......@@ -269,7 +288,6 @@ class TFModelTesterMixin:
inputs_keywords = copy.deepcopy(inputs_dict)
input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,)
outputs_keywords = model(input_ids, **inputs_keywords)
output_dict = outputs_dict[0].numpy()
output_keywords = outputs_keywords[0].numpy()
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_tf_available
from .utils import require_tf, slow
if is_tf_available():
import tensorflow as tf
import numpy as np
from transformers import TFFlaubertModel
@require_tf
class TFFlaubertModelIntegrationTest(unittest.TestCase):
@slow
def test_output_embeds_base_model(self):
model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
input_ids = tf.convert_to_tensor(
[[0, 158, 735, 2592, 1424, 6727, 82, 1]], dtype=tf.int32,
) # "J'aime flaubert !"
output = model(input_ids)[0]
expected_shape = tf.TensorShape((1, 8, 512))
self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice.
expected_slice = tf.convert_to_tensor(
[
[
[-1.8768773, -1.566555, 0.27072418],
[-1.6920038, -0.5873505, 1.9329599],
[-2.9563985, -1.6993835, 1.7972052],
]
],
dtype=tf.float32,
)
self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_tf_available
from .utils import require_tf, slow
if is_tf_available():
import tensorflow as tf
import numpy as np
from transformers import TFXLMRobertaModel
@require_tf
class TFFlaubertModelIntegrationTest(unittest.TestCase):
@slow
def test_output_embeds_base_model(self):
model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
features = {
"input_ids": tf.convert_to_tensor([[0, 2646, 10269, 83, 99942, 2]], dtype=tf.int32), # "My dog is cute"
"attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
}
output = model(features)[0]
expected_shape = tf.TensorShape((1, 6, 768))
self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice.
expected_slice = tf.convert_to_tensor(
[
[
[0.0681762, 0.10894451, 0.06772504],
[-0.06423668, 0.02366615, 0.04329344],
[-0.06057295, 0.09974135, -0.00070584],
]
],
dtype=tf.float32,
)
self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
......@@ -47,7 +47,7 @@ class OptimizationFTest(unittest.TestCase):
with strategy.scope():
accumulator = GradientAccumulator()
variable = tf.Variable([4.0, 3.0])
optimizer = create_optimizer(5e-5, 10, 5)
optimizer, _ = create_optimizer(5e-5, 10, 5)
gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
def accumulate_on_replica(gradient):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment