Unverified Commit f9414f75 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

Tensorflow improvements (#4530)



* Better None gradients handling

* Apply Style

* Apply Style

* Create a loss class per task to compute its respective loss

* Add loss classes to the ALBERT TF models

* Add loss classes to the BERT TF models

* Add question answering and multiple choice to TF Camembert

* Remove prints

* Add multiple choice model to TF DistilBERT + loss computation

* Add question answering model to TF Electra + loss computation

* Add token classification, question answering and multiple choice models to TF Flaubert

* Add multiple choice model to TF Roberta + loss computation

* Add multiple choice model to TF XLM + loss computation

* Add multiple choice and question answering models to TF XLM-Roberta

* Add multiple choice model to TF XLNet + loss computation

* Remove unused parameters

* Add task loss classes

* Reorder TF imports + add new model classes

* Add new model classes

* Bugfix in TF T5 model

* Bugfix for TF T5 tests

* Bugfix in TF T5 model

* Fix TF T5 model tests

* Fix T5 tests + some renaming

* Fix inheritance issue in the AutoX tests

* Add tests for TF Flaubert and TF XLM Roberta

* Add tests for TF Flaubert and TF XLM Roberta

* Remove unused piece of code in the TF trainer

* bugfix and remove unused code

* Bugfix for TF 2.2

* Apply Style

* Divide TFSequenceClassificationAndMultipleChoiceLoss into their two respective name

* Apply style

* Mirror the PT Trainer in the TF one: fp16, optimizers and tb_writer as class parameter and better dataset handling

* Fix TF optimizations tests and apply style

* Remove useless parameter

* Bugfix and apply style

* Fix TF Trainer prediction

* Now the TF models return the loss such as their PyTorch couterparts

* Apply Style

* Ignore some tests output

* Take into account the SQuAD cls_index, p_mask and is_impossible parameters for the QuestionAnswering task models.

* Fix names for SQuAD data

* Apply Style

* Fix conflicts with 2.11 release

* Fix conflicts with 2.11

* Fix wrongname

* Add better documentation on the new create_optimizer function

* Fix isort

* logging_dir: use same default as PyTorch
Co-authored-by: default avatarJulien Chaumond <chaumond@gmail.com>
parent ccd26c28
...@@ -3,12 +3,12 @@ ...@@ -3,12 +3,12 @@
import logging import logging
import math import math
import os import os
from typing import Callable, Dict, Optional from typing import Callable, Dict, Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .modeling_tf_utils import TFPreTrainedModel, shape_list from .modeling_tf_utils import TFPreTrainedModel
from .optimization_tf import GradientAccumulator, create_optimizer from .optimization_tf import GradientAccumulator, create_optimizer
from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput
from .training_args_tf import TFTrainingArguments from .training_args_tf import TFTrainingArguments
...@@ -20,13 +20,14 @@ logger = logging.getLogger(__name__) ...@@ -20,13 +20,14 @@ logger = logging.getLogger(__name__)
class TFTrainer: class TFTrainer:
model: TFPreTrainedModel model: TFPreTrainedModel
args: TFTrainingArguments args: TFTrainingArguments
# something similar to a PT Dataset.
# This is just temporary before to have
# a framework-agnostic approach for datasets.
train_dataset: Optional[tf.data.Dataset] train_dataset: Optional[tf.data.Dataset]
eval_dataset: Optional[tf.data.Dataset] eval_dataset: Optional[tf.data.Dataset]
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
prediction_loss_only: bool prediction_loss_only: bool
tb_writer: Optional[tf.summary.SummaryWriter] = None
optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None
global_step: Optional[int] = None
epoch: Optional[float] = None
def __init__( def __init__(
self, self,
...@@ -36,6 +37,8 @@ class TFTrainer: ...@@ -36,6 +37,8 @@ class TFTrainer:
eval_dataset: Optional[tf.data.Dataset] = None, eval_dataset: Optional[tf.data.Dataset] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
prediction_loss_only=False, prediction_loss_only=False,
tb_writer: Optional[tf.summary.SummaryWriter] = None,
optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None,
): ):
self.model = model self.model = model
self.args = args self.args = args
...@@ -43,120 +46,73 @@ class TFTrainer: ...@@ -43,120 +46,73 @@ class TFTrainer:
self.eval_dataset = eval_dataset self.eval_dataset = eval_dataset
self.compute_metrics = compute_metrics self.compute_metrics = compute_metrics
self.prediction_loss_only = prediction_loss_only self.prediction_loss_only = prediction_loss_only
self.optimizers = optimizers
self.gradient_accumulator = GradientAccumulator() self.gradient_accumulator = GradientAccumulator()
self._setup_training() if tb_writer is not None:
self.tb_writer = tb_writer
else:
self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir)
def _setup_training(self) -> None: def get_train_tfdataset(self) -> tf.data.Dataset:
""" if self.train_dataset is None:
Setup the different steps to train a model: raise ValueError("Trainer: training requires a train_dataset.")
- check if all the data are given
- create the proper strategy
- create the features
- prepare the model settings
"""
self._prepare_dataset()
with self.args.strategy.scope(): self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
self._create_optimizer()
_ = self.optimizer.iterations
self._set_loss_and_metric()
self._create_checkpoint_manager()
self._create_summary_writer()
def _set_loss_and_metric(self) -> None: if self.args.max_steps > 0:
""" self.train_steps = self.args.max_steps
Create the training loss and metric with their name. Allowed names are those listed else:
in the Tensorflow documentation and those contained in the transformers library. self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
"""
try:
self.loss = tf.keras.losses.get(
{
"class_name": self.args.loss_name,
"config": {"from_logits": True, "reduction": tf.keras.losses.Reduction.NONE},
}
)
except TypeError:
self.loss = tf.keras.losses.get(
{"class_name": self.args.loss_name, "config": {"reduction": tf.keras.losses.Reduction.NONE}}
)
def _create_summary_writer(self) -> None:
"""
Create a summary writer to be able to read the logs in Tensorboard.
"""
self.writer = tf.summary.create_file_writer(self.args.logging_dir)
def _prepare_dataset(self) -> None: ds = (
""" self.train_dataset.cache()
Prepare the training, validation and test data. .shuffle(self.num_train_examples)
""" .batch(self.args.train_batch_size)
if self.train_dataset is not None: .prefetch(tf.data.experimental.AUTOTUNE)
self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy() )
if self.args.max_steps > 0: if self.args.max_steps > 0:
self.train_steps = self.args.max_steps self.train_dataset = self.train_dataset.repeat(-1)
else:
self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
self.train_dataset = ( return self.args.strategy.experimental_distribute_dataset(ds)
self.train_dataset.cache()
.shuffle(self.num_train_examples)
.batch(self.args.train_batch_size)
.prefetch(tf.data.experimental.AUTOTUNE)
)
if self.args.max_steps > 0: def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
self.train_dataset = self.train_dataset.repeat(-1) if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.")
self.train_dataset = self.args.strategy.experimental_distribute_dataset(self.train_dataset) eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
else: ds = eval_dataset.cache().batch(self.args.eval_batch_size).prefetch(tf.data.experimental.AUTOTUNE)
self.train_steps = 0
if self.eval_dataset is not None: return self.args.strategy.experimental_distribute_dataset(ds)
self.eval_dataset = (
self.eval_dataset.batch(self.args.eval_batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
)
self.eval_dataset = self.args.strategy.experimental_distribute_dataset(self.eval_dataset)
def _create_optimizer(self) -> None: def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
""" ds = test_dataset.batch(self.args.eval_batch_size)
Create the training optimizer with its name. Allowed names are those listed
in the Tensorflow documentation and those contained in the transformers library.
"""
if self.args.optimizer_name == "adamw":
self.optimizer = create_optimizer(
self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr
)
else:
try:
self.optimizer = tf.keras.optimizers.get(
{
"class_name": self.args.optimizer_name,
"config": {"learning_rate": self.args.learning_rate, "epsilon": self.args.adam_epsilon},
}
)
except TypeError:
# This is for the case where the optimizer is not Adam-like such as SGD
self.optimizer = tf.keras.optimizers.get(
{"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}}
)
logger.info("Created an/a {} optimizer".format(self.args.optimizer_name))
def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None: return self.args.strategy.experimental_distribute_dataset(ds)
"""
Create a checkpoint manager in order to be able to make the training def get_optimizers(
fault-tolerant. self,
Args: ) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]:
max_to_keep: the maximum number of checkpoints to keep in the checkpoint path.
load_model: if we want to start the training from the latest checkpoint.
""" """
ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) Setup the optimizer and the learning rate scheduler.
self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep) We provide a reasonable default that works well.
If you want to use something else, you can pass a tuple in the Trainer's init,
or override this method in a subclass.
"""
if self.optimizers is not None:
return self.optimizers
optimizer, scheduler = create_optimizer(
self.args.learning_rate,
self.train_steps,
self.args.warmup_steps,
adam_epsilon=self.args.adam_epsilon,
weight_decay_rate=self.args.weight_decay,
)
if load_model: return optimizer, scheduler
ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
@tf.function @tf.function
def _evaluate_steps(self, per_replica_features, per_replica_labels): def _evaluate_steps(self, per_replica_features, per_replica_labels):
...@@ -182,6 +138,14 @@ class TFTrainer: ...@@ -182,6 +138,14 @@ class TFTrainer:
def _prediction_loop( def _prediction_loop(
self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None
) -> PredictionOutput: ) -> PredictionOutput:
"""
Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
Works both with or without labels.
"""
prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
logger.info("***** Running %s *****", description) logger.info("***** Running %s *****", description)
logger.info(" Batch size = %d", self.args.eval_batch_size) logger.info(" Batch size = %d", self.args.eval_batch_size)
...@@ -196,6 +160,12 @@ class TFTrainer: ...@@ -196,6 +160,12 @@ class TFTrainer:
loss = tf.reduce_mean(loss) loss = tf.reduce_mean(loss)
if not prediction_loss_only: if not prediction_loss_only:
if isinstance(logits, tuple):
logits = logits[0]
if isinstance(labels, tuple):
labels = labels[0]
if self.args.n_gpu > 1: if self.args.n_gpu > 1:
for val in logits.values: for val in logits.values:
if preds is None: if preds is None:
...@@ -240,10 +210,9 @@ class TFTrainer: ...@@ -240,10 +210,9 @@ class TFTrainer:
""" """
Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
""" """
if eval_dataset is None: eval_ds = self.get_eval_tfdataset(eval_dataset)
eval_dataset = self.eval_dataset
output = self._prediction_loop(eval_dataset, description="Evaluation") output = self._prediction_loop(eval_ds, description="Evaluation")
return output.metrics return output.metrics
...@@ -251,12 +220,25 @@ class TFTrainer: ...@@ -251,12 +220,25 @@ class TFTrainer:
""" """
Train method to train the model. Train method to train the model.
""" """
train_ds = self.get_train_tfdataset()
if self.args.debug: if self.args.debug:
tf.summary.trace_on(graph=True, profiler=True) tf.summary.trace_on(graph=True, profiler=True)
self.gradient_accumulator.reset() self.gradient_accumulator.reset()
iterations = self.optimizer.iterations with self.args.strategy.scope():
optimizer, lr_scheduler = self.get_optimizers()
iterations = optimizer.iterations
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model)
self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=5)
if self.model.ckpt_manager.latest_checkpoint:
logger.info(
"Checkpoint file %s found and restoring from checkpoint", self.model.ckpt_manager.latest_checkpoint
)
ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
if iterations.numpy() > 0: if iterations.numpy() > 0:
logger.info("Start the training from the last checkpoint") logger.info("Start the training from the last checkpoint")
...@@ -268,21 +250,30 @@ class TFTrainer: ...@@ -268,21 +250,30 @@ class TFTrainer:
epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs
if self.args.fp16:
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
tf.keras.mixed_precision.experimental.set_policy(policy)
with self.tb_writer.as_default():
tf.summary.text("args", self.args.to_json_string())
self.tb_writer.flush()
logger.info("***** Running training *****") logger.info("***** Running training *****")
logger.info(" Num examples = %d", self.num_train_examples) logger.info(" Num examples = %d", self.num_train_examples)
logger.info(" Num Epochs = %d", epochs) logger.info(" Num Epochs = %d", epochs)
logger.info(" Total optimization steps = %d", self.train_steps) logger.info(" Total optimization steps = %d", self.train_steps)
for epoch in range(start_epoch, int(epochs + 1)): for epoch in range(start_epoch, int(epochs + 1)):
for training_loss in self._training_steps(): for training_loss in self._training_steps(train_ds, optimizer):
step = iterations.numpy() step = iterations.numpy()
if self.args.debug: if self.args.debug:
with self.writer.as_default(): with self.tb_writer.as_default():
tf.summary.scalar("loss", training_loss, step=step) tf.summary.scalar("loss", training_loss, step=step)
if step == 1 and self.args.debug: if step == 1 and self.args.debug:
with self.writer.as_default(): with self.tb_writer.as_default():
tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir) tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir)
if self.args.evaluate_during_training and step % self.args.eval_steps == 0: if self.args.evaluate_during_training and step % self.args.eval_steps == 0:
...@@ -293,17 +284,16 @@ class TFTrainer: ...@@ -293,17 +284,16 @@ class TFTrainer:
eval_key = "eval_{}".format(key) eval_key = "eval_{}".format(key)
logs[eval_key] = value logs[eval_key] = value
if callable(self.optimizer.learning_rate): logs["learning_rate"] = lr_scheduler(step).numpy()
logs["learning_rate"] = self.optimizer.learning_rate(step).numpy()
else:
logs["learning_rate"] = self.optimizer.learning_rate.numpy()
logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs)) logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs))
with self.writer.as_default(): with self.tb_writer.as_default():
for k, v in logs.items(): for k, v in logs.items():
tf.summary.scalar(k, v, step=step) tf.summary.scalar(k, v, step=step)
self.tb_writer.flush()
if step % self.args.logging_steps == 0: if step % self.args.logging_steps == 0:
logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy())) logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy()))
...@@ -314,21 +304,21 @@ class TFTrainer: ...@@ -314,21 +304,21 @@ class TFTrainer:
if step % self.train_steps == 0: if step % self.train_steps == 0:
break break
def _training_steps(self): def _training_steps(self, ds, optimizer):
""" """
Returns a generator over training steps (i.e. parameters update). Returns a generator over training steps (i.e. parameters update).
""" """
for i, loss in enumerate(self._accumulate_next_gradients()): for i, loss in enumerate(self._accumulate_next_gradients(ds)):
if i % self.args.gradient_accumulation_steps == 0: if i % self.args.gradient_accumulation_steps == 0:
self._apply_gradients() self._apply_gradients(optimizer)
yield loss yield loss
@tf.function @tf.function
def _apply_gradients(self): def _apply_gradients(self, optimizer):
"""Applies the gradients (cross-replica).""" """Applies the gradients (cross-replica)."""
self.args.strategy.experimental_run_v2(self._step) self.args.strategy.experimental_run_v2(self._step, args=(optimizer,))
def _step(self): def _step(self, optimizer):
"""Applies gradients and resets accumulation.""" """Applies gradients and resets accumulation."""
gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync
gradients = [ gradients = [
...@@ -336,12 +326,12 @@ class TFTrainer: ...@@ -336,12 +326,12 @@ class TFTrainer:
] ]
gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients] gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients]
self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
self.gradient_accumulator.reset() self.gradient_accumulator.reset()
def _accumulate_next_gradients(self): def _accumulate_next_gradients(self, ds):
"""Accumulates the gradients from the next element in dataset.""" """Accumulates the gradients from the next element in dataset."""
iterator = iter(self.train_dataset) iterator = iter(ds)
@tf.function @tf.function
def _accumulate_next(): def _accumulate_next():
...@@ -388,23 +378,10 @@ class TFTrainer: ...@@ -388,23 +378,10 @@ class TFTrainer:
labels: the batched labels. labels: the batched labels.
training: run the model in training mode or not training: run the model in training mode or not
""" """
if self.args.mode == "text-classification" or self.args.mode == "token-classification": if isinstance(labels, (dict)):
logits = self.model(features, training=training)[0] loss, logits = self.model(features, training=training, **labels)[:2]
else:
logits = self.model(features, training=training)
if self.args.mode == "token-classification":
active_loss = tf.reshape(labels, (-1,)) != -1
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
loss = self.loss(labels, reduced_logits)
elif self.args.mode == "question-answering":
start_loss = self.loss(labels["start_position"], logits[0])
end_loss = self.loss(labels["end_position"], logits[1])
loss = (start_loss + end_loss) / 2.0
else: else:
loss = self.loss(labels, logits) loss, logits = self.model(features, labels=labels, training=training)[:2]
loss += sum(self.model.losses) * (1.0 / self.args.n_gpu) loss += sum(self.model.losses) * (1.0 / self.args.n_gpu)
return loss, logits return loss, logits
...@@ -418,19 +395,24 @@ class TFTrainer: ...@@ -418,19 +395,24 @@ class TFTrainer:
test_dataset: something similar to a PT Dataset. This is just test_dataset: something similar to a PT Dataset. This is just
temporary before to have a framework-agnostic approach for datasets. temporary before to have a framework-agnostic approach for datasets.
""" """
test_dataset = test_dataset.batch(self.args.eval_batch_size) test_ds = self.get_test_tfdataset(test_dataset)
test_dataset = self.args.strategy.experimental_distribute_dataset(test_dataset)
return self._prediction_loop(test_dataset, description="Prediction") return self._prediction_loop(test_ds, description="Prediction")
def save_model(self) -> None: def save_model(self, output_dir: Optional[str] = None):
""" """
Save the pretrained model and create a Tensorflow saved model. Save the pretrained model and create a Tensorflow saved model.
""" """
logger.info("Saving model in {}".format(self.args.output_dir)) output_dir = output_dir if output_dir is not None else self.args.output_dir
logger.info("Saving model in {}".format(output_dir))
path = os.path.join(self.args.output_dir, "saved_model") path = os.path.join(self.args.output_dir, "saved_model")
logger.info("Saving model in {}".format(path)) logger.info("Saving model in {}".format(path))
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
if not isinstance(self.model, TFPreTrainedModel):
raise ValueError("Trainer.model appears to not be a PreTrainedModel")
self.model.save_pretrained(self.args.output_dir) self.model.save_pretrained(self.args.output_dir)
import dataclasses import dataclasses
import json import json
import logging import logging
import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Dict, Optional, Tuple from typing import Any, Dict, Optional, Tuple
...@@ -27,6 +28,17 @@ def is_tpu_available(): ...@@ -27,6 +28,17 @@ def is_tpu_available():
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def default_logdir() -> str:
"""
Same default as PyTorch
"""
import socket
from datetime import datetime
current_time = datetime.now().strftime("%b%d_%H-%M-%S")
return os.path.join("runs", current_time + "_" + socket.gethostname())
@dataclass @dataclass
class TrainingArguments: class TrainingArguments:
""" """
...@@ -97,7 +109,7 @@ class TrainingArguments: ...@@ -97,7 +109,7 @@ class TrainingArguments:
) )
warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."}) logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"}) logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
......
...@@ -14,28 +14,9 @@ if is_tf_available(): ...@@ -14,28 +14,9 @@ if is_tf_available():
@dataclass @dataclass
class TFTrainingArguments(TrainingArguments): class TFTrainingArguments(TrainingArguments):
optimizer_name: str = field(
default="adam",
metadata={
"help": 'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"'
},
)
mode: str = field(
default="text-classification",
metadata={"help": 'Type of task, one of "text-classification", "token-classification", "question-answering"'},
)
loss_name: str = field(
default="SparseCategoricalCrossentropy",
metadata={
"help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses"
},
)
tpu_name: str = field( tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"}, default=None, metadata={"help": "Name of TPU"},
) )
end_lr: float = field(
default=0, metadata={"help": "End learning rate for optimizer"},
)
eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."}) eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."})
debug: bool = field( debug: bool = field(
default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"} default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"}
......
...@@ -30,7 +30,7 @@ if is_tf_available(): ...@@ -30,7 +30,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding, TFSharedEmbeddings
if _tf_gpu_memory_limit is not None: if _tf_gpu_memory_limit is not None:
gpus = tf.config.list_physical_devices("GPU") gpus = tf.config.list_physical_devices("GPU")
...@@ -107,26 +107,45 @@ class TFModelTesterMixin: ...@@ -107,26 +107,45 @@ class TFModelTesterMixin:
and getattr(module_member, "_keras_serializable", False) and getattr(module_member, "_keras_serializable", False)
) )
for main_layer_class in tf_main_layer_classes: for main_layer_class in tf_main_layer_classes:
main_layer = main_layer_class(config) # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
if "T5" in main_layer_class.__name__:
# Take the same values than in TFT5ModelTester for this shared layer
shared = TFSharedEmbeddings(99, 32, name="shared")
main_layer = main_layer_class(config, embed_tokens=shared)
else:
main_layer = main_layer_class(config)
symbolic_inputs = { symbolic_inputs = {
name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
} }
model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
outputs = model(inputs_dict) outputs = model(inputs_dict)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "keras_model.h5") filepath = os.path.join(tmpdirname, "keras_model.h5")
model.save(filepath) model.save(filepath)
model = tf.keras.models.load_model( if "T5" in main_layer_class.__name__:
filepath, custom_objects={main_layer_class.__name__: main_layer_class} model = tf.keras.models.load_model(
) filepath,
custom_objects={
main_layer_class.__name__: main_layer_class,
"TFSharedEmbeddings": TFSharedEmbeddings,
},
)
else:
model = tf.keras.models.load_model(
filepath, custom_objects={main_layer_class.__name__: main_layer_class}
)
assert isinstance(model, tf.keras.Model) assert isinstance(model, tf.keras.Model)
after_outputs = model(inputs_dict) after_outputs = model(inputs_dict)
self.assert_outputs_same(after_outputs, outputs) self.assert_outputs_same(after_outputs, outputs)
def assert_outputs_same(self, after_outputs, outputs): def assert_outputs_same(self, after_outputs, outputs):
# Make sure we don't have nans # Make sure we don't have nans
out_1 = after_outputs[0].numpy() if isinstance(after_outputs, tf.Tensor):
out_1 = after_outputs.numpy()
else:
out_1 = after_outputs[0].numpy()
out_2 = outputs[0].numpy() out_2 = outputs[0].numpy()
self.assertEqual(out_1.shape, out_2.shape) self.assertEqual(out_1.shape, out_2.shape)
out_1 = out_1[~np.isnan(out_1)] out_1 = out_1[~np.isnan(out_1)]
...@@ -269,7 +288,6 @@ class TFModelTesterMixin: ...@@ -269,7 +288,6 @@ class TFModelTesterMixin:
inputs_keywords = copy.deepcopy(inputs_dict) inputs_keywords = copy.deepcopy(inputs_dict)
input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,) input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,)
outputs_keywords = model(input_ids, **inputs_keywords) outputs_keywords = model(input_ids, **inputs_keywords)
output_dict = outputs_dict[0].numpy() output_dict = outputs_dict[0].numpy()
output_keywords = outputs_keywords[0].numpy() output_keywords = outputs_keywords[0].numpy()
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_tf_available
from .utils import require_tf, slow
if is_tf_available():
import tensorflow as tf
import numpy as np
from transformers import TFFlaubertModel
@require_tf
class TFFlaubertModelIntegrationTest(unittest.TestCase):
@slow
def test_output_embeds_base_model(self):
model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
input_ids = tf.convert_to_tensor(
[[0, 158, 735, 2592, 1424, 6727, 82, 1]], dtype=tf.int32,
) # "J'aime flaubert !"
output = model(input_ids)[0]
expected_shape = tf.TensorShape((1, 8, 512))
self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice.
expected_slice = tf.convert_to_tensor(
[
[
[-1.8768773, -1.566555, 0.27072418],
[-1.6920038, -0.5873505, 1.9329599],
[-2.9563985, -1.6993835, 1.7972052],
]
],
dtype=tf.float32,
)
self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_tf_available
from .utils import require_tf, slow
if is_tf_available():
import tensorflow as tf
import numpy as np
from transformers import TFXLMRobertaModel
@require_tf
class TFFlaubertModelIntegrationTest(unittest.TestCase):
@slow
def test_output_embeds_base_model(self):
model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
features = {
"input_ids": tf.convert_to_tensor([[0, 2646, 10269, 83, 99942, 2]], dtype=tf.int32), # "My dog is cute"
"attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
}
output = model(features)[0]
expected_shape = tf.TensorShape((1, 6, 768))
self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice.
expected_slice = tf.convert_to_tensor(
[
[
[0.0681762, 0.10894451, 0.06772504],
[-0.06423668, 0.02366615, 0.04329344],
[-0.06057295, 0.09974135, -0.00070584],
]
],
dtype=tf.float32,
)
self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
...@@ -47,7 +47,7 @@ class OptimizationFTest(unittest.TestCase): ...@@ -47,7 +47,7 @@ class OptimizationFTest(unittest.TestCase):
with strategy.scope(): with strategy.scope():
accumulator = GradientAccumulator() accumulator = GradientAccumulator()
variable = tf.Variable([4.0, 3.0]) variable = tf.Variable([4.0, 3.0])
optimizer = create_optimizer(5e-5, 10, 5) optimizer, _ = create_optimizer(5e-5, 10, 5)
gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False) gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
def accumulate_on_replica(gradient): def accumulate_on_replica(gradient):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment