Unverified Commit 6eb51450 authored by Matt's avatar Matt Committed by GitHub
Browse files

TF Examples Rewrite (#18451)



* Finished QA example

* Dodge a merge conflict

* Update text classification and LM examples

* Update NER example

* New Keras metrics WIP, fix NER example

* Update NER example

* Update MC, summarization and translation examples

* Add XLA warnings when shapes are variable

* Make sure batch_size is consistently scaled by num_replicas

* Add PushToHubCallback to all models

* Add docs links for KerasMetricCallback

* Add docs links for prepare_tf_dataset and jit_compile

* Correct inferred model names

* Don't assume the dataset has 'lang'

* Don't assume the dataset has 'lang'

* Write metrics in text classification

* Add 'framework' to TrainingArguments and TFTrainingArguments

* Export metrics in all examples and add tests

* Fix training args for Flax

* Update command line args for translation test

* make fixup

* Fix accidentally running other tests in fp16

* Remove do_train/do_eval from run_clm.py

* Remove do_train/do_eval from run_mlm.py

* Add tensorflow tests to circleci

* Fix circleci

* Update examples/tensorflow/language-modeling/run_mlm.py
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/test_tensorflow_examples.py
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/translation/run_translation.py
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/token-classification/run_ner.py
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>

* Fix save path for tests

* Fix some model card kwargs

* Explain the magical -1000

* Actually enable tests this time

* Skip text classification PR until we fix shape inference

* make fixup
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>
parent d7e2d7b4
......@@ -658,6 +658,71 @@ jobs:
- store_artifacts:
path: ~/transformers/reports
run_examples_tensorflow:
working_directory: ~/transformers
docker:
- image: cimg/python:3.7.12
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
PYTEST_TIMEOUT: 120
resource_class: xlarge
parallelism: 1
steps:
- checkout
- restore_cache:
keys:
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
- v0.5-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
- run: pip install -r examples/tensorflow/_tests_requirements.txt
- save_cache:
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
- store_artifacts:
path: ~/transformers/test_preparation.txt
- run: |
if [ -f test_list.txt ]; then
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt
fi
- store_artifacts:
path: ~/transformers/tensorflow_examples_output.txt
- store_artifacts:
path: ~/transformers/reports
run_examples_tensorflow_all:
working_directory: ~/transformers
docker:
- image: cimg/python:3.7.12
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
PYTEST_TIMEOUT: 120
resource_class: xlarge
parallelism: 1
steps:
- checkout
- restore_cache:
keys:
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
- v0.5-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
- run: pip install -r examples/tensorflow/_tests_requirements.txt
- save_cache:
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: |
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt
- store_artifacts:
path: ~/transformers/tensorflow_examples_output.txt
- store_artifacts:
path: ~/transformers/reports
run_examples_flax:
working_directory: ~/transformers
docker:
......@@ -1000,6 +1065,7 @@ workflows:
- check_code_quality
- check_repository_consistency
- run_examples_torch
- run_examples_tensorflow
- run_examples_flax
- run_tests_custom_tokenizers
- run_tests_torch_and_tf
......@@ -1022,6 +1088,7 @@ workflows:
- main
jobs:
- run_examples_torch_all
- run_examples_tensorflow_all
- run_examples_flax_all
- run_tests_torch_and_tf_all
- run_tests_torch_and_flax_all
......
tensorflow
tensorboard
scikit-learn
seqeval
psutil
sacrebleu >= 1.4.12
git+https://github.com/huggingface/accelerate@main#egg=accelerate
rouge-score
tensorflow_datasets
matplotlib
git-python==1.0.3
faiss-cpu
streamlit
elasticsearch
nltk
pandas
datasets >= 1.13.3
fire
pytest
conllu
sentencepiece != 0.1.92
protobuf
jiwer
librosa
evaluate >= 0.2.0
......@@ -22,6 +22,8 @@ https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own clm task. Pointers for this are left as comments.
import json
# region Imports
import logging
import math
......@@ -46,8 +48,8 @@ from transformers import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
AutoConfig,
AutoTokenizer,
DefaultDataCollator,
HfArgumentParser,
PushToHubCallback,
TFAutoModelForCausalLM,
TFTrainingArguments,
create_optimizer,
......@@ -205,21 +207,6 @@ class DataTrainingArguments:
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
# endregion
# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
def __init__(self, output_dir, **kwargs):
super().__init__()
self.output_dir = output_dir
def on_epoch_end(self, epoch, logs=None):
self.model.save_pretrained(self.output_dir)
# endregion
......@@ -299,6 +286,7 @@ def main():
raw_datasets = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
)
if "validation" not in raw_datasets.keys():
......@@ -306,12 +294,14 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
)
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
)
else:
......@@ -321,16 +311,39 @@ def main():
data_files["train"] = data_args.train_file
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
extension = (
data_args.train_file.split(".")[-1]
if data_args.train_file is not None
else data_args.validation_file.split(".")[-1]
)
if extension == "txt":
extension = "text"
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
raw_datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
**dataset_args,
)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
**dataset_args,
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
**dataset_args,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# endregion
......@@ -446,7 +459,7 @@ def main():
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# endregion
......@@ -465,44 +478,88 @@ def main():
# region TF Dataset preparation
num_replicas = training_args.strategy.num_replicas_in_sync
data_collator = DefaultDataCollator(return_tensors="tf")
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_train_dataset = train_dataset.to_tf_dataset(
# labels are passed as input, as we will use the model's internal loss
columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
shuffle=True,
batch_size=num_replicas * training_args.per_device_train_batch_size,
collate_fn=data_collator,
drop_remainder=True,
).with_options(options)
tf_eval_dataset = eval_dataset.to_tf_dataset(
# labels are passed as input, as we will use the model's internal loss
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
shuffle=False,
batch_size=num_replicas * training_args.per_device_train_batch_size,
collate_fn=data_collator,
batch_size=num_replicas * training_args.per_device_eval_batch_size,
drop_remainder=True,
).with_options(options)
# endregion
# region Optimizer and loss
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
# Bias and layernorm weights are automatically excluded from the decay
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
num_warmup_steps=training_args.warmup_steps,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# no user-specified loss = will use the model internal loss
model.compile(optimizer=optimizer)
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-clm"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training and validation
......@@ -512,33 +569,45 @@ def main():
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
# to the Hugging Face Hub rather than just pushing the finished model.
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
history = model.fit(
tf_train_dataset,
validation_data=tf_eval_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
callbacks=callbacks,
)
train_loss = history.history["loss"][-1]
try:
train_perplexity = math.exp(history.history["loss"][-1])
train_perplexity = math.exp(train_loss)
except OverflowError:
train_perplexity = math.inf
logger.info(f" Final train loss: {train_loss:.3f}")
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
validation_loss = history.history["val_loss"][-1]
try:
validation_perplexity = math.exp(history.history["val_loss"][-1])
validation_perplexity = math.exp(validation_loss)
except OverflowError:
validation_perplexity = math.inf
logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}")
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
logger.info(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
logger.info(f" Final validation loss: {validation_loss:.3f}")
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
# endregion
if training_args.output_dir is not None:
model.save_pretrained(training_args.output_dir)
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
results_dict = dict()
results_dict["train_loss"] = train_loss
results_dict["train_perplexity"] = train_perplexity
results_dict["eval_loss"] = validation_loss
results_dict["eval_perplexity"] = validation_perplexity
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(results_dict))
# endregion
if training_args.push_to_hub:
# You'll probably want to include some of your own metadata here!
model.push_to_hub()
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
......
......@@ -22,9 +22,7 @@ https://huggingface.co/models?filter=fill-mask
"""
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected
# TODO Duplicate all changes over to the CLM script
import json
import logging
import math
import os
......@@ -50,6 +48,7 @@ from transformers import (
AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser,
PushToHubCallback,
TFAutoModelForMaskedLM,
TFTrainingArguments,
create_optimizer,
......@@ -217,22 +216,6 @@ class DataTrainingArguments:
# endregion
# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
def __init__(self, output_dir, **kwargs):
super().__init__()
self.output_dir = output_dir
def on_epoch_end(self, epoch, logs=None):
self.model.save_pretrained(self.output_dir)
# endregion
def main():
# region Argument Parsing
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
......@@ -492,7 +475,7 @@ def main():
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# endregion
......@@ -517,40 +500,88 @@ def main():
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_train_dataset = train_dataset.to_tf_dataset(
# labels are passed as input, as we will use the model's internal loss
columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
shuffle=True,
batch_size=num_replicas * training_args.per_device_train_batch_size,
collate_fn=data_collator,
drop_remainder=True,
).with_options(options)
tf_eval_dataset = eval_dataset.to_tf_dataset(
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
# labels are passed as input, as we will use the model's internal loss
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
shuffle=False,
batch_size=num_replicas * training_args.per_device_train_batch_size,
batch_size=num_replicas * training_args.per_device_eval_batch_size,
collate_fn=data_collator,
drop_remainder=True,
).with_options(options)
# endregion
# region Optimizer and loss
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
# Bias and layernorm weights are automatically excluded from the decay
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
num_warmup_steps=training_args.warmup_steps,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# no user-specified loss = will use the model internal loss
model.compile(optimizer=optimizer)
model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-mlm"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training and validation
......@@ -560,33 +591,46 @@ def main():
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
# to the Hugging Face Hub rather than just pushing the finished model.
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
history = model.fit(
tf_train_dataset,
validation_data=tf_eval_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
callbacks=callbacks,
)
train_loss = history.history["loss"][-1]
try:
train_perplexity = math.exp(history.history["loss"][-1])
train_perplexity = math.exp(train_loss)
except OverflowError:
train_perplexity = math.inf
try:
validation_perplexity = math.exp(history.history["val_loss"][-1])
except OverflowError:
validation_perplexity = math.inf
logger.warning(f" Final train loss: {history.history['loss'][-1]:.3f}")
logger.warning(f" Final train perplexity: {train_perplexity:.3f}")
logger.warning(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
logger.warning(f" Final validation perplexity: {validation_perplexity:.3f}")
# endregion
logger.info(f" Final train loss: {train_loss:.3f}")
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
validation_loss = history.history["val_loss"][-1]
try:
validation_perplexity = math.exp(validation_loss)
except OverflowError:
validation_perplexity = math.inf
logger.info(f" Final validation loss: {validation_loss:.3f}")
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
if training_args.output_dir is not None:
model.save_pretrained(training_args.output_dir)
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
results_dict = dict()
results_dict["train_loss"] = train_loss
results_dict["train_perplexity"] = train_perplexity
results_dict["eval_loss"] = validation_loss
results_dict["eval_perplexity"] = validation_perplexity
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(results_dict))
# endregion
if training_args.push_to_hub:
# You'll probably want to append some of your own metadata here!
model.push_to_hub()
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
......
......@@ -18,6 +18,7 @@ Fine-tuning the library models for multiple choice.
"""
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
import json
import logging
import os
import sys
......@@ -38,6 +39,7 @@ from transformers import (
AutoTokenizer,
DefaultDataCollator,
HfArgumentParser,
PushToHubCallback,
TFAutoModelForMultipleChoice,
TFTrainingArguments,
create_optimizer,
......@@ -54,16 +56,6 @@ logger = logging.getLogger(__name__)
# region Helper classes and functions
class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
def __init__(self, output_dir, **kwargs):
super().__init__()
self.output_dir = output_dir
def on_epoch_end(self, epoch, logs=None):
self.model.save_pretrained(self.output_dir)
@dataclass
......@@ -391,7 +383,6 @@ def main():
if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"]
non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
if data_args.max_train_samples is not None:
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
......@@ -407,8 +398,6 @@ def main():
if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = raw_datasets["validation"]
if not training_args.do_train:
non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
if data_args.max_eval_samples is not None:
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
......@@ -444,79 +433,120 @@ def main():
num_replicas = training_args.strategy.num_replicas_in_sync
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
if training_args.do_train:
total_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
else:
optimizer = "adam" # Just put anything in here, since we're not using it anyway
model.compile(
optimizer=optimizer,
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
)
optimizer = None
model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice"}
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training
eval_metrics = None
if training_args.do_train:
dataset_exclude_cols = set(non_label_columns + ["label"])
tf_train_dataset = train_dataset.to_tf_dataset(
columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols],
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
shuffle=True,
batch_size=total_train_batch_size,
collate_fn=data_collator,
drop_remainder=True,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in train_dataset.column_names else None,
)
).with_options(dataset_options)
if training_args.do_eval:
validation_data = eval_dataset.to_tf_dataset(
columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
validation_data = model.prepare_tf_dataset(
eval_dataset,
shuffle=False,
batch_size=total_eval_batch_size,
collate_fn=data_collator,
drop_remainder=True,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in eval_dataset.column_names else None,
)
).with_options(dataset_options)
else:
validation_data = None
model.fit(
history = model.fit(
tf_train_dataset,
validation_data=validation_data,
epochs=int(training_args.num_train_epochs),
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
callbacks=callbacks,
)
eval_metrics = {key: val[-1] for key, val in history.history.items()}
# endregion
# region Evaluation
if training_args.do_eval and not training_args.do_train:
dataset_exclude_cols = set(non_label_columns + ["label"])
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
# Do a standalone evaluation pass
tf_eval_dataset = eval_dataset.to_tf_dataset(
columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
shuffle=False,
batch_size=total_eval_batch_size,
collate_fn=data_collator,
drop_remainder=True,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in eval_dataset.column_names else None,
)
model.evaluate(tf_eval_dataset)
).with_options(dataset_options)
eval_results = model.evaluate(tf_eval_dataset)
eval_metrics = {"val_loss": eval_results[0], "val_accuracy": eval_results[1]}
# endregion
if eval_metrics is not None and training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metrics))
# region Push to hub
if training_args.push_to_hub:
model.push_to_hub(
finetuned_from=model_args.model_name_or_path,
tasks="multiple-choice",
dataset_tags="swag",
dataset_args="regular",
dataset="SWAG",
language="en",
)
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
# endregion
......
......@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
"""
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
import json
import logging
import os
import sys
......@@ -33,13 +34,13 @@ import transformers
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
EvalPrediction,
HfArgumentParser,
PreTrainedTokenizerFast,
PushToHubCallback,
TFAutoModelForQuestionAnswering,
TFTrainingArguments,
create_optimizer,
set_seed,
)
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
......@@ -609,7 +610,12 @@ def main():
# endregion
with training_args.strategy.scope():
# region Load model
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
num_replicas = training_args.strategy.num_replicas_in_sync
# region Load model and prepare datasets
if checkpoint is None:
model_path = model_args.model_name_or_path
else:
......@@ -621,71 +627,163 @@ def main():
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
optimizer = tf.keras.optimizers.Adam(
learning_rate=training_args.learning_rate,
beta_1=training_args.adam_beta1,
beta_2=training_args.adam_beta2,
epsilon=training_args.adam_epsilon,
clipnorm=training_args.max_grad_norm,
)
if training_args.do_train:
# no user-specified loss = will use the model internal loss
model.compile(optimizer=optimizer)
# endregion
training_dataset = model.prepare_tf_dataset(
processed_datasets["train"],
shuffle=True,
batch_size=training_args.per_device_train_batch_size * num_replicas,
tokenizer=tokenizer,
)
training_dataset = training_dataset.with_options(dataset_options)
num_train_steps = len(training_dataset) * training_args.num_train_epochs
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=len(training_dataset) * training_args.num_train_epochs,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# no user-specified loss = will use the model internal loss
model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
# region Training
if padding:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
tensor_keys = ["attention_mask", "input_ids"]
label_keys = ["start_positions", "end_positions"]
model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
training_dataset = None
if training_args.do_train:
# Make a tf.data.Dataset for this
training_dataset = processed_datasets["train"].to_tf_dataset(
# labels are passed as input, as we will use the model's internal loss
columns=tensor_keys + label_keys,
shuffle=True,
batch_size=training_args.per_device_train_batch_size,
collate_fn=data_collator,
drop_remainder=True,
if training_args.do_eval:
eval_dataset = model.prepare_tf_dataset(
processed_datasets["validation"],
shuffle=False,
batch_size=training_args.per_device_train_batch_size * num_replicas,
tokenizer=tokenizer,
)
eval_dataset = eval_dataset.with_options(dataset_options)
else:
eval_dataset = None
if training_args.do_predict:
predict_dataset = model.prepare_tf_dataset(
processed_datasets["test"],
shuffle=False,
batch_size=training_args.per_device_eval_batch_size * num_replicas,
tokenizer=tokenizer,
)
model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
predict_dataset = predict_dataset.with_options(dataset_options)
else:
predict_dataset = None
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Evaluation
# region Training and Evaluation
if training_args.do_train:
# Note that the validation and test datasets have been processed in a different way to the
# training datasets in this example, and so they don't have the same label structure.
# As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
# after training.
model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
if training_args.do_eval:
logger.info("*** Evaluation ***")
eval_inputs = {
"input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
"attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
}
eval_predictions = model.predict(eval_inputs)
# In this example, we compute advanced metrics at the end of training, but
# if you'd like to compute metrics every epoch that are too complex to be written as
# standard Keras metrics, you can use our KerasMetricCallback. See
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
eval_predictions = model.predict(eval_dataset)
if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
# padding positions are correctly masked.
eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
else:
eval_start_logits = eval_predictions.start_logits
eval_end_logits = eval_predictions.end_logits
post_processed_eval = post_processing_function(
datasets["validation"],
processed_datasets["validation"],
(eval_predictions.start_logits, eval_predictions.end_logits),
(eval_start_logits, eval_end_logits),
)
metrics = compute_metrics(post_processed_eval)
logging.info("Evaluation metrics:")
for metric, value in metrics.items():
logging.info(f"{metric}: {value:.3f}")
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(metrics))
# endregion
# region Prediction
if training_args.do_predict:
logger.info("*** Predict ***")
predict_inputs = {
"input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
"attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
}
test_predictions = model.predict(predict_inputs)
test_predictions = model.predict(predict_dataset)
if isinstance(test_predictions.start_logits, tf.RaggedTensor):
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
# padding positions are correctly masked.
test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
else:
test_start_logits = test_predictions.start_logits
test_end_logits = test_predictions.end_logits
post_processed_test = post_processing_function(
datasets["test"],
processed_datasets["test"],
(test_predictions.start_logits, test_predictions.end_logits),
(test_start_logits, test_end_logits),
)
metrics = compute_metrics(post_processed_test)
......@@ -694,8 +792,9 @@ def main():
logging.info(f"{metric}: {value:.3f}")
# endregion
if training_args.push_to_hub:
model.push_to_hub()
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
......
......@@ -18,11 +18,11 @@ Fine-tuning the library models for summarization.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
import json
import logging
import os
import sys
from dataclasses import dataclass, field
from functools import partial
from typing import Optional
import datasets
......@@ -30,7 +30,6 @@ import nltk # Here to have a nice missing dependency error message early on
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import transformers
......@@ -38,7 +37,10 @@ from filelock import FileLock
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorForSeq2Seq,
HfArgumentParser,
KerasMetricCallback,
PushToHubCallback,
TFAutoModelForSeq2SeqLM,
TFTrainingArguments,
create_optimizer,
......@@ -253,7 +255,6 @@ class DataTrainingArguments:
# endregion
# region Dataset name mappings
summarization_name_mapping = {
"amazon_reviews_multi": ("review_body", "review_title"),
......@@ -272,71 +273,6 @@ summarization_name_mapping = {
# endregion
# region Data generator
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
if shuffle:
sample_ordering = np.random.permutation(len(dataset))
else:
sample_ordering = np.arange(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
labels=tf.expand_dims(example["labels"], 0)
)
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
# endregion
# region Helper functions
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
if dataset is None:
return None
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
train_signature = {
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
for feature in dataset.features
if feature != "special_tokens_mask"
}
if (
model is not None
and "decoder_input_ids" not in train_signature
and hasattr(model, "prepare_decoder_input_ids_from_labels")
):
train_signature["decoder_input_ids"] = train_signature["labels"]
# This may need to be changed depending on your particular model or tokenizer!
padding_values = {
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
for key in train_signature.keys()
}
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, train_signature["labels"])
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
.with_options(options)
.padded_batch(
batch_size=total_batch_size,
drop_remainder=True,
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
)
.repeat(int(num_epochs))
)
return tf_dataset
# endregion
def main():
# region Argument parsing
# See all possible arguments in src/transformers/training_args.py
......@@ -587,59 +523,148 @@ def main():
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=128, # Reduce the number of unique shapes for XLA, especially for generation
return_tensors="tf",
)
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
num_replicas = training_args.strategy.num_replicas_in_sync
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
tf_train_dataset = dataset_to_tf(
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
model,
tokenizer,
total_batch_size=total_train_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=data_collator,
batch_size=total_train_batch_size,
shuffle=True,
)
tf_eval_dataset = dataset_to_tf(
).with_options(dataset_options)
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
model,
tokenizer,
total_eval_batch_size,
num_epochs=1,
collate_fn=data_collator,
batch_size=total_eval_batch_size,
shuffle=False,
)
).with_options(dataset_options)
# endregion
# region Optimizer, loss and LR scheduling
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
)
def masked_sparse_categorical_crossentropy(y_true, y_pred):
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
# More pragmatically, consider redesigning your tokenizer.
losses = tf.keras.losses.sparse_categorical_crossentropy(
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
if training_args.do_train:
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# Compute the per-sample loss only over the unmasked tokens
losses = tf.ragged.boolean_mask(losses, y_true != -100)
losses = tf.reduce_mean(losses, axis=-1)
return losses
else:
optimizer = None
# endregion
# region Metric and KerasMetricCallback
if training_args.do_eval:
metric = evaluate.load("rouge")
if data_args.val_max_target_length is None:
data_args.val_max_target_length = data_args.max_target_length
gen_kwargs = {
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
"num_beams": data_args.num_beams,
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
}
def compute_metrics(preds):
predictions, labels = preds
if isinstance(predictions, tuple):
predictions = predictions[0]
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# Only print the mid f-measures, but there are a lot of other statistics in there too!
metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()}
return metrics
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
# For more information, see the docs at
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
metric_callback = KerasMetricCallback(
metric_fn=compute_metrics,
eval_dataset=tf_eval_dataset,
predict_with_generate=True,
use_xla_generation=True,
generate_kwargs=gen_kwargs,
)
callbacks = [metric_callback]
else:
callbacks = []
# endregion
# region Metric
metric = evaluate.load("rouge")
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-summarization"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
# Because this training can be quite long, we save once per epoch.
callbacks.append(
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
)
# endregion
# region Training
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
eval_metrics = None
if training_args.do_train:
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
......@@ -648,28 +673,29 @@ def main():
logger.info(f" Total train batch size = {total_train_batch_size}")
logger.info(f" Total optimization steps = {num_train_steps}")
model.fit(
tf_train_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=num_update_steps_per_epoch,
)
if training_args.xla and not data_args.pad_to_max_length:
logger.warning(
"XLA training may be slow at first when --pad_to_max_length is not set "
"until all possible shapes have been compiled."
)
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
eval_metrics = {key: val[-1] for key, val in history.history.items()}
# endregion
# region Validation
if data_args.val_max_target_length is None:
data_args.val_max_target_length = data_args.max_target_length
gen_kwargs = {
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
"num_beams": data_args.num_beams,
}
if training_args.do_eval:
if training_args.do_eval and not training_args.do_train:
# Do a standalone evaluation run
logger.info("Evaluation...")
for batch, labels in tqdm(
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
):
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
@tf.function(jit_compile=True)
def generate(**kwargs):
return model.generate(**kwargs)
for batch, labels in tf_eval_dataset:
batch.update(gen_kwargs)
generated_tokens = model.generate(**batch)
generated_tokens = generate(**batch)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
......@@ -679,13 +705,19 @@ def main():
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
result = metric.compute(use_stemmer=True)
result = {k: round(v * 100, 4) for k, v in result.items()}
eval_metrics = metric.compute(use_stemmer=True)
result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()}
logger.info(result)
# endregion
if training_args.output_dir is not None:
if training_args.output_dir is not None and eval_metrics is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metrics))
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import logging
import os
import sys
from unittest import skip
from unittest.mock import patch
import tensorflow as tf
from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
SRC_DIRS = [
os.path.join(os.path.dirname(__file__), dirname)
for dirname in [
"text-generation",
"text-classification",
"token-classification",
"language-modeling",
"multiple-choice",
"question-answering",
"summarization",
"translation",
]
]
sys.path.extend(SRC_DIRS)
if SRC_DIRS is not None:
import run_clm
import run_mlm
import run_ner
import run_qa as run_squad
import run_summarization
import run_swag
import run_text_classification
import run_translation
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
def get_setup_file():
parser = argparse.ArgumentParser()
parser.add_argument("-f")
args = parser.parse_args()
return args.f
def get_results(output_dir):
results = {}
path = os.path.join(output_dir, "all_results.json")
if os.path.exists(path):
with open(path, "r") as f:
results = json.load(f)
else:
raise ValueError(f"can't find {path}")
return results
def is_cuda_available():
return bool(tf.config.list_physical_devices("GPU"))
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
class ExamplesTests(TestCasePlus):
@skip("Skipping until shape inference for to_tf_dataset PR is merged.")
def test_run_text_classification(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_text_classification.py
--model_name_or_path distilbert-base-uncased
--output_dir {tmp_dir}
--overwrite_output_dir
--train_file ./tests/fixtures/tests_samples/MRPC/train.csv
--validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
--do_train
--do_eval
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
--learning_rate=1e-4
--max_steps=10
--warmup_steps=2
--seed=42
--max_seq_length=128
""".split()
if is_cuda_available():
testargs.append("--fp16")
with patch.object(sys, "argv", testargs):
run_text_classification.main()
# Reset the mixed precision policy so we don't break other tests
tf.keras.mixed_precision.set_global_policy("float32")
result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_accuracy"], 0.75)
def test_run_clm(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_clm.py
--model_name_or_path distilgpt2
--train_file ./tests/fixtures/sample_text.txt
--validation_file ./tests/fixtures/sample_text.txt
--do_train
--do_eval
--block_size 128
--per_device_train_batch_size 2
--per_device_eval_batch_size 1
--num_train_epochs 2
--output_dir {tmp_dir}
--overwrite_output_dir
""".split()
if len(tf.config.list_physical_devices("GPU")) > 1:
# Skipping because there are not enough batches to train the model + would need a drop_last to work.
return
with patch.object(sys, "argv", testargs):
run_clm.main()
result = get_results(tmp_dir)
self.assertLess(result["eval_perplexity"], 100)
def test_run_mlm(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_mlm.py
--model_name_or_path distilroberta-base
--train_file ./tests/fixtures/sample_text.txt
--validation_file ./tests/fixtures/sample_text.txt
--max_seq_length 64
--output_dir {tmp_dir}
--overwrite_output_dir
--do_train
--do_eval
--prediction_loss_only
--num_train_epochs=1
""".split()
with patch.object(sys, "argv", testargs):
run_mlm.main()
result = get_results(tmp_dir)
self.assertLess(result["eval_perplexity"], 42)
def test_run_ner(self):
# with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
epochs = 7 if get_gpu_count() > 1 else 2
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_ner.py
--model_name_or_path bert-base-uncased
--train_file tests/fixtures/tests_samples/conll/sample.json
--validation_file tests/fixtures/tests_samples/conll/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--do_train
--do_eval
--warmup_steps=2
--learning_rate=2e-4
--per_device_train_batch_size=2
--per_device_eval_batch_size=2
--num_train_epochs={epochs}
--seed 7
""".split()
with patch.object(sys, "argv", testargs):
run_ner.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["accuracy"], 0.75)
def test_run_squad(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_qa.py
--model_name_or_path bert-base-uncased
--version_2_with_negative
--train_file tests/fixtures/tests_samples/SQUAD/sample.json
--validation_file tests/fixtures/tests_samples/SQUAD/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=10
--warmup_steps=2
--do_train
--do_eval
--learning_rate=2e-4
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
""".split()
with patch.object(sys, "argv", testargs):
run_squad.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["f1"], 30)
self.assertGreaterEqual(result["exact"], 30)
def test_run_swag(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_swag.py
--model_name_or_path bert-base-uncased
--train_file tests/fixtures/tests_samples/swag/sample.json
--validation_file tests/fixtures/tests_samples/swag/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=20
--warmup_steps=2
--do_train
--do_eval
--learning_rate=2e-4
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
""".split()
with patch.object(sys, "argv", testargs):
run_swag.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["val_accuracy"], 0.8)
@slow
def test_run_summarization(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_summarization.py
--model_name_or_path t5-small
--train_file tests/fixtures/tests_samples/xsum/sample.json
--validation_file tests/fixtures/tests_samples/xsum/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=50
--warmup_steps=8
--do_train
--do_eval
--learning_rate=2e-4
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
""".split()
with patch.object(sys, "argv", testargs):
run_summarization.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["rouge1"], 10)
self.assertGreaterEqual(result["rouge2"], 2)
self.assertGreaterEqual(result["rougeL"], 7)
self.assertGreaterEqual(result["rougeLsum"], 7)
@slow
def test_run_translation(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_translation.py
--model_name_or_path Rocketknight1/student_marian_en_ro_6_1
--source_lang en
--target_lang ro
--train_file tests/fixtures/tests_samples/wmt16/sample.json
--validation_file tests/fixtures/tests_samples/wmt16/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--warmup_steps=8
--do_train
--do_eval
--learning_rate=3e-3
--num_train_epochs 12
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
--source_lang en_XX
--target_lang ro_RO
""".split()
with patch.object(sys, "argv", testargs):
run_translation.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["bleu"], 30)
......@@ -16,6 +16,7 @@
""" Finetuning the library models for sequence classification on GLUE."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
import json
import logging
import os
import sys
......@@ -35,32 +36,16 @@ from transformers import (
DefaultDataCollator,
HfArgumentParser,
PretrainedConfig,
PushToHubCallback,
TFAutoModelForSequenceClassification,
TFTrainingArguments,
create_optimizer,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry
# region Helper functions
class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
def __init__(self, output_dir, **kwargs):
super().__init__()
self.output_dir = output_dir
def on_epoch_end(self, epoch, logs=None):
self.model.save_pretrained(self.output_dir)
# endregion
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.22.0.dev0")
......@@ -312,7 +297,6 @@ def main():
# region Dataset preprocessing
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
# Padding strategy
if data_args.pad_to_max_length:
......@@ -394,24 +378,11 @@ def main():
)
# endregion
# region Optimizer, loss and compilation
optimizer = tf.keras.optimizers.Adam(
learning_rate=training_args.learning_rate,
beta_1=training_args.adam_beta1,
beta_2=training_args.adam_beta2,
epsilon=training_args.adam_epsilon,
clipnorm=training_args.max_grad_norm,
)
if is_regression:
loss_fn = tf.keras.losses.MeanSquaredError()
metrics = []
else:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
# endregion
# region Convert data to a tf.data.Dataset
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
num_replicas = training_args.strategy.num_replicas_in_sync
tf_data = dict()
max_samples = {
"train": data_args.max_train_samples,
......@@ -428,31 +399,89 @@ def main():
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
if key == "train":
shuffle = True
batch_size = training_args.per_device_train_batch_size
drop_remainder = True # Saves us worrying about scaling gradients for the last batch
batch_size = training_args.per_device_train_batch_size * num_replicas
else:
shuffle = False
batch_size = training_args.per_device_eval_batch_size
drop_remainder = False
batch_size = training_args.per_device_eval_batch_size * num_replicas
samples_limit = max_samples[key]
dataset = datasets[key]
if samples_limit is not None:
dataset = dataset.select(range(samples_limit))
data = dataset.to_tf_dataset(
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
data = model.prepare_tf_dataset(
dataset,
shuffle=shuffle,
batch_size=batch_size,
collate_fn=data_collator,
drop_remainder=drop_remainder,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
tokenizer=tokenizer,
)
data = data.with_options(dataset_options)
tf_data[key] = data
# endregion
# region Optimizer, loss and compilation
if training_args.do_train:
num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
else:
optimizer = "adam" # Just write anything because we won't be using it
if is_regression:
metrics = []
else:
metrics = ["accuracy"]
model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla)
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
push_to_hub_model_id = f"{model_name}-finetuned-glue"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
model_card_kwargs["task_name"] = data_args.task_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training and validation
if training_args.do_train:
callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
if training_args.do_eval and not data_args.task_name == "mnli":
# Do both evaluation and training in the Keras fit loop, unless the task is MNLI
# because MNLI has two validation sets
......@@ -472,6 +501,12 @@ def main():
# We normally do validation as part of the Keras fit loop, but we run it independently
# if there was no fit() step (because we didn't train the model) or if the task is MNLI,
# because MNLI has a separate validation-mismatched validation set
# In this example, we compute advanced metrics only at the end of training, and only compute
# loss and accuracy on the validation set each epoch, but
# if you'd like to compute metrics every epoch that are too complex to be written as
# standard Keras metrics, you can use our KerasMetricCallback. See
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
logger.info("*** Evaluate ***")
# Loop to handle MNLI double evaluation (matched, mis-matched)
......@@ -489,6 +524,10 @@ def main():
eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"])
print(f"Evaluation metrics ({task}):")
print(eval_metrics)
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metrics))
# endregion
......@@ -538,6 +577,10 @@ def main():
writer.write(f"{index}\t{item}\n")
# endregion
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
main()
......@@ -16,6 +16,7 @@
""" Fine-tuning the library models for sequence classification."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
import json
import logging
import os
import sys
......@@ -29,12 +30,12 @@ from datasets import load_dataset
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser,
PretrainedConfig,
PushToHubCallback,
TFAutoModelForSequenceClassification,
TFTrainingArguments,
create_optimizer,
set_seed,
)
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, send_example_telemetry
......@@ -383,10 +384,6 @@ def main():
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion
with training_args.strategy.scope():
......@@ -409,24 +406,10 @@ def main():
)
# endregion
# region Optimizer, loss and compilation
optimizer = tf.keras.optimizers.Adam(
learning_rate=training_args.learning_rate,
beta_1=training_args.adam_beta1,
beta_2=training_args.adam_beta2,
epsilon=training_args.adam_epsilon,
clipnorm=training_args.max_grad_norm,
)
if is_regression:
loss_fn = tf.keras.losses.MeanSquaredError()
metrics = []
else:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
# endregion
# region Convert data to a tf.data.Dataset
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
num_replicas = training_args.strategy.num_replicas_in_sync
tf_data = dict()
max_samples = {
......@@ -438,50 +421,121 @@ def main():
if key not in datasets:
tf_data[key] = None
continue
if (
(key == "train" and not training_args.do_train)
or (key == "validation" and not training_args.do_eval)
or (key == "test" and not training_args.do_predict)
):
tf_data[key] = None
continue
if key in ("train", "validation"):
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
if key == "train":
shuffle = True
batch_size = training_args.per_device_train_batch_size
drop_remainder = True # Saves us worrying about scaling gradients for the last batch
batch_size = training_args.per_device_train_batch_size * num_replicas
else:
shuffle = False
batch_size = training_args.per_device_eval_batch_size
drop_remainder = False
batch_size = training_args.per_device_eval_batch_size * num_replicas
samples_limit = max_samples[key]
dataset = datasets[key]
if samples_limit is not None:
dataset = dataset.select(range(samples_limit))
data = dataset.to_tf_dataset(
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
data = model.prepare_tf_dataset(
dataset,
shuffle=shuffle,
batch_size=batch_size,
collate_fn=data_collator,
drop_remainder=drop_remainder,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
tokenizer=tokenizer,
)
data = data.with_options(dataset_options)
tf_data[key] = data
# endregion
# region Optimizer, loss and compilation
if training_args.do_train:
num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
else:
optimizer = None
if is_regression:
metrics = []
else:
metrics = ["accuracy"]
model.compile(optimizer=optimizer, metrics=metrics)
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
push_to_hub_model_id = f"{model_name}-finetuned-text-classification"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training and validation
if tf_data["train"] is not None:
callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
model.fit(
tf_data["train"],
validation_data=tf_data["validation"],
epochs=int(training_args.num_train_epochs),
callbacks=callbacks,
)
elif tf_data["validation"] is not None:
# If there's a validation dataset but no training set, just evaluate the metrics
if tf_data["validation"] is not None:
logger.info("Computing metrics on validation data...")
if is_regression:
loss = model.evaluate(tf_data["validation"])
logger.info(f"Loss: {loss:.5f}")
logger.info(f"Eval loss: {loss:.5f}")
else:
loss, accuracy = model.evaluate(tf_data["validation"])
logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%")
logger.info(f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%")
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
eval_dict = {"eval_loss": loss}
if not is_regression:
eval_dict["eval_accuracy"] = accuracy
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_dict))
# endregion
# region Prediction
......@@ -501,14 +555,9 @@ def main():
logger.info(f"Wrote predictions to {output_test_file}!")
# endregion
# region Prediction losses
# This section is outside the scope() because it's very quick to compute, but behaves badly inside it
if "test" in datasets and "label" in datasets["test"].features:
print("Computing prediction loss on test labels...")
labels = datasets["test"]["label"]
loss = float(loss_fn(labels, predictions).numpy())
print(f"Test loss: {loss:.4f}")
# endregion
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
......
......@@ -18,14 +18,14 @@ Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, C
without using a Trainer.
"""
import json
import logging
import os
import random
from dataclasses import dataclass, field
from functools import partial
from typing import Optional
import datasets
import numpy as np
import tensorflow as tf
from datasets import ClassLabel, load_dataset
......@@ -33,10 +33,11 @@ import evaluate
import transformers
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AutoConfig,
AutoTokenizer,
DataCollatorForTokenClassification,
HfArgumentParser,
PushToHubCallback,
TFAutoModelForTokenClassification,
TFTrainingArguments,
create_optimizer,
......@@ -48,11 +49,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")
# region Command-line arguments
......@@ -195,61 +192,6 @@ class DataTrainingArguments:
# endregion
# region Data generator
def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None):
# Trim off the last partial batch if present
if shuffle:
sample_ordering = np.random.permutation(len(dataset))
else:
sample_ordering = np.arange(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
if tokenizer.pad_token_id is not None:
example["labels"][example["attention_mask"] == 0] = -100
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
# endregion
# region Helper functions
def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle):
train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle)
train_signature = {
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
for feature in dataset.features
if feature != "special_tokens_mask"
}
# This may need to be changed depending on your particular model or tokenizer!
padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features}
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64)
if tokenizer.pad_token_id is not None:
padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64)
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, train_signature["labels"])
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
.with_options(options)
.padded_batch(
batch_size=total_batch_size,
drop_remainder=True,
padding_values=(padding_values, np.array(0, dtype=np.int64)),
)
.repeat(int(num_epochs))
)
return tf_dataset
# endregion
def main():
# region Argument Parsing
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
......@@ -419,6 +361,14 @@ def main():
train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["validation"]
if data_args.max_train_samples is not None:
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if data_args.max_eval_samples is not None:
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
......@@ -439,43 +389,62 @@ def main():
# endregion
# region Create TF datasets
# We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
# well as inputs.
collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
num_replicas = training_args.strategy.num_replicas_in_sync
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
train_batches_per_epoch = len(train_dataset) // total_train_batch_size
tf_train_dataset = dataset_to_tf(
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
tokenizer,
total_batch_size=total_train_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=collate_fn,
batch_size=total_train_batch_size,
shuffle=True,
)
).with_options(dataset_options)
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
tf_eval_dataset = dataset_to_tf(
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
tokenizer,
total_batch_size=total_eval_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=collate_fn,
batch_size=total_eval_batch_size,
shuffle=False,
)
).with_options(dataset_options)
# endregion
# region Optimizer, loss and compilation
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch),
num_warmup_steps=training_args.warmup_steps,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
def dummy_loss(y_true, y_pred):
return tf.reduce_mean(y_pred)
model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
# endregion
# Metrics
......@@ -517,6 +486,39 @@ def main():
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-token-classification"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
......@@ -524,23 +526,43 @@ def main():
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
logger.info(f" Total train batch size = {total_train_batch_size}")
# Only show the progress bar once on each machine.
model.fit(
tf_train_dataset,
validation_data=tf_eval_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=train_batches_per_epoch,
validation_steps=eval_batches_per_epoch,
callbacks=callbacks,
)
# endregion
# region Predictions
# For predictions, we preload the entire validation set - note that if you have a really giant validation
# set, you might need to change this!
eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features}
predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"]
predictions = tf.math.argmax(predictions, axis=-1)
labels = np.array(eval_inputs["labels"])
labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
# If you have variable batch sizes (i.e. not using pad_to_max_length), then
# this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
# length from predict().
try:
predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"]
except tf.python.framework.errors_impl.InvalidArgumentError:
raise ValueError(
"Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
"then you will need to use --pad_to_max_length to generate predictions, as older "
"versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
)
if isinstance(predictions, tf.RaggedTensor):
predictions = predictions.to_tensor(default_value=-100)
predictions = tf.math.argmax(predictions, axis=-1).numpy()
if "label" in eval_dataset:
labels = eval_dataset.with_format("tf")["label"]
else:
labels = eval_dataset.with_format("tf")["labels"]
if isinstance(labels, tf.RaggedTensor):
labels = labels.to_tensor(default_value=-100)
labels = labels.numpy()
attention_mask = eval_dataset.with_format("tf")["attention_mask"]
if isinstance(attention_mask, tf.RaggedTensor):
attention_mask = attention_mask.to_tensor(default_value=-100)
attention_mask = attention_mask.numpy()
labels[attention_mask == 0] = -100
preds, refs = get_labels(predictions, labels)
metric.add_batch(
predictions=preds,
......@@ -550,12 +572,15 @@ def main():
logger.info("Evaluation metrics:")
for key, val in eval_metric.items():
logger.info(f"{key}: {val:.4f}")
# endregion
# We don't do predictions in the strategy scope because there are some issues in there right now.
# They'll get fixed eventually, promise!
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metric))
# endregion
if training_args.output_dir is not None:
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
......
......@@ -18,30 +18,32 @@ Fine-tuning the library models for translation.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
import json
import logging
import os
import sys
from dataclasses import dataclass, field
from functools import partial
from typing import Optional
import datasets
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import transformers
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorForSeq2Seq,
HfArgumentParser,
KerasMetricCallback,
M2M100Tokenizer,
MBart50Tokenizer,
MBart50TokenizerFast,
MBartTokenizer,
MBartTokenizerFast,
PushToHubCallback,
TFAutoModelForSeq2SeqLM,
TFTrainingArguments,
create_optimizer,
......@@ -224,6 +226,16 @@ class DataTrainingArguments:
source_prefix: Optional[str] = field(
default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
)
forced_bos_token: Optional[str] = field(
default=None,
metadata={
"help": (
"The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
" multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
" be the target language token.(Usually it is the target language token)"
)
},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
......@@ -239,70 +251,6 @@ class DataTrainingArguments:
self.val_max_target_length = self.max_target_length
# endregion
# region Data generator
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
if shuffle:
sample_ordering = np.random.permutation(len(dataset))
else:
sample_ordering = np.arange(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
labels=tf.expand_dims(example["labels"], 0)
)
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
# endregion
# region Helper functions
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
if dataset is None:
return None
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
train_signature = {
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
for feature in dataset.features
if feature != "special_tokens_mask"
}
if (
model is not None
and "decoder_input_ids" not in train_signature
and hasattr(model, "prepare_decoder_input_ids_from_labels")
):
train_signature["decoder_input_ids"] = train_signature["labels"]
# This may need to be changed depending on your particular model or tokenizer!
padding_values = {
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
for key in train_signature.keys()
}
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, train_signature["labels"])
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
.with_options(options)
.padded_batch(
batch_size=total_batch_size,
drop_remainder=True,
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
)
.repeat(int(num_epochs))
)
return tf_dataset
# endregion
......@@ -541,67 +489,149 @@ def main():
# endregion
# region Prepare TF Dataset objects
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=64, # Reduce the number of unique shapes for XLA, especially for generation
return_tensors="tf",
)
num_replicas = training_args.strategy.num_replicas_in_sync
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
tf_train_dataset = dataset_to_tf(
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
model,
tokenizer,
total_batch_size=total_train_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=data_collator,
batch_size=total_train_batch_size,
shuffle=True,
)
tf_eval_dataset = dataset_to_tf(
eval_dataset,
model,
tokenizer,
total_eval_batch_size,
num_epochs=1,
shuffle=False,
)
).with_options(dataset_options)
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False
).with_options(dataset_options)
# endregion
# region Optimizer, loss and LR scheduling
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=training_args.warmup_steps,
)
def masked_sparse_categorical_crossentropy(y_true, y_pred):
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
# More pragmatically, consider redesigning your tokenizer.
losses = tf.keras.losses.sparse_categorical_crossentropy(
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
# region Optimizer and LR scheduling
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
if training_args.do_train:
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# Compute the per-sample loss only over the unmasked tokens
losses = tf.ragged.boolean_mask(losses, y_true != -100)
losses = tf.reduce_mean(losses, axis=-1)
return losses
else:
optimizer = None
# endregion
# region Metric and postprocessing
metric = evaluate.load("sacrebleu")
if training_args.do_eval:
metric = evaluate.load("sacrebleu")
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
if data_args.val_max_target_length is None:
data_args.val_max_target_length = data_args.max_target_length
gen_kwargs = {
"max_length": data_args.val_max_target_length,
"num_beams": data_args.num_beams,
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
}
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(preds):
predictions, labels = preds
if isinstance(predictions, tuple):
predictions = predictions[0]
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {"bleu": metrics["score"]}
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
# For more information, see the docs at
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
metric_callback = KerasMetricCallback(
metric_fn=compute_metrics,
eval_dataset=tf_eval_dataset,
predict_with_generate=True,
use_xla_generation=True,
generate_kwargs=gen_kwargs,
)
callbacks = [metric_callback]
else:
callbacks = []
return preds, labels
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
if len(languages) > 0:
model_card_kwargs["language"] = languages
if training_args.push_to_hub:
# Because this training can be quite long, we save once per epoch.
callbacks.append(
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
)
# endregion
# region Training
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
eval_metrics = None
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
if training_args.do_train:
logger.info("***** Running training *****")
......@@ -611,41 +641,48 @@ def main():
logger.info(f" Total train batch size = {total_train_batch_size}")
logger.info(f" Total optimization steps = {num_train_steps}")
model.fit(
tf_train_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=num_update_steps_per_epoch,
)
if training_args.xla and not data_args.pad_to_max_length:
logger.warning(
"XLA training may be slow at first when --pad_to_max_length is not set "
"until all possible shapes have been compiled."
)
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
eval_metrics = {key: val[-1] for key, val in history.history.items()}
# endregion
# region Validation
if data_args.val_max_target_length is None:
data_args.val_max_target_length = data_args.max_target_length
gen_kwargs = {
"max_length": data_args.val_max_target_length,
"num_beams": data_args.num_beams,
}
if training_args.do_eval:
logger.info("Evaluation...")
for batch, labels in tqdm(
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
):
batch.update(gen_kwargs)
generated_tokens = model.generate(**batch)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
eval_metric = metric.compute()
logger.info({"bleu": eval_metric["score"]})
if training_args.do_eval and not training_args.do_train:
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
@tf.function(jit_compile=True)
def generate(**kwargs):
return model.generate(**kwargs)
if training_args.do_eval:
logger.info("Evaluation...")
for batch, labels in tf_eval_dataset:
batch.update(gen_kwargs)
generated_tokens = generate(**batch)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
eval_metrics = metric.compute()
logger.info({"bleu": eval_metrics["score"]})
# endregion
if training_args.output_dir is not None:
if training_args.output_dir is not None and eval_metrics is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metrics))
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
......
......@@ -87,6 +87,8 @@ def create_optimizer(
adam_beta1: float = 0.9,
adam_beta2: float = 0.999,
adam_epsilon: float = 1e-8,
adam_clipnorm: Optional[float] = None,
adam_global_clipnorm: Optional[float] = None,
weight_decay_rate: float = 0.0,
power: float = 1.0,
include_in_weight_decay: Optional[List[str]] = None,
......@@ -109,6 +111,11 @@ def create_optimizer(
The beta2 to use in Adam.
adam_epsilon (`float`, *optional*, defaults to 1e-8):
The epsilon to use in Adam.
adam_clipnorm: (`float`, *optional*, defaults to `None`):
If not `None`, clip the gradient norm for each weight tensor to this value.
adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
weight tensors, as if they were concatenated into a single vector.
weight_decay_rate (`float`, *optional*, defaults to 0):
The weight decay to use.
power (`float`, *optional*, defaults to 1.0):
......@@ -137,12 +144,19 @@ def create_optimizer(
beta_1=adam_beta1,
beta_2=adam_beta2,
epsilon=adam_epsilon,
clipnorm=adam_clipnorm,
global_clipnorm=adam_global_clipnorm,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
include_in_weight_decay=include_in_weight_decay,
)
else:
optimizer = tf.keras.optimizers.Adam(
learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon
learning_rate=lr_schedule,
beta_1=adam_beta1,
beta_2=adam_beta2,
epsilon=adam_epsilon,
clipnorm=adam_clipnorm,
global_clipnorm=adam_global_clipnorm,
)
# We return the optimizer and the LR scheduler in order to better track the
# evolution of the LR independently of the optimizer.
......
......@@ -106,6 +106,7 @@ class OptimizerNames(ExplicitEnum):
@dataclass
class TrainingArguments:
framework = "pt"
"""
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
itself**.
......@@ -1039,25 +1040,25 @@ class TrainingArguments:
self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
if self.run_name is None:
self.run_name = self.output_dir
if self.framework == "pt" and is_torch_available():
if self.fp16_backend and self.fp16_backend != "auto":
warnings.warn(
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
" `half_precision_backend` instead",
FutureWarning,
)
self.half_precision_backend = self.fp16_backend
if self.fp16_backend and self.fp16_backend != "auto":
warnings.warn(
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
" `half_precision_backend` instead",
FutureWarning,
)
self.half_precision_backend = self.fp16_backend
if self.bf16 or self.bf16_full_eval:
if self.bf16 or self.bf16_full_eval:
if self.no_cuda and not is_torch_bf16_cpu_available():
# cpu
raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
elif not self.no_cuda and not is_torch_bf16_gpu_available():
# gpu
raise ValueError(
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
)
if self.no_cuda and not is_torch_bf16_cpu_available():
# cpu
raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
elif not self.no_cuda and not is_torch_bf16_gpu_available():
# gpu
raise ValueError(
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
)
if self.fp16 and self.bf16:
raise ValueError("At most one of fp16 and bf16 can be True, but not both")
......@@ -1084,7 +1085,8 @@ class TrainingArguments:
self.optim = OptimizerNames.ADAFACTOR
if (
is_torch_available()
self.framework == "pt"
and is_torch_available()
and (self.device.type != "cuda")
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
and (self.fp16 or self.fp16_full_eval)
......@@ -1095,7 +1097,8 @@ class TrainingArguments:
)
if (
is_torch_available()
self.framework == "pt"
and is_torch_available()
and (self.device.type != "cuda")
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
and (self.device.type != "cpu")
......@@ -1106,7 +1109,7 @@ class TrainingArguments:
" (`--bf16_full_eval`) can only be used on CUDA or CPU devices."
)
if is_torch_available() and self.tf32 is not None:
if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
if self.tf32:
if is_torch_tf32_available():
torch.backends.cuda.matmul.allow_tf32 = True
......
......@@ -28,6 +28,7 @@ if is_tf_available():
@dataclass
class TFTrainingArguments(TrainingArguments):
framework = "tf"
"""
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
itself**.
......@@ -188,9 +189,6 @@ class TFTrainingArguments(TrainingArguments):
def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
logger.info("Tensorflow: setting up strategy")
if self.xla:
tf.config.optimizer.set_jit(True)
gpus = tf.config.list_physical_devices("GPU")
# Set to float16 at first
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment