Unverified Commit 3956b133 authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

TF text classification examples (#15704)

* Working example with to_tf_dataset

* updated text_classification

* more comments
parent 142b69f2
...@@ -65,12 +65,7 @@ Those processors are: ...@@ -65,12 +65,7 @@ Those processors are:
Additionally, the following method can be used to load values from a data file and convert them to a list of Additionally, the following method can be used to load values from a data file and convert them to a list of
[`~data.processors.utils.InputExample`]. [`~data.processors.utils.InputExample`].
automethod,transformers.data.processors.glue.glue_convert_examples_to_features [[autodoc]] data.processors.glue.glue_convert_examples_to_features
### Example usage
An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
## XNLI ## XNLI
...@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso ...@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso
Additionally, the following method can be used to convert SQuAD examples into Additionally, the following method can be used to convert SQuAD examples into
[`~data.processors.utils.SquadFeatures`] that can be used as model inputs. [`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
automethod,transformers.data.processors.squad.squad_convert_examples_to_features [[autodoc]] data.processors.squad.squad_convert_examples_to_features
These processors as well as the aforementionned method can be used with files containing the data as well as with the These processors as well as the aforementionned method can be used with files containing the data as well as with the
......
...@@ -457,7 +457,8 @@ def main(): ...@@ -457,7 +457,8 @@ def main():
else: else:
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
# we already did the padding.
if data_args.pad_to_max_length: if data_args.pad_to_max_length:
data_collator = default_data_collator data_collator = default_data_collator
elif training_args.fp16: elif training_args.fp16:
......
...@@ -30,6 +30,8 @@ import transformers ...@@ -30,6 +30,8 @@ import transformers
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
AutoTokenizer, AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser, HfArgumentParser,
PretrainedConfig, PretrainedConfig,
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
...@@ -43,47 +45,6 @@ from transformers.utils import check_min_version ...@@ -43,47 +45,6 @@ from transformers.utils import check_min_version
# region Helper functions # region Helper functions
def convert_dataset_for_tensorflow(
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def densify_ragged_batch(features, label=None):
features = {
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
}
if label is None:
return features
else:
return features, label
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
if dataset_mode == "variable_batch":
batch_shape = {key: None for key in feature_keys}
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
elif dataset_mode == "constant_batch":
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
batch_shape = {
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
for key, ragged_tensor in data.items()
}
else:
raise ValueError("Unknown dataset mode!")
if "label" in dataset.features:
labels = tf.convert_to_tensor(np.array(dataset["label"]))
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
else:
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
if shuffle:
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
return tf_dataset
class SavePretrainedCallback(tf.keras.callbacks.Callback): class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
...@@ -377,6 +338,10 @@ def main(): ...@@ -377,6 +338,10 @@ def main():
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion # endregion
# region Metric function # region Metric function
...@@ -426,11 +391,6 @@ def main(): ...@@ -426,11 +391,6 @@ def main():
# region Convert data to a tf.data.Dataset # region Convert data to a tf.data.Dataset
tf_data = dict() tf_data = dict()
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
logger.info("Padding all batches to max length because argument was set or we're on TPU.")
dataset_mode = "constant_batch"
else:
dataset_mode = "variable_batch"
max_samples = { max_samples = {
"train": data_args.max_train_samples, "train": data_args.max_train_samples,
"validation": data_args.max_eval_samples, "validation": data_args.max_eval_samples,
...@@ -456,13 +416,14 @@ def main(): ...@@ -456,13 +416,14 @@ def main():
dataset = datasets[key] dataset = datasets[key]
if samples_limit is not None: if samples_limit is not None:
dataset = dataset.select(range(samples_limit)) dataset = dataset.select(range(samples_limit))
data = convert_dataset_for_tensorflow( data = dataset.to_tf_dataset(
dataset, columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
non_label_column_names, shuffle=shuffle,
batch_size=batch_size, batch_size=batch_size,
dataset_mode=dataset_mode, collate_fn=data_collator,
drop_remainder=drop_remainder, drop_remainder=drop_remainder,
shuffle=shuffle, # `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
) )
tf_data[key] = data tf_data[key] = data
# endregion # endregion
......
...@@ -29,6 +29,8 @@ from datasets import load_dataset ...@@ -29,6 +29,8 @@ from datasets import load_dataset
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
AutoTokenizer, AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser, HfArgumentParser,
PretrainedConfig, PretrainedConfig,
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
...@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback): ...@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
self.model.save_pretrained(self.output_dir) self.model.save_pretrained(self.output_dir)
def convert_dataset_for_tensorflow(
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def densify_ragged_batch(features, label=None):
features = {
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
}
if label is None:
return features
else:
return features, label
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
if dataset_mode == "variable_batch":
batch_shape = {key: None for key in feature_keys}
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
elif dataset_mode == "constant_batch":
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
batch_shape = {
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
for key, ragged_tensor in data.items()
}
else:
raise ValueError("Unknown dataset mode!")
if "label" in dataset.features:
labels = tf.convert_to_tensor(np.array(dataset["label"]))
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
else:
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
if shuffle:
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
return tf_dataset
# endregion # endregion
...@@ -399,6 +360,11 @@ def main(): ...@@ -399,6 +360,11 @@ def main():
return result return result
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion # endregion
with training_args.strategy.scope(): with training_args.strategy.scope():
...@@ -464,18 +430,14 @@ def main(): ...@@ -464,18 +430,14 @@ def main():
dataset = datasets[key] dataset = datasets[key]
if samples_limit is not None: if samples_limit is not None:
dataset = dataset.select(range(samples_limit)) dataset = dataset.select(range(samples_limit))
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: data = dataset.to_tf_dataset(
logger.info("Padding all batches to max length because argument was set or we're on TPU.") columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
dataset_mode = "constant_batch" shuffle=shuffle,
else:
dataset_mode = "variable_batch"
data = convert_dataset_for_tensorflow(
dataset,
non_label_column_names,
batch_size=batch_size, batch_size=batch_size,
dataset_mode=dataset_mode, collate_fn=data_collator,
drop_remainder=drop_remainder, drop_remainder=drop_remainder,
shuffle=shuffle, # `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
) )
tf_data[key] = data tf_data[key] = data
# endregion # endregion
......
...@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
def train_step(self, data): def train_step(self, data):
""" """
A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss. A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss.
""" """
# These are the only transformations `Model.fit` applies to user-input # These are the only transformations `Model.fit` applies to user-input
# data when a `tf.data.Dataset` is provided. # data when a `tf.data.Dataset` is provided.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment