"tests/data2vec/test_modeling_data2vec_text.py" did not exist on "29c10a41d04f855c433a6cde7797b325651417d2"
Unverified Commit 3956b133 authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

TF text classification examples (#15704)

* Working example with to_tf_dataset

* updated text_classification

* more comments
parent 142b69f2
......@@ -65,12 +65,7 @@ Those processors are:
Additionally, the following method can be used to load values from a data file and convert them to a list of
[`~data.processors.utils.InputExample`].
automethod,transformers.data.processors.glue.glue_convert_examples_to_features
### Example usage
An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
[[autodoc]] data.processors.glue.glue_convert_examples_to_features
## XNLI
......@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso
Additionally, the following method can be used to convert SQuAD examples into
[`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
automethod,transformers.data.processors.squad.squad_convert_examples_to_features
[[autodoc]] data.processors.squad.squad_convert_examples_to_features
These processors as well as the aforementionned method can be used with files containing the data as well as with the
......
......@@ -457,7 +457,8 @@ def main():
else:
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
# Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
# we already did the padding.
if data_args.pad_to_max_length:
data_collator = default_data_collator
elif training_args.fp16:
......
......@@ -30,6 +30,8 @@ import transformers
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser,
PretrainedConfig,
TFAutoModelForSequenceClassification,
......@@ -43,47 +45,6 @@ from transformers.utils import check_min_version
# region Helper functions
def convert_dataset_for_tensorflow(
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def densify_ragged_batch(features, label=None):
features = {
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
}
if label is None:
return features
else:
return features, label
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
if dataset_mode == "variable_batch":
batch_shape = {key: None for key in feature_keys}
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
elif dataset_mode == "constant_batch":
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
batch_shape = {
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
for key, ragged_tensor in data.items()
}
else:
raise ValueError("Unknown dataset mode!")
if "label" in dataset.features:
labels = tf.convert_to_tensor(np.array(dataset["label"]))
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
else:
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
if shuffle:
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
return tf_dataset
class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
......@@ -377,6 +338,10 @@ def main():
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion
# region Metric function
......@@ -426,11 +391,6 @@ def main():
# region Convert data to a tf.data.Dataset
tf_data = dict()
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
logger.info("Padding all batches to max length because argument was set or we're on TPU.")
dataset_mode = "constant_batch"
else:
dataset_mode = "variable_batch"
max_samples = {
"train": data_args.max_train_samples,
"validation": data_args.max_eval_samples,
......@@ -456,13 +416,14 @@ def main():
dataset = datasets[key]
if samples_limit is not None:
dataset = dataset.select(range(samples_limit))
data = convert_dataset_for_tensorflow(
dataset,
non_label_column_names,
data = dataset.to_tf_dataset(
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
shuffle=shuffle,
batch_size=batch_size,
dataset_mode=dataset_mode,
collate_fn=data_collator,
drop_remainder=drop_remainder,
shuffle=shuffle,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
)
tf_data[key] = data
# endregion
......
......@@ -29,6 +29,8 @@ from datasets import load_dataset
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser,
PretrainedConfig,
TFAutoModelForSequenceClassification,
......@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
self.model.save_pretrained(self.output_dir)
def convert_dataset_for_tensorflow(
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def densify_ragged_batch(features, label=None):
features = {
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
}
if label is None:
return features
else:
return features, label
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
if dataset_mode == "variable_batch":
batch_shape = {key: None for key in feature_keys}
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
elif dataset_mode == "constant_batch":
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
batch_shape = {
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
for key, ragged_tensor in data.items()
}
else:
raise ValueError("Unknown dataset mode!")
if "label" in dataset.features:
labels = tf.convert_to_tensor(np.array(dataset["label"]))
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
else:
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
if shuffle:
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
return tf_dataset
# endregion
......@@ -399,6 +360,11 @@ def main():
return result
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion
with training_args.strategy.scope():
......@@ -464,18 +430,14 @@ def main():
dataset = datasets[key]
if samples_limit is not None:
dataset = dataset.select(range(samples_limit))
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
logger.info("Padding all batches to max length because argument was set or we're on TPU.")
dataset_mode = "constant_batch"
else:
dataset_mode = "variable_batch"
data = convert_dataset_for_tensorflow(
dataset,
non_label_column_names,
data = dataset.to_tf_dataset(
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
shuffle=shuffle,
batch_size=batch_size,
dataset_mode=dataset_mode,
collate_fn=data_collator,
drop_remainder=drop_remainder,
shuffle=shuffle,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
)
tf_data[key] = data
# endregion
......
......@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
def train_step(self, data):
"""
A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss.
A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss.
"""
# These are the only transformations `Model.fit` applies to user-input
# data when a `tf.data.Dataset` is provided.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment