TF text classification examples (#15704)

* Working example with to_tf_dataset * updated text_classification * more comments

TF text classification examples (#15704)
* Working example with to_tf_dataset * updated text_classification * more comments
3956b133 · Joao Gante · GitHub · 142b69f2 · 3956b133 · 3956b133
Unverified Commit 3956b133 authored Feb 21, 2022 by Joao Gante Committed by GitHub Feb 21, 2022
5 changed files
--- a/docs/source/main_classes/processors.mdx
+++ b/docs/source/main_classes/processors.mdx
@@ -65,12 +65,7 @@ Those processors are:
 Additionally, the following method can be used to load values from a data file and convert them to a list of
 [`~data.processors.utils.InputExample`].

-automethod,transformers.data.processors.glue.glue_convert_examples_to_features
-
-
-### Example usage
-
-An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
+[[autodoc]] data.processors.glue.glue_convert_examples_to_features


 ## XNLI
@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso
 Additionally, the following method can be used to convert SQuAD examples into
 [`~data.processors.utils.SquadFeatures`] that can be used as model inputs.

-automethod,transformers.data.processors.squad.squad_convert_examples_to_features
+[[autodoc]] data.processors.squad.squad_convert_examples_to_features


 These processors as well as the aforementionned method can be used with files containing the data as well as with the

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -457,7 +457,8 @@ def main():
        else:
            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

-    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
+    # we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:

--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -30,6 +30,8 @@ import transformers
 from transformers import (
    AutoConfig,
    AutoTokenizer,
+    DataCollatorWithPadding,
+    DefaultDataCollator,
    HfArgumentParser,
    PretrainedConfig,
    TFAutoModelForSequenceClassification,
@@ -43,47 +45,6 @@ from transformers.utils import check_min_version
 # region Helper functions


-def convert_dataset_for_tensorflow(
-    dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
-):
-    """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
-    to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
-    is most useful when training on TPU, as a new graph compilation is required for each sequence length.
-    """
-
-    def densify_ragged_batch(features, label=None):
-        features = {
-            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
-        }
-        if label is None:
-            return features
-        else:
-            return features, label
-
-    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
-    if dataset_mode == "variable_batch":
-        batch_shape = {key: None for key in feature_keys}
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-    elif dataset_mode == "constant_batch":
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-        batch_shape = {
-            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
-            for key, ragged_tensor in data.items()
-        }
-    else:
-        raise ValueError("Unknown dataset mode!")
-
-    if "label" in dataset.features:
-        labels = tf.convert_to_tensor(np.array(dataset["label"]))
-        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    else:
-        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
-    if shuffle:
-        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
-    tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
-    return tf_dataset
-
-
 class SavePretrainedCallback(tf.keras.callbacks.Callback):
    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
@@ -377,6 +338,10 @@ def main():

    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)

+    if data_args.pad_to_max_length:
+        data_collator = DefaultDataCollator(return_tensors="tf")
+    else:
+        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
    # endregion

    # region Metric function
@@ -426,11 +391,6 @@ def main():

        # region Convert data to a tf.data.Dataset
        tf_data = dict()
-        if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
-            logger.info("Padding all batches to max length because argument was set or we're on TPU.")
-            dataset_mode = "constant_batch"
-        else:
-            dataset_mode = "variable_batch"
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_eval_samples,
@@ -456,13 +416,14 @@ def main():
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))
-            data = convert_dataset_for_tensorflow(
-                dataset,
-                non_label_column_names,
+            data = dataset.to_tf_dataset(
+                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
+                shuffle=shuffle,
                batch_size=batch_size,
-                dataset_mode=dataset_mode,
+                collate_fn=data_collator,
                drop_remainder=drop_remainder,
-                shuffle=shuffle,
+                # `label_cols` is needed for user-defined losses, such as in this example
+                label_cols="label" if "label" in dataset.column_names else None,
            )
            tf_data[key] = data
        # endregion

--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -29,6 +29,8 @@ from datasets import load_dataset
 from transformers import (
    AutoConfig,
    AutoTokenizer,
+    DataCollatorWithPadding,
+    DefaultDataCollator,
    HfArgumentParser,
    PretrainedConfig,
    TFAutoModelForSequenceClassification,
@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
        self.model.save_pretrained(self.output_dir)


-def convert_dataset_for_tensorflow(
-    dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
-):
-    """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
-    to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
-    is most useful when training on TPU, as a new graph compilation is required for each sequence length.
-    """
-
-    def densify_ragged_batch(features, label=None):
-        features = {
-            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
-        }
-        if label is None:
-            return features
-        else:
-            return features, label
-
-    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
-    if dataset_mode == "variable_batch":
-        batch_shape = {key: None for key in feature_keys}
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-    elif dataset_mode == "constant_batch":
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-        batch_shape = {
-            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
-            for key, ragged_tensor in data.items()
-        }
-    else:
-        raise ValueError("Unknown dataset mode!")
-
-    if "label" in dataset.features:
-        labels = tf.convert_to_tensor(np.array(dataset["label"]))
-        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    else:
-        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
-    if shuffle:
-        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
-    tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
-    return tf_dataset
-
-
 # endregion


@@ -399,6 +360,11 @@ def main():
        return result

    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
+
+    if data_args.pad_to_max_length:
+        data_collator = DefaultDataCollator(return_tensors="tf")
+    else:
+        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
    # endregion

    with training_args.strategy.scope():
@@ -464,18 +430,14 @@ def main():
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))
-            if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
-                logger.info("Padding all batches to max length because argument was set or we're on TPU.")
-                dataset_mode = "constant_batch"
-            else:
-                dataset_mode = "variable_batch"
-            data = convert_dataset_for_tensorflow(
-                dataset,
-                non_label_column_names,
+            data = dataset.to_tf_dataset(
+                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
+                shuffle=shuffle,
                batch_size=batch_size,
-                dataset_mode=dataset_mode,
+                collate_fn=data_collator,
                drop_remainder=drop_remainder,
-                shuffle=shuffle,
+                # `label_cols` is needed for user-defined losses, such as in this example
+                label_cols="label" if "label" in dataset.column_names else None,
            )
            tf_data[key] = data
        # endregion

--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu

    def train_step(self, data):
        """
-        A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss.
+        A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss.
        """
        # These are the only transformations `Model.fit` applies to user-input
        # data when a `tf.data.Dataset` is provided.