Merge remote-tracking branch 'origin/master'

b3f95dce · Sylvain Gugger · a492aec8 · a3bd7637 · b3f95dce · b3f95dce
Commit b3f95dce authored Jul 26, 2021 by Sylvain Gugger
6 changed files
--- a/examples/tensorflow/multiple-choice/README.md
+++ b/examples/tensorflow/multiple-choice/README.md
 <!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
+Copyright 2021 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,26 +13,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
+# Multiple-choice training (e.g. SWAG)

-# Multiple Choice
+This folder contains the `run_swag.py` script, showing an examples of *multiple-choice answering* with the 
+🤗 Transformers library. For straightforward use-cases you may be able to use these scripts without modification, 
+although we have also included comments in the code to indicate areas that you may need to adapt to your own projects.

-## Fine-tuning on SWAG
+### Multi-GPU and TPU usage

+By default, the script uses a `MirroredStrategy` and will use multiple GPUs effectively if they are available. TPUs
+can also be used by passing the name of the TPU resource with the `--tpu` argument.
+
+### Memory usage and data loading
+
+One thing to note is that all data is loaded into memory in this script. Most multiple-choice datasets are small
+enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
+data streaming. This is particularly challenging for TPUs, given the stricter requirements and the sheer volume of data
+required to keep them fed. A full explanation of all the possible pitfalls is a bit beyond this example script and 
+README, but for more information you can see the 'Input Datasets' section of 
+[this document](https://www.tensorflow.org/guide/tpu).
+
+### Example command
 ```bash
-export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/multiple-choice/run_tf_multiple_choice.py \
--task_name swag \
--model_name_or_path bert-base-cased \
--do_train \
--do_eval \
--data_dir $SWAG_DIR \
--learning_rate 5e-5 \
--num_train_epochs 3 \
--max_seq_length 80 \
--output_dir models_bert/swag_base \
--per_gpu_eval_batch_size=16 \
--per_device_train_batch_size=16 \
--logging-dir logs \
--gradient_accumulation_steps 2 \
--overwrite_output
+python run_swag.py \
+ --model_name_or_path distilbert-base-cased \
+ --output_dir output \
+ --do_eval \
+ --do_train
 ```
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for multiple choice.
+"""
+# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+import datasets
+import numpy as np
+import tensorflow as tf
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    CONFIG_NAME,
+    TF2_WEIGHTS_NAME,
+    AutoConfig,
+    AutoTokenizer,
+    HfArgumentParser,
+    TFAutoModelForMultipleChoice,
+    TFTrainingArguments,
+    create_optimizer,
+    set_seed,
+)
+from transformers.utils import check_min_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.10.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+
+# region Helper classes and functions
+class SavePretrainedCallback(tf.keras.callbacks.Callback):
+    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
+    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
+    # that saves the model with this method after each epoch.
+    def __init__(self, output_dir, **kwargs):
+        super().__init__()
+        self.output_dir = output_dir
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.model.save_pretrained(self.output_dir)
+
+
+def convert_dataset_for_tensorflow(
+    dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
+):
+    """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
+    to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
+    is most useful when training on TPU, as a new graph compilation is required for each sequence length.
+    """
+
+    def densify_ragged_batch(features, label=None):
+        features = {
+            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
+        }
+        if label is None:
+            return features
+        else:
+            return features, label
+
+    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
+    if dataset_mode == "variable_batch":
+        batch_shape = {key: None for key in feature_keys}
+        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
+    elif dataset_mode == "constant_batch":
+        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
+        batch_shape = {
+            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
+            for key, ragged_tensor in data.items()
+        }
+    else:
+        raise ValueError("Unknown dataset mode!")
+
+    if "label" in dataset.features:
+        labels = tf.convert_to_tensor(np.array(dataset["label"]))
+        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+    else:
+        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
+    if shuffle:
+        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+    tf_dataset = (
+        tf_dataset.with_options(options)
+        .batch(batch_size=batch_size, drop_remainder=drop_remainder)
+        .map(densify_ragged_batch)
+    )
+    return tf_dataset
+
+
+# endregion
+
+# region Arguments
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. If passed, sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to the maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+
+    def __post_init__(self):
+        if self.train_file is not None:
+            extension = self.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if self.validation_file is not None:
+            extension = self.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+
+
+# endregion
+
+
+def main():
+    # region Argument parsing
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    output_dir = Path(training_args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # endregion
+
+    # region Logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # endregion
+
+    # region Checkpoints
+    checkpoint = None
+    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
+        if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file():
+            checkpoint = output_dir
+            logger.info(
+                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
+                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+        else:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to continue regardless."
+            )
+    # endregion
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # region Load datasets
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.train_file is not None or data_args.validation_file is not None:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+    else:
+        # Downloading and loading the swag dataset from the hub.
+        raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # When using your own dataset or a different dataset from swag, you will probably need to change this.
+    ending_names = [f"ending{i}" for i in range(4)]
+    context_name = "sent1"
+    question_header_name = "sent2"
+    # endregion
+
+    # region Load model config and tokenizer
+    if checkpoint is not None:
+        config_path = training_args.output_dir
+    elif model_args.config_name:
+        config_path = model_args.config_name
+    else:
+        config_path = model_args.model_name_or_path
+
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        config_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # endregion
+
+    # region Dataset preprocessing
+    if data_args.max_seq_length is None:
+        max_seq_length = tokenizer.model_max_length
+        if max_seq_length > 1024:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+            )
+            max_seq_length = 1024
+    else:
+        if data_args.max_seq_length > tokenizer.model_max_length:
+            logger.warning(
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def preprocess_function(examples):
+        first_sentences = [[context] * 4 for context in examples[context_name]]
+        question_headers = examples[question_header_name]
+        second_sentences = [
+            [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        ]
+
+        # Flatten out
+        first_sentences = sum(first_sentences, [])
+        second_sentences = sum(second_sentences, [])
+
+        # Tokenize
+        tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
+        # Un-flatten
+        data = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+        return data
+
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = raw_datasets["validation"]
+        if not training_args.do_train:
+            non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )
+    # endregion
+
+    with training_args.strategy.scope():
+        # region Build model
+        if checkpoint is None:
+            model_path = model_args.model_name_or_path
+        else:
+            model_path = checkpoint
+        model = TFAutoModelForMultipleChoice.from_pretrained(
+            model_path,
+            config=config,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+
+        num_replicas = training_args.strategy.num_replicas_in_sync
+        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
+        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
+        if training_args.do_train:
+            total_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
+            optimizer, lr_schedule = create_optimizer(
+                init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0
+            )
+        else:
+            optimizer = "adam"  # Just put anything in here, since we're not using it anyway
+        model.compile(
+            optimizer=optimizer,
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+            metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
+        )
+        # endregion
+
+        # region Training
+        if training_args.do_train:
+            tf_train_dataset = convert_dataset_for_tensorflow(
+                train_dataset, non_label_column_names=non_label_columns, batch_size=total_train_batch_size
+            )
+            if training_args.do_eval:
+                validation_data = convert_dataset_for_tensorflow(
+                    eval_dataset, non_label_column_names=non_label_columns, batch_size=total_eval_batch_size
+                )
+            else:
+                validation_data = None
+            model.fit(
+                tf_train_dataset,
+                validation_data=validation_data,
+                epochs=int(training_args.num_train_epochs),
+                callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+            )
+        # endregion
+
+        # region Evaluation
+        if training_args.do_eval and not training_args.do_train:
+            # Do a standalone evaluation pass
+            tf_eval_dataset = convert_dataset_for_tensorflow(
+                eval_dataset, non_label_column_names=non_label_columns, batch_size=total_eval_batch_size
+            )
+            model.evaluate(tf_eval_dataset)
+        # endregion
+
+        # region Push to hub
+        if training_args.push_to_hub:
+            model.push_to_hub(
+                finetuned_from=model_args.model_name_or_path,
+                tasks="multiple-choice",
+                dataset_tags="swag",
+                dataset_args="regular",
+                dataset="SWAG",
+                language="en",
+            )
+        # endregion
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/tensorflow/multiple-choice/run_tf_multiple_choice.py
+++ b/examples/tensorflow/multiple-choice/run_tf_multiple_choice.py
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
-
-
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import Dict, Optional
-
-import numpy as np
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    EvalPrediction,
-    HfArgumentParser,
-    TFAutoModelForMultipleChoice,
-    TFTrainer,
-    TFTrainingArguments,
-    set_seed,
-)
-from transformers.utils import logging as hf_logging
-from utils_multiple_choice import Split, TFMultipleChoiceDataset, processors
-
-
-hf_logging.set_verbosity_info()
-hf_logging.enable_default_handler()
-hf_logging.enable_explicit_format()
-
-
-logger = logging.getLogger(__name__)
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
-    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.warning(
-        f"device: {training_args.device}, n_replicas: {training_args.n_replicas}, "
-        f"16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed
-    set_seed(training_args.seed)
-
-    try:
-        processor = processors[data_args.task_name]()
-        label_list = processor.get_labels()
-        num_labels = len(label_list)
-    except KeyError:
-        raise ValueError(f"Task not found: {data_args.task_name}")
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    with training_args.strategy.scope():
-        model = TFAutoModelForMultipleChoice.from_pretrained(
-            model_args.model_name_or_path,
-            from_pt=bool(".bin" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-        )
-    # Get datasets
-    train_dataset = (
-        TFMultipleChoiceDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.train,
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        TFMultipleChoiceDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.dev,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    def compute_metrics(p: EvalPrediction) -> Dict:
-        preds = np.argmax(p.predictions, axis=1)
-        return {"acc": simple_accuracy(preds, p.label_ids)}
-
-    # Initialize our Trainer
-    trainer = TFTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset.get_dataset() if train_dataset else None,
-        eval_dataset=eval_dataset.get_dataset() if eval_dataset else None,
-        compute_metrics=compute_metrics,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train()
-        trainer.save_model()
-        tokenizer.save_pretrained(training_args.output_dir)
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        result = trainer.evaluate()
-
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key, value in result.items():
-                logger.info(f"  {key} = {value}")
-                writer.write(f"{key} = {value}\n")
-
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/tensorflow/multiple-choice/utils_multiple_choice.py
+++ b/examples/tensorflow/multiple-choice/utils_multiple_choice.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
-
-
-import csv
-import glob
-import json
-import logging
-import os
-from dataclasses import dataclass
-from enum import Enum
-from typing import List, Optional
-
-import tqdm
-
-from filelock import FileLock
-from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass(frozen=True)
-class InputExample:
-    """
-    A single training/test example for multiple choice
-
-    Args:
-        example_id: Unique id for the example.
-        question: string. The untokenized text of the second sequence (question).
-        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
-        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-
-    example_id: str
-    question: str
-    contexts: List[str]
-    endings: List[str]
-    label: Optional[str]
-
-
-@dataclass(frozen=True)
-class InputFeatures:
-    """
-    A single set of features of data.
-    Property names are the same names as the corresponding inputs to a model.
-    """
-
-    example_id: str
-    input_ids: List[List[int]]
-    attention_mask: Optional[List[List[int]]]
-    token_type_ids: Optional[List[List[int]]]
-    label: Optional[int]
-
-
-class Split(Enum):
-    train = "train"
-    dev = "dev"
-    test = "test"
-
-
-if is_torch_available():
-    import torch
-    from torch.utils.data.dataset import Dataset
-
-    class MultipleChoiceDataset(Dataset):
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
-        features: List[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = None,
-            overwrite_cache=False,
-            mode: Split = Split.train,
-        ):
-            processor = processors[task]()
-
-            cached_features_file = os.path.join(
-                data_dir, f"cached_{mode.value}_{tokenizer.__class__.__name__}_{max_seq_length}_{task}"
-            )
-
-            # Make sure only the first process in distributed training processes the dataset,
-            # and the others will use the cache.
-            lock_path = cached_features_file + ".lock"
-            with FileLock(lock_path):
-
-                if os.path.exists(cached_features_file) and not overwrite_cache:
-                    logger.info(f"Loading features from cached file {cached_features_file}")
-                    self.features = torch.load(cached_features_file)
-                else:
-                    logger.info(f"Creating features from dataset file at {data_dir}")
-                    label_list = processor.get_labels()
-                    if mode == Split.dev:
-                        examples = processor.get_dev_examples(data_dir)
-                    elif mode == Split.test:
-                        examples = processor.get_test_examples(data_dir)
-                    else:
-                        examples = processor.get_train_examples(data_dir)
-                    logger.info(f"Training examples: {len(examples)}")
-                    self.features = convert_examples_to_features(
-                        examples,
-                        label_list,
-                        max_seq_length,
-                        tokenizer,
-                    )
-                    logger.info(f"Saving features into cached file {cached_features_file}")
-                    torch.save(self.features, cached_features_file)
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    class TFMultipleChoiceDataset:
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
-        features: List[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = 128,
-            overwrite_cache=False,
-            mode: Split = Split.train,
-        ):
-            processor = processors[task]()
-
-            logger.info(f"Creating features from dataset file at {data_dir}")
-            label_list = processor.get_labels()
-            if mode == Split.dev:
-                examples = processor.get_dev_examples(data_dir)
-            elif mode == Split.test:
-                examples = processor.get_test_examples(data_dir)
-            else:
-                examples = processor.get_train_examples(data_dir)
-            logger.info(f"Training examples: {len(examples)}")
-
-            self.features = convert_examples_to_features(
-                examples,
-                label_list,
-                max_seq_length,
-                tokenizer,
-            )
-
-            def gen():
-                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
-                    if ex_index % 10000 == 0:
-                        logger.info(f"Writing example {ex_index} of {len(examples)}")
-
-                    yield (
-                        {
-                            "example_id": 0,
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "token_type_ids": ex.token_type_ids,
-                        },
-                        ex.label,
-                    )
-
-            self.dataset = tf.data.Dataset.from_generator(
-                gen,
-                (
-                    {
-                        "example_id": tf.int32,
-                        "input_ids": tf.int32,
-                        "attention_mask": tf.int32,
-                        "token_type_ids": tf.int32,
-                    },
-                    tf.int64,
-                ),
-                (
-                    {
-                        "example_id": tf.TensorShape([]),
-                        "input_ids": tf.TensorShape([None, None]),
-                        "attention_mask": tf.TensorShape([None, None]),
-                        "token_type_ids": tf.TensorShape([None, None]),
-                    },
-                    tf.TensorShape([]),
-                ),
-            )
-
-        def get_dataset(self):
-            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
-
-            return self.dataset
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-
-class DataProcessor:
-    """Base class for data converters for multiple choice data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_test_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the test set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-
-class RaceProcessor(DataProcessor):
-    """Processor for the RACE data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        high = os.path.join(data_dir, "train/high")
-        middle = os.path.join(data_dir, "train/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        high = os.path.join(data_dir, "dev/high")
-        middle = os.path.join(data_dir, "dev/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} test")
-        high = os.path.join(data_dir, "test/high")
-        middle = os.path.join(data_dir, "test/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_txt(self, input_dir):
-        lines = []
-        files = glob.glob(input_dir + "/*txt")
-        for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, "r", encoding="utf-8") as fin:
-                data_raw = json.load(fin)
-                data_raw["race_id"] = file
-                lines.append(data_raw)
-        return lines
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (_, data_raw) in enumerate(lines):
-            race_id = f"{set_type}-{data_raw['race_id']}"
-            article = data_raw["article"]
-            for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw["answers"][i]) - ord("A"))
-                question = data_raw["questions"][i]
-                options = data_raw["options"][i]
-
-                examples.append(
-                    InputExample(
-                        example_id=race_id,
-                        question=question,
-                        contexts=[article, article, article, article],  # this is not efficient but convenient
-                        endings=[options[0], options[1], options[2], options[3]],
-                        label=truth,
-                    )
-                )
-        return examples
-
-
-class SynonymProcessor(DataProcessor):
-    """Processor for the Synonym data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3", "4"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, "r", encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: List[List[str]], type: str):
-        """Creates examples for the training and dev sets."""
-
-        examples = [
-            InputExample(
-                example_id=line[0],
-                question="",  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[1], line[1], line[1], line[1], line[1]],
-                endings=[line[2], line[3], line[4], line[5], line[6]],
-                label=line[7],
-            )
-            for line in lines  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class SwagProcessor(DataProcessor):
-    """Processor for the SWAG data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        raise ValueError(
-            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
-            "setting!"
-        )
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, "r", encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: List[List[str]], type: str):
-        """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != "label":
-            raise ValueError("For training, the input file must contain a label column.")
-
-        examples = [
-            InputExample(
-                example_id=line[2],
-                question=line[5],  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[4], line[4], line[4], line[4]],
-                endings=[line[7], line[8], line[9], line[10]],
-                label=line[11],
-            )
-            for line in lines[1:]  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class ArcProcessor(DataProcessor):
-    """Processor for the ARC data set (request from allennlp)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
-
-    def get_test_examples(self, data_dir):
-        logger.info(f"LOOKING AT {data_dir} test")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_json(self, input_file):
-        with open(input_file, "r", encoding="utf-8") as fin:
-            lines = fin.readlines()
-            return lines
-
-    def _create_examples(self, lines, type):
-        """Creates examples for the training and dev sets."""
-
-        # There are two types of labels. They should be normalized
-        def normalize(truth):
-            if truth in "ABCD":
-                return ord(truth) - ord("A")
-            elif truth in "1234":
-                return int(truth) - 1
-            else:
-                logger.info(f"truth ERROR! {truth}")
-                return None
-
-        examples = []
-        three_choice = 0
-        four_choice = 0
-        five_choice = 0
-        other_choices = 0
-        # we deleted example which has more than or less than four choices
-        for line in tqdm.tqdm(lines, desc="read arc data"):
-            data_raw = json.loads(line.strip("\n"))
-            if len(data_raw["question"]["choices"]) == 3:
-                three_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) == 5:
-                five_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) != 4:
-                other_choices += 1
-                continue
-            four_choice += 1
-            truth = str(normalize(data_raw["answerKey"]))
-            assert truth != "None"
-            question_choices = data_raw["question"]
-            question = question_choices["stem"]
-            id = data_raw["id"]
-            options = question_choices["choices"]
-            if len(options) == 4:
-                examples.append(
-                    InputExample(
-                        example_id=id,
-                        question=question,
-                        contexts=[
-                            options[0]["para"].replace("_", ""),
-                            options[1]["para"].replace("_", ""),
-                            options[2]["para"].replace("_", ""),
-                            options[3]["para"].replace("_", ""),
-                        ],
-                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth,
-                    )
-                )
-
-        if type == "train":
-            assert len(examples) > 1
-            assert examples[0].label is not None
-        logger.info(f"len examples: {len(examples)}")
-        logger.info(f"Three choices: {three_choice}")
-        logger.info(f"Five choices: {five_choice}")
-        logger.info(f"Other choices: {other_choices}")
-        logger.info(f"four choices: {four_choice}")
-
-        return examples
-
-
-def convert_examples_to_features(
-    examples: List[InputExample],
-    label_list: List[str],
-    max_length: int,
-    tokenizer: PreTrainedTokenizer,
-) -> List[InputFeatures]:
-    """
-    Loads a data file into a list of `InputFeatures`
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
-        if ex_index % 10000 == 0:
-            logger.info(f"Writing example {ex_index} of {len(examples)}")
-        choices_inputs = []
-        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
-            text_a = context
-            if example.question.find("_") != -1:
-                # this is for cloze question
-                text_b = example.question.replace("_", ending)
-            else:
-                text_b = example.question + " " + ending
-
-            inputs = tokenizer(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-                padding="max_length",
-                truncation=True,
-                return_overflowing_tokens=True,
-            )
-            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
-                logger.info(
-                    "Attention! you are cropping tokens (swag task is ok). "
-                    "If you are training ARC and RACE and you are poping question + options,"
-                    "you need to try to use a bigger max seq length!"
-                )
-
-            choices_inputs.append(inputs)
-
-        label = label_map[example.label]
-
-        input_ids = [x["input_ids"] for x in choices_inputs]
-        attention_mask = (
-            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
-        )
-        token_type_ids = (
-            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
-        )
-
-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                label=label,
-            )
-        )
-
-    for f in features[:2]:
-        logger.info("*** Example ***")
-        logger.info("feature: {f}")
-
-    return features
-
-
-processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -270,7 +270,19 @@ class TokenClassificationPipeline(Pipeline):
            if offset_mapping is not None:
                start_ind, end_ind = offset_mapping[idx]
                word_ref = sentence[start_ind:end_ind]
-                is_subword = len(word_ref) != len(word)
+                if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
+                    # This is a BPE, word aware tokenizer, there is a correct way
+                    # to fuse tokens
+                    is_subword = len(word) != len(word_ref)
+                else:
+                    # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
+                    if self.aggregation_strategy in {
+                        AggregationStrategy.FIRST,
+                        AggregationStrategy.AVERAGE,
+                        AggregationStrategy.MAX,
+                    }:
+                        warnings.warn(UserWarning, "Tokenizer does not support real words, using fallback heuristic")
+                    is_subword = sentence[start_ind - 1 : start_ind] != " " if start_ind > 0 else False

                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
                    word = word_ref

--- a/tests/test_pipelines_token_classification.py
+++ b/tests/test_pipelines_token_classification.py
@@ -191,6 +191,19 @@ class TokenClassificationPipelineTests(CustomInputPipelineCommonMixin, unittest.
            ],
        )

+    @require_torch
+    @slow
+    def test_aggregation_strategy_byte_level_tokenizer(self):
+        sentence = "Groenlinks praat over Schiphol."
+        ner = pipeline("ner", model="xlm-roberta-large-finetuned-conll02-dutch", aggregation_strategy="max")
+        self.assertEqual(
+            nested_simplify(ner(sentence)),
+            [
+                {"end": 10, "entity_group": "ORG", "score": 0.994, "start": 0, "word": "Groenlinks"},
+                {"entity_group": "LOC", "score": 1.0, "word": "Schiphol.", "start": 22, "end": 31},
+            ],
+        )
+
    @require_torch
    def test_aggregation_strategy(self):
        model_name = self.small_models[0]