Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg

Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
a14b055b · Albert Villanova del Moral · GitHub · 485fd814 · a14b055b · a14b055b
Unverified Commit a14b055b authored Jun 17, 2024 by Albert Villanova del Moral Committed by GitHub Jun 17, 2024
20 changed files
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -245,9 +245,9 @@ class DataTrainingArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -434,6 +434,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.train_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -459,6 +460,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if data_args.max_eval_samples is not None:

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -98,9 +98,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -347,6 +347,7 @@ def main():
            split=data_args.train_split_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    if training_args.do_eval:
@@ -356,6 +357,7 @@ def main():
            split=data_args.eval_split_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -112,9 +112,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -397,6 +397,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -268,12 +268,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -398,7 +397,9 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
    else:
        data_files = {}
        if args.train_file is not None:

--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -313,6 +313,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
            {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 1

--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -391,6 +391,7 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@@ -424,6 +425,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@@ -454,6 +456,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@@ -486,6 +489,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@@ -513,6 +517,7 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
            --dataset_name anton-l/superb_demo
+            --trust_remote_code
            --dataset_config_name ks
            --train_split_name test
            --eval_split_name test
@@ -547,6 +552,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_name hf-internal-testing/librispeech_asr_dummy
            --dataset_config_names clean
            --dataset_split_names validation
+            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
@@ -567,6 +573,7 @@ class ExamplesTests(TestCasePlus):
            run_mae.py
            --output_dir {tmp_dir}
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4

--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -240,9 +240,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -338,6 +338,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
        # Try print some info about the dataset
        logger.info(f"Dataset loaded: {raw_datasets}")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -201,9 +201,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -300,6 +300,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        # Loading a dataset from your local files.

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -92,9 +92,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -290,6 +290,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -212,12 +212,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -333,7 +332,9 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
    else:
        data_files = {}
        if args.train_file is not None:

--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -102,9 +102,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -346,6 +346,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -76,7 +76,6 @@ def parse_args():
        default=None,
        help="The name of the dataset to use (via the datasets library).",
    )
    parser.add_argument(
        "--predict_with_generate",
        type=bool,
@@ -259,12 +258,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -378,7 +376,9 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
    else:
        data_files = {}
        if args.train_file is not None:

--- a/examples/tensorflow/_tests_requirements.txt
+++ b/examples/tensorflow/_tests_requirements.txt
@@ -14,7 +14,7 @@ streamlit
 elasticsearch
 nltk
 pandas
-datasets >= 1.13.3,<2.20.0 # Temporary upper version
+datasets >= 1.13.3
 fire
 pytest<8.0.1
 conllu

--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -105,9 +105,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -326,6 +326,7 @@ def main():
            keep_in_memory=False,
            data_dir=data_args.data_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -171,9 +171,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -284,6 +284,7 @@ def main():
            cache_dir=model_args.cache_dir,
            task="image-classification",
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
+++ b/examples/tensorflow/language-modeling-tpu/prepare_tfrecord_shards.py
@@ -42,6 +42,15 @@ def parse_args():
    parser.add_argument(
        "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
    parser.add_argument(
        "--tokenizer_name_or_path",
        type=str,
@@ -105,7 +114,9 @@ def get_serialized_examples(tokenized_data):
 def main(args):
-    dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split=args.split)
+    dataset = datasets.load_dataset(
+        args.dataset_name, args.dataset_config, split=args.split, trust_remote_code=args.trust_remote_code
+    )
    if args.limit is not None:
        max_samples = min(len(dataset), args.limit)

--- a/examples/tensorflow/language-modeling-tpu/train_unigram.py
+++ b/examples/tensorflow/language-modeling-tpu/train_unigram.py
@@ -41,6 +41,15 @@ def parse_args():
    parser.add_argument(
        "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
    parser.add_argument(
        "--batch_size",
        type=int,
@@ -69,7 +78,9 @@ def parse_args():
 def main(args):
-    dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split="train")
+    dataset = datasets.load_dataset(
+        args.dataset_name, args.dataset_config, split="train", trust_remote_code=args.trust_remote_code
+    )
    if args.limit is not None:
        max_train_samples = min(len(dataset), args.limit)

--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -125,9 +125,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -298,6 +298,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -306,6 +307,7 @@ def main():
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -313,6 +315,7 @@ def main():
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -123,9 +123,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -307,6 +307,7 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -314,12 +315,14 @@ def main():
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -104,9 +104,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -329,6 +329,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}