Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg

Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
a14b055b · Albert Villanova del Moral · GitHub · 485fd814 · a14b055b · a14b055b
Unverified Commit a14b055b authored Jun 17, 2024 by Albert Villanova del Moral Committed by GitHub Jun 17, 2024
20 changed files
--- a/examples/flax/_tests_requirements.txt
+++ b/examples/flax/_tests_requirements.txt
-datasets >= 1.13.3,<2.20.0 # Temporary upper version
+datasets >= 1.13.3
 pytest<8.0.1
 conllu
 nltk

--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -195,9 +195,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -458,6 +458,7 @@ def main():
            keep_in_memory=False,
            data_dir=data_args.data_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -191,6 +191,16 @@ class DataTrainingArguments:
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
@@ -518,6 +528,7 @@ def main():
            cache_dir=model_args.cache_dir,
            token=model_args.token,
            num_proc=data_args.preprocessing_num_workers,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if "validation" not in datasets.keys():
@@ -528,6 +539,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=data_args.trust_remote_code,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -536,6 +548,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=data_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -182,9 +182,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -408,6 +408,7 @@ def main():
            keep_in_memory=False,
            token=model_args.token,
            num_proc=data_args.preprocessing_num_workers,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in dataset.keys():
@@ -418,6 +419,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=model_args.trust_remote_code,
            )
            dataset["train"] = load_dataset(
                data_args.dataset_name,
@@ -426,6 +428,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -188,9 +188,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -446,6 +446,7 @@ def main():
            cache_dir=model_args.cache_dir,
            token=model_args.token,
            num_proc=data_args.preprocessing_num_workers,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in datasets.keys():
@@ -456,6 +457,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=model_args.trust_remote_code,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -464,6 +466,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -192,6 +192,16 @@ class DataTrainingArguments:
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
@@ -560,6 +570,7 @@ def main():
            cache_dir=model_args.cache_dir,
            token=model_args.token,
            num_proc=data_args.preprocessing_num_workers,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if "validation" not in datasets.keys():
@@ -570,6 +581,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=data_args.trust_remote_code,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -578,6 +590,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                num_proc=data_args.preprocessing_num_workers,
+                trust_remote_code=data_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -168,9 +168,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -498,6 +498,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        # Loading the dataset from local csv or json file.

--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -136,6 +136,16 @@ class DataTrainingArguments:
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    text_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
@@ -442,6 +452,7 @@ def main():
            cache_dir=data_args.dataset_cache_dir,
            num_proc=data_args.preprocessing_num_workers,
            token=True if model_args.use_auth_token else None,
+            trust_remote_code=data_args.trust_remote_code,
        )
    if training_args.do_eval:
@@ -452,6 +463,7 @@ def main():
            cache_dir=data_args.dataset_cache_dir,
            num_proc=data_args.preprocessing_num_workers,
            token=True if model_args.use_auth_token else None,
+            trust_remote_code=data_args.trust_remote_code,
        )
    if not training_args.do_train and not training_args.do_eval:

--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -201,9 +201,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -485,6 +485,7 @@ def main():
            cache_dir=model_args.cache_dir,
            keep_in_memory=False,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@@ -265,6 +265,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --output_dir {tmp_dir}
            --overwrite_output_dir
            --num_train_epochs=2

--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -170,9 +170,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -449,6 +449,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        # Loading the dataset from local csv or json file.

--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@@ -13,7 +13,7 @@ streamlit
 elasticsearch
 nltk
 pandas
-datasets >= 1.13.3,<2.20.0 # Temporary upper version
+datasets >= 1.13.3
 fire
 pytest<8.0.1
 conllu

--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -165,9 +165,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -261,12 +261,14 @@ def main():
        data_args.dataset_config_name,
        split=data_args.train_split_name,
        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
    )
    raw_datasets["eval"] = load_dataset(
        data_args.dataset_name,
        data_args.dataset_config_name,
        split=data_args.eval_split_name,
        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
    )
    if data_args.audio_column_name not in raw_datasets["train"].column_names:

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -99,9 +99,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -305,6 +305,7 @@ def main():
            keep_in_memory=False,
            data_dir=data_args.data_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -164,9 +164,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -242,6 +242,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -150,12 +150,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -284,7 +283,7 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(args.dataset_name)
+        dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
    else:
        data_files = {}
        if args.train_dir is not None:

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -63,6 +63,16 @@ class DataTrainingArguments:
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    image_column_name: Optional[str] = field(
        default=None, metadata={"help": "The column name of the images in the files."}
    )
@@ -225,6 +235,7 @@ def main():
        data_files=data_args.data_files,
        cache_dir=model_args.cache_dir,
        token=model_args.token,
+        trust_remote_code=data_args.trust_remote_code,
    )
    # If we don't have a validation split, split off a percentage of train as validation.

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -166,9 +166,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -299,6 +299,7 @@ def main():
        data_files=data_args.data_files,
        cache_dir=model_args.cache_dir,
        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
    )
    # If we don't have a validation split, split off a percentage of train as validation.

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -197,12 +197,11 @@ def parse_args():
    )
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -441,6 +440,7 @@ def main():
        data_files=args.data_files,
        cache_dir=args.cache_dir,
        token=args.token,
+        trust_remote_code=args.trust_remote_code,
    )
    # If we don't have a validation split, split off a percentage of train as validation.

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -68,6 +68,16 @@ class Arguments:
            "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
        },
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    image_height: Optional[int] = field(default=512, metadata={"help": "Image height after resizing."})
    image_width: Optional[int] = field(default=512, metadata={"help": "Image width after resizing."})
    token: str = field(
@@ -364,7 +374,7 @@ def main():
    # Load dataset, prepare splits
    # ------------------------------------------------------------------------------------------------
-    dataset = load_dataset(args.dataset_name)
+    dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
    # We need to specify the label2id mapping for the model
    # it is a mapping from semantic class name to class index.