Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg

Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
a14b055b · Albert Villanova del Moral · GitHub · 485fd814 · a14b055b · a14b055b
Unverified Commit a14b055b authored Jun 17, 2024 by Albert Villanova del Moral Committed by GitHub Jun 17, 2024
20 changed files
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -71,6 +71,15 @@ def parse_args():
        help="Name of the dataset on the hub.",
        default="qubvel-hf/ade20k-mini",
    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
    parser.add_argument(
        "--image_height",
        type=int,
@@ -425,7 +434,7 @@ def main():
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
-    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
+    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
    # We need to specify the label2id mapping for the model
    # it is a mapping from semantic class name to class index.

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -124,9 +124,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -312,6 +312,7 @@ def main():
            cache_dir=model_args.cache_dir,
            token=model_args.token,
            streaming=data_args.streaming,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -321,6 +322,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -329,6 +331,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -195,12 +195,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -327,17 +326,21 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
+                trust_remote_code=args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
+                trust_remote_code=args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -127,9 +127,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -382,6 +382,7 @@ def main():
            cache_dir=model_args.cache_dir,
            token=model_args.token,
            streaming=data_args.streaming,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -391,6 +392,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -399,6 +401,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -257,12 +257,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -395,17 +394,21 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
+                trust_remote_code=args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
+                trust_remote_code=args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -121,9 +121,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -324,6 +324,7 @@ def main():
            cache_dir=model_args.cache_dir,
            token=model_args.token,
            streaming=data_args.streaming,
+            trust_remote_code=model_args.trust_remote_code,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -333,6 +334,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -341,6 +343,7 @@ def main():
                cache_dir=model_args.cache_dir,
                token=model_args.token,
                streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -202,12 +202,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -334,17 +333,21 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
+                trust_remote_code=args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
+                trust_remote_code=args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -133,6 +133,16 @@ class DataTrainingArguments:
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
@@ -292,6 +302,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -300,6 +311,7 @@ def main():
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                token=model_args.token,
+                trust_remote_code=data_args.trust_remote_code,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
@@ -307,6 +319,7 @@ def main():
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                token=model_args.token,
+                trust_remote_code=data_args.trust_remote_code,
            )
    else:
        data_files = {}

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -184,12 +184,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -351,7 +350,9 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
    else:
        data_files = {}
        if args.train_file is not None:

--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -313,9 +313,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -383,7 +383,9 @@ def main():
    # Load dataset, prepare splits
    # ------------------------------------------------------------------------------------------------
-    dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir)
+    dataset = load_dataset(
+        data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
+    )
    # If we don't have a validation split, split off a percentage of train as validation
    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -340,12 +340,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -445,7 +444,7 @@ def main():
    # Load dataset
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
-    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
+    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
    # If we don't have a validation split, split off a percentage of train as validation.
    args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split

--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -93,9 +93,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -301,6 +301,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -101,6 +101,16 @@ class DataTrainingArguments:
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
@@ -289,6 +299,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -100,6 +100,15 @@ def parse_args():
        default=None,
        help="The configuration name of the dataset to use (via the datasets library).",
    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
    parser.add_argument(
        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
    )
@@ -356,7 +365,9 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
    else:
        data_files = {}
        if args.train_file is not None:

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -275,12 +275,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -404,7 +403,9 @@ def main():
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        raw_datasets = load_dataset(
+            args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
+        )
    else:
        data_files = {}
        if args.train_file is not None:

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -93,9 +93,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -346,6 +346,7 @@ def main():
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )
    else:
        data_files = {}

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -165,9 +165,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -233,7 +233,9 @@ def main():
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # TODO support datasets from local folders
-    dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir)
+    dataset = load_dataset(
+        data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
+    )
    # Rename column names to standardized names (only "image" and "label" need to be present)
    if "pixel_values" in dataset["train"].column_names:

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -180,12 +180,11 @@ def parse_args():
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--trust_remote_code",
-        type=bool,
+        action="store_true",
-        default=False,
        help=(
-            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
-            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
-            "execute code present on the Hub on your local machine."
+            " code, as it will execute code present on the Hub on your local machine."
        ),
    )
    parser.add_argument(
@@ -294,7 +293,7 @@ def main():
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # TODO support datasets from local folders
-    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
+    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
    # Rename column names to standardized names (only "image" and "label" need to be present)
    if "pixel_values" in dataset["train"].column_names:

--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -71,6 +71,15 @@ def parse_args():
        required=True,
        help="The names of the training data set splits to use (via the datasets library).",
    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
    parser.add_argument(
        "--preprocessing_num_workers",
        type=int,
@@ -446,6 +455,7 @@ def main():
            dataset_config_name,
            split=train_split_name,
            cache_dir=args.cache_dir,
+            trust_remote_code=args.trust_remote_code,
        )
        datasets_splits.append(dataset_split)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -255,9 +255,9 @@ class DataTrainingArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
-                "execute code present on the Hub on your local machine."
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -454,6 +454,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.train_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -479,6 +480,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )
        if data_args.max_eval_samples is not None: