Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
...@@ -245,9 +245,9 @@ class DataTrainingArguments: ...@@ -245,9 +245,9 @@ class DataTrainingArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -434,6 +434,7 @@ def main(): ...@@ -434,6 +434,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.train_split_name, split=data_args.train_split_name,
token=data_args.token, token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
if data_args.audio_column_name not in raw_datasets["train"].column_names: if data_args.audio_column_name not in raw_datasets["train"].column_names:
...@@ -459,6 +460,7 @@ def main(): ...@@ -459,6 +460,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.eval_split_name, split=data_args.eval_split_name,
token=data_args.token, token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
......
...@@ -98,9 +98,9 @@ class ModelArguments: ...@@ -98,9 +98,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -347,6 +347,7 @@ def main(): ...@@ -347,6 +347,7 @@ def main():
split=data_args.train_split_name, split=data_args.train_split_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
if training_args.do_eval: if training_args.do_eval:
...@@ -356,6 +357,7 @@ def main(): ...@@ -356,6 +357,7 @@ def main():
split=data_args.eval_split_name, split=data_args.eval_split_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names: if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
......
...@@ -112,9 +112,9 @@ class ModelArguments: ...@@ -112,9 +112,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -397,6 +397,7 @@ def main(): ...@@ -397,6 +397,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -268,12 +268,11 @@ def parse_args(): ...@@ -268,12 +268,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -398,7 +397,9 @@ def main(): ...@@ -398,7 +397,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -313,6 +313,7 @@ class ExamplesTestsNoTrainer(TestCasePlus): ...@@ -313,6 +313,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
{self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
--model_name_or_path google/vit-base-patch16-224-in21k --model_name_or_path google/vit-base-patch16-224-in21k
--dataset_name hf-internal-testing/cats_vs_dogs_sample --dataset_name hf-internal-testing/cats_vs_dogs_sample
--trust_remote_code
--learning_rate 1e-4 --learning_rate 1e-4
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
......
...@@ -391,6 +391,7 @@ class ExamplesTests(TestCasePlus): ...@@ -391,6 +391,7 @@ class ExamplesTests(TestCasePlus):
--output_dir {tmp_dir} --output_dir {tmp_dir}
--model_name_or_path google/vit-base-patch16-224-in21k --model_name_or_path google/vit-base-patch16-224-in21k
--dataset_name hf-internal-testing/cats_vs_dogs_sample --dataset_name hf-internal-testing/cats_vs_dogs_sample
--trust_remote_code
--do_train --do_train
--do_eval --do_eval
--learning_rate 1e-4 --learning_rate 1e-4
...@@ -424,6 +425,7 @@ class ExamplesTests(TestCasePlus): ...@@ -424,6 +425,7 @@ class ExamplesTests(TestCasePlus):
--dataset_config_name clean --dataset_config_name clean
--train_split_name validation --train_split_name validation
--eval_split_name validation --eval_split_name validation
--trust_remote_code
--do_train --do_train
--do_eval --do_eval
--learning_rate 1e-4 --learning_rate 1e-4
...@@ -454,6 +456,7 @@ class ExamplesTests(TestCasePlus): ...@@ -454,6 +456,7 @@ class ExamplesTests(TestCasePlus):
--dataset_config_name clean --dataset_config_name clean
--train_split_name validation --train_split_name validation
--eval_split_name validation --eval_split_name validation
--trust_remote_code
--do_train --do_train
--do_eval --do_eval
--learning_rate 1e-4 --learning_rate 1e-4
...@@ -486,6 +489,7 @@ class ExamplesTests(TestCasePlus): ...@@ -486,6 +489,7 @@ class ExamplesTests(TestCasePlus):
--dataset_config_name clean --dataset_config_name clean
--train_split_name validation --train_split_name validation
--eval_split_name validation --eval_split_name validation
--trust_remote_code
--do_train --do_train
--do_eval --do_eval
--learning_rate 1e-4 --learning_rate 1e-4
...@@ -513,6 +517,7 @@ class ExamplesTests(TestCasePlus): ...@@ -513,6 +517,7 @@ class ExamplesTests(TestCasePlus):
--output_dir {tmp_dir} --output_dir {tmp_dir}
--model_name_or_path hf-internal-testing/tiny-random-wav2vec2 --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
--dataset_name anton-l/superb_demo --dataset_name anton-l/superb_demo
--trust_remote_code
--dataset_config_name ks --dataset_config_name ks
--train_split_name test --train_split_name test
--eval_split_name test --eval_split_name test
...@@ -547,6 +552,7 @@ class ExamplesTests(TestCasePlus): ...@@ -547,6 +552,7 @@ class ExamplesTests(TestCasePlus):
--dataset_name hf-internal-testing/librispeech_asr_dummy --dataset_name hf-internal-testing/librispeech_asr_dummy
--dataset_config_names clean --dataset_config_names clean
--dataset_split_names validation --dataset_split_names validation
--trust_remote_code
--learning_rate 1e-4 --learning_rate 1e-4
--per_device_train_batch_size 4 --per_device_train_batch_size 4
--per_device_eval_batch_size 4 --per_device_eval_batch_size 4
...@@ -567,6 +573,7 @@ class ExamplesTests(TestCasePlus): ...@@ -567,6 +573,7 @@ class ExamplesTests(TestCasePlus):
run_mae.py run_mae.py
--output_dir {tmp_dir} --output_dir {tmp_dir}
--dataset_name hf-internal-testing/cats_vs_dogs_sample --dataset_name hf-internal-testing/cats_vs_dogs_sample
--trust_remote_code
--do_train --do_train
--do_eval --do_eval
--learning_rate 1e-4 --learning_rate 1e-4
......
...@@ -240,9 +240,9 @@ class ModelArguments: ...@@ -240,9 +240,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -338,6 +338,7 @@ def main(): ...@@ -338,6 +338,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
# Try print some info about the dataset # Try print some info about the dataset
logger.info(f"Dataset loaded: {raw_datasets}") logger.info(f"Dataset loaded: {raw_datasets}")
......
...@@ -201,9 +201,9 @@ class ModelArguments: ...@@ -201,9 +201,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -300,6 +300,7 @@ def main(): ...@@ -300,6 +300,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
# Loading a dataset from your local files. # Loading a dataset from your local files.
......
...@@ -92,9 +92,9 @@ class ModelArguments: ...@@ -92,9 +92,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -290,6 +290,7 @@ def main(): ...@@ -290,6 +290,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -212,12 +212,11 @@ def parse_args(): ...@@ -212,12 +212,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -333,7 +332,9 @@ def main(): ...@@ -333,7 +332,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -102,9 +102,9 @@ class ModelArguments: ...@@ -102,9 +102,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -346,6 +346,7 @@ def main(): ...@@ -346,6 +346,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -76,7 +76,6 @@ def parse_args(): ...@@ -76,7 +76,6 @@ def parse_args():
default=None, default=None,
help="The name of the dataset to use (via the datasets library).", help="The name of the dataset to use (via the datasets library).",
) )
parser.add_argument( parser.add_argument(
"--predict_with_generate", "--predict_with_generate",
type=bool, type=bool,
...@@ -259,12 +258,11 @@ def parse_args(): ...@@ -259,12 +258,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -378,7 +376,9 @@ def main(): ...@@ -378,7 +376,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -14,7 +14,7 @@ streamlit ...@@ -14,7 +14,7 @@ streamlit
elasticsearch elasticsearch
nltk nltk
pandas pandas
datasets >= 1.13.3,<2.20.0 # Temporary upper version datasets >= 1.13.3
fire fire
pytest<8.0.1 pytest<8.0.1
conllu conllu
......
...@@ -105,9 +105,9 @@ class ModelArguments: ...@@ -105,9 +105,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -326,6 +326,7 @@ def main(): ...@@ -326,6 +326,7 @@ def main():
keep_in_memory=False, keep_in_memory=False,
data_dir=data_args.data_dir, data_dir=data_args.data_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -171,9 +171,9 @@ class ModelArguments: ...@@ -171,9 +171,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -284,6 +284,7 @@ def main(): ...@@ -284,6 +284,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
task="image-classification", task="image-classification",
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -42,6 +42,15 @@ def parse_args(): ...@@ -42,6 +42,15 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset." "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
) )
parser.add_argument(
"--trust_remote_code",
action="store_true",
help=(
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument( parser.add_argument(
"--tokenizer_name_or_path", "--tokenizer_name_or_path",
type=str, type=str,
...@@ -105,7 +114,9 @@ def get_serialized_examples(tokenized_data): ...@@ -105,7 +114,9 @@ def get_serialized_examples(tokenized_data):
def main(args): def main(args):
dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split=args.split) dataset = datasets.load_dataset(
args.dataset_name, args.dataset_config, split=args.split, trust_remote_code=args.trust_remote_code
)
if args.limit is not None: if args.limit is not None:
max_samples = min(len(dataset), args.limit) max_samples = min(len(dataset), args.limit)
......
...@@ -41,6 +41,15 @@ def parse_args(): ...@@ -41,6 +41,15 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset." "--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
) )
parser.add_argument(
"--trust_remote_code",
action="store_true",
help=(
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument( parser.add_argument(
"--batch_size", "--batch_size",
type=int, type=int,
...@@ -69,7 +78,9 @@ def parse_args(): ...@@ -69,7 +78,9 @@ def parse_args():
def main(args): def main(args):
dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split="train") dataset = datasets.load_dataset(
args.dataset_name, args.dataset_config, split="train", trust_remote_code=args.trust_remote_code
)
if args.limit is not None: if args.limit is not None:
max_train_samples = min(len(dataset), args.limit) max_train_samples = min(len(dataset), args.limit)
......
...@@ -125,9 +125,9 @@ class ModelArguments: ...@@ -125,9 +125,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -298,6 +298,7 @@ def main(): ...@@ -298,6 +298,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
...@@ -306,6 +307,7 @@ def main(): ...@@ -306,6 +307,7 @@ def main():
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -313,6 +315,7 @@ def main(): ...@@ -313,6 +315,7 @@ def main():
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -123,9 +123,9 @@ class ModelArguments: ...@@ -123,9 +123,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -307,6 +307,7 @@ def main(): ...@@ -307,6 +307,7 @@ def main():
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
...@@ -314,12 +315,14 @@ def main(): ...@@ -314,12 +315,14 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -104,9 +104,9 @@ class ModelArguments: ...@@ -104,9 +104,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -329,6 +329,7 @@ def main(): ...@@ -329,6 +329,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment