Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
datasets >= 1.13.3,<2.20.0 # Temporary upper version datasets >= 1.13.3
pytest<8.0.1 pytest<8.0.1
conllu conllu
nltk nltk
......
...@@ -195,9 +195,9 @@ class ModelArguments: ...@@ -195,9 +195,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -458,6 +458,7 @@ def main(): ...@@ -458,6 +458,7 @@ def main():
keep_in_memory=False, keep_in_memory=False,
data_dir=data_args.data_dir, data_dir=data_args.data_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -191,6 +191,16 @@ class DataTrainingArguments: ...@@ -191,6 +191,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field( validation_file: Optional[str] = field(
default=None, default=None,
...@@ -518,6 +528,7 @@ def main(): ...@@ -518,6 +528,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
) )
if "validation" not in datasets.keys(): if "validation" not in datasets.keys():
...@@ -528,6 +539,7 @@ def main(): ...@@ -528,6 +539,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
) )
datasets["train"] = load_dataset( datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -536,6 +548,7 @@ def main(): ...@@ -536,6 +548,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -182,9 +182,9 @@ class ModelArguments: ...@@ -182,9 +182,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -408,6 +408,7 @@ def main(): ...@@ -408,6 +408,7 @@ def main():
keep_in_memory=False, keep_in_memory=False,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in dataset.keys(): if "validation" not in dataset.keys():
...@@ -418,6 +419,7 @@ def main(): ...@@ -418,6 +419,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
) )
dataset["train"] = load_dataset( dataset["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -426,6 +428,7 @@ def main(): ...@@ -426,6 +428,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -188,9 +188,9 @@ class ModelArguments: ...@@ -188,9 +188,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -446,6 +446,7 @@ def main(): ...@@ -446,6 +446,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in datasets.keys(): if "validation" not in datasets.keys():
...@@ -456,6 +457,7 @@ def main(): ...@@ -456,6 +457,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
) )
datasets["train"] = load_dataset( datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -464,6 +466,7 @@ def main(): ...@@ -464,6 +466,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -192,6 +192,16 @@ class DataTrainingArguments: ...@@ -192,6 +192,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field( validation_file: Optional[str] = field(
default=None, default=None,
...@@ -560,6 +570,7 @@ def main(): ...@@ -560,6 +570,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
) )
if "validation" not in datasets.keys(): if "validation" not in datasets.keys():
...@@ -570,6 +581,7 @@ def main(): ...@@ -570,6 +581,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
) )
datasets["train"] = load_dataset( datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -578,6 +590,7 @@ def main(): ...@@ -578,6 +590,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -168,9 +168,9 @@ class ModelArguments: ...@@ -168,9 +168,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -498,6 +498,7 @@ def main(): ...@@ -498,6 +498,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
# Loading the dataset from local csv or json file. # Loading the dataset from local csv or json file.
......
...@@ -136,6 +136,16 @@ class DataTrainingArguments: ...@@ -136,6 +136,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
text_column: Optional[str] = field( text_column: Optional[str] = field(
default=None, default=None,
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."}, metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
...@@ -442,6 +452,7 @@ def main(): ...@@ -442,6 +452,7 @@ def main():
cache_dir=data_args.dataset_cache_dir, cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None, token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_code,
) )
if training_args.do_eval: if training_args.do_eval:
...@@ -452,6 +463,7 @@ def main(): ...@@ -452,6 +463,7 @@ def main():
cache_dir=data_args.dataset_cache_dir, cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None, token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_code,
) )
if not training_args.do_train and not training_args.do_eval: if not training_args.do_train and not training_args.do_eval:
......
...@@ -201,9 +201,9 @@ class ModelArguments: ...@@ -201,9 +201,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -485,6 +485,7 @@ def main(): ...@@ -485,6 +485,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
keep_in_memory=False, keep_in_memory=False,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -265,6 +265,7 @@ class ExamplesTests(TestCasePlus): ...@@ -265,6 +265,7 @@ class ExamplesTests(TestCasePlus):
--dataset_config clean --dataset_config clean
--train_split_name validation --train_split_name validation
--eval_split_name validation --eval_split_name validation
--trust_remote_code
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir --overwrite_output_dir
--num_train_epochs=2 --num_train_epochs=2
......
...@@ -170,9 +170,9 @@ class ModelArguments: ...@@ -170,9 +170,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -449,6 +449,7 @@ def main(): ...@@ -449,6 +449,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
# Loading the dataset from local csv or json file. # Loading the dataset from local csv or json file.
......
...@@ -13,7 +13,7 @@ streamlit ...@@ -13,7 +13,7 @@ streamlit
elasticsearch elasticsearch
nltk nltk
pandas pandas
datasets >= 1.13.3,<2.20.0 # Temporary upper version datasets >= 1.13.3
fire fire
pytest<8.0.1 pytest<8.0.1
conllu conllu
......
...@@ -165,9 +165,9 @@ class ModelArguments: ...@@ -165,9 +165,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -261,12 +261,14 @@ def main(): ...@@ -261,12 +261,14 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.train_split_name, split=data_args.train_split_name,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
raw_datasets["eval"] = load_dataset( raw_datasets["eval"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.eval_split_name, split=data_args.eval_split_name,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
if data_args.audio_column_name not in raw_datasets["train"].column_names: if data_args.audio_column_name not in raw_datasets["train"].column_names:
......
...@@ -99,9 +99,9 @@ class ModelArguments: ...@@ -99,9 +99,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -305,6 +305,7 @@ def main(): ...@@ -305,6 +305,7 @@ def main():
keep_in_memory=False, keep_in_memory=False,
data_dir=data_args.data_dir, data_dir=data_args.data_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -164,9 +164,9 @@ class ModelArguments: ...@@ -164,9 +164,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -242,6 +242,7 @@ def main(): ...@@ -242,6 +242,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -150,12 +150,11 @@ def parse_args(): ...@@ -150,12 +150,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -284,7 +283,7 @@ def main(): ...@@ -284,7 +283,7 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
dataset = load_dataset(args.dataset_name) dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
else: else:
data_files = {} data_files = {}
if args.train_dir is not None: if args.train_dir is not None:
......
...@@ -63,6 +63,16 @@ class DataTrainingArguments: ...@@ -63,6 +63,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
image_column_name: Optional[str] = field( image_column_name: Optional[str] = field(
default=None, metadata={"help": "The column name of the images in the files."} default=None, metadata={"help": "The column name of the images in the files."}
) )
...@@ -225,6 +235,7 @@ def main(): ...@@ -225,6 +235,7 @@ def main():
data_files=data_args.data_files, data_files=data_args.data_files,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
# If we don't have a validation split, split off a percentage of train as validation. # If we don't have a validation split, split off a percentage of train as validation.
......
...@@ -166,9 +166,9 @@ class ModelArguments: ...@@ -166,9 +166,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -299,6 +299,7 @@ def main(): ...@@ -299,6 +299,7 @@ def main():
data_files=data_args.data_files, data_files=data_args.data_files,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
# If we don't have a validation split, split off a percentage of train as validation. # If we don't have a validation split, split off a percentage of train as validation.
......
...@@ -197,12 +197,11 @@ def parse_args(): ...@@ -197,12 +197,11 @@ def parse_args():
) )
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -441,6 +440,7 @@ def main(): ...@@ -441,6 +440,7 @@ def main():
data_files=args.data_files, data_files=args.data_files,
cache_dir=args.cache_dir, cache_dir=args.cache_dir,
token=args.token, token=args.token,
trust_remote_code=args.trust_remote_code,
) )
# If we don't have a validation split, split off a percentage of train as validation. # If we don't have a validation split, split off a percentage of train as validation.
......
...@@ -68,6 +68,16 @@ class Arguments: ...@@ -68,6 +68,16 @@ class Arguments:
"help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)." "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
}, },
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
image_height: Optional[int] = field(default=512, metadata={"help": "Image height after resizing."}) image_height: Optional[int] = field(default=512, metadata={"help": "Image height after resizing."})
image_width: Optional[int] = field(default=512, metadata={"help": "Image width after resizing."}) image_width: Optional[int] = field(default=512, metadata={"help": "Image width after resizing."})
token: str = field( token: str = field(
...@@ -364,7 +374,7 @@ def main(): ...@@ -364,7 +374,7 @@ def main():
# Load dataset, prepare splits # Load dataset, prepare splits
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------
dataset = load_dataset(args.dataset_name) dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
# We need to specify the label2id mapping for the model # We need to specify the label2id mapping for the model
# it is a mapping from semantic class name to class index. # it is a mapping from semantic class name to class index.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment