"vscode:/vscode.git/clone" did not exist on "28d5700aae1bed4ec721cbeb5bc2527079113f46"
Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
...@@ -71,6 +71,15 @@ def parse_args(): ...@@ -71,6 +71,15 @@ def parse_args():
help="Name of the dataset on the hub.", help="Name of the dataset on the hub.",
default="qubvel-hf/ade20k-mini", default="qubvel-hf/ade20k-mini",
) )
parser.add_argument(
"--trust_remote_code",
action="store_true",
help=(
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument( parser.add_argument(
"--image_height", "--image_height",
type=int, type=int,
...@@ -425,7 +434,7 @@ def main(): ...@@ -425,7 +434,7 @@ def main():
# In distributed training, the load_dataset function guarantees that only one local process can concurrently # In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset. # download the dataset.
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
# We need to specify the label2id mapping for the model # We need to specify the label2id mapping for the model
# it is a mapping from semantic class name to class index. # it is a mapping from semantic class name to class index.
......
...@@ -124,9 +124,9 @@ class ModelArguments: ...@@ -124,9 +124,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -312,6 +312,7 @@ def main(): ...@@ -312,6 +312,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
...@@ -321,6 +322,7 @@ def main(): ...@@ -321,6 +322,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -329,6 +331,7 @@ def main(): ...@@ -329,6 +331,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -195,12 +195,11 @@ def parse_args(): ...@@ -195,12 +195,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -327,17 +326,21 @@ def main(): ...@@ -327,17 +326,21 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
args.dataset_name, args.dataset_name,
args.dataset_config_name, args.dataset_config_name,
split=f"train[:{args.validation_split_percentage}%]", split=f"train[:{args.validation_split_percentage}%]",
trust_remote_code=args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
args.dataset_name, args.dataset_name,
args.dataset_config_name, args.dataset_config_name,
split=f"train[{args.validation_split_percentage}%:]", split=f"train[{args.validation_split_percentage}%:]",
trust_remote_code=args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -127,9 +127,9 @@ class ModelArguments: ...@@ -127,9 +127,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -382,6 +382,7 @@ def main(): ...@@ -382,6 +382,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
...@@ -391,6 +392,7 @@ def main(): ...@@ -391,6 +392,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -399,6 +401,7 @@ def main(): ...@@ -399,6 +401,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -257,12 +257,11 @@ def parse_args(): ...@@ -257,12 +257,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -395,17 +394,21 @@ def main(): ...@@ -395,17 +394,21 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
args.dataset_name, args.dataset_name,
args.dataset_config_name, args.dataset_config_name,
split=f"train[:{args.validation_split_percentage}%]", split=f"train[:{args.validation_split_percentage}%]",
trust_remote_code=args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
args.dataset_name, args.dataset_name,
args.dataset_config_name, args.dataset_config_name,
split=f"train[{args.validation_split_percentage}%:]", split=f"train[{args.validation_split_percentage}%:]",
trust_remote_code=args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -121,9 +121,9 @@ class ModelArguments: ...@@ -121,9 +121,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -324,6 +324,7 @@ def main(): ...@@ -324,6 +324,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
...@@ -333,6 +334,7 @@ def main(): ...@@ -333,6 +334,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -341,6 +343,7 @@ def main(): ...@@ -341,6 +343,7 @@ def main():
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
streaming=data_args.streaming, streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -202,12 +202,11 @@ def parse_args(): ...@@ -202,12 +202,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -334,17 +333,21 @@ def main(): ...@@ -334,17 +333,21 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
args.dataset_name, args.dataset_name,
args.dataset_config_name, args.dataset_config_name,
split=f"train[:{args.validation_split_percentage}%]", split=f"train[:{args.validation_split_percentage}%]",
trust_remote_code=args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
args.dataset_name, args.dataset_name,
args.dataset_config_name, args.dataset_config_name,
split=f"train[{args.validation_split_percentage}%:]", split=f"train[{args.validation_split_percentage}%:]",
trust_remote_code=args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -133,6 +133,16 @@ class DataTrainingArguments: ...@@ -133,6 +133,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field( validation_file: Optional[str] = field(
default=None, default=None,
...@@ -292,6 +302,7 @@ def main(): ...@@ -292,6 +302,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
if "validation" not in raw_datasets.keys(): if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset( raw_datasets["validation"] = load_dataset(
...@@ -300,6 +311,7 @@ def main(): ...@@ -300,6 +311,7 @@ def main():
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
...@@ -307,6 +319,7 @@ def main(): ...@@ -307,6 +319,7 @@ def main():
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -184,12 +184,11 @@ def parse_args(): ...@@ -184,12 +184,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -351,7 +350,9 @@ def main(): ...@@ -351,7 +350,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -313,9 +313,9 @@ class ModelArguments: ...@@ -313,9 +313,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -383,7 +383,9 @@ def main(): ...@@ -383,7 +383,9 @@ def main():
# Load dataset, prepare splits # Load dataset, prepare splits
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------
dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir) dataset = load_dataset(
data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
)
# If we don't have a validation split, split off a percentage of train as validation # If we don't have a validation split, split off a percentage of train as validation
data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
......
...@@ -340,12 +340,11 @@ def parse_args(): ...@@ -340,12 +340,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -445,7 +444,7 @@ def main(): ...@@ -445,7 +444,7 @@ def main():
# Load dataset # Load dataset
# In distributed training, the load_dataset function guarantees that only one local process can concurrently # In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset. # download the dataset.
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
# If we don't have a validation split, split off a percentage of train as validation. # If we don't have a validation split, split off a percentage of train as validation.
args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split
......
...@@ -93,9 +93,9 @@ class ModelArguments: ...@@ -93,9 +93,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -301,6 +301,7 @@ def main(): ...@@ -301,6 +301,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -101,6 +101,16 @@ class DataTrainingArguments: ...@@ -101,6 +101,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field( validation_file: Optional[str] = field(
default=None, default=None,
...@@ -289,6 +299,7 @@ def main(): ...@@ -289,6 +299,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -100,6 +100,15 @@ def parse_args(): ...@@ -100,6 +100,15 @@ def parse_args():
default=None, default=None,
help="The configuration name of the dataset to use (via the datasets library).", help="The configuration name of the dataset to use (via the datasets library).",
) )
parser.add_argument(
"--trust_remote_code",
action="store_true",
help=(
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument( parser.add_argument(
"--train_file", type=str, default=None, help="A csv or a json file containing the training data." "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
) )
...@@ -356,7 +365,9 @@ def main(): ...@@ -356,7 +365,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -275,12 +275,11 @@ def parse_args(): ...@@ -275,12 +275,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -404,7 +403,9 @@ def main(): ...@@ -404,7 +403,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -93,9 +93,9 @@ class ModelArguments: ...@@ -93,9 +93,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -346,6 +346,7 @@ def main(): ...@@ -346,6 +346,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -165,9 +165,9 @@ class ModelArguments: ...@@ -165,9 +165,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -233,7 +233,9 @@ def main(): ...@@ -233,7 +233,9 @@ def main():
# In distributed training, the load_dataset function guarantees that only one local process can concurrently # In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset. # download the dataset.
# TODO support datasets from local folders # TODO support datasets from local folders
dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir) dataset = load_dataset(
data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
)
# Rename column names to standardized names (only "image" and "label" need to be present) # Rename column names to standardized names (only "image" and "label" need to be present)
if "pixel_values" in dataset["train"].column_names: if "pixel_values" in dataset["train"].column_names:
......
...@@ -180,12 +180,11 @@ def parse_args(): ...@@ -180,12 +180,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument( parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
type=bool, action="store_true",
default=False,
help=( help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -294,7 +293,7 @@ def main(): ...@@ -294,7 +293,7 @@ def main():
# In distributed training, the load_dataset function guarantees that only one local process can concurrently # In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset. # download the dataset.
# TODO support datasets from local folders # TODO support datasets from local folders
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
# Rename column names to standardized names (only "image" and "label" need to be present) # Rename column names to standardized names (only "image" and "label" need to be present)
if "pixel_values" in dataset["train"].column_names: if "pixel_values" in dataset["train"].column_names:
......
...@@ -71,6 +71,15 @@ def parse_args(): ...@@ -71,6 +71,15 @@ def parse_args():
required=True, required=True,
help="The names of the training data set splits to use (via the datasets library).", help="The names of the training data set splits to use (via the datasets library).",
) )
parser.add_argument(
"--trust_remote_code",
action="store_true",
help=(
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument( parser.add_argument(
"--preprocessing_num_workers", "--preprocessing_num_workers",
type=int, type=int,
...@@ -446,6 +455,7 @@ def main(): ...@@ -446,6 +455,7 @@ def main():
dataset_config_name, dataset_config_name,
split=train_split_name, split=train_split_name,
cache_dir=args.cache_dir, cache_dir=args.cache_dir,
trust_remote_code=args.trust_remote_code,
) )
datasets_splits.append(dataset_split) datasets_splits.append(dataset_split)
......
...@@ -255,9 +255,9 @@ class DataTrainingArguments: ...@@ -255,9 +255,9 @@ class DataTrainingArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -454,6 +454,7 @@ def main(): ...@@ -454,6 +454,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.train_split_name, split=data_args.train_split_name,
token=data_args.token, token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
if data_args.audio_column_name not in raw_datasets["train"].column_names: if data_args.audio_column_name not in raw_datasets["train"].column_names:
...@@ -479,6 +480,7 @@ def main(): ...@@ -479,6 +480,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.eval_split_name, split=data_args.eval_split_name,
token=data_args.token, token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment