Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
......@@ -112,9 +112,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
......@@ -366,6 +366,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
......
......@@ -316,6 +316,7 @@ class ExamplesTests(TestCasePlus):
testargs = f"""
run_image_classification.py
--dataset_name hf-internal-testing/cats_vs_dogs_sample
--trust_remote_code
--model_name_or_path microsoft/resnet-18
--do_train
--do_eval
......
......@@ -88,9 +88,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
......@@ -239,6 +239,7 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
......
......@@ -106,9 +106,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
......@@ -333,6 +333,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
......
......@@ -13,7 +13,7 @@ TOKENIZER_CLASSES = {
name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS
}
dataset = datasets.load_dataset("xnli", split="test+validation")
dataset = datasets.load_dataset("facebook/xnli", split="test+validation") # no-script
total = 0
perfect = 0
......
......@@ -102,7 +102,7 @@ _deps = [
"codecarbon==1.2.0",
"cookiecutter==1.7.3",
"dataclasses",
"datasets!=2.5.0,<2.20.0", # Temporary upper version
"datasets!=2.5.0",
"decord==0.6.0",
"deepspeed>=0.9.3",
"diffusers",
......
......@@ -51,7 +51,9 @@ class TextToSpeechTool(PipelineTool):
if not is_datasets_available():
raise ImportError("Datasets needs to be installed if not passing speaker embeddings.")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True
)
speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0)
return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings}
......
......@@ -202,7 +202,9 @@ class PTtoTFCommand(BaseTransformersCLICommand):
"""
def _get_audio_input():
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
raw_samples = [x["array"] for x in speech_samples]
return raw_samples
......@@ -234,7 +236,7 @@ class PTtoTFCommand(BaseTransformersCLICommand):
}
)
if "pixel_values" in model_forward_signature:
sample_images = load_dataset("cifar10", "plain_text", split="test")[:2]["img"]
sample_images = load_dataset("uoft-cs/cifar10", "plain_text", split="test")[:2]["img"] # no-script
processor_inputs.update({"images": sample_images})
if "input_features" in model_forward_signature:
feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters
......
......@@ -9,7 +9,7 @@ deps = {
"codecarbon": "codecarbon==1.2.0",
"cookiecutter": "cookiecutter==1.7.3",
"dataclasses": "dataclasses",
"datasets": "datasets!=2.5.0,<2.20.0",
"datasets": "datasets!=2.5.0",
"decord": "decord==0.6.0",
"deepspeed": "deepspeed>=0.9.3",
"diffusers": "diffusers",
......
......@@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means
......@@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default.
......@@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt")
>>> input_features = inputs.input_features
......
......@@ -205,7 +205,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
if "speech-commands" in model_name:
dataset = load_dataset("speech_commands", "v0.02", split="validation")
# TODO: Convert dataset to Parquet
dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
waveform = dataset[0]["audio"]["array"]
else:
filepath = hf_hub_download(
......
......@@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
# Check outputs on an image
if is_semantic:
image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(ds[0]["file"])
else:
image_processor = BeitImageProcessor(
......
......@@ -2409,7 +2409,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
>>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
>>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> # select random long article
>>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"]
......@@ -2711,7 +2711,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
>>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
>>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> LONG_ARTICLE = squad_ds[81514]["context"]
>>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt")
......@@ -3040,7 +3040,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
>>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base")
>>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> # select random article and question
>>> LONG_ARTICLE = squad_ds[81514]["context"]
......
......@@ -1681,7 +1681,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
>>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
......@@ -1754,7 +1754,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
>>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
......
......@@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
input_audio = [x["array"] for x in ds[:4]["audio"]]
inputs = processor(input_audio, return_tensors="pt", padding=True)
......
......@@ -831,7 +831,7 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
>>> model.config.decoder_start_token_id = tokenizer.bos_token_id
>>> # pre-process inputs and labels
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = feature_extractor(
... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
... )
......
......@@ -148,7 +148,7 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
model.load_state_dict(new_state_dict)
# verify results on scanned document
dataset = load_dataset("hf-internal-testing/example-documents")
dataset = load_dataset("hf-internal-testing/example-documents") # no-script
image = dataset["test"][0]["image"].convert("RGB")
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
......
......@@ -1431,7 +1431,7 @@ class HubertModel(HubertPreTrainedModel):
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
......
......@@ -1471,7 +1471,7 @@ class TFHubertModel(TFHubertPreTrainedModel):
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
......@@ -1583,7 +1583,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
......
......@@ -1294,7 +1294,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
>>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
>>> dataset = load_dataset("nielsr/funsd", split="train")
>>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
>>> example = dataset[0]
>>> question = "what's his name?"
>>> words = example["words"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment