Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
......@@ -167,7 +167,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
def preprocess_data(examples):
......@@ -203,7 +203,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
......
......@@ -102,7 +102,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
image = Image.open(ds[0]["file"]).convert("RGB")
......
......@@ -183,7 +183,7 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
......
......@@ -160,7 +160,7 @@ class LayoutXLMProcessorTest(unittest.TestCase):
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
def preprocess_data(examples):
......@@ -198,7 +198,7 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
......
......@@ -514,7 +514,7 @@ class LlamaIntegrationTest(unittest.TestCase):
pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer
dataset = load_dataset("code_x_glue_ct_code_to_text", "go")
dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
for item in tqdm.tqdm(dataset["validation"]):
string = item["code"]
encoded1 = pyth_tokenizer.encode(string)
......@@ -527,7 +527,7 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(decoded1, decoded2)
dataset = load_dataset("xnli", "all_languages")
dataset = load_dataset("facebook/xnli", "all_languages")
for item in tqdm.tqdm(dataset["train"]):
for string in item["premise"].values():
......
......@@ -87,7 +87,7 @@ class MobileViTImageProcessingTester(unittest.TestCase):
def prepare_semantic_single_inputs():
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(dataset[0]["file"])
map = Image.open(dataset[1]["file"])
......@@ -96,7 +96,7 @@ def prepare_semantic_single_inputs():
def prepare_semantic_batch_inputs():
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image1 = Image.open(dataset[0]["file"])
map1 = Image.open(dataset[1]["file"])
......
......@@ -851,7 +851,7 @@ def prepare_img():
# Helper functions for optical flow integration test
def prepare_optical_flow_images():
dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test")
dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
image1 = Image.open(dataset[0]["file"]).convert("RGB")
image2 = Image.open(dataset[0]["file"]).convert("RGB")
......
......@@ -136,7 +136,9 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes
self.assertTrue(input_features.extrapolated_beatstep.ndim == 2)
def test_integration(self):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select([0])["audio"]
input_speech = [x["array"] for x in speech_samples][0]
sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
......
......@@ -111,7 +111,9 @@ class Pop2PianoProcessorTest(unittest.TestCase):
def get_inputs(self):
"""get inputs for both feature extractor and tokenizer"""
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select([0])["audio"]
input_speech = [x["array"] for x in speech_samples][0]
sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
......
......@@ -258,7 +258,9 @@ class SeamlessM4TFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasample(self, id):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_sample = ds.sort("id")[id]["audio"]["array"]
......
......@@ -87,7 +87,7 @@ class SegformerImageProcessingTester(unittest.TestCase):
def prepare_semantic_single_inputs():
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(dataset[0]["file"])
map = Image.open(dataset[1]["file"])
......@@ -96,7 +96,7 @@ def prepare_semantic_single_inputs():
def prepare_semantic_batch_inputs():
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image1 = Image.open(dataset[0]["file"])
map1 = Image.open(dataset[1]["file"])
......
......@@ -497,7 +497,9 @@ class SEWModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
......@@ -511,7 +511,9 @@ class SEWDModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
......@@ -259,7 +259,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
......@@ -791,7 +791,9 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
......@@ -587,7 +587,9 @@ class TFSpeech2TextModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
......@@ -380,7 +380,9 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
......@@ -741,7 +741,9 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......@@ -1763,7 +1765,9 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
......@@ -598,7 +598,7 @@ class CommonSpmIntegrationTests(unittest.TestCase):
from datasets import load_dataset
from seqio import SentencePieceVocabulary
ds = load_dataset("xnli", "all_languages", split="train+test+validation")
ds = load_dataset("facebook/xnli", "all_languages", split="train+test+validation")
# TODO @ArthurZucker fix the 3 commented tests with #23909
input_texts = [
......
......@@ -185,7 +185,7 @@ class UdopProcessorTest(unittest.TestCase):
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
def preprocess_data(examples):
......@@ -223,7 +223,7 @@ class UdopProcessorIntegrationTests(unittest.TestCase):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment