Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
...@@ -1590,7 +1590,7 @@ class WhisperModel(WhisperPreTrainedModel): ...@@ -1590,7 +1590,7 @@ class WhisperModel(WhisperPreTrainedModel):
>>> model = WhisperModel.from_pretrained("openai/whisper-base") >>> model = WhisperModel.from_pretrained("openai/whisper-base")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_features = inputs.input_features >>> input_features = inputs.input_features
>>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
...@@ -1731,7 +1731,7 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM ...@@ -1731,7 +1731,7 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_features = inputs.input_features >>> input_features = inputs.input_features
...@@ -1983,7 +1983,7 @@ class WhisperForCausalLM(WhisperPreTrainedModel): ...@@ -1983,7 +1983,7 @@ class WhisperForCausalLM(WhisperPreTrainedModel):
>>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2") >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> sample = ds[0]["audio"] >>> sample = ds[0]["audio"]
>>> input_features = processor( >>> input_features = processor(
... sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" ... sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
......
...@@ -385,7 +385,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r""" ...@@ -385,7 +385,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
>>> import torch >>> import torch
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -411,7 +411,7 @@ PT_SPEECH_CTC_SAMPLE = r""" ...@@ -411,7 +411,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import torch >>> import torch
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -446,7 +446,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r""" ...@@ -446,7 +446,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import torch >>> import torch
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -482,7 +482,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r""" ...@@ -482,7 +482,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import torch >>> import torch
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -511,7 +511,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r""" ...@@ -511,7 +511,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import torch >>> import torch
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -546,7 +546,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r""" ...@@ -546,7 +546,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r"""
>>> import torch >>> import torch
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> dataset = load_dataset("huggingface/cats-image") >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
>>> image = dataset["test"]["image"][0] >>> image = dataset["test"]["image"][0]
>>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
...@@ -571,7 +571,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r""" ...@@ -571,7 +571,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r"""
>>> import torch >>> import torch
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> dataset = load_dataset("huggingface/cats-image") >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
>>> image = dataset["test"]["image"][0] >>> image = dataset["test"]["image"][0]
>>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
...@@ -803,7 +803,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r""" ...@@ -803,7 +803,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r"""
>>> from transformers import AutoProcessor, {model_class} >>> from transformers import AutoProcessor, {model_class}
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -828,7 +828,7 @@ TF_SPEECH_CTC_SAMPLE = r""" ...@@ -828,7 +828,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import tensorflow as tf >>> import tensorflow as tf
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
>>> dataset = dataset.sort("id") >>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
...@@ -863,7 +863,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r""" ...@@ -863,7 +863,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
>>> from transformers import AutoImageProcessor, {model_class} >>> from transformers import AutoImageProcessor, {model_class}
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> dataset = load_dataset("huggingface/cats-image") >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
>>> image = dataset["test"]["image"][0] >>> image = dataset["test"]["image"][0]
>>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
...@@ -886,7 +886,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r""" ...@@ -886,7 +886,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> dataset = load_dataset("huggingface/cats-image") >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
>>> image = dataset["test"]["image"][0] >>> image = dataset["test"]["image"][0]
>>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}") >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
......
...@@ -128,9 +128,9 @@ class ModelArguments: ...@@ -128,9 +128,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -274,7 +274,11 @@ def main(): ...@@ -274,7 +274,11 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) raw_datasets = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
trust_remote_code=model_args.trust_remote_code,
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
...@@ -568,6 +572,15 @@ def parse_args(): ...@@ -568,6 +572,15 @@ def parse_args():
default=None, default=None,
help= "The configuration name of the dataset to use (via the datasets library).", help= "The configuration name of the dataset to use (via the datasets library).",
) )
parser.add_argument(
"--trust_remote_code",
action="store_true",
help=(
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument( parser.add_argument(
"--train_file", type=str, default=None, help="A csv or a json file containing the training data." "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
) )
...@@ -725,7 +738,9 @@ def main(): ...@@ -725,7 +738,9 @@ def main():
# download the dataset. # download the dataset.
if args.dataset_name is not None: if args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) raw_datasets = load_dataset(
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
)
else: else:
data_files = {} data_files = {}
if args.train_file is not None: if args.train_file is not None:
......
...@@ -269,6 +269,7 @@ def make_task_cmds(): ...@@ -269,6 +269,7 @@ def make_task_cmds():
"img_clas": f""" "img_clas": f"""
{scripts_dir}/image-classification/run_image_classification.py {scripts_dir}/image-classification/run_image_classification.py
--dataset_name hf-internal-testing/cats_vs_dogs_sample --dataset_name hf-internal-testing/cats_vs_dogs_sample
--trust_remote_code
--remove_unused_columns False --remove_unused_columns False
--max_steps 10 --max_steps 10
--image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
......
...@@ -153,7 +153,9 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ...@@ -153,7 +153,9 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -96,7 +96,7 @@ class BeitImageProcessingTester(unittest.TestCase): ...@@ -96,7 +96,7 @@ class BeitImageProcessingTester(unittest.TestCase):
def prepare_semantic_single_inputs(): def prepare_semantic_single_inputs():
dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(dataset[0]["file"]) image = Image.open(dataset[0]["file"])
map = Image.open(dataset[1]["file"]) map = Image.open(dataset[1]["file"])
...@@ -105,7 +105,7 @@ def prepare_semantic_single_inputs(): ...@@ -105,7 +105,7 @@ def prepare_semantic_single_inputs():
def prepare_semantic_batch_inputs(): def prepare_semantic_batch_inputs():
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image1 = Image.open(ds[0]["file"]) image1 = Image.open(ds[0]["file"])
map1 = Image.open(ds[1]["file"]) map1 = Image.open(ds[1]["file"])
......
...@@ -484,7 +484,7 @@ class BeitModelIntegrationTest(unittest.TestCase): ...@@ -484,7 +484,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False) image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(ds[0]["file"]) image = Image.open(ds[0]["file"])
inputs = image_processor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
...@@ -527,7 +527,7 @@ class BeitModelIntegrationTest(unittest.TestCase): ...@@ -527,7 +527,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False) image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(ds[0]["file"]) image = Image.open(ds[0]["file"])
inputs = image_processor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
......
...@@ -123,7 +123,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -123,7 +123,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
- https://huggingface.co/bigscience/tokenizer/ - https://huggingface.co/bigscience/tokenizer/
""" """
tokenizer = self.get_rust_tokenizer() tokenizer = self.get_rust_tokenizer()
ds = load_dataset("xnli", "all_languages", split="test", streaming=True) ds = load_dataset("facebook/xnli", "all_languages", split="test", streaming=True)
sample_data = next(iter(ds))["premise"] # pick up one data sample_data = next(iter(ds))["premise"] # pick up one data
input_text = list(sample_data.values()) input_text = list(sample_data.values())
......
...@@ -164,7 +164,9 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes ...@@ -164,7 +164,9 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -665,7 +665,9 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -665,7 +665,9 @@ class ClapModelIntegrationTest(unittest.TestCase):
"repeat": 0.0023, "repeat": 0.0023,
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_sample = librispeech_dummy[-1] audio_sample = librispeech_dummy[-1]
model_id = "laion/clap-htsat-unfused" model_id = "laion/clap-htsat-unfused"
...@@ -692,7 +694,9 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -692,7 +694,9 @@ class ClapModelIntegrationTest(unittest.TestCase):
"pad": -0.000379, "pad": -0.000379,
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_sample = librispeech_dummy[-1] audio_sample = librispeech_dummy[-1]
model_id = "laion/clap-htsat-fused" model_id = "laion/clap-htsat-fused"
...@@ -719,7 +723,9 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -719,7 +723,9 @@ class ClapModelIntegrationTest(unittest.TestCase):
"pad": 0.0006, "pad": 0.0006,
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
model_id = "laion/clap-htsat-fused" model_id = "laion/clap-htsat-fused"
...@@ -746,7 +752,9 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -746,7 +752,9 @@ class ClapModelIntegrationTest(unittest.TestCase):
"pad": 0.0019, "pad": 0.0019,
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
model_id = "laion/clap-htsat-unfused" model_id = "laion/clap-htsat-unfused"
......
...@@ -209,7 +209,9 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes ...@@ -209,7 +209,9 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
self.assertTrue(pt_processed.input_features.dtype == torch.float32) self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", Audio(sampling_rate=22050)) ds = ds.cast_column("audio", Audio(sampling_rate=22050))
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -371,7 +371,9 @@ class ClvpModelForConditionalGenerationTester: ...@@ -371,7 +371,9 @@ class ClvpModelForConditionalGenerationTester:
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
_, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
...@@ -553,7 +555,9 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase) ...@@ -553,7 +555,9 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase)
class ClvpIntegrationTest(unittest.TestCase): class ClvpIntegrationTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.text = "This is an example text." self.text = "This is an example text."
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
......
...@@ -493,7 +493,7 @@ class LlamaIntegrationTest(unittest.TestCase): ...@@ -493,7 +493,7 @@ class LlamaIntegrationTest(unittest.TestCase):
pyth_tokenizer = self.tokenizer pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer rust_tokenizer = self.rust_tokenizer
dataset = load_dataset("code_x_glue_ct_code_to_text", "go") dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
for item in tqdm.tqdm(dataset["validation"]): for item in tqdm.tqdm(dataset["validation"]):
string = item["code"] string = item["code"]
encoded1 = pyth_tokenizer.encode(string) encoded1 = pyth_tokenizer.encode(string)
...@@ -506,7 +506,7 @@ class LlamaIntegrationTest(unittest.TestCase): ...@@ -506,7 +506,7 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(decoded1, decoded2) self.assertEqual(decoded1, decoded2)
dataset = load_dataset("xnli", "all_languages") dataset = load_dataset("facebook/xnli", "all_languages")
for item in tqdm.tqdm(dataset["train"]): for item in tqdm.tqdm(dataset["train"]):
for string in item["premise"].values(): for string in item["premise"].values():
......
...@@ -697,7 +697,9 @@ class Data2VecAudioUtilsTest(unittest.TestCase): ...@@ -697,7 +697,9 @@ class Data2VecAudioUtilsTest(unittest.TestCase):
@slow @slow
class Data2VecAudioModelIntegrationTest(unittest.TestCase): class Data2VecAudioModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
...@@ -706,7 +708,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase): ...@@ -706,7 +708,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]
def _load_superb(self, task, num_samples): def _load_superb(self, task, num_samples):
ds = load_dataset("anton-l/superb_dummy", task, split="test") ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
return ds[:num_samples] return ds[:num_samples]
......
...@@ -138,7 +138,9 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. ...@@ -138,7 +138,9 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -462,7 +462,9 @@ class EncodecIntegrationTest(unittest.TestCase): ...@@ -462,7 +462,9 @@ class EncodecIntegrationTest(unittest.TestCase):
"1.5": [371955], "1.5": [371955],
"24.0": [6659962], "24.0": [6659962],
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
model_id = "facebook/encodec_24khz" model_id = "facebook/encodec_24khz"
model = EncodecModel.from_pretrained(model_id).to(torch_device) model = EncodecModel.from_pretrained(model_id).to(torch_device)
...@@ -516,7 +518,9 @@ class EncodecIntegrationTest(unittest.TestCase): ...@@ -516,7 +518,9 @@ class EncodecIntegrationTest(unittest.TestCase):
"3.0": [144259, 146765, 156435, 176871, 161971], "3.0": [144259, 146765, 156435, 176871, 161971],
"24.0": [1568553, 1294948, 1306190, 1464747, 1663150], "24.0": [1568553, 1294948, 1306190, 1464747, 1663150],
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
model_id = "facebook/encodec_48khz" model_id = "facebook/encodec_48khz"
model = EncodecModel.from_pretrained(model_id).to(torch_device) model = EncodecModel.from_pretrained(model_id).to(torch_device)
...@@ -578,7 +582,9 @@ class EncodecIntegrationTest(unittest.TestCase): ...@@ -578,7 +582,9 @@ class EncodecIntegrationTest(unittest.TestCase):
[85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
], ],
} }
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
model_id = "facebook/encodec_48khz" model_id = "facebook/encodec_48khz"
model = EncodecModel.from_pretrained(model_id).to(torch_device) model = EncodecModel.from_pretrained(model_id).to(torch_device)
......
...@@ -314,7 +314,7 @@ class GemmaIntegrationTest(unittest.TestCase): ...@@ -314,7 +314,7 @@ class GemmaIntegrationTest(unittest.TestCase):
pyth_tokenizer = self.tokenizer pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer rust_tokenizer = self.rust_tokenizer
dataset = load_dataset("code_x_glue_ct_code_to_text", "go") dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
for item in tqdm.tqdm(dataset["validation"]): for item in tqdm.tqdm(dataset["validation"]):
string = item["code"] string = item["code"]
encoded1 = pyth_tokenizer.encode(string) encoded1 = pyth_tokenizer.encode(string)
...@@ -333,7 +333,7 @@ class GemmaIntegrationTest(unittest.TestCase): ...@@ -333,7 +333,7 @@ class GemmaIntegrationTest(unittest.TestCase):
self.assertEqual(decoded1, decoded2) self.assertEqual(decoded1, decoded2)
dataset = load_dataset("xnli", "all_languages") dataset = load_dataset("facebook/xnli", "all_languages")
for item in tqdm.tqdm(dataset["train"]): for item in tqdm.tqdm(dataset["train"]):
for string in item["premise"].values(): for string in item["premise"].values():
......
...@@ -757,7 +757,9 @@ class HubertModelIntegrationTest(unittest.TestCase): ...@@ -757,7 +757,9 @@ class HubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
...@@ -768,7 +770,7 @@ class HubertModelIntegrationTest(unittest.TestCase): ...@@ -768,7 +770,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
def _load_superb(self, task, num_samples): def _load_superb(self, task, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("anton-l/superb_dummy", task, split="test") ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
return ds[:num_samples] return ds[:num_samples]
......
...@@ -609,7 +609,9 @@ class TFHubertModelIntegrationTest(unittest.TestCase): ...@@ -609,7 +609,9 @@ class TFHubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -103,7 +103,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) ...@@ -103,7 +103,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test") ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
image = Image.open(ds[0]["file"]).convert("RGB") image = Image.open(ds[0]["file"]).convert("RGB")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment