"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "1045a36c1f670fb6d18d71260bc9905f5b551843"
Unverified Commit f83c6f1d authored by Sanchit Gandhi's avatar Sanchit Gandhi Committed by GitHub
Browse files

Remove `trust_remote_code` when loading Libri Dummy (#31748)

* [whisper integration] use parquet dataset for testing

* propagate to others

* more propagation

* last one
parent 3aefb4ec
...@@ -665,9 +665,7 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -665,9 +665,7 @@ class ClapModelIntegrationTest(unittest.TestCase):
"repeat": 0.0023, "repeat": 0.0023,
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_sample = librispeech_dummy[-1] audio_sample = librispeech_dummy[-1]
model_id = "laion/clap-htsat-unfused" model_id = "laion/clap-htsat-unfused"
...@@ -694,9 +692,7 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -694,9 +692,7 @@ class ClapModelIntegrationTest(unittest.TestCase):
"pad": -0.000379, "pad": -0.000379,
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_sample = librispeech_dummy[-1] audio_sample = librispeech_dummy[-1]
model_id = "laion/clap-htsat-fused" model_id = "laion/clap-htsat-fused"
...@@ -723,9 +719,7 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -723,9 +719,7 @@ class ClapModelIntegrationTest(unittest.TestCase):
"pad": 0.0006, "pad": 0.0006,
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
model_id = "laion/clap-htsat-fused" model_id = "laion/clap-htsat-fused"
...@@ -752,9 +746,7 @@ class ClapModelIntegrationTest(unittest.TestCase): ...@@ -752,9 +746,7 @@ class ClapModelIntegrationTest(unittest.TestCase):
"pad": 0.0019, "pad": 0.0019,
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
model_id = "laion/clap-htsat-unfused" model_id = "laion/clap-htsat-unfused"
......
...@@ -209,9 +209,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes ...@@ -209,9 +209,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
self.assertTrue(pt_processed.input_features.dtype == torch.float32) self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", Audio(sampling_rate=22050)) ds = ds.cast_column("audio", Audio(sampling_rate=22050))
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -371,9 +371,7 @@ class ClvpModelForConditionalGenerationTester: ...@@ -371,9 +371,7 @@ class ClvpModelForConditionalGenerationTester:
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
_, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
ds = datasets.load_dataset( ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
...@@ -555,9 +553,7 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase) ...@@ -555,9 +553,7 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase)
class ClvpIntegrationTest(unittest.TestCase): class ClvpIntegrationTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.text = "This is an example text." self.text = "This is an example text."
ds = datasets.load_dataset( ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
......
...@@ -694,9 +694,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase): ...@@ -694,9 +694,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase):
@slow @slow
class Data2VecAudioModelIntegrationTest(unittest.TestCase): class Data2VecAudioModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -138,9 +138,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. ...@@ -138,9 +138,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -461,9 +461,7 @@ class EncodecIntegrationTest(unittest.TestCase): ...@@ -461,9 +461,7 @@ class EncodecIntegrationTest(unittest.TestCase):
"1.5": [371955], "1.5": [371955],
"24.0": [6659962], "24.0": [6659962],
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
model_id = "facebook/encodec_24khz" model_id = "facebook/encodec_24khz"
model = EncodecModel.from_pretrained(model_id).to(torch_device) model = EncodecModel.from_pretrained(model_id).to(torch_device)
...@@ -517,9 +515,7 @@ class EncodecIntegrationTest(unittest.TestCase): ...@@ -517,9 +515,7 @@ class EncodecIntegrationTest(unittest.TestCase):
"3.0": [144259, 146765, 156435, 176871, 161971], "3.0": [144259, 146765, 156435, 176871, 161971],
"24.0": [1568553, 1294948, 1306190, 1464747, 1663150], "24.0": [1568553, 1294948, 1306190, 1464747, 1663150],
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
model_id = "facebook/encodec_48khz" model_id = "facebook/encodec_48khz"
model = EncodecModel.from_pretrained(model_id).to(torch_device) model = EncodecModel.from_pretrained(model_id).to(torch_device)
...@@ -581,9 +577,7 @@ class EncodecIntegrationTest(unittest.TestCase): ...@@ -581,9 +577,7 @@ class EncodecIntegrationTest(unittest.TestCase):
[85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
], ],
} }
librispeech_dummy = load_dataset( librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
model_id = "facebook/encodec_48khz" model_id = "facebook/encodec_48khz"
model = EncodecModel.from_pretrained(model_id).to(torch_device) model = EncodecModel.from_pretrained(model_id).to(torch_device)
......
...@@ -753,9 +753,7 @@ class HubertModelIntegrationTest(unittest.TestCase): ...@@ -753,9 +753,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -609,9 +609,7 @@ class TFHubertModelIntegrationTest(unittest.TestCase): ...@@ -609,9 +609,7 @@ class TFHubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -136,9 +136,7 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes ...@@ -136,9 +136,7 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes
self.assertTrue(input_features.extrapolated_beatstep.ndim == 2) self.assertTrue(input_features.extrapolated_beatstep.ndim == 2)
def test_integration(self): def test_integration(self):
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select([0])["audio"] speech_samples = ds.sort("id").select([0])["audio"]
input_speech = [x["array"] for x in speech_samples][0] input_speech = [x["array"] for x in speech_samples][0]
sampling_rate = [x["sampling_rate"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
......
...@@ -111,9 +111,7 @@ class Pop2PianoProcessorTest(unittest.TestCase): ...@@ -111,9 +111,7 @@ class Pop2PianoProcessorTest(unittest.TestCase):
def get_inputs(self): def get_inputs(self):
"""get inputs for both feature extractor and tokenizer""" """get inputs for both feature extractor and tokenizer"""
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select([0])["audio"] speech_samples = ds.sort("id").select([0])["audio"]
input_speech = [x["array"] for x in speech_samples][0] input_speech = [x["array"] for x in speech_samples][0]
sampling_rate = [x["sampling_rate"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
......
...@@ -258,9 +258,7 @@ class SeamlessM4TFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -258,9 +258,7 @@ class SeamlessM4TFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
self.assertTrue(pt_processed.input_features.dtype == torch.float32) self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasample(self, id): def _load_datasample(self, id):
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_sample = ds.sort("id")[id]["audio"]["array"] speech_sample = ds.sort("id")[id]["audio"]["array"]
......
...@@ -494,9 +494,7 @@ class SEWModelIntegrationTest(unittest.TestCase): ...@@ -494,9 +494,7 @@ class SEWModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -508,9 +508,7 @@ class SEWDModelIntegrationTest(unittest.TestCase): ...@@ -508,9 +508,7 @@ class SEWDModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -259,9 +259,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -259,9 +259,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -793,9 +793,7 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): ...@@ -793,9 +793,7 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -587,9 +587,7 @@ class TFSpeech2TextModelIntegrationTests(unittest.TestCase): ...@@ -587,9 +587,7 @@ class TFSpeech2TextModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -380,9 +380,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -380,9 +380,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -744,9 +744,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase): ...@@ -744,9 +744,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
...@@ -1771,9 +1769,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase): ...@@ -1771,9 +1769,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
......
...@@ -549,9 +549,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T ...@@ -549,9 +549,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
@slow @slow
class UniSpeechModelIntegrationTest(unittest.TestCase): class UniSpeechModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
...@@ -806,9 +806,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -806,9 +806,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):
@slow @slow
class UniSpeechSatModelIntegrationTest(unittest.TestCase): class UniSpeechSatModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset( ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").filter( speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment