Unverified Commit bdf31d6e authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Speech] Move all examples to new audio feature (#14045)

* up

* up

* up

* finish
parent 4334095c
...@@ -342,7 +342,7 @@ def main(): ...@@ -342,7 +342,7 @@ def main():
if data_args.audio_column_name not in raw_datasets["train"].column_names: if data_args.audio_column_name not in raw_datasets["train"].column_names:
raise ValueError( raise ValueError(
f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of " "Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_datasets['train'].column_names)}." f"{', '.join(raw_datasets['train'].column_names)}."
) )
......
...@@ -136,7 +136,7 @@ _deps = [ ...@@ -136,7 +136,7 @@ _deps = [
"scikit-learn", "scikit-learn",
"sentencepiece>=0.1.91,!=0.1.92", "sentencepiece>=0.1.91,!=0.1.92",
"sigopt", "sigopt",
"soundfile", "librosa",
"sphinx-copybutton", "sphinx-copybutton",
"sphinx-markdown-tables", "sphinx-markdown-tables",
"sphinx-rtd-theme==0.4.3", # sphinx-rtd-theme==0.5.0 introduced big changes in the style. "sphinx-rtd-theme==0.4.3", # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
...@@ -251,10 +251,10 @@ extras["optuna"] = deps_list("optuna") ...@@ -251,10 +251,10 @@ extras["optuna"] = deps_list("optuna")
extras["ray"] = deps_list("ray[tune]") extras["ray"] = deps_list("ray[tune]")
extras["sigopt"] = deps_list("sigopt") extras["sigopt"] = deps_list("sigopt")
extras["integrations"] = extras["optuna"] + extras["ray"]+ extras["sigopt"] extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
extras["audio"] = deps_list("soundfile") extras["audio"] = deps_list("librosa")
extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
extras["tf-speech"] = extras["audio"] extras["tf-speech"] = extras["audio"]
......
...@@ -54,7 +54,7 @@ deps = { ...@@ -54,7 +54,7 @@ deps = {
"scikit-learn": "scikit-learn", "scikit-learn": "scikit-learn",
"sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
"sigopt": "sigopt", "sigopt": "sigopt",
"soundfile": "soundfile", "librosa": "librosa",
"sphinx-copybutton": "sphinx-copybutton", "sphinx-copybutton": "sphinx-copybutton",
"sphinx-markdown-tables": "sphinx-markdown-tables", "sphinx-markdown-tables": "sphinx-markdown-tables",
"sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
......
...@@ -356,21 +356,13 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): ...@@ -356,21 +356,13 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf
ids = [f"1272-141231-000{i}" for i in range(num_samples)]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
)[:num_samples]["audio"]
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) return [x["array"] for x in speech_samples]
return ds["speech"][:num_samples]
def test_inference_ctc_robust_batched(self): def test_inference_ctc_robust_batched(self):
model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", from_pt=True) model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", from_pt=True)
......
...@@ -613,21 +613,11 @@ class HubertModelIntegrationTest(unittest.TestCase): ...@@ -613,21 +613,11 @@ class HubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf
ids = [f"1272-141231-000{i}" for i in range(num_samples)]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) return [x["array"] for x in speech_samples]
return ds["speech"][:num_samples]
def _load_superb(self, task, num_samples): def _load_superb(self, task, num_samples):
from datasets import load_dataset from datasets import load_dataset
......
...@@ -407,21 +407,13 @@ class SEWModelIntegrationTest(unittest.TestCase): ...@@ -407,21 +407,13 @@ class SEWModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
)[:num_samples]["audio"]
ids = [f"1272-141231-000{i}" for i in range(num_samples)] return [x["array"] for x in speech_samples]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array)
return ds["speech"][:num_samples]
def test_inference_pretrained_batched(self): def test_inference_pretrained_batched(self):
model = SEWModel.from_pretrained("asapp/sew-tiny-100k").to(torch_device) model = SEWModel.from_pretrained("asapp/sew-tiny-100k").to(torch_device)
......
...@@ -428,21 +428,13 @@ class SEWDModelIntegrationTest(unittest.TestCase): ...@@ -428,21 +428,13 @@ class SEWDModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
)[:num_samples]["audio"]
ids = [f"1272-141231-000{i}" for i in range(num_samples)] return [x["array"] for x in speech_samples]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array)
return ds["speech"][:num_samples]
def test_inference_pretrained_batched(self): def test_inference_pretrained_batched(self):
model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k").to(torch_device) model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k").to(torch_device)
......
...@@ -715,18 +715,11 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): ...@@ -715,18 +715,11 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.sort("id").select(range(num_samples)).map(map_to_array) # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return ds["speech"][:num_samples] return [x["array"] for x in speech_samples]
def test_generation_librispeech(self): def test_generation_librispeech(self):
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
......
...@@ -479,21 +479,13 @@ class TFHubertModelIntegrationTest(unittest.TestCase): ...@@ -479,21 +479,13 @@ class TFHubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf
ids = [f"1272-141231-000{i}" for i in range(num_samples)]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
)[:num_samples]["audio"]
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) return [x["array"] for x in speech_samples]
return ds["speech"][:num_samples]
def test_inference_ctc_normal(self): def test_inference_ctc_normal(self):
model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
......
...@@ -479,21 +479,13 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase): ...@@ -479,21 +479,13 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf
ids = [f"1272-141231-000{i}" for i in range(num_samples)]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
)[:num_samples]["audio"]
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) return [x["array"] for x in speech_samples]
return ds["speech"][:num_samples]
def test_inference_ctc_normal(self): def test_inference_ctc_normal(self):
model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
......
...@@ -900,21 +900,13 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): ...@@ -900,21 +900,13 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
from datasets import load_dataset from datasets import load_dataset
import soundfile as sf
ids = [f"1272-141231-000{i}" for i in range(num_samples)]
# map files to raw
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
return batch
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
)[:num_samples]["audio"]
ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) return [x["array"] for x in speech_samples]
return ds["speech"][:num_samples]
def _load_superb(self, task, num_samples): def _load_superb(self, task, num_samples):
from datasets import load_dataset from datasets import load_dataset
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment