Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bdf31d6e
Unverified
Commit
bdf31d6e
authored
Oct 18, 2021
by
Patrick von Platen
Committed by
GitHub
Oct 18, 2021
Browse files
[Speech] Move all examples to new audio feature (#14045)
* up * up * up * finish
parent
4334095c
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
43 additions
and
108 deletions
+43
-108
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
.../pytorch/speech-recognition/run_speech_recognition_ctc.py
+1
-1
setup.py
setup.py
+3
-3
src/transformers/dependency_versions_table.py
src/transformers/dependency_versions_table.py
+1
-1
tests/test_modeling_flax_wav2vec2.py
tests/test_modeling_flax_wav2vec2.py
+5
-13
tests/test_modeling_hubert.py
tests/test_modeling_hubert.py
+3
-13
tests/test_modeling_sew.py
tests/test_modeling_sew.py
+6
-14
tests/test_modeling_sew_d.py
tests/test_modeling_sew_d.py
+6
-14
tests/test_modeling_speech_to_text.py
tests/test_modeling_speech_to_text.py
+3
-10
tests/test_modeling_tf_hubert.py
tests/test_modeling_tf_hubert.py
+5
-13
tests/test_modeling_tf_wav2vec2.py
tests/test_modeling_tf_wav2vec2.py
+5
-13
tests/test_modeling_wav2vec2.py
tests/test_modeling_wav2vec2.py
+5
-13
No files found.
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
View file @
bdf31d6e
...
...
@@ -342,7 +342,7 @@ def main():
if
data_args
.
audio_column_name
not
in
raw_datasets
[
"train"
].
column_names
:
raise
ValueError
(
f
"--audio_column_name
{
data_args
.
audio_column_name
}
not found in dataset '
{
data_args
.
dataset_name
}
'. "
f
"--audio_column_name
'
{
data_args
.
audio_column_name
}
'
not found in dataset '
{
data_args
.
dataset_name
}
'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f
"
{
', '
.
join
(
raw_datasets
[
'train'
].
column_names
)
}
."
)
...
...
setup.py
View file @
bdf31d6e
...
...
@@ -136,7 +136,7 @@ _deps = [
"scikit-learn"
,
"sentencepiece>=0.1.91,!=0.1.92"
,
"sigopt"
,
"
soundfile
"
,
"
librosa
"
,
"sphinx-copybutton"
,
"sphinx-markdown-tables"
,
"sphinx-rtd-theme==0.4.3"
,
# sphinx-rtd-theme==0.5.0 introduced big changes in the style.
...
...
@@ -251,10 +251,10 @@ extras["optuna"] = deps_list("optuna")
extras
[
"ray"
]
=
deps_list
(
"ray[tune]"
)
extras
[
"sigopt"
]
=
deps_list
(
"sigopt"
)
extras
[
"integrations"
]
=
extras
[
"optuna"
]
+
extras
[
"ray"
]
+
extras
[
"sigopt"
]
extras
[
"integrations"
]
=
extras
[
"optuna"
]
+
extras
[
"ray"
]
+
extras
[
"sigopt"
]
extras
[
"serving"
]
=
deps_list
(
"pydantic"
,
"uvicorn"
,
"fastapi"
,
"starlette"
)
extras
[
"audio"
]
=
deps_list
(
"
soundfile
"
)
extras
[
"audio"
]
=
deps_list
(
"
librosa
"
)
extras
[
"speech"
]
=
deps_list
(
"torchaudio"
)
+
extras
[
"audio"
]
# `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
extras
[
"torch-speech"
]
=
deps_list
(
"torchaudio"
)
+
extras
[
"audio"
]
extras
[
"tf-speech"
]
=
extras
[
"audio"
]
...
...
src/transformers/dependency_versions_table.py
View file @
bdf31d6e
...
...
@@ -54,7 +54,7 @@ deps = {
"scikit-learn"
:
"scikit-learn"
,
"sentencepiece"
:
"sentencepiece>=0.1.91,!=0.1.92"
,
"sigopt"
:
"sigopt"
,
"
soundfile"
:
"soundfile
"
,
"
librosa"
:
"librosa
"
,
"sphinx-copybutton"
:
"sphinx-copybutton"
,
"sphinx-markdown-tables"
:
"sphinx-markdown-tables"
,
"sphinx-rtd-theme"
:
"sphinx-rtd-theme==0.4.3"
,
...
...
tests/test_modeling_flax_wav2vec2.py
View file @
bdf31d6e
...
...
@@ -356,21 +356,13 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_ctc_robust_batched
(
self
):
model
=
FlaxWav2Vec2ForCTC
.
from_pretrained
(
"facebook/wav2vec2-large-960h-lv60-self"
,
from_pt
=
True
)
...
...
tests/test_modeling_hubert.py
View file @
bdf31d6e
...
...
@@ -613,21 +613,11 @@ class HubertModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
select
(
range
(
num_samples
))[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
_load_superb
(
self
,
task
,
num_samples
):
from
datasets
import
load_dataset
...
...
tests/test_modeling_sew.py
View file @
bdf31d6e
...
...
@@ -407,21 +407,13 @@ class SEWModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"patrickvonplaten/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_pretrained_batched
(
self
):
model
=
SEWModel
.
from_pretrained
(
"asapp/sew-tiny-100k"
).
to
(
torch_device
)
...
...
tests/test_modeling_sew_d.py
View file @
bdf31d6e
...
...
@@ -428,21 +428,13 @@ class SEWDModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"patrickvonplaten/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_pretrained_batched
(
self
):
model
=
SEWDModel
.
from_pretrained
(
"asapp/sew-d-tiny-100k"
).
to
(
torch_device
)
...
...
tests/test_modeling_speech_to_text.py
View file @
bdf31d6e
...
...
@@ -715,18 +715,11 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
ds
=
ds
.
sort
(
"id"
).
select
(
range
(
num_samples
)).
map
(
map_to_array
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
select
(
range
(
num_samples
))[:
num_samples
][
"audio"
]
return
ds
[
"speech"
][:
num
_samples
]
return
[
x
[
"array"
]
for
x
in
speech
_samples
]
def
test_generation_librispeech
(
self
):
model
=
Speech2TextForConditionalGeneration
.
from_pretrained
(
"facebook/s2t-small-librispeech-asr"
)
...
...
tests/test_modeling_tf_hubert.py
View file @
bdf31d6e
...
...
@@ -479,21 +479,13 @@ class TFHubertModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_ctc_normal
(
self
):
model
=
TFHubertForCTC
.
from_pretrained
(
"facebook/hubert-large-ls960-ft"
)
...
...
tests/test_modeling_tf_wav2vec2.py
View file @
bdf31d6e
...
...
@@ -479,21 +479,13 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_ctc_normal
(
self
):
model
=
TFWav2Vec2ForCTC
.
from_pretrained
(
"facebook/wav2vec2-base-960h"
)
...
...
tests/test_modeling_wav2vec2.py
View file @
bdf31d6e
...
...
@@ -900,21 +900,13 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
_load_superb
(
self
,
task
,
num_samples
):
from
datasets
import
load_dataset
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment