Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bdf31d6e
Unverified
Commit
bdf31d6e
authored
Oct 18, 2021
by
Patrick von Platen
Committed by
GitHub
Oct 18, 2021
Browse files
[Speech] Move all examples to new audio feature (#14045)
* up * up * up * finish
parent
4334095c
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
43 additions
and
108 deletions
+43
-108
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
.../pytorch/speech-recognition/run_speech_recognition_ctc.py
+1
-1
setup.py
setup.py
+3
-3
src/transformers/dependency_versions_table.py
src/transformers/dependency_versions_table.py
+1
-1
tests/test_modeling_flax_wav2vec2.py
tests/test_modeling_flax_wav2vec2.py
+5
-13
tests/test_modeling_hubert.py
tests/test_modeling_hubert.py
+3
-13
tests/test_modeling_sew.py
tests/test_modeling_sew.py
+6
-14
tests/test_modeling_sew_d.py
tests/test_modeling_sew_d.py
+6
-14
tests/test_modeling_speech_to_text.py
tests/test_modeling_speech_to_text.py
+3
-10
tests/test_modeling_tf_hubert.py
tests/test_modeling_tf_hubert.py
+5
-13
tests/test_modeling_tf_wav2vec2.py
tests/test_modeling_tf_wav2vec2.py
+5
-13
tests/test_modeling_wav2vec2.py
tests/test_modeling_wav2vec2.py
+5
-13
No files found.
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
View file @
bdf31d6e
...
...
@@ -342,7 +342,7 @@ def main():
if
data_args
.
audio_column_name
not
in
raw_datasets
[
"train"
].
column_names
:
raise
ValueError
(
f
"--audio_column_name
{
data_args
.
audio_column_name
}
not found in dataset '
{
data_args
.
dataset_name
}
'. "
f
"--audio_column_name
'
{
data_args
.
audio_column_name
}
'
not found in dataset '
{
data_args
.
dataset_name
}
'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f
"
{
', '
.
join
(
raw_datasets
[
'train'
].
column_names
)
}
."
)
...
...
setup.py
View file @
bdf31d6e
...
...
@@ -136,7 +136,7 @@ _deps = [
"scikit-learn"
,
"sentencepiece>=0.1.91,!=0.1.92"
,
"sigopt"
,
"
soundfile
"
,
"
librosa
"
,
"sphinx-copybutton"
,
"sphinx-markdown-tables"
,
"sphinx-rtd-theme==0.4.3"
,
# sphinx-rtd-theme==0.5.0 introduced big changes in the style.
...
...
@@ -251,10 +251,10 @@ extras["optuna"] = deps_list("optuna")
extras
[
"ray"
]
=
deps_list
(
"ray[tune]"
)
extras
[
"sigopt"
]
=
deps_list
(
"sigopt"
)
extras
[
"integrations"
]
=
extras
[
"optuna"
]
+
extras
[
"ray"
]
+
extras
[
"sigopt"
]
extras
[
"integrations"
]
=
extras
[
"optuna"
]
+
extras
[
"ray"
]
+
extras
[
"sigopt"
]
extras
[
"serving"
]
=
deps_list
(
"pydantic"
,
"uvicorn"
,
"fastapi"
,
"starlette"
)
extras
[
"audio"
]
=
deps_list
(
"
soundfile
"
)
extras
[
"audio"
]
=
deps_list
(
"
librosa
"
)
extras
[
"speech"
]
=
deps_list
(
"torchaudio"
)
+
extras
[
"audio"
]
# `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
extras
[
"torch-speech"
]
=
deps_list
(
"torchaudio"
)
+
extras
[
"audio"
]
extras
[
"tf-speech"
]
=
extras
[
"audio"
]
...
...
src/transformers/dependency_versions_table.py
View file @
bdf31d6e
...
...
@@ -54,7 +54,7 @@ deps = {
"scikit-learn"
:
"scikit-learn"
,
"sentencepiece"
:
"sentencepiece>=0.1.91,!=0.1.92"
,
"sigopt"
:
"sigopt"
,
"
soundfile"
:
"soundfile
"
,
"
librosa"
:
"librosa
"
,
"sphinx-copybutton"
:
"sphinx-copybutton"
,
"sphinx-markdown-tables"
:
"sphinx-markdown-tables"
,
"sphinx-rtd-theme"
:
"sphinx-rtd-theme==0.4.3"
,
...
...
tests/test_modeling_flax_wav2vec2.py
View file @
bdf31d6e
...
...
@@ -356,21 +356,13 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_ctc_robust_batched
(
self
):
model
=
FlaxWav2Vec2ForCTC
.
from_pretrained
(
"facebook/wav2vec2-large-960h-lv60-self"
,
from_pt
=
True
)
...
...
tests/test_modeling_hubert.py
View file @
bdf31d6e
...
...
@@ -613,21 +613,11 @@ class HubertModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
select
(
range
(
num_samples
))[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
_load_superb
(
self
,
task
,
num_samples
):
from
datasets
import
load_dataset
...
...
tests/test_modeling_sew.py
View file @
bdf31d6e
...
...
@@ -407,21 +407,13 @@ class SEWModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"patrickvonplaten/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_pretrained_batched
(
self
):
model
=
SEWModel
.
from_pretrained
(
"asapp/sew-tiny-100k"
).
to
(
torch_device
)
...
...
tests/test_modeling_sew_d.py
View file @
bdf31d6e
...
...
@@ -428,21 +428,13 @@ class SEWDModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"patrickvonplaten/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_pretrained_batched
(
self
):
model
=
SEWDModel
.
from_pretrained
(
"asapp/sew-d-tiny-100k"
).
to
(
torch_device
)
...
...
tests/test_modeling_speech_to_text.py
View file @
bdf31d6e
...
...
@@ -715,18 +715,11 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
ds
=
ds
.
sort
(
"id"
).
select
(
range
(
num_samples
)).
map
(
map_to_array
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
select
(
range
(
num_samples
))[:
num_samples
][
"audio"
]
return
ds
[
"speech"
][:
num
_samples
]
return
[
x
[
"array"
]
for
x
in
speech
_samples
]
def
test_generation_librispeech
(
self
):
model
=
Speech2TextForConditionalGeneration
.
from_pretrained
(
"facebook/s2t-small-librispeech-asr"
)
...
...
tests/test_modeling_tf_hubert.py
View file @
bdf31d6e
...
...
@@ -479,21 +479,13 @@ class TFHubertModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_ctc_normal
(
self
):
model
=
TFHubertForCTC
.
from_pretrained
(
"facebook/hubert-large-ls960-ft"
)
...
...
tests/test_modeling_tf_wav2vec2.py
View file @
bdf31d6e
...
...
@@ -479,21 +479,13 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
test_inference_ctc_normal
(
self
):
model
=
TFWav2Vec2ForCTC
.
from_pretrained
(
"facebook/wav2vec2-base-960h"
)
...
...
tests/test_modeling_wav2vec2.py
View file @
bdf31d6e
...
...
@@ -900,21 +900,13 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
def
_load_datasamples
(
self
,
num_samples
):
from
datasets
import
load_dataset
import
soundfile
as
sf
ids
=
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
# map files to raw
def
map_to_array
(
batch
):
speech
,
_
=
sf
.
read
(
batch
[
"file"
])
batch
[
"speech"
]
=
speech
return
batch
ds
=
load_dataset
(
"hf-internal-testing/librispeech_asr_dummy"
,
"clean"
,
split
=
"validation"
)
# automatic decoding with librispeech
speech_samples
=
ds
.
sort
(
"id"
).
filter
(
lambda
x
:
x
[
"id"
]
in
[
f
"1272-141231-000
{
i
}
"
for
i
in
range
(
num_samples
)]
)[:
num_samples
][
"audio"
]
ds
=
ds
.
filter
(
lambda
x
:
x
[
"id"
]
in
ids
).
sort
(
"id"
).
map
(
map_to_array
)
return
ds
[
"speech"
][:
num_samples
]
return
[
x
[
"array"
]
for
x
in
speech_samples
]
def
_load_superb
(
self
,
task
,
num_samples
):
from
datasets
import
load_dataset
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment