Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
37c5759c
Unverified
Commit
37c5759c
authored
Oct 17, 2021
by
Patrick von Platen
Committed by
GitHub
Oct 17, 2021
Browse files
[Speech Examples] Add new audio feature (#14027)
* finish * up * finish all * up
parent
cde0c750
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
75 additions
and
58 deletions
+75
-58
examples/pytorch/_tests_requirements.txt
examples/pytorch/_tests_requirements.txt
+2
-1
examples/pytorch/speech-pretraining/README.md
examples/pytorch/speech-pretraining/README.md
+1
-1
examples/pytorch/speech-pretraining/requirements.txt
examples/pytorch/speech-pretraining/requirements.txt
+1
-0
examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
...speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+43
-22
examples/pytorch/speech-recognition/README.md
examples/pytorch/speech-recognition/README.md
+0
-2
examples/pytorch/speech-recognition/requirements.txt
examples/pytorch/speech-recognition/requirements.txt
+2
-1
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
.../pytorch/speech-recognition/run_speech_recognition_ctc.py
+26
-30
examples/pytorch/test_examples.py
examples/pytorch/test_examples.py
+0
-1
No files found.
examples/pytorch/_tests_requirements.txt
View file @
37c5759c
...
...
@@ -13,7 +13,7 @@ streamlit
elasticsearch
nltk
pandas
datasets >= 1.1.3
datasets >= 1.1
3
.3
fire
pytest
conllu
...
...
@@ -21,3 +21,4 @@ sentencepiece != 0.1.92
protobuf
torchvision
jiwer
librosa
examples/pytorch/speech-pretraining/README.md
View file @
37c5759c
...
...
@@ -94,7 +94,7 @@ To pre-train `"large-sized"` Wav2Vec2 model, *e.g.* [facebook/wav2vec2-large-lv6
on
[
librispeech_asr
](
https://huggingface.co/datasets/librispeech_asr
)
, the following command can be run:
```
bash
accelerate launch run_pretrain_no_trainer.py
\
accelerate launch run_
wav2vec2_
pretrain
ing
_no_trainer.py
\
--dataset_name
=
librispeech_asr
\
--dataset_config_names
clean clean other
\
--dataset_split_names
train.100 train.360 train.500
\
...
...
examples/pytorch/speech-pretraining/requirements.txt
View file @
37c5759c
...
...
@@ -2,3 +2,4 @@ datasets >= 1.12.0
torch >= 1.5
torchaudio
accelerate >= 0.5.0
librosa
examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
View file @
37c5759c
...
...
@@ -25,7 +25,6 @@ from typing import Dict, List, Optional, Union
import
datasets
import
torch
import
torchaudio
from
datasets
import
DatasetDict
,
concatenate_datasets
,
load_dataset
from
torch.utils.data.dataloader
import
DataLoader
from
tqdm.auto
import
tqdm
...
...
@@ -113,7 +112,7 @@ def parse_args():
parser
.
add_argument
(
"--audio_column_name"
,
type
=
str
,
default
=
"
file
"
,
default
=
"
audio
"
,
help
=
"Column in the dataset that contains speech file path. Defaults to 'file'"
,
)
parser
.
add_argument
(
...
...
@@ -128,6 +127,18 @@ def parse_args():
default
=
None
,
help
=
"Pretrained config name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--train_cache_file_name"
,
type
=
str
,
default
=
None
,
help
=
"Path to the train cached file name"
,
)
parser
.
add_argument
(
"--validation_cache_file_name"
,
type
=
str
,
default
=
None
,
help
=
"Path to the validation cached file name"
,
)
parser
.
add_argument
(
"--per_device_train_batch_size"
,
type
=
int
,
...
...
@@ -414,9 +425,17 @@ def main():
raw_datasets
[
"validation"
]
=
raw_datasets
[
"train"
].
select
(
range
(
num_validation_samples
))
raw_datasets
[
"train"
]
=
raw_datasets
[
"train"
].
select
(
range
(
num_validation_samples
,
raw_datasets
[
"train"
].
num_rows
))
# 2. Preprocess audio: load, resample, normalize and truncate
# 2. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor`
feature_extractor
=
Wav2Vec2FeatureExtractor
.
from_pretrained
(
args
.
model_name_or_path
)
# make sure that dataset decodes audio with correct samlping rate
raw_datasets
=
raw_datasets
.
cast_column
(
"audio"
,
datasets
.
features
.
Audio
(
sampling_rate
=
feature_extractor
.
sampling_rate
)
)
# only normalized-inputs-training is supported
if
not
feature_extractor
.
do_normalize
:
raise
ValueError
(
...
...
@@ -427,38 +446,40 @@ def main():
max_length
=
int
(
args
.
max_duration_in_seconds
*
feature_extractor
.
sampling_rate
)
min_length
=
int
(
args
.
min_duration_in_seconds
*
feature_extractor
.
sampling_rate
)
resampler
=
None
if
raw_datasets
[
"train"
][
args
.
audio_column_name
][
0
].
split
(
"."
)[
-
1
]
==
"mp3"
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
resampler
=
torchaudio
.
transforms
.
Resample
(
48_000
,
feature_extractor
.
sampling_rate
)
def
prepare_dataset
(
batch
):
speech_array
,
sampling_rate
=
torchaudio
.
load
(
batch
[
args
.
audio_column_name
])
speech_array
=
speech_array
.
squeeze
()
# if necessary resample audio
if
resampler
is
not
None
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
speech_array
=
resampler
(
speech_array
)
sampling_rate
=
resampler
.
new_freq
sample
=
batch
[
args
.
audio_column_name
]
speech_array
=
speech_array
.
numpy
()
inputs
=
feature_extractor
(
speech_array
,
sampling_rate
=
sampling_rate
,
max_length
=
max_length
,
truncation
=
True
)
inputs
=
feature_extractor
(
sample
[
"array"
],
sampling_rate
=
sample
[
"sampling_rate"
],
max_length
=
max_length
,
truncation
=
True
)
batch
[
"input_values"
]
=
inputs
.
input_values
[
0
]
batch
[
"input_length"
]
=
len
(
inputs
.
input_values
[
0
])
return
batch
# load via mapped files via path
cache_file_names
=
None
if
args
.
train_cache_file_name
is
not
None
:
cache_file_names
=
{
"train"
:
args
.
train_cache_file_name
,
"validation"
:
args
.
validation_cache_file_name
}
# load audio files into numpy arrays
with
accelerator
.
main_process_first
():
vectorized_datasets
=
raw_datasets
.
map
(
prepare_dataset
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
cache_file_names
=
cache_file_names
,
)
if
min_length
>
0.0
:
vectorized_datasets
=
vectorized_datasets
.
filter
(
lambda
x
:
len
(
x
[
"input_values"
])
>
min_length
,
load_from_cache_file
=
not
args
.
overwrite_cache
lambda
x
:
x
>
min_length
,
num_proc
=
args
.
preprocessing_num_workers
,
input_columns
=
[
"input_length"
],
)
vectorized_datasets
=
vectorized_datasets
.
remove_columns
(
"input_length"
)
# for large datasets it is advised to run the preprocessing on a
# single machine first with ``args.preprocessing_only`` since there will mostly likely
# be a timeout when running the script in distributed mode.
...
...
examples/pytorch/speech-recognition/README.md
View file @
37c5759c
...
...
@@ -58,7 +58,6 @@ python run_speech_recognition_ctc.py \
--learning_rate
=
"3e-4"
\
--warmup_steps
=
"500"
\
--evaluation_strategy
=
"steps"
\
--audio_column_name
=
"path"
\
--text_column_name
=
"sentence"
\
--save_steps
=
"400"
\
--eval_steps
=
"100"
\
...
...
@@ -87,7 +86,6 @@ python -m torch.distributed.launch \
--model_name_or_path
=
"facebook/wav2vec2-large-xlsr-53"
\
--dataset_config_name
=
"tr"
\
--output_dir
=
"./wav2vec2-common_voice-tr-demo-dist"
\
--preprocessing_num_workers
=
"16"
\
--overwrite_output_dir
\
--num_train_epochs
=
"15"
\
--per_device_train_batch_size
=
"4"
\
...
...
examples/pytorch/speech-recognition/requirements.txt
View file @
37c5759c
datasets >= 1.1
2.0
datasets >= 1.1
3.3
torch >= 1.5
torchaudio
librosa
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
View file @
37c5759c
...
...
@@ -24,9 +24,9 @@ import sys
from
dataclasses
import
dataclass
,
field
from
typing
import
Dict
,
List
,
Optional
,
Union
import
datasets
import
numpy
as
np
import
torch
import
torchaudio
from
datasets
import
DatasetDict
,
load_dataset
,
load_metric
import
transformers
...
...
@@ -49,8 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.12.0.dev0"
)
# TODO(Patrick) Bump up as soon as audio features are merged
require_version
(
"datasets>=1.12.0"
,
"To fix: pip install -r examples/pytorch/text-classification/requirements.txt"
)
require_version
(
"datasets>=1.13.3"
,
"To fix: pip install -r examples/pytorch/text-classification/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -179,12 +178,12 @@ class DataTrainingArguments:
min_duration_in_seconds
:
Optional
[
float
]
=
field
(
default
=
0.0
,
metadata
=
{
"help"
:
"Filter audio files that are shorter than `min_duration_in_seconds` seconds"
}
)
only_data_
preprocessing
:
Optional
[
bool
]
=
field
(
preprocessing
_only
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether to only do data preprocessing and skip training. "
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
"In this case, one should run the preprocessing in a non-distributed setup with `
only_data_
preprocessing=True` "
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing
_only
=True` "
"so that the cached datasets can consequently be loaded in distributed training"
},
)
...
...
@@ -450,41 +449,30 @@ def main():
if
model_args
.
freeze_feature_extractor
:
model
.
freeze_feature_extractor
()
# 5. Now we preprocess the datasets which includes loading the audio, resampling and padding
# 5. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor`
# The following code should be cleaned up as soon as
# https://github.com/huggingface/datasets/pull/2324 is merged
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
# make sure that dataset decodes audio with correct samlping rate
raw_datasets
=
raw_datasets
.
cast_column
(
"audio"
,
datasets
.
features
.
Audio
(
sampling_rate
=
feature_extractor
.
sampling_rate
)
)
# derive max & min input length for sample rate & max duration
max_input_length
=
data_args
.
max_duration_in_seconds
*
processor
.
feature_extractor
.
sampling_rate
min_input_length
=
data_args
.
min_duration_in_seconds
*
processor
.
feature_extractor
.
sampling_rate
resampler
=
None
if
raw_datasets
[
"train"
][
data_args
.
audio_column_name
][
0
].
split
(
"."
)[
-
1
]
==
"mp3"
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
resampler
=
torchaudio
.
transforms
.
Resample
(
48_000
,
processor
.
feature_extractor
.
sampling_rate
)
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
def
prepare_dataset
(
batch
):
# load audio
speech_array
,
sampling_rate
=
torchaudio
.
load
(
batch
[
data_args
.
audio_column_name
])
speech_array
=
speech_array
.
squeeze
()
# if necessary resample audio
if
resampler
is
not
None
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
speech_array
=
resampler
(
speech_array
)
sampling_rate
=
resampler
.
new_freq
speech_array
=
speech_array
.
numpy
()
sample
=
batch
[
data_args
.
audio_column_name
]
batch
[
"input_values"
]
=
processor
(
s
peech_
array
,
sampling_rate
=
sampling_rate
,
truncate
=
True
,
max_length
=
max_input_length
s
ample
[
"
array
"
]
,
sampling_rate
=
sample
[
"
sampling_rate
"
]
,
truncate
=
True
,
max_length
=
max_input_length
).
input_values
[
0
]
batch
[
"input_length"
]
=
len
(
batch
[
"input_values"
])
# Setup the processor for targets
with
processor
.
as_target_processor
():
...
...
@@ -502,10 +490,13 @@ def main():
if
min_input_length
>
0.0
:
# filter data that is shorter than min_input_length
vectorized_datasets
=
vectorized_datasets
.
filter
(
lambda
data
:
len
(
data
[
"input_values"
])
>
min_input_length
,
lambda
x
:
x
>
min_input_length
,
num_proc
=
data_args
.
preprocessing_num_workers
,
input_columns
=
[
"input_length"
],
)
vectorized_datasets
=
vectorized_datasets
.
remove_columns
(
"input_length"
)
# 6. Next, we can prepare the training.
# Let's use word error rate (WER) as our evaluation metric,
# instantiate a data collator and the trainer
...
...
@@ -513,8 +504,13 @@ def main():
# Define Metric during training
wer_metric
=
load_metric
(
"wer"
)
if
data_args
.
only_data_preprocessing
:
logger
.
info
(
"Data preprocessing finished."
)
# for large datasets it is advised to run the preprocessing on a
# single machine first with ``args.preprocessing_only`` since there will mostly likely
# be a timeout when running the script in distributed mode.
# In a second step ``args.preprocessing_only`` can then be set to `False` to load the
# cached dataset
if
data_args
.
preprocessing_only
:
logger
.
info
(
f
"Data preprocessing finished. Files cached at
{
vectorized_datasets
.
cache_files
}
"
)
return
def
compute_metrics
(
pred
):
...
...
examples/pytorch/test_examples.py
View file @
37c5759c
...
...
@@ -395,7 +395,6 @@ class ExamplesTests(TestCasePlus):
--dataset_config_name clean
--train_split_name validation
--eval_split_name validation
--audio_column_name file
--do_train
--do_eval
--learning_rate 1e-4
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment