Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
37c5759c
Unverified
Commit
37c5759c
authored
Oct 17, 2021
by
Patrick von Platen
Committed by
GitHub
Oct 17, 2021
Browse files
[Speech Examples] Add new audio feature (#14027)
* finish * up * finish all * up
parent
cde0c750
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
75 additions
and
58 deletions
+75
-58
examples/pytorch/_tests_requirements.txt
examples/pytorch/_tests_requirements.txt
+2
-1
examples/pytorch/speech-pretraining/README.md
examples/pytorch/speech-pretraining/README.md
+1
-1
examples/pytorch/speech-pretraining/requirements.txt
examples/pytorch/speech-pretraining/requirements.txt
+1
-0
examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
...speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+43
-22
examples/pytorch/speech-recognition/README.md
examples/pytorch/speech-recognition/README.md
+0
-2
examples/pytorch/speech-recognition/requirements.txt
examples/pytorch/speech-recognition/requirements.txt
+2
-1
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
.../pytorch/speech-recognition/run_speech_recognition_ctc.py
+26
-30
examples/pytorch/test_examples.py
examples/pytorch/test_examples.py
+0
-1
No files found.
examples/pytorch/_tests_requirements.txt
View file @
37c5759c
...
...
@@ -13,7 +13,7 @@ streamlit
elasticsearch
nltk
pandas
datasets >= 1.1.3
datasets >= 1.1
3
.3
fire
pytest
conllu
...
...
@@ -21,3 +21,4 @@ sentencepiece != 0.1.92
protobuf
torchvision
jiwer
librosa
examples/pytorch/speech-pretraining/README.md
View file @
37c5759c
...
...
@@ -94,7 +94,7 @@ To pre-train `"large-sized"` Wav2Vec2 model, *e.g.* [facebook/wav2vec2-large-lv6
on
[
librispeech_asr
](
https://huggingface.co/datasets/librispeech_asr
)
, the following command can be run:
```
bash
accelerate launch run_pretrain_no_trainer.py
\
accelerate launch run_
wav2vec2_
pretrain
ing
_no_trainer.py
\
--dataset_name
=
librispeech_asr
\
--dataset_config_names
clean clean other
\
--dataset_split_names
train.100 train.360 train.500
\
...
...
examples/pytorch/speech-pretraining/requirements.txt
View file @
37c5759c
...
...
@@ -2,3 +2,4 @@ datasets >= 1.12.0
torch >= 1.5
torchaudio
accelerate >= 0.5.0
librosa
examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
View file @
37c5759c
...
...
@@ -25,7 +25,6 @@ from typing import Dict, List, Optional, Union
import
datasets
import
torch
import
torchaudio
from
datasets
import
DatasetDict
,
concatenate_datasets
,
load_dataset
from
torch.utils.data.dataloader
import
DataLoader
from
tqdm.auto
import
tqdm
...
...
@@ -113,7 +112,7 @@ def parse_args():
parser
.
add_argument
(
"--audio_column_name"
,
type
=
str
,
default
=
"
file
"
,
default
=
"
audio
"
,
help
=
"Column in the dataset that contains speech file path. Defaults to 'file'"
,
)
parser
.
add_argument
(
...
...
@@ -128,6 +127,18 @@ def parse_args():
default
=
None
,
help
=
"Pretrained config name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--train_cache_file_name"
,
type
=
str
,
default
=
None
,
help
=
"Path to the train cached file name"
,
)
parser
.
add_argument
(
"--validation_cache_file_name"
,
type
=
str
,
default
=
None
,
help
=
"Path to the validation cached file name"
,
)
parser
.
add_argument
(
"--per_device_train_batch_size"
,
type
=
int
,
...
...
@@ -414,9 +425,17 @@ def main():
raw_datasets
[
"validation"
]
=
raw_datasets
[
"train"
].
select
(
range
(
num_validation_samples
))
raw_datasets
[
"train"
]
=
raw_datasets
[
"train"
].
select
(
range
(
num_validation_samples
,
raw_datasets
[
"train"
].
num_rows
))
# 2. Preprocess audio: load, resample, normalize and truncate
# 2. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor`
feature_extractor
=
Wav2Vec2FeatureExtractor
.
from_pretrained
(
args
.
model_name_or_path
)
# make sure that dataset decodes audio with correct samlping rate
raw_datasets
=
raw_datasets
.
cast_column
(
"audio"
,
datasets
.
features
.
Audio
(
sampling_rate
=
feature_extractor
.
sampling_rate
)
)
# only normalized-inputs-training is supported
if
not
feature_extractor
.
do_normalize
:
raise
ValueError
(
...
...
@@ -427,38 +446,40 @@ def main():
max_length
=
int
(
args
.
max_duration_in_seconds
*
feature_extractor
.
sampling_rate
)
min_length
=
int
(
args
.
min_duration_in_seconds
*
feature_extractor
.
sampling_rate
)
resampler
=
None
if
raw_datasets
[
"train"
][
args
.
audio_column_name
][
0
].
split
(
"."
)[
-
1
]
==
"mp3"
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
resampler
=
torchaudio
.
transforms
.
Resample
(
48_000
,
feature_extractor
.
sampling_rate
)
def
prepare_dataset
(
batch
):
speech_array
,
sampling_rate
=
torchaudio
.
load
(
batch
[
args
.
audio_column_name
])
speech_array
=
speech_array
.
squeeze
()
# if necessary resample audio
if
resampler
is
not
None
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
speech_array
=
resampler
(
speech_array
)
sampling_rate
=
resampler
.
new_freq
sample
=
batch
[
args
.
audio_column_name
]
speech_array
=
speech_array
.
numpy
()
inputs
=
feature_extractor
(
speech_array
,
sampling_rate
=
sampling_rate
,
max_length
=
max_length
,
truncation
=
True
)
inputs
=
feature_extractor
(
sample
[
"array"
],
sampling_rate
=
sample
[
"sampling_rate"
],
max_length
=
max_length
,
truncation
=
True
)
batch
[
"input_values"
]
=
inputs
.
input_values
[
0
]
batch
[
"input_length"
]
=
len
(
inputs
.
input_values
[
0
])
return
batch
# load via mapped files via path
cache_file_names
=
None
if
args
.
train_cache_file_name
is
not
None
:
cache_file_names
=
{
"train"
:
args
.
train_cache_file_name
,
"validation"
:
args
.
validation_cache_file_name
}
# load audio files into numpy arrays
with
accelerator
.
main_process_first
():
vectorized_datasets
=
raw_datasets
.
map
(
prepare_dataset
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
vectorized_datasets
=
vectorized_datasets
.
filter
(
lambda
x
:
len
(
x
[
"input_values"
])
>
min_length
,
load_from_cache_file
=
not
args
.
overwrite_cache
cache_file_names
=
cache_file_names
,
)
if
min_length
>
0.0
:
vectorized_datasets
=
vectorized_datasets
.
filter
(
lambda
x
:
x
>
min_length
,
num_proc
=
args
.
preprocessing_num_workers
,
input_columns
=
[
"input_length"
],
)
vectorized_datasets
=
vectorized_datasets
.
remove_columns
(
"input_length"
)
# for large datasets it is advised to run the preprocessing on a
# single machine first with ``args.preprocessing_only`` since there will mostly likely
# be a timeout when running the script in distributed mode.
...
...
examples/pytorch/speech-recognition/README.md
View file @
37c5759c
...
...
@@ -58,7 +58,6 @@ python run_speech_recognition_ctc.py \
--learning_rate
=
"3e-4"
\
--warmup_steps
=
"500"
\
--evaluation_strategy
=
"steps"
\
--audio_column_name
=
"path"
\
--text_column_name
=
"sentence"
\
--save_steps
=
"400"
\
--eval_steps
=
"100"
\
...
...
@@ -87,7 +86,6 @@ python -m torch.distributed.launch \
--model_name_or_path
=
"facebook/wav2vec2-large-xlsr-53"
\
--dataset_config_name
=
"tr"
\
--output_dir
=
"./wav2vec2-common_voice-tr-demo-dist"
\
--preprocessing_num_workers
=
"16"
\
--overwrite_output_dir
\
--num_train_epochs
=
"15"
\
--per_device_train_batch_size
=
"4"
\
...
...
examples/pytorch/speech-recognition/requirements.txt
View file @
37c5759c
datasets >= 1.1
2.0
datasets >= 1.1
3.3
torch >= 1.5
torchaudio
librosa
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
View file @
37c5759c
...
...
@@ -24,9 +24,9 @@ import sys
from
dataclasses
import
dataclass
,
field
from
typing
import
Dict
,
List
,
Optional
,
Union
import
datasets
import
numpy
as
np
import
torch
import
torchaudio
from
datasets
import
DatasetDict
,
load_dataset
,
load_metric
import
transformers
...
...
@@ -49,8 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.12.0.dev0"
)
# TODO(Patrick) Bump up as soon as audio features are merged
require_version
(
"datasets>=1.12.0"
,
"To fix: pip install -r examples/pytorch/text-classification/requirements.txt"
)
require_version
(
"datasets>=1.13.3"
,
"To fix: pip install -r examples/pytorch/text-classification/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -179,12 +178,12 @@ class DataTrainingArguments:
min_duration_in_seconds
:
Optional
[
float
]
=
field
(
default
=
0.0
,
metadata
=
{
"help"
:
"Filter audio files that are shorter than `min_duration_in_seconds` seconds"
}
)
only_data_
preprocessing
:
Optional
[
bool
]
=
field
(
preprocessing
_only
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether to only do data preprocessing and skip training. "
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
"In this case, one should run the preprocessing in a non-distributed setup with `
only_data_
preprocessing=True` "
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing
_only
=True` "
"so that the cached datasets can consequently be loaded in distributed training"
},
)
...
...
@@ -450,41 +449,30 @@ def main():
if
model_args
.
freeze_feature_extractor
:
model
.
freeze_feature_extractor
()
# 5. Now we preprocess the datasets which includes loading the audio, resampling and padding
# 5. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor`
# The following code should be cleaned up as soon as
# https://github.com/huggingface/datasets/pull/2324 is merged
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
# make sure that dataset decodes audio with correct samlping rate
raw_datasets
=
raw_datasets
.
cast_column
(
"audio"
,
datasets
.
features
.
Audio
(
sampling_rate
=
feature_extractor
.
sampling_rate
)
)
# derive max & min input length for sample rate & max duration
max_input_length
=
data_args
.
max_duration_in_seconds
*
processor
.
feature_extractor
.
sampling_rate
min_input_length
=
data_args
.
min_duration_in_seconds
*
processor
.
feature_extractor
.
sampling_rate
resampler
=
None
if
raw_datasets
[
"train"
][
data_args
.
audio_column_name
][
0
].
split
(
"."
)[
-
1
]
==
"mp3"
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
resampler
=
torchaudio
.
transforms
.
Resample
(
48_000
,
processor
.
feature_extractor
.
sampling_rate
)
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
def
prepare_dataset
(
batch
):
# load audio
speech_array
,
sampling_rate
=
torchaudio
.
load
(
batch
[
data_args
.
audio_column_name
])
speech_array
=
speech_array
.
squeeze
()
# if necessary resample audio
if
resampler
is
not
None
:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
speech_array
=
resampler
(
speech_array
)
sampling_rate
=
resampler
.
new_freq
speech_array
=
speech_array
.
numpy
()
sample
=
batch
[
data_args
.
audio_column_name
]
batch
[
"input_values"
]
=
processor
(
s
peech_
array
,
sampling_rate
=
sampling_rate
,
truncate
=
True
,
max_length
=
max_input_length
s
ample
[
"
array
"
]
,
sampling_rate
=
sample
[
"
sampling_rate
"
]
,
truncate
=
True
,
max_length
=
max_input_length
).
input_values
[
0
]
batch
[
"input_length"
]
=
len
(
batch
[
"input_values"
])
# Setup the processor for targets
with
processor
.
as_target_processor
():
...
...
@@ -502,10 +490,13 @@ def main():
if
min_input_length
>
0.0
:
# filter data that is shorter than min_input_length
vectorized_datasets
=
vectorized_datasets
.
filter
(
lambda
data
:
len
(
data
[
"input_values"
])
>
min_input_length
,
lambda
x
:
x
>
min_input_length
,
num_proc
=
data_args
.
preprocessing_num_workers
,
input_columns
=
[
"input_length"
],
)
vectorized_datasets
=
vectorized_datasets
.
remove_columns
(
"input_length"
)
# 6. Next, we can prepare the training.
# Let's use word error rate (WER) as our evaluation metric,
# instantiate a data collator and the trainer
...
...
@@ -513,8 +504,13 @@ def main():
# Define Metric during training
wer_metric
=
load_metric
(
"wer"
)
if
data_args
.
only_data_preprocessing
:
logger
.
info
(
"Data preprocessing finished."
)
# for large datasets it is advised to run the preprocessing on a
# single machine first with ``args.preprocessing_only`` since there will mostly likely
# be a timeout when running the script in distributed mode.
# In a second step ``args.preprocessing_only`` can then be set to `False` to load the
# cached dataset
if
data_args
.
preprocessing_only
:
logger
.
info
(
f
"Data preprocessing finished. Files cached at
{
vectorized_datasets
.
cache_files
}
"
)
return
def
compute_metrics
(
pred
):
...
...
examples/pytorch/test_examples.py
View file @
37c5759c
...
...
@@ -395,7 +395,6 @@ class ExamplesTests(TestCasePlus):
--dataset_config_name clean
--train_split_name validation
--eval_split_name validation
--audio_column_name file
--do_train
--do_eval
--learning_rate 1e-4
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment